wa-lang.org/wazero@v1.0.2/internal/engine/compiler/impl_amd64.go

wa-lang.org/wazero@v1.0.2/internal/engine/compiler/impl_amd64.go (about)

     1  package compiler
     2  
     3  // This file implements the compiler for amd64/x86_64 target.
     4  // Please refer to https://www.felixcloutier.com/x86/index.html
     5  // if unfamiliar with amd64 instructions used here.
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"math"
    11  	"runtime"
    12  
    13  	"wa-lang.org/wazero/internal/asm"
    14  	"wa-lang.org/wazero/internal/asm/amd64"
    15  	"wa-lang.org/wazero/internal/platform"
    16  	"wa-lang.org/wazero/internal/u32"
    17  	"wa-lang.org/wazero/internal/u64"
    18  	"wa-lang.org/wazero/internal/wasm"
    19  	"wa-lang.org/wazero/internal/wazeroir"
    20  )
    21  
    22  var (
    23  	minimum32BitSignedInt                  int32  = math.MinInt32
    24  	maximum32BitSignedInt                  int32  = math.MaxInt32
    25  	maximum32BitUnsignedInt                uint32 = math.MaxUint32
    26  	minimum64BitSignedInt                  int64  = math.MinInt64
    27  	maximum64BitSignedInt                  int64  = math.MaxInt64
    28  	maximum64BitUnsignedInt                uint64 = math.MaxUint64
    29  	float32SignBitMask                     uint32 = 1 << 31
    30  	float32RestBitMask                            = ^float32SignBitMask
    31  	float64SignBitMask                     uint64 = 1 << 63
    32  	float64RestBitMask                            = ^float64SignBitMask
    33  	float32ForMinimumSigned32bitInteger           = uint32(0xCF00_0000)
    34  	float64ForMinimumSigned32bitInteger           = uint64(0xC1E0_0000_0020_0000)
    35  	float32ForMinimumSigned64bitInteger           = uint32(0xDF00_0000)
    36  	float64ForMinimumSigned64bitInteger           = uint64(0xC3E0_0000_0000_0000)
    37  	float32ForMaximumSigned32bitIntPlusOne        = uint32(0x4F00_0000)
    38  	float64ForMaximumSigned32bitIntPlusOne        = uint64(0x41E0_0000_0000_0000)
    39  	float32ForMaximumSigned64bitIntPlusOne        = uint32(0x5F00_0000)
    40  	float64ForMaximumSigned64bitIntPlusOne        = uint64(0x43E0_0000_0000_0000)
    41  )
    42  
    43  var (
    44  	// amd64ReservedRegisterForCallEngine: pointer to callEngine (i.e. *callEngine as uintptr)
    45  	amd64ReservedRegisterForCallEngine = amd64.RegR13
    46  	// amd64ReservedRegisterForStackBasePointerAddress: stack base pointer's address (callEngine.stackBasePointer) in the current function call.
    47  	amd64ReservedRegisterForStackBasePointerAddress = amd64.RegR14
    48  	// amd64ReservedRegisterForMemory: pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
    49  	amd64ReservedRegisterForMemory = amd64.RegR15
    50  )
    51  
    52  var (
    53  	amd64UnreservedVectorRegisters = []asm.Register{ // nolint
    54  		amd64.RegX0, amd64.RegX1, amd64.RegX2, amd64.RegX3,
    55  		amd64.RegX4, amd64.RegX5, amd64.RegX6, amd64.RegX7,
    56  		amd64.RegX8, amd64.RegX9, amd64.RegX10, amd64.RegX11,
    57  		amd64.RegX12, amd64.RegX13, amd64.RegX14, amd64.RegX15,
    58  	}
    59  	// Note that we never invoke "call" instruction,
    60  	// so we don't need to care about the calling convention.
    61  	// TODO: Maybe it is safe just save rbp, rsp somewhere
    62  	// in Go-allocated variables, and reuse these registers
    63  	// in compiled functions and write them back before returns.
    64  	amd64UnreservedGeneralPurposeRegisters = []asm.Register{ // nolint
    65  		amd64.RegAX, amd64.RegCX, amd64.RegDX, amd64.RegBX,
    66  		amd64.RegSI, amd64.RegDI, amd64.RegR8, amd64.RegR9,
    67  		amd64.RegR10, amd64.RegR11, amd64.RegR12,
    68  	}
    69  )
    70  
    71  // amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds *wasm.ModuleInstance of the
    72  // next executing function instance. The value is set and used when making function calls
    73  // or function returns in the ModuleContextInitialization. See compileModuleContextInitialization.
    74  var amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister = amd64.RegR12
    75  
    76  func (c *amd64Compiler) String() string {
    77  	return c.locationStack.String()
    78  }
    79  
    80  type amd64Compiler struct {
    81  	assembler amd64.Assembler
    82  	ir        *wazeroir.CompilationResult
    83  	// locationStack holds the state of wazeroir virtual stack.
    84  	// and each item is either placed in register or the actual memory stack.
    85  	locationStack *runtimeValueLocationStack
    86  	// labels hold per wazeroir label specific information in this function.
    87  	labels map[string]*amd64LabelInfo
    88  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    89  	stackPointerCeil uint64
    90  	// currentLabel holds a currently compiled wazeroir label key. For debugging only.
    91  	currentLabel string
    92  	// onStackPointerCeilDeterminedCallBack hold a callback which are called when the max stack pointer is determined BEFORE generating native code.
    93  	onStackPointerCeilDeterminedCallBack func(stackPointerCeil uint64)
    94  	withListener                         bool
    95  }
    96  
    97  func newAmd64Compiler(ir *wazeroir.CompilationResult, withListener bool) (compiler, error) {
    98  	c := &amd64Compiler{
    99  		assembler:     amd64.NewAssembler(),
   100  		locationStack: newRuntimeValueLocationStack(),
   101  		currentLabel:  wazeroir.EntrypointLabel,
   102  		ir:            ir,
   103  		labels:        map[string]*amd64LabelInfo{},
   104  		withListener:  withListener,
   105  	}
   106  	return c, nil
   107  }
   108  
   109  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   110  func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   111  	return c.locationStack
   112  }
   113  
   114  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   115  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   116  // This is called when we branch into different block.
   117  func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   118  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   119  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   120  	}
   121  	c.locationStack = newStack
   122  }
   123  
   124  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for amd64.
   125  func (c *amd64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   126  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   127  	c.locationStack.markRegisterUsed(reg)
   128  	return
   129  }
   130  
   131  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for amd64.
   132  func (c *amd64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   133  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   134  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   135  	c.locationStack.markRegisterUsed(reg)
   136  	return
   137  }
   138  
   139  type amd64LabelInfo struct {
   140  	// initialInstruction is the initial instruction for this label so other block can jump into it.
   141  	initialInstruction asm.Node
   142  	// initialStack is the initial value location stack from which we start compiling this label.
   143  	initialStack *runtimeValueLocationStack
   144  	// labelBeginningCallbacks holds callbacks should to be called with initialInstruction
   145  	labelBeginningCallbacks []func(asm.Node)
   146  }
   147  
   148  func (c *amd64Compiler) label(labelKey string) *amd64LabelInfo {
   149  	ret, ok := c.labels[labelKey]
   150  	if ok {
   151  		return ret
   152  	}
   153  	c.labels[labelKey] = &amd64LabelInfo{}
   154  	return c.labels[labelKey]
   155  }
   156  
   157  // compileGoDefinedHostFunction constructs the entire code to enter the host function implementation,
   158  // and return to the caller.
   159  func (c *amd64Compiler) compileGoDefinedHostFunction() error {
   160  	// First we must update the location stack to reflect the number of host function inputs.
   161  	c.locationStack.init(c.ir.Signature)
   162  
   163  	if c.withListener {
   164  		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
   165  			return err
   166  		}
   167  	}
   168  
   169  	if err := c.compileCallGoHostFunction(); err != nil {
   170  		return err
   171  	}
   172  
   173  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   174  	c.compileReservedStackBasePointerInitialization()
   175  	return c.compileReturnFunction()
   176  }
   177  
   178  // compile implements compiler.compile for the amd64 architecture.
   179  func (c *amd64Compiler) compile() (code []byte, stackPointerCeil uint64, err error) {
   180  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
   181  	// used for all labels (via setLocationStack), excluding the current one.
   182  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
   183  	stackPointerCeil = c.stackPointerCeil
   184  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   185  		stackPointerCeil = c.locationStack.stackPointerCeil
   186  	}
   187  
   188  	// Now that the max stack pointer is determined, we are invoking the callback.
   189  	// Note this MUST be called before Assemble() below.
   190  	if c.onStackPointerCeilDeterminedCallBack != nil {
   191  		c.onStackPointerCeilDeterminedCallBack(stackPointerCeil)
   192  		c.onStackPointerCeilDeterminedCallBack = nil
   193  	}
   194  
   195  	code, err = c.assembler.Assemble()
   196  	if err != nil {
   197  		return
   198  	}
   199  
   200  	code, err = platform.MmapCodeSegment(bytes.NewReader(code), len(code))
   201  	return
   202  }
   203  
   204  // compileUnreachable implements compiler.compileUnreachable for the amd64 architecture.
   205  func (c *amd64Compiler) compileUnreachable() error {
   206  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   207  	return nil
   208  }
   209  
   210  // compileSet implements compiler.compileSet for the amd64 architecture.
   211  func (c *amd64Compiler) compileSet(o *wazeroir.OperationSet) error {
   212  	setTargetIndex := int(c.locationStack.sp) - 1 - o.Depth
   213  
   214  	if o.IsTargetVector {
   215  		_ = c.locationStack.pop() // ignore the higher 64-bits.
   216  	}
   217  	v := c.locationStack.pop()
   218  	if err := c.compileEnsureOnRegister(v); err != nil {
   219  		return err
   220  	}
   221  
   222  	targetLocation := c.locationStack.stack[setTargetIndex]
   223  	if targetLocation.onRegister() {
   224  		// We no longer need the register previously used by the target location.
   225  		c.locationStack.markRegisterUnused(targetLocation.register)
   226  	}
   227  
   228  	reg := v.register
   229  	targetLocation.setRegister(reg)
   230  	if o.IsTargetVector {
   231  		c.locationStack.stack[setTargetIndex+1].setRegister(reg)
   232  	}
   233  	return nil
   234  }
   235  
   236  // compileGlobalGet implements compiler.compileGlobalGet for the amd64 architecture.
   237  func (c *amd64Compiler) compileGlobalGet(o *wazeroir.OperationGlobalGet) error {
   238  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   239  		return err
   240  	}
   241  
   242  	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   243  	if err != nil {
   244  		return err
   245  	}
   246  
   247  	// First, move the pointer to the global slice into the allocated register.
   248  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
   249  
   250  	// Now, move the location of the global instance into the register.
   251  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(o.Index), intReg)
   252  
   253  	// When an integer, reuse the pointer register for the value. Otherwise, allocate a float register for it.
   254  	valueReg := intReg
   255  	var vt runtimeValueType
   256  	var inst asm.Instruction
   257  	switch c.ir.Globals[o.Index].ValType {
   258  	case wasm.ValueTypeI32:
   259  		inst = amd64.MOVL
   260  		vt = runtimeValueTypeI32
   261  	case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   262  		inst = amd64.MOVQ
   263  		vt = runtimeValueTypeI64
   264  	case wasm.ValueTypeF32:
   265  		inst = amd64.MOVL
   266  		vt = runtimeValueTypeF32
   267  		valueReg, err = c.allocateRegister(registerTypeVector)
   268  		if err != nil {
   269  			return err
   270  		}
   271  	case wasm.ValueTypeF64:
   272  		inst = amd64.MOVQ
   273  		vt = runtimeValueTypeF64
   274  		valueReg, err = c.allocateRegister(registerTypeVector)
   275  		if err != nil {
   276  			return err
   277  		}
   278  	case wasm.ValueTypeV128:
   279  		inst = amd64.MOVDQU
   280  		vt = runtimeValueTypeV128Lo
   281  		valueReg, err = c.allocateRegister(registerTypeVector)
   282  		if err != nil {
   283  			return err
   284  		}
   285  	default:
   286  		panic("BUG: unknown runtime value type")
   287  	}
   288  
   289  	// Using the register holding the pointer to the target instance, move its value into a register.
   290  	c.assembler.CompileMemoryToRegister(inst, intReg, globalInstanceValueOffset, valueReg)
   291  
   292  	// Record that the retrieved global value on the top of the stack is now in a register.
   293  	if vt == runtimeValueTypeV128Lo {
   294  		c.pushVectorRuntimeValueLocationOnRegister(valueReg)
   295  	} else {
   296  		c.pushRuntimeValueLocationOnRegister(valueReg, vt)
   297  	}
   298  	return nil
   299  }
   300  
   301  // compileGlobalSet implements compiler.compileGlobalSet for the amd64 architecture.
   302  func (c *amd64Compiler) compileGlobalSet(o *wazeroir.OperationGlobalSet) error {
   303  	wasmValueType := c.ir.Globals[o.Index].ValType
   304  	isV128 := wasmValueType == wasm.ValueTypeV128
   305  
   306  	// First, move the value to set into a temporary register.
   307  	val := c.locationStack.pop()
   308  	if isV128 {
   309  		// The previous val is higher 64-bits, and have to use lower 64-bit's runtimeValueLocation for allocation, etc.
   310  		val = c.locationStack.pop()
   311  	}
   312  	if err := c.compileEnsureOnRegister(val); err != nil {
   313  		return err
   314  	}
   315  
   316  	// Allocate a register to hold the memory location of the target global instance.
   317  	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   318  	if err != nil {
   319  		return err
   320  	}
   321  
   322  	// First, move the pointer to the global slice into the allocated register.
   323  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
   324  
   325  	// Now, move the location of the global instance into the register.
   326  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(o.Index), intReg)
   327  
   328  	// Now ready to write the value to the global instance location.
   329  	var inst asm.Instruction
   330  	if isV128 {
   331  		inst = amd64.MOVDQU
   332  	} else if wasmValueType == wasm.ValueTypeI32 || wasmValueType == wasm.ValueTypeF32 {
   333  		inst = amd64.MOVL
   334  	} else {
   335  		inst = amd64.MOVQ
   336  	}
   337  	c.assembler.CompileRegisterToMemory(inst, val.register, intReg, globalInstanceValueOffset)
   338  
   339  	// Since the value is now written to memory, release the value register.
   340  	c.locationStack.releaseRegister(val)
   341  	return nil
   342  }
   343  
   344  // compileBr implements compiler.compileBr for the amd64 architecture.
   345  func (c *amd64Compiler) compileBr(o *wazeroir.OperationBr) error {
   346  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   347  		return err
   348  	}
   349  	return c.branchInto(o.Target)
   350  }
   351  
   352  // branchInto adds instruction necessary to jump into the given branch target.
   353  func (c *amd64Compiler) branchInto(target *wazeroir.BranchTarget) error {
   354  	if target.IsReturnTarget() {
   355  		return c.compileReturnFunction()
   356  	} else {
   357  		labelKey := target.String()
   358  		if c.ir.LabelCallers[labelKey] > 1 {
   359  			// We can only re-use register state if when there's a single call-site.
   360  			// Release existing values on registers to the stack if there's multiple ones to have
   361  			// the consistent value location state at the beginning of label.
   362  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   363  				return err
   364  			}
   365  		}
   366  		// Set the initial stack of the target label, so we can start compiling the label
   367  		// with the appropriate value locations. Note we clone the stack here as we maybe
   368  		// manipulate the stack before compiler reaches the label.
   369  		targetLabel := c.label(labelKey)
   370  		if targetLabel.initialStack == nil {
   371  			// It seems unnecessary to clone as branchInto is always the tail of the current block.
   372  			// TODO: verify ^^.
   373  			targetLabel.initialStack = c.locationStack.clone()
   374  		}
   375  		jmp := c.assembler.CompileJump(amd64.JMP)
   376  		c.assignJumpTarget(labelKey, jmp)
   377  	}
   378  	return nil
   379  }
   380  
   381  // compileBrIf implements compiler.compileBrIf for the amd64 architecture.
   382  func (c *amd64Compiler) compileBrIf(o *wazeroir.OperationBrIf) error {
   383  	cond := c.locationStack.pop()
   384  	var jmpWithCond asm.Node
   385  	if cond.onConditionalRegister() {
   386  		var inst asm.Instruction
   387  		switch cond.conditionalRegister {
   388  		case amd64.ConditionalRegisterStateE:
   389  			inst = amd64.JEQ
   390  		case amd64.ConditionalRegisterStateNE:
   391  			inst = amd64.JNE
   392  		case amd64.ConditionalRegisterStateS:
   393  			inst = amd64.JMI
   394  		case amd64.ConditionalRegisterStateNS:
   395  			inst = amd64.JPL
   396  		case amd64.ConditionalRegisterStateG:
   397  			inst = amd64.JGT
   398  		case amd64.ConditionalRegisterStateGE:
   399  			inst = amd64.JGE
   400  		case amd64.ConditionalRegisterStateL:
   401  			inst = amd64.JLT
   402  		case amd64.ConditionalRegisterStateLE:
   403  			inst = amd64.JLE
   404  		case amd64.ConditionalRegisterStateA:
   405  			inst = amd64.JHI
   406  		case amd64.ConditionalRegisterStateAE:
   407  			inst = amd64.JCC
   408  		case amd64.ConditionalRegisterStateB:
   409  			inst = amd64.JCS
   410  		case amd64.ConditionalRegisterStateBE:
   411  			inst = amd64.JLS
   412  		}
   413  		jmpWithCond = c.assembler.CompileJump(inst)
   414  	} else {
   415  		// Usually the comparison operand for br_if is on the conditional register,
   416  		// but in some cases, they are on the stack or register.
   417  		// For example, the following code
   418  		// 		i64.const 1
   419  		//      local.get 1
   420  		//      i64.add
   421  		//      br_if ....
   422  		// will try to use the result of i64.add, which resides on the (virtual) stack,
   423  		// as the operand for br_if instruction.
   424  		if err := c.compileEnsureOnRegister(cond); err != nil {
   425  			return err
   426  		}
   427  		// Check if the value not equals zero.
   428  		c.assembler.CompileRegisterToConst(amd64.CMPQ, cond.register, 0)
   429  
   430  		// Emit jump instruction which jumps when the value does not equals zero.
   431  		jmpWithCond = c.assembler.CompileJump(amd64.JNE)
   432  		c.locationStack.markRegisterUnused(cond.register)
   433  	}
   434  
   435  	// Make sure that the next coming label is the else jump target.
   436  	thenTarget, elseTarget := o.Then, o.Else
   437  
   438  	// Here's the diagram of how we organize the instructions necessarily for brif operation.
   439  	//
   440  	// jmp_with_cond -> jmp (.Else) -> Then operations...
   441  	//    |---------(satisfied)------------^^^
   442  	//
   443  	// Note that .Else branch doesn't have ToDrop as .Else is in reality
   444  	// corresponding to either If's Else block or Br_if's else block in Wasm.
   445  
   446  	// Emit for else branches
   447  	saved := c.locationStack
   448  	c.setLocationStack(saved.clone())
   449  	if elseTarget.Target.IsReturnTarget() {
   450  		if err := c.compileReturnFunction(); err != nil {
   451  			return err
   452  		}
   453  	} else {
   454  		elseLabelKey := elseTarget.Target.Label.String()
   455  		if c.ir.LabelCallers[elseLabelKey] > 1 {
   456  			// We can only re-use register state if when there's a single call-site.
   457  			// Release existing values on registers to the stack if there's multiple ones to have
   458  			// the consistent value location state at the beginning of label.
   459  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   460  				return err
   461  			}
   462  		}
   463  		// Set the initial stack of the target label, so we can start compiling the label
   464  		// with the appropriate value locations. Note we clone the stack here as we maybe
   465  		// manipulate the stack before compiler reaches the label.
   466  		labelInfo := c.label(elseLabelKey)
   467  		if labelInfo.initialStack == nil {
   468  			labelInfo.initialStack = c.locationStack
   469  		}
   470  
   471  		elseJmp := c.assembler.CompileJump(amd64.JMP)
   472  		c.assignJumpTarget(elseLabelKey, elseJmp)
   473  	}
   474  
   475  	// Handle then branch.
   476  	c.assembler.SetJumpTargetOnNext(jmpWithCond)
   477  	c.setLocationStack(saved)
   478  	if err := compileDropRange(c, thenTarget.ToDrop); err != nil {
   479  		return err
   480  	}
   481  	if thenTarget.Target.IsReturnTarget() {
   482  		return c.compileReturnFunction()
   483  	} else {
   484  		thenLabelKey := thenTarget.Target.Label.String()
   485  		if c.ir.LabelCallers[thenLabelKey] > 1 {
   486  			// We can only re-use register state if when there's a single call-site.
   487  			// Release existing values on registers to the stack if there's multiple ones to have
   488  			// the consistent value location state at the beginning of label.
   489  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   490  				return err
   491  			}
   492  		}
   493  		// Set the initial stack of the target label, so we can start compiling the label
   494  		// with the appropriate value locations. Note we clone the stack here as we maybe
   495  		// manipulate the stack before compiler reaches the label.
   496  		labelInfo := c.label(thenLabelKey)
   497  		if labelInfo.initialStack == nil {
   498  			labelInfo.initialStack = c.locationStack
   499  		}
   500  		thenJmp := c.assembler.CompileJump(amd64.JMP)
   501  		c.assignJumpTarget(thenLabelKey, thenJmp)
   502  		return nil
   503  	}
   504  }
   505  
   506  // compileBrTable implements compiler.compileBrTable for the amd64 architecture.
   507  func (c *amd64Compiler) compileBrTable(o *wazeroir.OperationBrTable) error {
   508  	index := c.locationStack.pop()
   509  
   510  	// If the operation only consists of the default target, we branch into it and return early.
   511  	if len(o.Targets) == 0 {
   512  		c.locationStack.releaseRegister(index)
   513  		if err := compileDropRange(c, o.Default.ToDrop); err != nil {
   514  			return err
   515  		}
   516  		return c.branchInto(o.Default.Target)
   517  	}
   518  
   519  	// Otherwise, we jump into the selected branch.
   520  	if err := c.compileEnsureOnRegister(index); err != nil {
   521  		return err
   522  	}
   523  
   524  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   525  	if err != nil {
   526  		return err
   527  	}
   528  
   529  	// First, we move the length of target list into the tmp register.
   530  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(len(o.Targets)), tmp)
   531  
   532  	// Then, we compare the value with the length of targets.
   533  	c.assembler.CompileRegisterToRegister(amd64.CMPL, tmp, index.register)
   534  
   535  	// If the value is larger than the length,
   536  	// we round the index to the length as the spec states that
   537  	// if the index is larger than or equal the length of list,
   538  	// branch into the default branch.
   539  	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, tmp, index.register)
   540  
   541  	// We prepare the static data which holds the offset of
   542  	// each target's first instruction (incl. default)
   543  	// relative to the beginning of label tables.
   544  	//
   545  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   546  	// we emit the the code like this at [Emit the code for each targets and default branch] below.
   547  	//
   548  	// L0:
   549  	//  0x123001: XXXX, ...
   550  	//  .....
   551  	// L1:
   552  	//  0x123005: YYY, ...
   553  	//  .....
   554  	// L_DEFAULT:
   555  	//  0x123009: ZZZ, ...
   556  	//
   557  	// then offsetData becomes like [0x0, 0x5, 0x8].
   558  	// By using this offset list, we could jump into the label for the index by
   559  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA"
   560  	// instruction.
   561  	//
   562  	// Note: We store each offset of 32-bite unsigned integer as 4 consecutive bytes. So more precisely,
   563  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   564  	//
   565  	// Note: this is similar to how GCC implements Switch statements in C.
   566  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Targets)+1)))
   567  
   568  	// Load the offsetData's address into tmp.
   569  	if err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, offsetData, tmp); err != nil {
   570  		return err
   571  	}
   572  
   573  	// Now we have the address of first byte of offsetData in tmp register.
   574  	// So the target offset's first byte is at tmp+index*4 as we store
   575  	// the offset as 4 bytes for a 32-byte integer.
   576  	// Here, we store the offset into the index.register.
   577  	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVL, tmp, 0, index.register, 4, index.register)
   578  
   579  	// Now we read the address of the beginning of the jump table.
   580  	// In the above example, this corresponds to reading the address of 0x123001.
   581  	c.assembler.CompileReadInstructionAddress(tmp, amd64.JMP)
   582  
   583  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   584  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   585  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, index.register, tmp)
   586  
   587  	c.assembler.CompileJumpToRegister(amd64.JMP, tmp)
   588  
   589  	// We no longer need the index's register, so mark it unused.
   590  	c.locationStack.markRegisterUnused(index.register)
   591  
   592  	// [Emit the code for each targets and default branch]
   593  	labelInitialInstructions := make([]asm.Node, len(o.Targets)+1)
   594  	saved := c.locationStack
   595  	for i := range labelInitialInstructions {
   596  		// Emit the initial instruction of each target.
   597  		// We use NOP as we don't yet know the next instruction in each label.
   598  		// Assembler would optimize out this NOP during code generation, so this is harmless.
   599  		labelInitialInstructions[i] = c.assembler.CompileStandAlone(amd64.NOP)
   600  
   601  		var locationStack *runtimeValueLocationStack
   602  		var target *wazeroir.BranchTargetDrop
   603  		if i < len(o.Targets) {
   604  			target = o.Targets[i]
   605  			// Clone the location stack so the branch-specific code doesn't
   606  			// affect others.
   607  			locationStack = saved.clone()
   608  		} else {
   609  			target = o.Default
   610  			// If this is the default branch, we use the original one
   611  			// as this is the last code in this block.
   612  			locationStack = saved
   613  		}
   614  		c.setLocationStack(locationStack)
   615  		if err := compileDropRange(c, target.ToDrop); err != nil {
   616  			return err
   617  		}
   618  		if err := c.branchInto(target.Target); err != nil {
   619  			return err
   620  		}
   621  	}
   622  
   623  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   624  	return nil
   625  }
   626  
   627  func (c *amd64Compiler) assignJumpTarget(labelKey string, jmpInstruction asm.Node) {
   628  	jmpTargetLabel := c.label(labelKey)
   629  	if jmpTargetLabel.initialInstruction != nil {
   630  		jmpInstruction.AssignJumpTarget(jmpTargetLabel.initialInstruction)
   631  	} else {
   632  		jmpTargetLabel.labelBeginningCallbacks = append(jmpTargetLabel.labelBeginningCallbacks, func(labelInitialInstruction asm.Node) {
   633  			jmpInstruction.AssignJumpTarget(labelInitialInstruction)
   634  		})
   635  	}
   636  }
   637  
   638  // compileLabel implements compiler.compileLabel for the amd64 architecture.
   639  func (c *amd64Compiler) compileLabel(o *wazeroir.OperationLabel) (skipLabel bool) {
   640  	if false {
   641  		fmt.Printf("[label %s ends]\n\n", c.currentLabel)
   642  	}
   643  
   644  	labelKey := o.Label.String()
   645  	labelInfo := c.label(labelKey)
   646  
   647  	// If initialStack is not set, that means this label has never been reached.
   648  	if labelInfo.initialStack == nil {
   649  		skipLabel = true
   650  		c.currentLabel = ""
   651  		return
   652  	}
   653  
   654  	// We use NOP as a beginning of instructions in a label.
   655  	labelBegin := c.assembler.CompileStandAlone(amd64.NOP)
   656  
   657  	// Save the instructions so that backward branching
   658  	// instructions can jump to this label.
   659  	labelInfo.initialInstruction = labelBegin
   660  
   661  	// Set the initial stack.
   662  	c.setLocationStack(labelInfo.initialStack)
   663  
   664  	// Invoke callbacks to notify the forward branching
   665  	// instructions can properly jump to this label.
   666  	for _, cb := range labelInfo.labelBeginningCallbacks {
   667  		cb(labelBegin)
   668  	}
   669  
   670  	// Clear for debugging purpose. See the comment in "len(amd64LabelInfo.labelBeginningCallbacks) > 0" block above.
   671  	labelInfo.labelBeginningCallbacks = nil
   672  
   673  	if false {
   674  		fmt.Printf("[label %s (num callers=%d)]\n%s\n", labelKey, c.ir.LabelCallers[labelKey], c.locationStack)
   675  	}
   676  	c.currentLabel = labelKey
   677  	return
   678  }
   679  
   680  // compileCall implements compiler.compileCall for the amd64 architecture.
   681  func (c *amd64Compiler) compileCall(o *wazeroir.OperationCall) error {
   682  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   683  		return err
   684  	}
   685  
   686  	target := c.ir.Functions[o.FunctionIndex]
   687  	targetType := c.ir.Types[target]
   688  
   689  	targetAddressRegister, err := c.allocateRegister(registerTypeGeneralPurpose)
   690  	if err != nil {
   691  		return err
   692  	}
   693  
   694  	// First, we read the address of the first item of callEngine.functions slice (= &callEngine.functions[0])
   695  	// into tmpRegister.
   696  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine,
   697  		callEngineModuleContextFunctionsElement0AddressOffset, targetAddressRegister)
   698  
   699  	// next, read the address of the target function (= &callEngine.codes[offset])
   700  	// into targetAddressRegister.
   701  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
   702  		// Note: FunctionIndex is limited up to 2^27 so this offset never exceeds 32-bit integer.
   703  		// *8 because the size of *code equals 8 bytes.
   704  		targetAddressRegister, int64(o.FunctionIndex)*8,
   705  		targetAddressRegister,
   706  	)
   707  
   708  	if err := c.compileCallFunctionImpl(targetAddressRegister, targetType); err != nil {
   709  		return err
   710  	}
   711  	return nil
   712  }
   713  
   714  // compileCallIndirect implements compiler.compileCallIndirect for the amd64 architecture.
   715  func (c *amd64Compiler) compileCallIndirect(o *wazeroir.OperationCallIndirect) error {
   716  	offset := c.locationStack.pop()
   717  	if err := c.compileEnsureOnRegister(offset); err != nil {
   718  		return nil
   719  	}
   720  
   721  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   722  	if err != nil {
   723  		return err
   724  	}
   725  	c.locationStack.markRegisterUsed(tmp)
   726  
   727  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
   728  	if err != nil {
   729  		return err
   730  	}
   731  	c.locationStack.markRegisterUsed(tmp2)
   732  
   733  	// Load the address of the target table: tmp = &module.Tables[0]
   734  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
   735  	// tmp = &module.Tables[0] + Index*8 = &module.Tables[0] + sizeOf(*TableInstance)*index = module.Tables[o.TableIndex].
   736  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(o.TableIndex*8), tmp)
   737  
   738  	// Then, we need to check if the offset doesn't exceed the length of table.
   739  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
   740  	notLengthExceedJump := c.assembler.CompileJump(amd64.JHI)
   741  
   742  	// If it exceeds, we return the function with nativeCallStatusCodeInvalidTableAccess.
   743  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
   744  	c.assembler.SetJumpTargetOnNext(notLengthExceedJump)
   745  
   746  	// next we check if the target's type matches the operation's one.
   747  	// In order to get the type instance's address, we have to multiply the offset
   748  	// by 8 as the offset is the "length" of table in Go's "[]uintptr{}",
   749  	// and size of uintptr equals 8 bytes == (2^3).
   750  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, offset.register)
   751  
   752  	// Adds the address of wasm.Table[0] stored as callEngine.tableElement0Address to the offset.
   753  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
   754  		tmp, tableInstanceTableOffset, offset.register)
   755  
   756  	// "offset = (*offset) (== table[offset]  == *code type)"
   757  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, 0, offset.register)
   758  
   759  	// At this point offset.register holds the address of *code (as uintptr) at wasm.Table[offset].
   760  	//
   761  	// Check if the value of table[offset] equals zero, meaning that the target is uninitialized.
   762  	c.assembler.CompileRegisterToConst(amd64.CMPQ, offset.register, 0)
   763  
   764  	// Jump if the target is initialized element.
   765  	jumpIfInitialized := c.assembler.CompileJump(amd64.JNE)
   766  
   767  	// If not initialized, we return the function with nativeCallStatusCodeInvalidTableAccess.
   768  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
   769  
   770  	c.assembler.SetJumpTargetOnNext(jumpIfInitialized)
   771  
   772  	// next we need to check the type matches, i.e. table[offset].source.TypeID == targetFunctionType's typeID.
   773  	//
   774  	// "tmp = table[offset].source ( == *FunctionInstance type)"
   775  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, functionSourceOffset, tmp)
   776  
   777  	// "tmp2 = [&moduleInstance.TypeIDs[0] + index * 4] (== moduleInstance.TypeIDs[index])"
   778  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
   779  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
   780  		tmp2)
   781  	c.assembler.CompileMemoryToRegister(amd64.MOVL, tmp2, int64(o.TypeIndex)*4, tmp2)
   782  
   783  	// Jump if the type matches.
   784  	c.assembler.CompileMemoryToRegister(amd64.CMPL, tmp, functionInstanceTypeIDOffset, tmp2)
   785  	jumpIfTypeMatch := c.assembler.CompileJump(amd64.JEQ)
   786  
   787  	// Otherwise, exit with type mismatch status.
   788  	c.compileExitFromNativeCode(nativeCallStatusCodeTypeMismatchOnIndirectCall)
   789  
   790  	c.assembler.SetJumpTargetOnNext(jumpIfTypeMatch)
   791  	targetFunctionType := c.ir.Types[o.TypeIndex]
   792  	if err = c.compileCallFunctionImpl(offset.register, targetFunctionType); err != nil {
   793  		return nil
   794  	}
   795  
   796  	// The offset register should be marked as un-used as we consumed in the function call.
   797  	c.locationStack.markRegisterUnused(offset.register, tmp, tmp2)
   798  	return nil
   799  }
   800  
   801  // compileDrop implements compiler.compileDrop for the amd64 architecture.
   802  func (c *amd64Compiler) compileDrop(o *wazeroir.OperationDrop) error {
   803  	return compileDropRange(c, o.Depth)
   804  }
   805  
   806  // compileSelectV128Impl implements compileSelect for vector values.
   807  func (c *amd64Compiler) compileSelectV128Impl(selectorReg asm.Register) error {
   808  	x2 := c.locationStack.popV128()
   809  	if err := c.compileEnsureOnRegister(x2); err != nil {
   810  		return err
   811  	}
   812  
   813  	x1 := c.locationStack.popV128()
   814  	if err := c.compileEnsureOnRegister(x1); err != nil {
   815  		return err
   816  	}
   817  
   818  	// Compare the conditional value with zero.
   819  	c.assembler.CompileRegisterToConst(amd64.CMPQ, selectorReg, 0)
   820  
   821  	// Set the jump if the top value is not zero.
   822  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
   823  
   824  	// In this branch, we select the value of x2, so we move the value into x1.register so that
   825  	// we can have the result in x1.register regardless of the selection.
   826  	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, x2.register, x1.register)
   827  
   828  	// Else, we don't need to adjust value, just need to jump to the next instruction.
   829  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
   830  
   831  	// As noted, the result exists in x1.register regardless of the selector.
   832  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   833  	// Plus, x2.register is no longer used.
   834  	c.locationStack.markRegisterUnused(x2.register)
   835  	c.locationStack.markRegisterUnused(selectorReg)
   836  	return nil
   837  }
   838  
   839  // compileSelect implements compiler.compileSelect for the amd64 architecture.
   840  //
   841  // The emitted native code depends on whether the values are on
   842  // the physical registers or memory stack, or maybe conditional register.
   843  func (c *amd64Compiler) compileSelect(o *wazeroir.OperationSelect) error {
   844  	cv := c.locationStack.pop()
   845  	if err := c.compileEnsureOnRegister(cv); err != nil {
   846  		return err
   847  	}
   848  
   849  	if o.IsTargetVector {
   850  		return c.compileSelectV128Impl(cv.register)
   851  	}
   852  
   853  	x2 := c.locationStack.pop()
   854  	// We do not consume x1 here, but modify the value according to
   855  	// the conditional value "c" above.
   856  	peekedX1 := c.locationStack.peek()
   857  
   858  	// Compare the conditional value with zero.
   859  	c.assembler.CompileRegisterToConst(amd64.CMPQ, cv.register, 0)
   860  
   861  	// Now we can use c.register as temporary location.
   862  	// We alias it here for readability.
   863  	tmpRegister := cv.register
   864  
   865  	// Set the jump if the top value is not zero.
   866  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
   867  
   868  	// If the value is zero, we must place the value of x2 onto the stack position of x1.
   869  
   870  	// First we copy the value of x2 to the temporary register if x2 is not currently on a register.
   871  	if x2.onStack() {
   872  		x2.register = tmpRegister
   873  		c.compileLoadValueOnStackToRegister(x2)
   874  	}
   875  
   876  	//
   877  	// At this point x2's value is always on a register.
   878  	//
   879  
   880  	// Then release the value in the x2's register to the x1's stack position.
   881  	if peekedX1.onRegister() {
   882  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, peekedX1.register)
   883  	} else {
   884  		peekedX1.register = x2.register
   885  		c.compileReleaseRegisterToStack(peekedX1) // Note inside we mark the register unused!
   886  	}
   887  
   888  	// Else, we don't need to adjust value, just need to jump to the next instruction.
   889  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
   890  
   891  	// In any case, we don't need x2 and c anymore!
   892  	c.locationStack.releaseRegister(x2)
   893  	c.locationStack.releaseRegister(cv)
   894  	return nil
   895  }
   896  
   897  // compilePick implements compiler.compilePick for the amd64 architecture.
   898  func (c *amd64Compiler) compilePick(o *wazeroir.OperationPick) error {
   899  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   900  		return err
   901  	}
   902  
   903  	// TODO: if we track the type of values on the stack,
   904  	// we could optimize the instruction according to the bit size of the value.
   905  	// For now, we just move the entire register i.e. as a quad word (8 bytes).
   906  	pickTarget := c.locationStack.stack[c.locationStack.sp-1-uint64(o.Depth)]
   907  	reg, err := c.allocateRegister(pickTarget.getRegisterType())
   908  	if err != nil {
   909  		return err
   910  	}
   911  
   912  	if pickTarget.onRegister() {
   913  		var inst asm.Instruction
   914  		if o.IsTargetVector {
   915  			inst = amd64.MOVDQU
   916  		} else if pickTarget.valueType == runtimeValueTypeI32 { // amd64 cannot copy single-precisions between registers.
   917  			inst = amd64.MOVL
   918  		} else {
   919  			inst = amd64.MOVQ
   920  		}
   921  		c.assembler.CompileRegisterToRegister(inst, pickTarget.register, reg)
   922  	} else if pickTarget.onStack() {
   923  		// Copy the value from the stack.
   924  		var inst asm.Instruction
   925  		if o.IsTargetVector {
   926  			inst = amd64.MOVDQU
   927  		} else if pickTarget.valueType == runtimeValueTypeI32 || pickTarget.valueType == runtimeValueTypeF32 {
   928  			inst = amd64.MOVL
   929  		} else {
   930  			inst = amd64.MOVQ
   931  		}
   932  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
   933  		c.assembler.CompileMemoryToRegister(inst, amd64ReservedRegisterForStackBasePointerAddress,
   934  			int64(pickTarget.stackPointer)*8, reg)
   935  	}
   936  	// Now we already placed the picked value on the register,
   937  	// so push the location onto the stack.
   938  	if o.IsTargetVector {
   939  		c.pushVectorRuntimeValueLocationOnRegister(reg)
   940  	} else {
   941  		c.pushRuntimeValueLocationOnRegister(reg, pickTarget.valueType)
   942  	}
   943  	return nil
   944  }
   945  
   946  // compileAdd implements compiler.compileAdd for the amd64 architecture.
   947  func (c *amd64Compiler) compileAdd(o *wazeroir.OperationAdd) error {
   948  	// TODO: if the previous instruction is const, then
   949  	// this can be optimized. Same goes for other arithmetic instructions.
   950  
   951  	var instruction asm.Instruction
   952  	switch o.Type {
   953  	case wazeroir.UnsignedTypeI32:
   954  		instruction = amd64.ADDL
   955  	case wazeroir.UnsignedTypeI64:
   956  		instruction = amd64.ADDQ
   957  	case wazeroir.UnsignedTypeF32:
   958  		instruction = amd64.ADDSS
   959  	case wazeroir.UnsignedTypeF64:
   960  		instruction = amd64.ADDSD
   961  	}
   962  
   963  	x2 := c.locationStack.pop()
   964  	if err := c.compileEnsureOnRegister(x2); err != nil {
   965  		return err
   966  	}
   967  
   968  	x1 := c.locationStack.peek() // Note this is peek, pop!
   969  	if err := c.compileEnsureOnRegister(x1); err != nil {
   970  		return err
   971  	}
   972  
   973  	// x1 += x2.
   974  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
   975  
   976  	// We no longer need x2 register after ADD operation here,
   977  	// so we release it.
   978  	c.locationStack.releaseRegister(x2)
   979  	return nil
   980  }
   981  
   982  // compileSub implements compiler.compileSub for the amd64 architecture.
   983  func (c *amd64Compiler) compileSub(o *wazeroir.OperationSub) error {
   984  	// TODO: if the previous instruction is const, then
   985  	// this can be optimized. Same goes for other arithmetic instructions.
   986  
   987  	var instruction asm.Instruction
   988  	switch o.Type {
   989  	case wazeroir.UnsignedTypeI32:
   990  		instruction = amd64.SUBL
   991  	case wazeroir.UnsignedTypeI64:
   992  		instruction = amd64.SUBQ
   993  	case wazeroir.UnsignedTypeF32:
   994  		instruction = amd64.SUBSS
   995  	case wazeroir.UnsignedTypeF64:
   996  		instruction = amd64.SUBSD
   997  	}
   998  
   999  	x2 := c.locationStack.pop()
  1000  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1001  		return err
  1002  	}
  1003  
  1004  	x1 := c.locationStack.peek() // Note this is peek, pop!
  1005  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1006  		return err
  1007  	}
  1008  
  1009  	// x1 -= x2.
  1010  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1011  
  1012  	// We no longer need x2 register after ADD operation here,
  1013  	// so we release it.
  1014  	c.locationStack.releaseRegister(x2)
  1015  	return nil
  1016  }
  1017  
  1018  // compileMul implements compiler.compileMul for the amd64 architecture.
  1019  func (c *amd64Compiler) compileMul(o *wazeroir.OperationMul) (err error) {
  1020  	switch o.Type {
  1021  	case wazeroir.UnsignedTypeI32:
  1022  		err = c.compileMulForInts(true, amd64.MULL)
  1023  	case wazeroir.UnsignedTypeI64:
  1024  		err = c.compileMulForInts(false, amd64.MULQ)
  1025  	case wazeroir.UnsignedTypeF32:
  1026  		err = c.compileMulForFloats(amd64.MULSS)
  1027  	case wazeroir.UnsignedTypeF64:
  1028  		err = c.compileMulForFloats(amd64.MULSD)
  1029  	}
  1030  	return
  1031  }
  1032  
  1033  // compileMulForInts emits instructions to perform integer multiplication for
  1034  // top two values on the stack. If unfamiliar with the convention for integer
  1035  // multiplication on x86, see https://www.felixcloutier.com/x86/mul.
  1036  //
  1037  // In summary, one of the values must be on the AX register,
  1038  // and the mul instruction stores the overflow info in DX register which we don't use.
  1039  // Here, we mean "the overflow info" by 65 bit or higher part of the result for 64 bit case.
  1040  //
  1041  // So, we have to ensure that
  1042  //  1. Previously located value on DX must be saved to memory stack. That is because
  1043  //     the existing value will be overridden after the mul execution.
  1044  //  2. One of the operands (x1 or x2) must be on AX register.
  1045  //
  1046  // See https://www.felixcloutier.com/x86/mul#description for detail semantics.
  1047  func (c *amd64Compiler) compileMulForInts(is32Bit bool, mulInstruction asm.Instruction) error {
  1048  	const (
  1049  		resultRegister   = amd64.RegAX
  1050  		reservedRegister = amd64.RegDX
  1051  	)
  1052  
  1053  	x2 := c.locationStack.pop()
  1054  	x1 := c.locationStack.pop()
  1055  
  1056  	var valueOnAX *runtimeValueLocation
  1057  	if x1.register == resultRegister {
  1058  		valueOnAX = x1
  1059  	} else if x2.register == resultRegister {
  1060  		valueOnAX = x2
  1061  	} else {
  1062  		valueOnAX = x2
  1063  		// This case we  move x2 to AX register.
  1064  		c.onValueReleaseRegisterToStack(resultRegister)
  1065  		if x2.onConditionalRegister() {
  1066  			c.compileMoveConditionalToGeneralPurposeRegister(x2, resultRegister)
  1067  		} else if x2.onStack() {
  1068  			x2.setRegister(resultRegister)
  1069  			c.compileLoadValueOnStackToRegister(x2)
  1070  			c.locationStack.markRegisterUsed(resultRegister)
  1071  		} else {
  1072  			var inst asm.Instruction
  1073  			if is32Bit {
  1074  				inst = amd64.MOVL
  1075  			} else {
  1076  				inst = amd64.MOVQ
  1077  			}
  1078  			c.assembler.CompileRegisterToRegister(inst, x2.register, resultRegister)
  1079  
  1080  			// We no longer uses the prev register of x2.
  1081  			c.locationStack.releaseRegister(x2)
  1082  			x2.setRegister(resultRegister)
  1083  			c.locationStack.markRegisterUsed(resultRegister)
  1084  		}
  1085  	}
  1086  
  1087  	// We have to make sure that at this point the operands must be on registers.
  1088  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1089  		return err
  1090  	}
  1091  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1092  		return err
  1093  	}
  1094  
  1095  	// We have to save the existing value on DX.
  1096  	// If the DX register is used by either x1 or x2, we don't need to
  1097  	// save the value because it is consumed by mul anyway.
  1098  	if x1.register != reservedRegister && x2.register != reservedRegister {
  1099  		c.onValueReleaseRegisterToStack(reservedRegister)
  1100  	}
  1101  
  1102  	// Now ready to emit the mul instruction.
  1103  	if x1 == valueOnAX {
  1104  		c.assembler.CompileRegisterToNone(mulInstruction, x2.register)
  1105  	} else {
  1106  		c.assembler.CompileRegisterToNone(mulInstruction, x1.register)
  1107  	}
  1108  
  1109  	c.locationStack.markRegisterUnused(x2.register)
  1110  	c.locationStack.markRegisterUnused(x1.register)
  1111  
  1112  	// Now we have the result in the AX register,
  1113  	// so we record it.
  1114  	c.pushRuntimeValueLocationOnRegister(resultRegister, x1.valueType)
  1115  	return nil
  1116  }
  1117  
  1118  func (c *amd64Compiler) compileMulForFloats(instruction asm.Instruction) error {
  1119  	x2 := c.locationStack.pop()
  1120  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1121  		return err
  1122  	}
  1123  
  1124  	x1 := c.locationStack.peek() // Note this is peek!
  1125  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1126  		return err
  1127  	}
  1128  
  1129  	// x1 *= x2.
  1130  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1131  
  1132  	// We no longer need x2 register after MUL operation here,
  1133  	// so we release it.
  1134  	c.locationStack.releaseRegister(x2)
  1135  	return nil
  1136  }
  1137  
  1138  // compileClz implements compiler.compileClz for the amd64 architecture.
  1139  func (c *amd64Compiler) compileClz(o *wazeroir.OperationClz) error {
  1140  	target := c.locationStack.pop()
  1141  	if err := c.compileEnsureOnRegister(target); err != nil {
  1142  		return err
  1143  	}
  1144  
  1145  	if runtime.GOOS != "darwin" && runtime.GOOS != "freebsd" {
  1146  		if o.Type == wazeroir.UnsignedInt32 {
  1147  			c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register)
  1148  		} else {
  1149  			c.assembler.CompileRegisterToRegister(amd64.LZCNTQ, target.register, target.register)
  1150  		}
  1151  	} else {
  1152  		// On x86 mac, we cannot use LZCNT as it always results in zero.
  1153  		// Instead we combine BSR (calculating most significant set bit)
  1154  		// with XOR. This logic is described in
  1155  		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
  1156  		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
  1157  
  1158  		// First, we have to check if the target is non-zero as BSR is undefined
  1159  		// on zero. See https://www.felixcloutier.com/x86/bsr.
  1160  		c.assembler.CompileRegisterToConst(amd64.CMPQ, target.register, 0)
  1161  		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
  1162  
  1163  		// If the value is zero, we just push the const value.
  1164  		if o.Type == wazeroir.UnsignedInt32 {
  1165  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
  1166  		} else {
  1167  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
  1168  		}
  1169  
  1170  		// Emit the jmp instruction to jump to the position right after
  1171  		// the non-zero case.
  1172  		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
  1173  
  1174  		// Start emitting non-zero case.
  1175  		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
  1176  		// First, we calculate the most significant set bit.
  1177  		if o.Type == wazeroir.UnsignedInt32 {
  1178  			c.assembler.CompileRegisterToRegister(amd64.BSRL, target.register, target.register)
  1179  		} else {
  1180  			c.assembler.CompileRegisterToRegister(amd64.BSRQ, target.register, target.register)
  1181  		}
  1182  
  1183  		// Now we XOR the value with the bit length minus one.
  1184  		if o.Type == wazeroir.UnsignedInt32 {
  1185  			c.assembler.CompileConstToRegister(amd64.XORL, 31, target.register)
  1186  		} else {
  1187  			c.assembler.CompileConstToRegister(amd64.XORQ, 63, target.register)
  1188  		}
  1189  
  1190  		// Finally the end jump instruction of zero case must target towards
  1191  		// the next instruction.
  1192  		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
  1193  	}
  1194  
  1195  	// We reused the same register of target for the result.
  1196  	c.locationStack.markRegisterUnused(target.register)
  1197  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1198  	return nil
  1199  }
  1200  
  1201  // compileCtz implements compiler.compileCtz for the amd64 architecture.
  1202  func (c *amd64Compiler) compileCtz(o *wazeroir.OperationCtz) error {
  1203  	target := c.locationStack.pop()
  1204  	if err := c.compileEnsureOnRegister(target); err != nil {
  1205  		return err
  1206  	}
  1207  
  1208  	if runtime.GOOS != "darwin" && runtime.GOOS != "freebsd" {
  1209  		if o.Type == wazeroir.UnsignedInt32 {
  1210  			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
  1211  		} else {
  1212  			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
  1213  		}
  1214  	} else {
  1215  		// Somehow, if the target value is zero, TZCNT always returns zero: this is wrong.
  1216  		// Meanwhile, we need branches for non-zero and zero cases on macos.
  1217  		// TODO: find the reference to this behavior and put the link here.
  1218  
  1219  		// First we compare the target with zero.
  1220  		c.assembler.CompileRegisterToConst(amd64.CMPQ, target.register, 0)
  1221  		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
  1222  
  1223  		// If the value is zero, we just push the const value.
  1224  		if o.Type == wazeroir.UnsignedInt32 {
  1225  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
  1226  		} else {
  1227  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
  1228  		}
  1229  
  1230  		// Emit the jmp instruction to jump to the position right after
  1231  		// the non-zero case.
  1232  		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
  1233  
  1234  		// Otherwise, emit the TZCNT.
  1235  		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
  1236  		if o.Type == wazeroir.UnsignedInt32 {
  1237  			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
  1238  		} else {
  1239  			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
  1240  		}
  1241  
  1242  		// Finally the end jump instruction of zero case must target towards
  1243  		// the next instruction.
  1244  		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
  1245  	}
  1246  
  1247  	// We reused the same register of target for the result.
  1248  	c.locationStack.markRegisterUnused(target.register)
  1249  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1250  	return nil
  1251  }
  1252  
  1253  // compilePopcnt implements compiler.compilePopcnt for the amd64 architecture.
  1254  func (c *amd64Compiler) compilePopcnt(o *wazeroir.OperationPopcnt) error {
  1255  	target := c.locationStack.pop()
  1256  	if err := c.compileEnsureOnRegister(target); err != nil {
  1257  		return err
  1258  	}
  1259  
  1260  	if o.Type == wazeroir.UnsignedInt32 {
  1261  		c.assembler.CompileRegisterToRegister(amd64.POPCNTL, target.register, target.register)
  1262  	} else {
  1263  		c.assembler.CompileRegisterToRegister(amd64.POPCNTQ, target.register, target.register)
  1264  	}
  1265  
  1266  	// We reused the same register of target for the result.
  1267  	c.locationStack.markRegisterUnused(target.register)
  1268  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1269  	return nil
  1270  }
  1271  
  1272  // compileDiv implements compiler.compileDiv for the amd64 architecture.
  1273  func (c *amd64Compiler) compileDiv(o *wazeroir.OperationDiv) (err error) {
  1274  	switch o.Type {
  1275  	case wazeroir.SignedTypeUint32:
  1276  		err = c.compileDivForInts(true, false)
  1277  	case wazeroir.SignedTypeUint64:
  1278  		err = c.compileDivForInts(false, false)
  1279  	case wazeroir.SignedTypeInt32:
  1280  		err = c.compileDivForInts(true, true)
  1281  	case wazeroir.SignedTypeInt64:
  1282  		err = c.compileDivForInts(false, true)
  1283  	case wazeroir.SignedTypeFloat32:
  1284  		err = c.compileDivForFloats(true)
  1285  	case wazeroir.SignedTypeFloat64:
  1286  		err = c.compileDivForFloats(false)
  1287  	}
  1288  	return
  1289  }
  1290  
  1291  // compileDivForInts emits the instructions to perform division on the top
  1292  // two values of integer type on the stack and puts the quotient of the result
  1293  // onto the stack. For example, stack [..., 10, 3] results in [..., 3] where
  1294  // the remainder is discarded.
  1295  func (c *amd64Compiler) compileDivForInts(is32Bit bool, signed bool) error {
  1296  	if err := c.performDivisionOnInts(false, is32Bit, signed); err != nil {
  1297  		return err
  1298  	}
  1299  	// Now we have the quotient of the division result in the AX register,
  1300  	// so we record it.
  1301  	if is32Bit {
  1302  		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI32)
  1303  	} else {
  1304  		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI64)
  1305  	}
  1306  	return nil
  1307  }
  1308  
  1309  // compileRem implements compiler.compileRem for the amd64 architecture.
  1310  func (c *amd64Compiler) compileRem(o *wazeroir.OperationRem) (err error) {
  1311  	var vt runtimeValueType
  1312  	switch o.Type {
  1313  	case wazeroir.SignedInt32:
  1314  		err = c.performDivisionOnInts(true, true, true)
  1315  		vt = runtimeValueTypeI32
  1316  	case wazeroir.SignedInt64:
  1317  		err = c.performDivisionOnInts(true, false, true)
  1318  		vt = runtimeValueTypeI64
  1319  	case wazeroir.SignedUint32:
  1320  		err = c.performDivisionOnInts(true, true, false)
  1321  		vt = runtimeValueTypeI32
  1322  	case wazeroir.SignedUint64:
  1323  		err = c.performDivisionOnInts(true, false, false)
  1324  		vt = runtimeValueTypeI64
  1325  	}
  1326  	if err != nil {
  1327  		return err
  1328  	}
  1329  
  1330  	// Now we have the remainder of the division result in the DX register,
  1331  	// so we record it.
  1332  	c.pushRuntimeValueLocationOnRegister(amd64.RegDX, vt)
  1333  	return
  1334  }
  1335  
  1336  // performDivisionOnInts emits the instructions to do divisions on top two integers on the stack
  1337  // via DIV (unsigned div) and IDIV (signed div) instructions.
  1338  // See the following explanation of these instructions' semantics from https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf
  1339  //
  1340  // >> Division requires special arrangements: idiv (signed) and div (unsigned) operate on a 2n-byte dividend and
  1341  // >> an n-byte divisor to produce an n-byte quotient and n-byte remainder. The dividend always lives in a fixed pair of
  1342  // >> registers (%edx and %eax for the 32-bit case; %rdx and %rax for the 64-bit case); the divisor is specified as the
  1343  // >> source operand in the instruction. The quotient goes in %eax (resp. %rax); the remainder in %edx (resp. %rdx). For
  1344  // >> signed division, the cltd (resp. ctqo) instruction is used to prepare %edx (resp. %rdx) with the sign extension of
  1345  // >> %eax (resp. %rax). For example, if a,b, c are memory locations holding quad words, then we could set c = a/b
  1346  // >> using the sequence: movq a(%rip), %rax; ctqo; idivq b(%rip); movq %rax, c(%rip).
  1347  //
  1348  // tl;dr is that the division result is placed in AX and DX registers after instructions emitted by this function
  1349  // where AX holds the quotient while DX the remainder of the division result.
  1350  func (c *amd64Compiler) performDivisionOnInts(isRem, is32Bit, signed bool) error {
  1351  	const (
  1352  		quotientRegister  = amd64.RegAX
  1353  		remainderRegister = amd64.RegDX
  1354  	)
  1355  
  1356  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1357  		return err
  1358  	}
  1359  
  1360  	// Ensures that previous values on these registers are saved to memory.
  1361  	c.onValueReleaseRegisterToStack(quotientRegister)
  1362  	c.onValueReleaseRegisterToStack(remainderRegister)
  1363  
  1364  	// In order to ensure x2 is placed on a temporary register for x2 value other than AX and DX,
  1365  	// we mark them as used here.
  1366  	c.locationStack.markRegisterUsed(quotientRegister)
  1367  	c.locationStack.markRegisterUsed(remainderRegister)
  1368  
  1369  	// Ensure that x2 is placed on a register which is not either AX or DX.
  1370  	x2 := c.locationStack.pop()
  1371  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1372  		return err
  1373  	}
  1374  
  1375  	// Now we successfully place x2 on a temp register, so we no longer need to
  1376  	// mark these registers used.
  1377  	c.locationStack.markRegisterUnused(quotientRegister)
  1378  	c.locationStack.markRegisterUnused(remainderRegister)
  1379  
  1380  	// Check if the x2 equals zero.
  1381  	if is32Bit {
  1382  		c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, 0)
  1383  	} else {
  1384  		c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, 0)
  1385  	}
  1386  
  1387  	// Jump if the divisor is not zero.
  1388  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
  1389  
  1390  	// Otherwise, we return with nativeCallStatusIntegerDivisionByZero status.
  1391  	c.compileExitFromNativeCode(nativeCallStatusIntegerDivisionByZero)
  1392  
  1393  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
  1394  
  1395  	// next, we ensure that x1 is placed on AX.
  1396  	x1 := c.locationStack.pop()
  1397  	if x1.onRegister() && x1.register != quotientRegister {
  1398  		// Move x1 to quotientRegister.
  1399  		if is32Bit {
  1400  			c.assembler.CompileRegisterToRegister(amd64.MOVL, x1.register, quotientRegister)
  1401  		} else {
  1402  			c.assembler.CompileRegisterToRegister(amd64.MOVQ, x1.register, quotientRegister)
  1403  		}
  1404  		c.locationStack.markRegisterUnused(x1.register)
  1405  		x1.setRegister(quotientRegister)
  1406  	} else if x1.onStack() {
  1407  		x1.setRegister(quotientRegister)
  1408  		c.compileLoadValueOnStackToRegister(x1)
  1409  	}
  1410  
  1411  	// Note: at this point, x1 is placed on AX, x2 is on a register which is not AX or DX.
  1412  
  1413  	isSignedRem := isRem && signed
  1414  	isSignedDiv := !isRem && signed
  1415  	var signedRemMinusOneDivisorJmp asm.Node
  1416  	if isSignedRem {
  1417  		// If this is for getting remainder of signed division,
  1418  		// we have to treat the special case where the divisor equals -1.
  1419  		// For example, if this is 32-bit case, the result of (-2^31) / -1 equals (quotient=2^31, remainder=0)
  1420  		// where quotient doesn't fit in the 32-bit range whose maximum is 2^31-1.
  1421  		// x86 in this case cause floating point exception, but according to the Wasm spec
  1422  		// if the divisor equals -1, the result must be zero (not undefined!) as opposed to be "undefined"
  1423  		// for divisions on (-2^31) / -1 where we do not need to emit the special branches.
  1424  		// For detail, please refer to https://stackoverflow.com/questions/56303282/why-idiv-with-1-causes-floating-point-exception
  1425  
  1426  		// First we compare the division with -1.
  1427  		if is32Bit {
  1428  			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
  1429  		} else {
  1430  			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
  1431  		}
  1432  
  1433  		// If it doesn't equal minus one, we jump to the normal case.
  1434  		okJmp := c.assembler.CompileJump(amd64.JNE)
  1435  
  1436  		// Otherwise, we store zero into the remainder result register (DX).
  1437  		if is32Bit {
  1438  			c.assembler.CompileRegisterToRegister(amd64.XORL, remainderRegister, remainderRegister)
  1439  		} else {
  1440  			c.assembler.CompileRegisterToRegister(amd64.XORQ, remainderRegister, remainderRegister)
  1441  		}
  1442  
  1443  		// Emit the exit jump instruction for the divisor -1 case so
  1444  		// we skips the normal case.
  1445  		signedRemMinusOneDivisorJmp = c.assembler.CompileJump(amd64.JMP)
  1446  
  1447  		// Set the normal case's jump target.
  1448  		c.assembler.SetJumpTargetOnNext(okJmp)
  1449  	} else if isSignedDiv {
  1450  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1451  		// case which results in the floating point exception via division error as
  1452  		// the resulting value exceeds the maximum of signed int.
  1453  
  1454  		// First we compare the division with -1.
  1455  		if is32Bit {
  1456  			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
  1457  		} else {
  1458  			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
  1459  		}
  1460  
  1461  		// If it doesn't equal minus one, we jump to the normal case.
  1462  		nonMinusOneDivisorJmp := c.assembler.CompileJump(amd64.JNE)
  1463  
  1464  		// next we check if the quotient is the most negative value for the signed integer.
  1465  		// That means whether or not we try to do (math.MaxInt32 / -1) or (math.Math.Int64 / -1) respectively.
  1466  		if is32Bit {
  1467  			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPL, x1.register,
  1468  				asm.NewStaticConst(u32.LeBytes(uint32(minimum32BitSignedInt)))); err != nil {
  1469  				return err
  1470  			}
  1471  		} else {
  1472  			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPQ, x1.register,
  1473  				asm.NewStaticConst(u64.LeBytes(uint64(minimum64BitSignedInt)))); err != nil {
  1474  				return err
  1475  			}
  1476  		}
  1477  
  1478  		// If it doesn't equal, we jump to the normal case.
  1479  		jmpOK := c.assembler.CompileJump(amd64.JNE)
  1480  
  1481  		// Otherwise, we are trying to do (math.MaxInt32 / -1) or (math.Math.Int64 / -1),
  1482  		// and that is the overflow in division as the result becomes 2^31 which is larger than
  1483  		// the maximum of signed 32-bit int (2^31-1).
  1484  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  1485  
  1486  		// Set the normal case's jump target.
  1487  		c.assembler.SetJumpTargetOnNext(nonMinusOneDivisorJmp, jmpOK)
  1488  	}
  1489  
  1490  	// Now ready to emit the div instruction.
  1491  	// Since the div instructions takes 2n byte dividend placed in DX:AX registers...
  1492  	// * signed case - we need to sign-extend the dividend into DX register via CDQ (32 bit) or CQO (64 bit).
  1493  	// * unsigned case - we need to zero DX register via "XOR DX DX"
  1494  	if is32Bit && signed {
  1495  		// Emit sign-extension to have 64 bit dividend over DX and AX registers.
  1496  		c.assembler.CompileStandAlone(amd64.CDQ)
  1497  		c.assembler.CompileRegisterToNone(amd64.IDIVL, x2.register)
  1498  	} else if is32Bit && !signed {
  1499  		// Zeros DX register to have 64 bit dividend over DX and AX registers.
  1500  		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
  1501  		c.assembler.CompileRegisterToNone(amd64.DIVL, x2.register)
  1502  	} else if !is32Bit && signed {
  1503  		// Emits sign-extension to have 128 bit dividend over DX and AX registers.
  1504  		c.assembler.CompileStandAlone(amd64.CQO)
  1505  		c.assembler.CompileRegisterToNone(amd64.IDIVQ, x2.register)
  1506  	} else if !is32Bit && !signed {
  1507  		// Zeros DX register to have 128 bit dividend over DX and AX registers.
  1508  		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
  1509  		c.assembler.CompileRegisterToNone(amd64.DIVQ, x2.register)
  1510  	}
  1511  
  1512  	// If this is signed rem instruction, we must set the jump target of
  1513  	// the exit jump from division -1 case towards the next instruction.
  1514  	if signedRemMinusOneDivisorJmp != nil {
  1515  		c.assembler.SetJumpTargetOnNext(signedRemMinusOneDivisorJmp)
  1516  	}
  1517  
  1518  	// We mark them as unused so that we can push one of them onto the location stack at call sites.
  1519  	c.locationStack.markRegisterUnused(remainderRegister)
  1520  	c.locationStack.markRegisterUnused(quotientRegister)
  1521  	c.locationStack.markRegisterUnused(x2.register)
  1522  	return nil
  1523  }
  1524  
  1525  // compileDivForFloats emits the instructions to perform division
  1526  // on the top two values of float type on the stack, placing the result back onto the stack.
  1527  // For example, stack [..., 1.0, 4.0] results in [..., 0.25].
  1528  func (c *amd64Compiler) compileDivForFloats(is32Bit bool) error {
  1529  	if is32Bit {
  1530  		return c.compileSimpleBinaryOp(amd64.DIVSS)
  1531  	} else {
  1532  		return c.compileSimpleBinaryOp(amd64.DIVSD)
  1533  	}
  1534  }
  1535  
  1536  // compileAnd implements compiler.compileAnd for the amd64 architecture.
  1537  func (c *amd64Compiler) compileAnd(o *wazeroir.OperationAnd) (err error) {
  1538  	switch o.Type {
  1539  	case wazeroir.UnsignedInt32:
  1540  		err = c.compileSimpleBinaryOp(amd64.ANDL)
  1541  	case wazeroir.UnsignedInt64:
  1542  		err = c.compileSimpleBinaryOp(amd64.ANDQ)
  1543  	}
  1544  	return
  1545  }
  1546  
  1547  // compileOr implements compiler.compileOr for the amd64 architecture.
  1548  func (c *amd64Compiler) compileOr(o *wazeroir.OperationOr) (err error) {
  1549  	switch o.Type {
  1550  	case wazeroir.UnsignedInt32:
  1551  		err = c.compileSimpleBinaryOp(amd64.ORL)
  1552  	case wazeroir.UnsignedInt64:
  1553  		err = c.compileSimpleBinaryOp(amd64.ORQ)
  1554  	}
  1555  	return
  1556  }
  1557  
  1558  // compileXor implements compiler.compileXor for the amd64 architecture.
  1559  func (c *amd64Compiler) compileXor(o *wazeroir.OperationXor) (err error) {
  1560  	switch o.Type {
  1561  	case wazeroir.UnsignedInt32:
  1562  		err = c.compileSimpleBinaryOp(amd64.XORL)
  1563  	case wazeroir.UnsignedInt64:
  1564  		err = c.compileSimpleBinaryOp(amd64.XORQ)
  1565  	}
  1566  	return
  1567  }
  1568  
  1569  // compileSimpleBinaryOp emits instructions to pop two values from the stack
  1570  // and perform the given instruction on these two values and push the result
  1571  // onto the stack.
  1572  func (c *amd64Compiler) compileSimpleBinaryOp(instruction asm.Instruction) error {
  1573  	x2 := c.locationStack.pop()
  1574  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1575  		return err
  1576  	}
  1577  
  1578  	x1 := c.locationStack.pop()
  1579  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1580  		return err
  1581  	}
  1582  
  1583  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1584  
  1585  	// We consumed x2 register after the operation here,
  1586  	// so we release it.
  1587  	c.locationStack.releaseRegister(x2)
  1588  
  1589  	// We already stored the result in the register used by x1
  1590  	// so we record it.
  1591  	c.locationStack.markRegisterUnused(x1.register)
  1592  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1593  	return nil
  1594  }
  1595  
  1596  // compileShl implements compiler.compileShl for the amd64 architecture.
  1597  func (c *amd64Compiler) compileShl(o *wazeroir.OperationShl) (err error) {
  1598  	switch o.Type {
  1599  	case wazeroir.UnsignedInt32:
  1600  		err = c.compileShiftOp(amd64.SHLL, false)
  1601  	case wazeroir.UnsignedInt64:
  1602  		err = c.compileShiftOp(amd64.SHLQ, true)
  1603  	}
  1604  	return
  1605  }
  1606  
  1607  // compileShr implements compiler.compileShr for the amd64 architecture.
  1608  func (c *amd64Compiler) compileShr(o *wazeroir.OperationShr) (err error) {
  1609  	switch o.Type {
  1610  	case wazeroir.SignedInt32:
  1611  		err = c.compileShiftOp(amd64.SARL, true)
  1612  	case wazeroir.SignedInt64:
  1613  		err = c.compileShiftOp(amd64.SARQ, false)
  1614  	case wazeroir.SignedUint32:
  1615  		err = c.compileShiftOp(amd64.SHRL, true)
  1616  	case wazeroir.SignedUint64:
  1617  		err = c.compileShiftOp(amd64.SHRQ, false)
  1618  	}
  1619  	return
  1620  }
  1621  
  1622  // compileRotl implements compiler.compileRotl for the amd64 architecture.
  1623  func (c *amd64Compiler) compileRotl(o *wazeroir.OperationRotl) (err error) {
  1624  	switch o.Type {
  1625  	case wazeroir.UnsignedInt32:
  1626  		err = c.compileShiftOp(amd64.ROLL, true)
  1627  	case wazeroir.UnsignedInt64:
  1628  		err = c.compileShiftOp(amd64.ROLQ, false)
  1629  	}
  1630  	return
  1631  }
  1632  
  1633  // compileRotr implements compiler.compileRotr for the amd64 architecture.
  1634  func (c *amd64Compiler) compileRotr(o *wazeroir.OperationRotr) (err error) {
  1635  	switch o.Type {
  1636  	case wazeroir.UnsignedInt32:
  1637  		err = c.compileShiftOp(amd64.RORL, true)
  1638  	case wazeroir.UnsignedInt64:
  1639  		err = c.compileShiftOp(amd64.RORQ, false)
  1640  	}
  1641  	return
  1642  }
  1643  
  1644  // compileShiftOp adds instructions for shift operations (SHR, SHL, ROTR, ROTL)
  1645  // where we have to place the second value (shift counts) on the CX register.
  1646  func (c *amd64Compiler) compileShiftOp(instruction asm.Instruction, is32Bit bool) error {
  1647  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1648  		return err
  1649  	}
  1650  
  1651  	x2 := c.locationStack.pop()
  1652  
  1653  	// Ensures that x2 (holding shift counts) is placed on the CX register.
  1654  	const shiftCountRegister = amd64.RegCX
  1655  	if (x2.onRegister() && x2.register != shiftCountRegister) || x2.onStack() {
  1656  		// If another value lives on the CX register, we release it to the stack.
  1657  		c.onValueReleaseRegisterToStack(shiftCountRegister)
  1658  
  1659  		if x2.onRegister() {
  1660  			// If x2 lives on a register, we move the value to CX.
  1661  			if is32Bit {
  1662  				c.assembler.CompileRegisterToRegister(amd64.MOVL, x2.register, shiftCountRegister)
  1663  			} else {
  1664  				c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, shiftCountRegister)
  1665  			}
  1666  			// We no longer place any value on the original register, so we record it.
  1667  			c.locationStack.markRegisterUnused(x2.register)
  1668  			// Instead, we've already placed the value on the CX register.
  1669  			x2.setRegister(shiftCountRegister)
  1670  		} else {
  1671  			// If it is on stack, we just move the memory allocated value to the CX register.
  1672  			x2.setRegister(shiftCountRegister)
  1673  			c.compileLoadValueOnStackToRegister(x2)
  1674  		}
  1675  		c.locationStack.markRegisterUsed(shiftCountRegister)
  1676  	}
  1677  
  1678  	x1 := c.locationStack.peek() // Note this is peek!
  1679  
  1680  	if x1.onRegister() {
  1681  		c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1682  	} else {
  1683  		// Shift target can be placed on a memory location.
  1684  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  1685  		c.assembler.CompileRegisterToMemory(instruction, x2.register, amd64ReservedRegisterForStackBasePointerAddress, int64(x1.stackPointer)*8)
  1686  	}
  1687  
  1688  	// We consumed x2 register after the operation here,
  1689  	// so we release it.
  1690  	c.locationStack.releaseRegister(x2)
  1691  	return nil
  1692  }
  1693  
  1694  // compileAbs implements compiler.compileAbs for the amd64 architecture.
  1695  //
  1696  // See the following discussions for how we could take the abs of floats on x86 assembly.
  1697  // https://stackoverflow.com/questions/32408665/fastest-way-to-compute-absolute-value-using-sse/32422471#32422471
  1698  // https://stackoverflow.com/questions/44630015/how-would-fabsdouble-be-implemented-on-x86-is-it-an-expensive-operation
  1699  func (c *amd64Compiler) compileAbs(o *wazeroir.OperationAbs) (err error) {
  1700  	target := c.locationStack.peek() // Note this is peek!
  1701  	if err = c.compileEnsureOnRegister(target); err != nil {
  1702  		return err
  1703  	}
  1704  
  1705  	// First shift left by one to clear the sign bit, and then shift right by one.
  1706  	if o.Type == wazeroir.Float32 {
  1707  		c.assembler.CompileConstToRegister(amd64.PSLLD, 1, target.register)
  1708  		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, target.register)
  1709  	} else {
  1710  		c.assembler.CompileConstToRegister(amd64.PSLLQ, 1, target.register)
  1711  		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, target.register)
  1712  	}
  1713  	return nil
  1714  }
  1715  
  1716  // compileNeg implements compiler.compileNeg for the amd64 architecture.
  1717  func (c *amd64Compiler) compileNeg(o *wazeroir.OperationNeg) (err error) {
  1718  	target := c.locationStack.peek() // Note this is peek!
  1719  	if err := c.compileEnsureOnRegister(target); err != nil {
  1720  		return err
  1721  	}
  1722  
  1723  	tmpReg, err := c.allocateRegister(registerTypeVector)
  1724  	if err != nil {
  1725  		return err
  1726  	}
  1727  
  1728  	// First we move the sign-bit mask (placed in memory) to the tmp register,
  1729  	// since we cannot take XOR directly with float reg and const.
  1730  	// And then negate the value by XOR it with the sign-bit mask.
  1731  	if o.Type == wazeroir.Float32 {
  1732  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, asm.NewStaticConst(u32.LeBytes(float32SignBitMask)), tmpReg)
  1733  		if err != nil {
  1734  			return err
  1735  		}
  1736  		c.assembler.CompileRegisterToRegister(amd64.XORPS, tmpReg, target.register)
  1737  	} else {
  1738  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, asm.NewStaticConst(u64.LeBytes(float64SignBitMask)), tmpReg)
  1739  		if err != nil {
  1740  			return err
  1741  		}
  1742  		c.assembler.CompileRegisterToRegister(amd64.XORPD, tmpReg, target.register)
  1743  	}
  1744  	return nil
  1745  }
  1746  
  1747  // compileCeil implements compiler.compileCeil for the amd64 architecture.
  1748  func (c *amd64Compiler) compileCeil(o *wazeroir.OperationCeil) (err error) {
  1749  	// Internally, ceil can be performed via ROUND instruction with 0x02 mode.
  1750  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/ceilf.S for example.
  1751  	return c.compileRoundInstruction(o.Type == wazeroir.Float32, 0x02)
  1752  }
  1753  
  1754  // compileFloor implements compiler.compileFloor for the amd64 architecture.
  1755  func (c *amd64Compiler) compileFloor(o *wazeroir.OperationFloor) (err error) {
  1756  	// Internally, floor can be performed via ROUND instruction with 0x01 mode.
  1757  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/floorf.S for example.
  1758  	return c.compileRoundInstruction(o.Type == wazeroir.Float32, 0x01)
  1759  }
  1760  
  1761  // compileTrunc implements compiler.compileTrunc for the amd64 architecture.
  1762  func (c *amd64Compiler) compileTrunc(o *wazeroir.OperationTrunc) error {
  1763  	// Internally, trunc can be performed via ROUND instruction with 0x03 mode.
  1764  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/truncf.S for example.
  1765  	return c.compileRoundInstruction(o.Type == wazeroir.Float32, 0x03)
  1766  }
  1767  
  1768  // compileNearest implements compiler.compileNearest for the amd64 architecture.
  1769  func (c *amd64Compiler) compileNearest(o *wazeroir.OperationNearest) error {
  1770  	// Nearest can be performed via ROUND instruction with 0x00 mode.
  1771  	return c.compileRoundInstruction(o.Type == wazeroir.Float32, 0x00)
  1772  }
  1773  
  1774  func (c *amd64Compiler) compileRoundInstruction(is32Bit bool, mode int64) error {
  1775  	target := c.locationStack.peek() // Note this is peek!
  1776  	if err := c.compileEnsureOnRegister(target); err != nil {
  1777  		return err
  1778  	}
  1779  
  1780  	if is32Bit {
  1781  		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSS, target.register, target.register, byte(mode))
  1782  	} else {
  1783  		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSD, target.register, target.register, byte(mode))
  1784  	}
  1785  	return nil
  1786  }
  1787  
  1788  // compileMin implements compiler.compileMin for the amd64 architecture.
  1789  func (c *amd64Compiler) compileMin(o *wazeroir.OperationMin) error {
  1790  	is32Bit := o.Type == wazeroir.Float32
  1791  	if is32Bit {
  1792  		return c.compileMinOrMax(is32Bit, true, amd64.MINSS)
  1793  	} else {
  1794  		return c.compileMinOrMax(is32Bit, true, amd64.MINSD)
  1795  	}
  1796  }
  1797  
  1798  // compileMax implements compiler.compileMax for the amd64 architecture.
  1799  func (c *amd64Compiler) compileMax(o *wazeroir.OperationMax) error {
  1800  	is32Bit := o.Type == wazeroir.Float32
  1801  	if is32Bit {
  1802  		return c.compileMinOrMax(is32Bit, false, amd64.MAXSS)
  1803  	} else {
  1804  		return c.compileMinOrMax(is32Bit, false, amd64.MAXSD)
  1805  	}
  1806  }
  1807  
  1808  // emitMinOrMax adds instructions to pop two values from the stack, and push back either minimum or
  1809  // minimum of these two values onto the stack according to the minOrMaxInstruction argument.
  1810  // minOrMaxInstruction must be one of MAXSS, MAXSD, MINSS or MINSD.
  1811  // Note: These native min/max instructions are almost compatible with min/max in the Wasm specification,
  1812  // but it is slightly different with respect to the NaN handling.
  1813  // Native min/max instructions return non-NaN value if exactly one of target values
  1814  // is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN.
  1815  // However, WebAssembly specifies that min/max must always return NaN if one of values is NaN.
  1816  // Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before
  1817  // the native min/max, which is why we cannot simply emit a native min/max instruction here.
  1818  //
  1819  // For the semantics, see wazeroir.Min and wazeroir.Max for detail.
  1820  func (c *amd64Compiler) compileMinOrMax(is32Bit, isMin bool, minOrMaxInstruction asm.Instruction) error {
  1821  	x2 := c.locationStack.pop()
  1822  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1823  		return err
  1824  	}
  1825  	x1 := c.locationStack.pop()
  1826  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1827  		return err
  1828  	}
  1829  
  1830  	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case
  1831  	if is32Bit {
  1832  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
  1833  	} else {
  1834  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
  1835  	}
  1836  
  1837  	// At this point, we have the three cases of conditional flags below
  1838  	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
  1839  	//
  1840  	// 1) Two values are NaN-free and different: All flags are cleared.
  1841  	// 2) Two values are NaN-free and equal: Only ZF flags is set.
  1842  	// 3) One of Two values is NaN: ZF, PF and CF flags are set.
  1843  
  1844  	// Jump instruction to handle 1) case by checking the ZF flag
  1845  	// as ZF is only set for 2) and 3) cases.
  1846  	nanFreeOrDiffJump := c.assembler.CompileJump(amd64.JNE)
  1847  
  1848  	// Start handling 2) and 3).
  1849  
  1850  	// Jump if one of two values is NaN by checking the parity flag (PF).
  1851  	includeNaNJmp := c.assembler.CompileJump(amd64.JPS)
  1852  
  1853  	// Start handling 2).
  1854  
  1855  	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
  1856  	// returned if two values are positive and negative zeros.
  1857  	var inst asm.Instruction
  1858  	switch {
  1859  	case is32Bit && isMin:
  1860  		inst = amd64.ORPS
  1861  	case !is32Bit && isMin:
  1862  		inst = amd64.ORPD
  1863  	case is32Bit && !isMin:
  1864  		inst = amd64.ANDPS
  1865  	case !is32Bit && !isMin:
  1866  		inst = amd64.ANDPD
  1867  	}
  1868  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1869  
  1870  	sameExitJmp := c.assembler.CompileJump(amd64.JMP)
  1871  
  1872  	// Start handling 3).
  1873  	c.assembler.SetJumpTargetOnNext(includeNaNJmp)
  1874  
  1875  	// We emit the ADD instruction to produce the NaN in x1.
  1876  	if is32Bit {
  1877  		c.assembler.CompileRegisterToRegister(amd64.ADDSS, x2.register, x1.register)
  1878  	} else {
  1879  		c.assembler.CompileRegisterToRegister(amd64.ADDSD, x2.register, x1.register)
  1880  	}
  1881  
  1882  	// Exit from the NaN case branch.
  1883  	nanExitJmp := c.assembler.CompileJump(amd64.JMP)
  1884  
  1885  	// Start handling 1).
  1886  	c.assembler.SetJumpTargetOnNext(nanFreeOrDiffJump)
  1887  
  1888  	// Now handle the NaN-free and different values case.
  1889  	c.assembler.CompileRegisterToRegister(minOrMaxInstruction, x2.register, x1.register)
  1890  
  1891  	// Set the jump target of 1) and 2) cases to the next instruction after 3) case.
  1892  	c.assembler.SetJumpTargetOnNext(nanExitJmp, sameExitJmp)
  1893  
  1894  	// Record that we consumed the x2 and placed the minOrMax result in the x1's register.
  1895  	c.locationStack.markRegisterUnused(x2.register)
  1896  	c.locationStack.markRegisterUnused(x1.register)
  1897  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1898  	return nil
  1899  }
  1900  
  1901  // compileCopysign implements compiler.compileCopysign for the amd64 architecture.
  1902  func (c *amd64Compiler) compileCopysign(o *wazeroir.OperationCopysign) error {
  1903  	is32Bit := o.Type == wazeroir.Float32
  1904  
  1905  	x2 := c.locationStack.pop()
  1906  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1907  		return err
  1908  	}
  1909  	x1 := c.locationStack.pop()
  1910  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1911  		return err
  1912  	}
  1913  	tmpReg, err := c.allocateRegister(registerTypeVector)
  1914  	if err != nil {
  1915  		return err
  1916  	}
  1917  
  1918  	// Move the rest bit mask to the temp register.
  1919  	if is32Bit {
  1920  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, asm.NewStaticConst(u32.LeBytes(float32RestBitMask)), tmpReg)
  1921  	} else {
  1922  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, asm.NewStaticConst(u64.LeBytes(float64RestBitMask)), tmpReg)
  1923  	}
  1924  	if err != nil {
  1925  		return err
  1926  	}
  1927  
  1928  	// Clear the sign bit of x1 via AND with the mask.
  1929  	if is32Bit {
  1930  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x1.register)
  1931  	} else {
  1932  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x1.register)
  1933  	}
  1934  
  1935  	// Move the sign bit mask to the temp register.
  1936  	if is32Bit {
  1937  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, asm.NewStaticConst(u32.LeBytes(float32SignBitMask)), tmpReg)
  1938  	} else {
  1939  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, asm.NewStaticConst(u64.LeBytes(float64SignBitMask)), tmpReg)
  1940  	}
  1941  	if err != nil {
  1942  		return err
  1943  	}
  1944  
  1945  	// Clear the non-sign bits of x2 via AND with the mask.
  1946  	if is32Bit {
  1947  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x2.register)
  1948  	} else {
  1949  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x2.register)
  1950  	}
  1951  
  1952  	// Finally, copy the sign bit of x2 to x1.
  1953  	if is32Bit {
  1954  		c.assembler.CompileRegisterToRegister(amd64.ORPS, x2.register, x1.register)
  1955  	} else {
  1956  		c.assembler.CompileRegisterToRegister(amd64.ORPD, x2.register, x1.register)
  1957  	}
  1958  
  1959  	// Record that we consumed the x2 and placed the copysign result in the x1's register.
  1960  	c.locationStack.markRegisterUnused(x2.register)
  1961  	c.locationStack.markRegisterUnused(x1.register)
  1962  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1963  	return nil
  1964  }
  1965  
  1966  // compileSqrt implements compiler.compileSqrt for the amd64 architecture.
  1967  func (c *amd64Compiler) compileSqrt(o *wazeroir.OperationSqrt) error {
  1968  	target := c.locationStack.peek() // Note this is peek!
  1969  	if err := c.compileEnsureOnRegister(target); err != nil {
  1970  		return err
  1971  	}
  1972  	if o.Type == wazeroir.Float32 {
  1973  		c.assembler.CompileRegisterToRegister(amd64.SQRTSS, target.register, target.register)
  1974  	} else {
  1975  		c.assembler.CompileRegisterToRegister(amd64.SQRTSD, target.register, target.register)
  1976  	}
  1977  	return nil
  1978  }
  1979  
  1980  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the amd64 architecture.
  1981  func (c *amd64Compiler) compileI32WrapFromI64() error {
  1982  	target := c.locationStack.peek() // Note this is peek!
  1983  	if err := c.compileEnsureOnRegister(target); err != nil {
  1984  		return err
  1985  	}
  1986  	c.assembler.CompileRegisterToRegister(amd64.MOVL, target.register, target.register)
  1987  	target.valueType = runtimeValueTypeI32
  1988  	return nil
  1989  }
  1990  
  1991  // compileITruncFromF implements compiler.compileITruncFromF for the amd64 architecture.
  1992  //
  1993  // Note: in the following implementation, we use CVTSS2SI and CVTSD2SI to convert floats to signed integers.
  1994  // According to the Intel manual ([1],[2]), if the source float value is either +-Inf or NaN, or it exceeds representative ranges
  1995  // of target signed integer, then the instruction returns "masked" response float32SignBitMask (or float64SignBitMask for 64 bit case).
  1996  // [1] Chapter 11.5.2, SIMD Floating-Point Exception Conditions in "Vol 1, Intel® 64 and IA-32 Architectures Manual"
  1997  //
  1998  //	https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html
  1999  //
  2000  // [2] https://xem.github.io/minix86/manual/intel-x86-and-64-manual-vol1/o_7281d5ea06a5b67a-268.html
  2001  func (c *amd64Compiler) compileITruncFromF(o *wazeroir.OperationITruncFromF) (err error) {
  2002  	if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedInt32 {
  2003  		err = c.emitSignedI32TruncFromFloat(true, o.NonTrapping)
  2004  	} else if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedInt64 {
  2005  		err = c.emitSignedI64TruncFromFloat(true, o.NonTrapping)
  2006  	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedInt32 {
  2007  		err = c.emitSignedI32TruncFromFloat(false, o.NonTrapping)
  2008  	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedInt64 {
  2009  		err = c.emitSignedI64TruncFromFloat(false, o.NonTrapping)
  2010  	} else if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedUint32 {
  2011  		err = c.emitUnsignedI32TruncFromFloat(true, o.NonTrapping)
  2012  	} else if o.InputType == wazeroir.Float32 && o.OutputType == wazeroir.SignedUint64 {
  2013  		err = c.emitUnsignedI64TruncFromFloat(true, o.NonTrapping)
  2014  	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedUint32 {
  2015  		err = c.emitUnsignedI32TruncFromFloat(false, o.NonTrapping)
  2016  	} else if o.InputType == wazeroir.Float64 && o.OutputType == wazeroir.SignedUint64 {
  2017  		err = c.emitUnsignedI64TruncFromFloat(false, o.NonTrapping)
  2018  	}
  2019  	return
  2020  }
  2021  
  2022  // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit unsigned integer.
  2023  func (c *amd64Compiler) emitUnsignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2024  	source := c.locationStack.pop()
  2025  	if err := c.compileEnsureOnRegister(source); err != nil {
  2026  		return err
  2027  	}
  2028  
  2029  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2030  	if err != nil {
  2031  		return err
  2032  	}
  2033  
  2034  	// First, we check the source float value is above or equal math.MaxInt32+1.
  2035  	if isFloat32Bit {
  2036  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2037  			asm.NewStaticConst(u32.LeBytes(float32ForMaximumSigned32bitIntPlusOne)), source.register)
  2038  	} else {
  2039  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2040  			asm.NewStaticConst(u64.LeBytes(float64ForMaximumSigned32bitIntPlusOne)), source.register)
  2041  	}
  2042  	if err != nil {
  2043  		return err
  2044  	}
  2045  
  2046  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2047  	jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2048  
  2049  	var nonTrappingNaNJump asm.Node
  2050  	if !nonTrapping {
  2051  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2052  	} else {
  2053  		// In non trapping case, NaN is casted as zero.
  2054  		// Zero out the result register by XOR itsself.
  2055  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2056  		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
  2057  	}
  2058  
  2059  	c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2060  
  2061  	// Jump if the source float value is above or equal math.MaxInt32+1.
  2062  	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
  2063  
  2064  	// next we convert the value as a signed integer.
  2065  	if isFloat32Bit {
  2066  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2067  	} else {
  2068  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2069  	}
  2070  
  2071  	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
  2072  	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
  2073  	jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
  2074  
  2075  	var nonTrappingMinusJump asm.Node
  2076  	if !nonTrapping {
  2077  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2078  	} else {
  2079  		// In non trapping case, the minus value is casted as zero.
  2080  		// Zero out the result register by XOR itsself.
  2081  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2082  		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
  2083  	}
  2084  
  2085  	c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
  2086  
  2087  	// Otherwise, the values is valid.
  2088  	okJmpForLessThanMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP)
  2089  
  2090  	// Now, start handling the case where the original float value is above or equal math.MaxInt32+1.
  2091  	//
  2092  	// First, we subtract the math.MaxInt32+1 from the original value so it can fit in signed 32-bit integer.
  2093  	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
  2094  	if isFloat32Bit {
  2095  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS,
  2096  			asm.NewStaticConst(u32.LeBytes(float32ForMaximumSigned32bitIntPlusOne)), source.register)
  2097  	} else {
  2098  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD,
  2099  			asm.NewStaticConst(u64.LeBytes(float64ForMaximumSigned32bitIntPlusOne)), source.register)
  2100  	}
  2101  	if err != nil {
  2102  		return err
  2103  	}
  2104  
  2105  	// Then, convert the subtracted value as a signed 32-bit integer.
  2106  	if isFloat32Bit {
  2107  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2108  	} else {
  2109  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2110  	}
  2111  
  2112  	// next, we have to check if the value is from NaN, +Inf.
  2113  	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
  2114  	// This means we check if the result int value is minus or not.
  2115  	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
  2116  
  2117  	// If the result is minus, the conversion is invalid (from NaN or +Inf)
  2118  	jmpIfPlusInf := c.assembler.CompileJump(amd64.JMI)
  2119  
  2120  	// Otherwise, we successfully converted the source float minus (math.MaxInt32+1) to int.
  2121  	// So, we retrieve the original source float value by adding the sign mask.
  2122  	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDL,
  2123  		asm.NewStaticConst(u32.LeBytes(float32SignBitMask)), result); err != nil {
  2124  		return err
  2125  	}
  2126  
  2127  	okJmpForAboveOrEqualMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP)
  2128  
  2129  	c.assembler.SetJumpTargetOnNext(jmpIfPlusInf)
  2130  	if !nonTrapping {
  2131  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2132  	} else {
  2133  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL,
  2134  			asm.NewStaticConst(u32.LeBytes(maximum32BitUnsignedInt)), result)
  2135  		if err != nil {
  2136  			return err
  2137  		}
  2138  	}
  2139  
  2140  	// We jump to the next instructions for valid cases.
  2141  	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt32PlusOne, okJmpForAboveOrEqualMaxInt32PlusOne)
  2142  	if nonTrapping {
  2143  		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump, nonTrappingNaNJump)
  2144  	}
  2145  
  2146  	// We consumed the source's register and placed the conversion result
  2147  	// in the result register.
  2148  	c.locationStack.markRegisterUnused(source.register)
  2149  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  2150  	return nil
  2151  }
  2152  
  2153  // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit unsigned integer.
  2154  func (c *amd64Compiler) emitUnsignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2155  	source := c.locationStack.pop()
  2156  	if err := c.compileEnsureOnRegister(source); err != nil {
  2157  		return err
  2158  	}
  2159  
  2160  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2161  	if err != nil {
  2162  		return err
  2163  	}
  2164  
  2165  	// First, we check the source float value is above or equal math.MaxInt64+1.
  2166  	if isFloat32Bit {
  2167  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2168  			asm.NewStaticConst(u32.LeBytes(float32ForMaximumSigned64bitIntPlusOne)), source.register)
  2169  	} else {
  2170  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2171  			asm.NewStaticConst(u64.LeBytes(float64ForMaximumSigned64bitIntPlusOne)), source.register)
  2172  	}
  2173  	if err != nil {
  2174  		return err
  2175  	}
  2176  
  2177  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2178  	jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2179  
  2180  	var nonTrappingNaNJump asm.Node
  2181  	if !nonTrapping {
  2182  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2183  	} else {
  2184  		// In non trapping case, NaN is casted as zero.
  2185  		// Zero out the result register by XOR itsself.
  2186  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2187  		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
  2188  	}
  2189  
  2190  	c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2191  
  2192  	// Jump if the source float values is above or equal math.MaxInt64+1.
  2193  	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
  2194  
  2195  	// next we convert the value as a signed integer.
  2196  	if isFloat32Bit {
  2197  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2198  	} else {
  2199  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2200  	}
  2201  
  2202  	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
  2203  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
  2204  	jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
  2205  
  2206  	var nonTrappingMinusJump asm.Node
  2207  	if !nonTrapping {
  2208  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2209  	} else {
  2210  		// In non trapping case, the minus value is casted as zero.
  2211  		// Zero out the result register by XOR itsself.
  2212  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2213  		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
  2214  	}
  2215  
  2216  	c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
  2217  
  2218  	// Otherwise, the values is valid.
  2219  	okJmpForLessThanMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP)
  2220  
  2221  	// Now, start handling the case where the original float value is above or equal math.MaxInt64+1.
  2222  	//
  2223  	// First, we subtract the math.MaxInt64+1 from the original value so it can fit in signed 64-bit integer.
  2224  	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
  2225  	if isFloat32Bit {
  2226  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS,
  2227  			asm.NewStaticConst(u32.LeBytes(float32ForMaximumSigned64bitIntPlusOne)), source.register)
  2228  	} else {
  2229  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD,
  2230  			asm.NewStaticConst(u64.LeBytes(float64ForMaximumSigned64bitIntPlusOne)), source.register)
  2231  	}
  2232  	if err != nil {
  2233  		return err
  2234  	}
  2235  
  2236  	// Then, convert the subtracted value as a signed 64-bit integer.
  2237  	if isFloat32Bit {
  2238  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2239  	} else {
  2240  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2241  	}
  2242  
  2243  	// next, we have to check if the value is from NaN, +Inf.
  2244  	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
  2245  	// This means we check if the result int value is minus or not.
  2246  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
  2247  
  2248  	// If the result is minus, the conversion is invalid (from NaN or +Inf)
  2249  	jmpIfPlusInf := c.assembler.CompileJump(amd64.JMI)
  2250  
  2251  	// Otherwise, we successfully converted the the source float minus (math.MaxInt64+1) to int.
  2252  	// So, we retrieve the original source float value by adding the sign mask.
  2253  	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDQ,
  2254  		asm.NewStaticConst(u64.LeBytes(float64SignBitMask)), result); err != nil {
  2255  		return err
  2256  	}
  2257  
  2258  	okJmpForAboveOrEqualMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP)
  2259  
  2260  	c.assembler.SetJumpTargetOnNext(jmpIfPlusInf)
  2261  	if !nonTrapping {
  2262  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2263  	} else {
  2264  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ,
  2265  			asm.NewStaticConst(u64.LeBytes(maximum64BitUnsignedInt)), result)
  2266  		if err != nil {
  2267  			return err
  2268  		}
  2269  	}
  2270  
  2271  	// We jump to the next instructions for valid cases.
  2272  	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt64PlusOne, okJmpForAboveOrEqualMaxInt64PlusOne)
  2273  	if nonTrapping {
  2274  		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump, nonTrappingNaNJump)
  2275  	}
  2276  
  2277  	// We consumed the source's register and placed the conversion result
  2278  	// in the result register.
  2279  	c.locationStack.markRegisterUnused(source.register)
  2280  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
  2281  	return nil
  2282  }
  2283  
  2284  // emitSignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit signed integer.
  2285  func (c *amd64Compiler) emitSignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2286  	source := c.locationStack.pop()
  2287  	if err := c.compileEnsureOnRegister(source); err != nil {
  2288  		return err
  2289  	}
  2290  
  2291  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2292  	if err != nil {
  2293  		return err
  2294  	}
  2295  
  2296  	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
  2297  	if isFloat32Bit {
  2298  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2299  	} else {
  2300  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2301  	}
  2302  
  2303  	// We compare the conversion result with the sign bit mask to check if it is either
  2304  	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
  2305  	// 2) the source equals the minimum signed 32-bit (=-2147483648.000000) whose bit pattern is float32ForMinimumSigned32bitIntegerAddress for 32 bit float
  2306  	// 	  or float64ForMinimumSigned32bitIntegerAddress for 64bit float.
  2307  	err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, asm.NewStaticConst(u32.LeBytes(float32SignBitMask)), result)
  2308  	if err != nil {
  2309  		return err
  2310  	}
  2311  
  2312  	// Otherwise, jump to exit as the result is valid.
  2313  	okJmp := c.assembler.CompileJump(amd64.JNE)
  2314  
  2315  	// Start handling the case of 1) and 2).
  2316  	// First, check if the value is NaN.
  2317  	if isFloat32Bit {
  2318  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
  2319  	} else {
  2320  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
  2321  	}
  2322  
  2323  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2324  	jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2325  
  2326  	var nontrappingNanJump asm.Node
  2327  	if !nonTrapping {
  2328  		// If the value is NaN, we return the function with nativeCallStatusCodeInvalidFloatToIntConversion.
  2329  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2330  	} else {
  2331  		// In non trapping case, NaN is casted as zero.
  2332  		// Zero out the result register by XOR itsself.
  2333  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2334  		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
  2335  	}
  2336  
  2337  	// Check if the value is larger than or equal the minimum 32-bit integer value,
  2338  	// meaning that the value exceeds the lower bound of 32-bit signed integer range.
  2339  	c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2340  	if isFloat32Bit {
  2341  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2342  			asm.NewStaticConst(u32.LeBytes(float32ForMinimumSigned32bitInteger)), source.register)
  2343  	} else {
  2344  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2345  			asm.NewStaticConst(u64.LeBytes(float64ForMinimumSigned32bitInteger)), source.register)
  2346  	}
  2347  	if err != nil {
  2348  		return err
  2349  	}
  2350  
  2351  	if !nonTrapping {
  2352  		// Jump if the value exceeds the lower bound.
  2353  		var jmpIfExceedsLowerBound asm.Node
  2354  		if isFloat32Bit {
  2355  			jmpIfExceedsLowerBound = c.assembler.CompileJump(amd64.JCS)
  2356  		} else {
  2357  			jmpIfExceedsLowerBound = c.assembler.CompileJump(amd64.JLS)
  2358  		}
  2359  
  2360  		// At this point, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
  2361  		// So, check if the value equals the minimum signed 32-bit int.
  2362  		if isFloat32Bit {
  2363  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2364  				asm.NewStaticConst([]byte{0, 0, 0, 0}), source.register)
  2365  		} else {
  2366  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2367  				asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0}), source.register)
  2368  		}
  2369  		if err != nil {
  2370  			return err
  2371  		}
  2372  
  2373  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int).
  2374  
  2375  		c.assembler.SetJumpTargetOnNext(jmpIfExceedsLowerBound)
  2376  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2377  
  2378  		// We jump to the next instructions for valid cases.
  2379  		c.assembler.SetJumpTargetOnNext(okJmp, jmpIfMinimumSignedInt)
  2380  	} else {
  2381  		// Jump if the value does not exceed the lower bound.
  2382  		var jmpIfNotExceedsLowerBound asm.Node
  2383  		if isFloat32Bit {
  2384  			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JCC)
  2385  		} else {
  2386  			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JHI)
  2387  		}
  2388  
  2389  		// If the value exceeds the lower bound, we "saturate" it to the minimum.
  2390  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL,
  2391  			asm.NewStaticConst(u32.LeBytes(uint32(minimum32BitSignedInt))), result); err != nil {
  2392  			return err
  2393  		}
  2394  		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
  2395  
  2396  		// Otherwise, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
  2397  		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
  2398  		if isFloat32Bit {
  2399  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2400  				asm.NewStaticConst([]byte{0, 0, 0, 0}), source.register)
  2401  		} else {
  2402  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2403  				asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0}), source.register)
  2404  		}
  2405  		if err != nil {
  2406  			return err
  2407  		}
  2408  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int).
  2409  
  2410  		// If the value exceeds signed 32-bit maximum, we saturate it to the maximum.
  2411  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL,
  2412  			asm.NewStaticConst(u32.LeBytes(uint32(maximum32BitSignedInt))), result); err != nil {
  2413  			return err
  2414  		}
  2415  
  2416  		c.assembler.SetJumpTargetOnNext(okJmp, nontrappingNanJump, nonTrappingSaturatedMinimumJump, jmpIfMinimumSignedInt)
  2417  	}
  2418  
  2419  	// We consumed the source's register and placed the conversion result
  2420  	// in the result register.
  2421  	c.locationStack.markRegisterUnused(source.register)
  2422  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  2423  	return nil
  2424  }
  2425  
  2426  // emitSignedI64TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit signed integer.
  2427  func (c *amd64Compiler) emitSignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2428  	source := c.locationStack.pop()
  2429  	if err := c.compileEnsureOnRegister(source); err != nil {
  2430  		return err
  2431  	}
  2432  
  2433  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2434  	if err != nil {
  2435  		return err
  2436  	}
  2437  
  2438  	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
  2439  	if isFloat32Bit {
  2440  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2441  	} else {
  2442  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2443  	}
  2444  
  2445  	// We compare the conversion result with the sign bit mask to check if it is either
  2446  	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
  2447  	// 2) the source equals the minimum signed 32-bit (=-9223372036854775808.0) whose bit pattern is float32ForMinimumSigned64bitIntegerAddress for 32 bit float
  2448  	// 	  or float64ForMinimumSigned64bitIntegerAddress for 64bit float.
  2449  	err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ,
  2450  		asm.NewStaticConst(u64.LeBytes(float64SignBitMask)), result)
  2451  	if err != nil {
  2452  		return err
  2453  	}
  2454  
  2455  	// Otherwise, we simply jump to exit as the result is valid.
  2456  	okJmp := c.assembler.CompileJump(amd64.JNE)
  2457  
  2458  	// Start handling the case of 1) and 2).
  2459  	// First, check if the value is NaN.
  2460  	if isFloat32Bit {
  2461  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
  2462  	} else {
  2463  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
  2464  	}
  2465  
  2466  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2467  	jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2468  
  2469  	var nontrappingNanJump asm.Node
  2470  	if !nonTrapping {
  2471  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidFloatToIntConversion)
  2472  	} else {
  2473  		// In non trapping case, NaN is casted as zero.
  2474  		// Zero out the result register by XOR itsself.
  2475  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2476  		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
  2477  	}
  2478  
  2479  	// Check if the value is larger than or equal the minimum 64-bit integer value,
  2480  	// meaning that the value exceeds the lower bound of 64-bit signed integer range.
  2481  	c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2482  	if isFloat32Bit {
  2483  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2484  			asm.NewStaticConst(u32.LeBytes(float32ForMinimumSigned64bitInteger)), source.register)
  2485  	} else {
  2486  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2487  			asm.NewStaticConst(u64.LeBytes(float64ForMinimumSigned64bitInteger)), source.register)
  2488  	}
  2489  	if err != nil {
  2490  		return err
  2491  	}
  2492  
  2493  	if !nonTrapping {
  2494  		// Jump if the value is -Inf.
  2495  		jmpIfExceedsLowerBound := c.assembler.CompileJump(amd64.JCS)
  2496  
  2497  		// At this point, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
  2498  		// So, check if the value equals the minimum signed 64-bit int.
  2499  		if isFloat32Bit {
  2500  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS,
  2501  				asm.NewStaticConst([]byte{0, 0, 0, 0}), source.register)
  2502  		} else {
  2503  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD,
  2504  				asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0}), source.register)
  2505  		}
  2506  		if err != nil {
  2507  			return err
  2508  		}
  2509  
  2510  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int).
  2511  
  2512  		c.assembler.SetJumpTargetOnNext(jmpIfExceedsLowerBound)
  2513  		c.compileExitFromNativeCode(nativeCallStatusIntegerOverflow)
  2514  
  2515  		// We jump to the next instructions for valid cases.
  2516  		c.assembler.SetJumpTargetOnNext(okJmp, jmpIfMinimumSignedInt)
  2517  	} else {
  2518  		// Jump if the value is not -Inf.
  2519  		jmpIfNotExceedsLowerBound := c.assembler.CompileJump(amd64.JCC)
  2520  
  2521  		// If the value exceeds the lower bound, we "saturate" it to the minimum.
  2522  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ,
  2523  			asm.NewStaticConst(u64.LeBytes(uint64(minimum64BitSignedInt))), result)
  2524  		if err != nil {
  2525  			return err
  2526  		}
  2527  
  2528  		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
  2529  
  2530  		// Otherwise, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
  2531  		// So, check if the value equals the minimum signed 64-bit int.
  2532  		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
  2533  		if isFloat32Bit {
  2534  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, asm.NewStaticConst([]byte{0, 0, 0, 0}), source.register)
  2535  		} else {
  2536  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0}), source.register)
  2537  		}
  2538  		if err != nil {
  2539  			return err
  2540  		}
  2541  
  2542  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int).
  2543  
  2544  		// If the value exceeds signed 64-bit maximum, we saturate it to the maximum.
  2545  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, asm.NewStaticConst(u64.LeBytes(uint64(maximum64BitSignedInt))), result); err != nil {
  2546  			return err
  2547  		}
  2548  
  2549  		c.assembler.SetJumpTargetOnNext(okJmp, jmpIfMinimumSignedInt, nonTrappingSaturatedMinimumJump, nontrappingNanJump)
  2550  	}
  2551  
  2552  	// We consumed the source's register and placed the conversion result
  2553  	// in the result register.
  2554  	c.locationStack.markRegisterUnused(source.register)
  2555  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
  2556  	return nil
  2557  }
  2558  
  2559  // compileFConvertFromI implements compiler.compileFConvertFromI for the amd64 architecture.
  2560  func (c *amd64Compiler) compileFConvertFromI(o *wazeroir.OperationFConvertFromI) (err error) {
  2561  	if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedInt32 {
  2562  		err = c.compileSimpleConversion(amd64.CVTSL2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 32bit int
  2563  	} else if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedInt64 {
  2564  		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int
  2565  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedInt32 {
  2566  		err = c.compileSimpleConversion(amd64.CVTSL2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 32bit int
  2567  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedInt64 {
  2568  		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int
  2569  	} else if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedUint32 {
  2570  		// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
  2571  		// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
  2572  		//
  2573  		// Here's the summary:
  2574  		// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
  2575  		// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
  2576  		// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
  2577  		// >> which allows CVTSI2SS to be used after all.
  2578  		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int.
  2579  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedUint32 {
  2580  		// For the same reason above, we use 64bit conversion for unsigned 32bit.
  2581  		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int.
  2582  	} else if o.OutputType == wazeroir.Float32 && o.InputType == wazeroir.SignedUint64 {
  2583  		err = c.emitUnsignedInt64ToFloatConversion(true)
  2584  	} else if o.OutputType == wazeroir.Float64 && o.InputType == wazeroir.SignedUint64 {
  2585  		err = c.emitUnsignedInt64ToFloatConversion(false)
  2586  	}
  2587  	return
  2588  }
  2589  
  2590  // emitUnsignedInt64ToFloatConversion is handling the case of unsigned 64-bit integer
  2591  // in compileFConvertFromI.
  2592  func (c *amd64Compiler) emitUnsignedInt64ToFloatConversion(isFloat32bit bool) error {
  2593  	// The logic here is exactly the same as GCC emits for the following code:
  2594  	//
  2595  	// float convert(int num) {
  2596  	//     float foo;
  2597  	//     uint64_t ptr1 = 100;
  2598  	//     foo = (float)(ptr1);
  2599  	//     return foo;
  2600  	// }
  2601  	//
  2602  	// which is compiled by GCC as
  2603  	//
  2604  	// convert:
  2605  	// 	   push    rbp
  2606  	// 	   mov     rbp, rsp
  2607  	// 	   mov     DWORD PTR [rbp-20], edi
  2608  	// 	   mov     DWORD PTR [rbp-4], 100
  2609  	// 	   mov     eax, DWORD PTR [rbp-4]
  2610  	// 	   test    rax, rax
  2611  	// 	   js      .handle_sign_bit_case
  2612  	// 	   cvtsi2ss        xmm0, rax
  2613  	// 	   jmp     .exit
  2614  	// .handle_sign_bit_case:
  2615  	// 	   mov     rdx, rax
  2616  	// 	   shr     rdx
  2617  	// 	   and     eax, 1
  2618  	// 	   or      rdx, rax
  2619  	// 	   cvtsi2ss        xmm0, rdx
  2620  	// 	   addsd   xmm0, xmm0
  2621  	// .exit: ...
  2622  	//
  2623  	// tl;dr is that we have a branch depending on whether or not sign bit is set.
  2624  
  2625  	origin := c.locationStack.pop()
  2626  	if err := c.compileEnsureOnRegister(origin); err != nil {
  2627  		return err
  2628  	}
  2629  
  2630  	dest, err := c.allocateRegister(registerTypeVector)
  2631  	if err != nil {
  2632  		return err
  2633  	}
  2634  
  2635  	c.locationStack.markRegisterUsed(dest)
  2636  
  2637  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2638  	if err != nil {
  2639  		return err
  2640  	}
  2641  
  2642  	// Check if the most significant bit (sign bit) is set.
  2643  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, origin.register, origin.register)
  2644  
  2645  	// Jump if the sign bit is set.
  2646  	jmpIfSignbitSet := c.assembler.CompileJump(amd64.JMI)
  2647  
  2648  	// Otherwise, we could fit the unsigned int into float32.
  2649  	// So, we convert it to float32 and emit jump instruction to exit from this branch.
  2650  	if isFloat32bit {
  2651  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, origin.register, dest)
  2652  	} else {
  2653  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, origin.register, dest)
  2654  	}
  2655  	exitFromSignbitUnSet := c.assembler.CompileJump(amd64.JMP)
  2656  
  2657  	// Now handling the case where sign-bit is set.
  2658  	// We emit the following sequences:
  2659  	// 	   mov     tmpReg, origin
  2660  	// 	   shr     tmpReg, 1
  2661  	// 	   and     origin, 1
  2662  	// 	   or      tmpReg, origin
  2663  	// 	   cvtsi2ss        xmm0, tmpReg
  2664  	// 	   addsd   xmm0, xmm0
  2665  
  2666  	c.assembler.SetJumpTargetOnNext(jmpIfSignbitSet)
  2667  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, tmpReg)
  2668  	c.assembler.CompileConstToRegister(amd64.SHRQ, 1, tmpReg)
  2669  	c.assembler.CompileConstToRegister(amd64.ANDQ, 1, origin.register)
  2670  	c.assembler.CompileRegisterToRegister(amd64.ORQ, origin.register, tmpReg)
  2671  	if isFloat32bit {
  2672  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, tmpReg, dest)
  2673  	} else {
  2674  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, tmpReg, dest)
  2675  	}
  2676  	if isFloat32bit {
  2677  		c.assembler.CompileRegisterToRegister(amd64.ADDSS, dest, dest)
  2678  	} else {
  2679  		c.assembler.CompileRegisterToRegister(amd64.ADDSD, dest, dest)
  2680  	}
  2681  
  2682  	// Now, we finished the sign-bit set branch.
  2683  	// We have to make the exit jump target of sign-bit unset branch
  2684  	// towards the next instruction.
  2685  	c.assembler.SetJumpTargetOnNext(exitFromSignbitUnSet)
  2686  
  2687  	// We consumed the origin's register and placed the conversion result
  2688  	// in the dest register.
  2689  	c.locationStack.markRegisterUnused(origin.register)
  2690  	if isFloat32bit {
  2691  		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF32)
  2692  	} else {
  2693  		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF64)
  2694  	}
  2695  	return nil
  2696  }
  2697  
  2698  // compileSimpleConversion pops a value type from the stack, and applies the
  2699  // given instruction on it, and push the result onto a register of the given type.
  2700  func (c *amd64Compiler) compileSimpleConversion(convInstruction asm.Instruction,
  2701  	destinationRegisterType registerType, destinationValueType runtimeValueType,
  2702  ) error {
  2703  	origin := c.locationStack.pop()
  2704  	if err := c.compileEnsureOnRegister(origin); err != nil {
  2705  		return err
  2706  	}
  2707  
  2708  	dest, err := c.allocateRegister(destinationRegisterType)
  2709  	if err != nil {
  2710  		return err
  2711  	}
  2712  
  2713  	c.assembler.CompileRegisterToRegister(convInstruction, origin.register, dest)
  2714  
  2715  	c.locationStack.markRegisterUnused(origin.register)
  2716  	c.pushRuntimeValueLocationOnRegister(dest, destinationValueType)
  2717  	return nil
  2718  }
  2719  
  2720  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the amd64 architecture.
  2721  func (c *amd64Compiler) compileF32DemoteFromF64() error {
  2722  	target := c.locationStack.peek() // Note this is peek!
  2723  	if err := c.compileEnsureOnRegister(target); err != nil {
  2724  		return err
  2725  	}
  2726  
  2727  	c.assembler.CompileRegisterToRegister(amd64.CVTSD2SS, target.register, target.register)
  2728  	target.valueType = runtimeValueTypeF32
  2729  	return nil
  2730  }
  2731  
  2732  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the amd64 architecture.
  2733  func (c *amd64Compiler) compileF64PromoteFromF32() error {
  2734  	target := c.locationStack.peek() // Note this is peek!
  2735  	if err := c.compileEnsureOnRegister(target); err != nil {
  2736  		return err
  2737  	}
  2738  
  2739  	c.assembler.CompileRegisterToRegister(amd64.CVTSS2SD, target.register, target.register)
  2740  	target.valueType = runtimeValueTypeF64
  2741  	return nil
  2742  }
  2743  
  2744  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the amd64 architecture.
  2745  func (c *amd64Compiler) compileI32ReinterpretFromF32() error {
  2746  	if peek := c.locationStack.peek(); peek.onStack() {
  2747  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2748  		peek.valueType = runtimeValueTypeI32
  2749  		return nil
  2750  	}
  2751  	return c.compileSimpleConversion(amd64.MOVL, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2752  }
  2753  
  2754  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the amd64 architecture.
  2755  func (c *amd64Compiler) compileI64ReinterpretFromF64() error {
  2756  	if peek := c.locationStack.peek(); peek.onStack() {
  2757  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2758  		peek.valueType = runtimeValueTypeI64
  2759  		return nil
  2760  	}
  2761  	return c.compileSimpleConversion(amd64.MOVQ, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2762  }
  2763  
  2764  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the amd64 architecture.
  2765  func (c *amd64Compiler) compileF32ReinterpretFromI32() error {
  2766  	if peek := c.locationStack.peek(); peek.onStack() {
  2767  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2768  		peek.valueType = runtimeValueTypeF32
  2769  		return nil
  2770  	}
  2771  	return c.compileSimpleConversion(amd64.MOVL, registerTypeVector, runtimeValueTypeF32)
  2772  }
  2773  
  2774  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the amd64 architecture.
  2775  func (c *amd64Compiler) compileF64ReinterpretFromI64() error {
  2776  	if peek := c.locationStack.peek(); peek.onStack() {
  2777  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2778  		peek.valueType = runtimeValueTypeF64
  2779  		return nil
  2780  	}
  2781  	return c.compileSimpleConversion(amd64.MOVQ, registerTypeVector, runtimeValueTypeF64)
  2782  }
  2783  
  2784  // compileExtend implements compiler.compileExtend for the amd64 architecture.
  2785  func (c *amd64Compiler) compileExtend(o *wazeroir.OperationExtend) error {
  2786  	var inst asm.Instruction
  2787  	if o.Signed {
  2788  		inst = amd64.MOVLQSX // = MOVSXD https://www.felixcloutier.com/x86/movsx:movsxd
  2789  	} else {
  2790  		inst = amd64.MOVL
  2791  	}
  2792  	return c.compileExtendImpl(inst, runtimeValueTypeI64)
  2793  }
  2794  
  2795  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the amd64 architecture.
  2796  func (c *amd64Compiler) compileSignExtend32From8() error {
  2797  	return c.compileExtendImpl(amd64.MOVBLSX, runtimeValueTypeI32)
  2798  }
  2799  
  2800  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the amd64 architecture.
  2801  func (c *amd64Compiler) compileSignExtend32From16() error {
  2802  	return c.compileExtendImpl(amd64.MOVWLSX, runtimeValueTypeI32)
  2803  }
  2804  
  2805  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the amd64 architecture.
  2806  func (c *amd64Compiler) compileSignExtend64From8() error {
  2807  	return c.compileExtendImpl(amd64.MOVBQSX, runtimeValueTypeI64)
  2808  }
  2809  
  2810  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the amd64 architecture.
  2811  func (c *amd64Compiler) compileSignExtend64From16() error {
  2812  	return c.compileExtendImpl(amd64.MOVWQSX, runtimeValueTypeI64)
  2813  }
  2814  
  2815  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the amd64 architecture.
  2816  func (c *amd64Compiler) compileSignExtend64From32() error {
  2817  	return c.compileExtendImpl(amd64.MOVLQSX, runtimeValueTypeI64)
  2818  }
  2819  
  2820  func (c *amd64Compiler) compileExtendImpl(inst asm.Instruction, destinationType runtimeValueType) error {
  2821  	target := c.locationStack.peek() // Note this is peek!
  2822  	if err := c.compileEnsureOnRegister(target); err != nil {
  2823  		return err
  2824  	}
  2825  
  2826  	c.assembler.CompileRegisterToRegister(inst, target.register, target.register)
  2827  	target.valueType = destinationType
  2828  	return nil
  2829  }
  2830  
  2831  // compileEq implements compiler.compileEq for the amd64 architecture.
  2832  func (c *amd64Compiler) compileEq(o *wazeroir.OperationEq) error {
  2833  	return c.compileEqOrNe(o.Type, true)
  2834  }
  2835  
  2836  // compileNe implements compiler.compileNe for the amd64 architecture.
  2837  func (c *amd64Compiler) compileNe(o *wazeroir.OperationNe) error {
  2838  	return c.compileEqOrNe(o.Type, false)
  2839  }
  2840  
  2841  func (c *amd64Compiler) compileEqOrNe(t wazeroir.UnsignedType, shouldEqual bool) (err error) {
  2842  	x2 := c.locationStack.pop()
  2843  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2844  		return err
  2845  	}
  2846  
  2847  	x1 := c.locationStack.pop()
  2848  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2849  		return err
  2850  	}
  2851  
  2852  	switch t {
  2853  	case wazeroir.UnsignedTypeI32:
  2854  		err = c.compileEqOrNeForInts(x1.register, x2.register, amd64.CMPL, shouldEqual)
  2855  	case wazeroir.UnsignedTypeI64:
  2856  		err = c.compileEqOrNeForInts(x1.register, x2.register, amd64.CMPQ, shouldEqual)
  2857  	case wazeroir.UnsignedTypeF32:
  2858  		err = c.compileEqOrNeForFloats(x1.register, x2.register, amd64.UCOMISS, shouldEqual)
  2859  	case wazeroir.UnsignedTypeF64:
  2860  		err = c.compileEqOrNeForFloats(x1.register, x2.register, amd64.UCOMISD, shouldEqual)
  2861  	}
  2862  	if err != nil {
  2863  		return
  2864  	}
  2865  
  2866  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  2867  	c.locationStack.releaseRegister(x1)
  2868  	c.locationStack.releaseRegister(x2)
  2869  	return
  2870  }
  2871  
  2872  func (c *amd64Compiler) compileEqOrNeForInts(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction,
  2873  	shouldEqual bool,
  2874  ) error {
  2875  	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
  2876  
  2877  	// Record that the result is on the conditional register.
  2878  	var condReg asm.ConditionalRegisterState
  2879  	if shouldEqual {
  2880  		condReg = amd64.ConditionalRegisterStateE
  2881  	} else {
  2882  		condReg = amd64.ConditionalRegisterStateNE
  2883  	}
  2884  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(condReg)
  2885  	loc.valueType = runtimeValueTypeI32
  2886  	return nil
  2887  }
  2888  
  2889  // For float EQ and NE, we have to take NaN values into account.
  2890  // Notably, Wasm specification states that if one of targets is NaN,
  2891  // the result must be zero for EQ or one for NE.
  2892  func (c *amd64Compiler) compileEqOrNeForFloats(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool) error {
  2893  	// Before we allocate the result, we have to reserve two int registers.
  2894  	nanFragReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2895  	if err != nil {
  2896  		return err
  2897  	}
  2898  	c.locationStack.markRegisterUsed(nanFragReg)
  2899  	cmpResultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2900  	if err != nil {
  2901  		return err
  2902  	}
  2903  
  2904  	// Then, execute the comparison.
  2905  	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
  2906  
  2907  	// First, we get the parity flag which indicates whether one of values was NaN.
  2908  	if shouldEqual {
  2909  		// Set 1 if two values are NOT NaN.
  2910  		c.assembler.CompileNoneToRegister(amd64.SETPC, nanFragReg)
  2911  	} else {
  2912  		// Set 1 if one of values is NaN.
  2913  		c.assembler.CompileNoneToRegister(amd64.SETPS, nanFragReg)
  2914  	}
  2915  
  2916  	// next, we get the usual comparison flag.
  2917  	if shouldEqual {
  2918  		// Set 1 if equal.
  2919  		c.assembler.CompileNoneToRegister(amd64.SETEQ, cmpResultReg)
  2920  	} else {
  2921  		// Set 1 if not equal.
  2922  		c.assembler.CompileNoneToRegister(amd64.SETNE, cmpResultReg)
  2923  	}
  2924  
  2925  	// Do "and" or "or" operations on these two flags to get the actual result.
  2926  	if shouldEqual {
  2927  		c.assembler.CompileRegisterToRegister(amd64.ANDL, nanFragReg, cmpResultReg)
  2928  	} else {
  2929  		c.assembler.CompileRegisterToRegister(amd64.ORL, nanFragReg, cmpResultReg)
  2930  	}
  2931  
  2932  	// Clear the unnecessary bits by zero extending the first byte.
  2933  	// This is necessary the upper bits (5 to 32 bits) of SET* instruction result is undefined.
  2934  	c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, cmpResultReg, cmpResultReg)
  2935  
  2936  	// Now we have the result in cmpResultReg register, so we record it.
  2937  	c.pushRuntimeValueLocationOnRegister(cmpResultReg, runtimeValueTypeI32)
  2938  	// Also, we no longer need nanFragRegister.
  2939  	c.locationStack.markRegisterUnused(nanFragReg)
  2940  	return nil
  2941  }
  2942  
  2943  // compileEqz implements compiler.compileEqz for the amd64 architecture.
  2944  func (c *amd64Compiler) compileEqz(o *wazeroir.OperationEqz) (err error) {
  2945  	v := c.locationStack.pop()
  2946  	if err = c.compileEnsureOnRegister(v); err != nil {
  2947  		return err
  2948  	}
  2949  
  2950  	switch o.Type {
  2951  	case wazeroir.UnsignedInt32:
  2952  		err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, asm.NewStaticConst([]byte{0, 0, 0, 0}), v.register)
  2953  	case wazeroir.UnsignedInt64:
  2954  		err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0}), v.register)
  2955  	}
  2956  	if err != nil {
  2957  		return err
  2958  	}
  2959  
  2960  	// v is consumed by the cmp operation so release it.
  2961  	c.locationStack.releaseRegister(v)
  2962  
  2963  	// Finally, record that the result is on the conditional register.
  2964  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
  2965  	loc.valueType = runtimeValueTypeI32
  2966  	return nil
  2967  }
  2968  
  2969  // compileLt implements compiler.compileLt for the amd64 architecture.
  2970  func (c *amd64Compiler) compileLt(o *wazeroir.OperationLt) error {
  2971  	x2 := c.locationStack.pop()
  2972  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2973  		return err
  2974  	}
  2975  
  2976  	x1 := c.locationStack.pop()
  2977  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2978  		return err
  2979  	}
  2980  
  2981  	// Emit the compare instruction.
  2982  	var resultConditionState asm.ConditionalRegisterState
  2983  	var inst asm.Instruction
  2984  	switch o.Type {
  2985  	case wazeroir.SignedTypeInt32:
  2986  		resultConditionState = amd64.ConditionalRegisterStateL
  2987  		inst = amd64.CMPL
  2988  	case wazeroir.SignedTypeUint32:
  2989  		resultConditionState = amd64.ConditionalRegisterStateB
  2990  		inst = amd64.CMPL
  2991  	case wazeroir.SignedTypeInt64:
  2992  		inst = amd64.CMPQ
  2993  		resultConditionState = amd64.ConditionalRegisterStateL
  2994  	case wazeroir.SignedTypeUint64:
  2995  		resultConditionState = amd64.ConditionalRegisterStateB
  2996  		inst = amd64.CMPQ
  2997  	case wazeroir.SignedTypeFloat32:
  2998  		resultConditionState = amd64.ConditionalRegisterStateA
  2999  		inst = amd64.COMISS
  3000  	case wazeroir.SignedTypeFloat64:
  3001  		resultConditionState = amd64.ConditionalRegisterStateA
  3002  		inst = amd64.COMISD
  3003  	}
  3004  	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
  3005  
  3006  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3007  	c.locationStack.releaseRegister(x1)
  3008  	c.locationStack.releaseRegister(x2)
  3009  
  3010  	// Finally, record that the result is on the conditional register.
  3011  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3012  	loc.valueType = runtimeValueTypeI32
  3013  	return nil
  3014  }
  3015  
  3016  // compileGt implements compiler.compileGt for the amd64 architecture.
  3017  func (c *amd64Compiler) compileGt(o *wazeroir.OperationGt) error {
  3018  	x2 := c.locationStack.pop()
  3019  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3020  		return err
  3021  	}
  3022  
  3023  	x1 := c.locationStack.pop()
  3024  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3025  		return err
  3026  	}
  3027  
  3028  	// Emit the compare instruction.
  3029  	var resultConditionState asm.ConditionalRegisterState
  3030  	switch o.Type {
  3031  	case wazeroir.SignedTypeInt32:
  3032  		resultConditionState = amd64.ConditionalRegisterStateG
  3033  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3034  	case wazeroir.SignedTypeUint32:
  3035  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3036  		resultConditionState = amd64.ConditionalRegisterStateA
  3037  	case wazeroir.SignedTypeInt64:
  3038  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3039  		resultConditionState = amd64.ConditionalRegisterStateG
  3040  	case wazeroir.SignedTypeUint64:
  3041  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3042  		resultConditionState = amd64.ConditionalRegisterStateA
  3043  	case wazeroir.SignedTypeFloat32:
  3044  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
  3045  		resultConditionState = amd64.ConditionalRegisterStateA
  3046  	case wazeroir.SignedTypeFloat64:
  3047  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
  3048  		resultConditionState = amd64.ConditionalRegisterStateA
  3049  	}
  3050  
  3051  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3052  	c.locationStack.releaseRegister(x1)
  3053  	c.locationStack.releaseRegister(x2)
  3054  
  3055  	// Finally, record that the result is on the conditional register.
  3056  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3057  	loc.valueType = runtimeValueTypeI32
  3058  	return nil
  3059  }
  3060  
  3061  // compileLe implements compiler.compileLe for the amd64 architecture.
  3062  func (c *amd64Compiler) compileLe(o *wazeroir.OperationLe) error {
  3063  	x2 := c.locationStack.pop()
  3064  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3065  		return err
  3066  	}
  3067  
  3068  	x1 := c.locationStack.pop()
  3069  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3070  		return err
  3071  	}
  3072  
  3073  	// Emit the compare instruction.
  3074  	var inst asm.Instruction
  3075  	var resultConditionState asm.ConditionalRegisterState
  3076  	switch o.Type {
  3077  	case wazeroir.SignedTypeInt32:
  3078  		resultConditionState = amd64.ConditionalRegisterStateLE
  3079  		inst = amd64.CMPL
  3080  	case wazeroir.SignedTypeUint32:
  3081  		resultConditionState = amd64.ConditionalRegisterStateBE
  3082  		inst = amd64.CMPL
  3083  	case wazeroir.SignedTypeInt64:
  3084  		resultConditionState = amd64.ConditionalRegisterStateLE
  3085  		inst = amd64.CMPQ
  3086  	case wazeroir.SignedTypeUint64:
  3087  		resultConditionState = amd64.ConditionalRegisterStateBE
  3088  		inst = amd64.CMPQ
  3089  	case wazeroir.SignedTypeFloat32:
  3090  		resultConditionState = amd64.ConditionalRegisterStateAE
  3091  		inst = amd64.UCOMISS
  3092  	case wazeroir.SignedTypeFloat64:
  3093  		resultConditionState = amd64.ConditionalRegisterStateAE
  3094  		inst = amd64.UCOMISD
  3095  	}
  3096  	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
  3097  
  3098  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3099  	c.locationStack.releaseRegister(x1)
  3100  	c.locationStack.releaseRegister(x2)
  3101  
  3102  	// Finally, record that the result is on the conditional register.
  3103  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3104  	loc.valueType = runtimeValueTypeI32
  3105  	return nil
  3106  }
  3107  
  3108  // compileGe implements compiler.compileGe for the amd64 architecture.
  3109  func (c *amd64Compiler) compileGe(o *wazeroir.OperationGe) error {
  3110  	x2 := c.locationStack.pop()
  3111  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3112  		return err
  3113  	}
  3114  
  3115  	x1 := c.locationStack.pop()
  3116  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3117  		return err
  3118  	}
  3119  
  3120  	// Emit the compare instruction.
  3121  	var resultConditionState asm.ConditionalRegisterState
  3122  	switch o.Type {
  3123  	case wazeroir.SignedTypeInt32:
  3124  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3125  		resultConditionState = amd64.ConditionalRegisterStateGE
  3126  	case wazeroir.SignedTypeUint32:
  3127  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3128  		resultConditionState = amd64.ConditionalRegisterStateAE
  3129  	case wazeroir.SignedTypeInt64:
  3130  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3131  		resultConditionState = amd64.ConditionalRegisterStateGE
  3132  	case wazeroir.SignedTypeUint64:
  3133  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3134  		resultConditionState = amd64.ConditionalRegisterStateAE
  3135  	case wazeroir.SignedTypeFloat32:
  3136  		c.assembler.CompileRegisterToRegister(amd64.COMISS, x2.register, x1.register)
  3137  		resultConditionState = amd64.ConditionalRegisterStateAE
  3138  	case wazeroir.SignedTypeFloat64:
  3139  		c.assembler.CompileRegisterToRegister(amd64.COMISD, x2.register, x1.register)
  3140  		resultConditionState = amd64.ConditionalRegisterStateAE
  3141  	}
  3142  
  3143  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3144  	c.locationStack.releaseRegister(x1)
  3145  	c.locationStack.releaseRegister(x2)
  3146  
  3147  	// Finally, record that the result is on the conditional register.
  3148  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3149  	loc.valueType = runtimeValueTypeI32
  3150  	return nil
  3151  }
  3152  
  3153  // compileLoad implements compiler.compileLoad for the amd64 architecture.
  3154  func (c *amd64Compiler) compileLoad(o *wazeroir.OperationLoad) error {
  3155  	var (
  3156  		isIntType         bool
  3157  		movInst           asm.Instruction
  3158  		targetSizeInBytes int64
  3159  		vt                runtimeValueType
  3160  	)
  3161  	switch o.Type {
  3162  	case wazeroir.UnsignedTypeI32:
  3163  		isIntType = true
  3164  		movInst = amd64.MOVL
  3165  		targetSizeInBytes = 32 / 8
  3166  		vt = runtimeValueTypeI32
  3167  	case wazeroir.UnsignedTypeI64:
  3168  		isIntType = true
  3169  		movInst = amd64.MOVQ
  3170  		targetSizeInBytes = 64 / 8
  3171  		vt = runtimeValueTypeI64
  3172  	case wazeroir.UnsignedTypeF32:
  3173  		isIntType = false
  3174  		movInst = amd64.MOVL
  3175  		targetSizeInBytes = 32 / 8
  3176  		vt = runtimeValueTypeF32
  3177  	case wazeroir.UnsignedTypeF64:
  3178  		isIntType = false
  3179  		movInst = amd64.MOVQ
  3180  		targetSizeInBytes = 64 / 8
  3181  		vt = runtimeValueTypeF64
  3182  	}
  3183  
  3184  	reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
  3185  	if err != nil {
  3186  		return err
  3187  	}
  3188  
  3189  	if isIntType {
  3190  		// For integer types, read the corresponding bytes from the offset to the memory
  3191  		// and store the value to the int register.
  3192  		c.assembler.CompileMemoryWithIndexToRegister(movInst,
  3193  			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3194  			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3195  			reg)
  3196  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  3197  	} else {
  3198  		// For float types, we read the value to the float register.
  3199  		floatReg, err := c.allocateRegister(registerTypeVector)
  3200  		if err != nil {
  3201  			return err
  3202  		}
  3203  		c.assembler.CompileMemoryWithIndexToRegister(movInst,
  3204  			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3205  			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3206  			floatReg)
  3207  		c.pushRuntimeValueLocationOnRegister(floatReg, vt)
  3208  		// We no longer need the int register so mark it unused.
  3209  		c.locationStack.markRegisterUnused(reg)
  3210  	}
  3211  	return nil
  3212  }
  3213  
  3214  // compileLoad8 implements compiler.compileLoad8 for the amd64 architecture.
  3215  func (c *amd64Compiler) compileLoad8(o *wazeroir.OperationLoad8) error {
  3216  	const targetSizeInBytes = 1
  3217  	reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
  3218  	if err != nil {
  3219  		return err
  3220  	}
  3221  
  3222  	// Then move a byte at the offset to the register.
  3223  	// Note that Load8 is only for integer types.
  3224  	var inst asm.Instruction
  3225  	var vt runtimeValueType
  3226  	switch o.Type {
  3227  	case wazeroir.SignedInt32:
  3228  		inst = amd64.MOVBLSX
  3229  		vt = runtimeValueTypeI32
  3230  	case wazeroir.SignedUint32:
  3231  		inst = amd64.MOVBLZX
  3232  		vt = runtimeValueTypeI32
  3233  	case wazeroir.SignedInt64:
  3234  		inst = amd64.MOVBQSX
  3235  		vt = runtimeValueTypeI64
  3236  	case wazeroir.SignedUint64:
  3237  		inst = amd64.MOVBQZX
  3238  		vt = runtimeValueTypeI64
  3239  	}
  3240  
  3241  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3242  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3243  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3244  		reg)
  3245  
  3246  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3247  	return nil
  3248  }
  3249  
  3250  // compileLoad16 implements compiler.compileLoad16 for the amd64 architecture.
  3251  func (c *amd64Compiler) compileLoad16(o *wazeroir.OperationLoad16) error {
  3252  	const targetSizeInBytes = 16 / 8
  3253  	reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
  3254  	if err != nil {
  3255  		return err
  3256  	}
  3257  
  3258  	// Then move 2 bytes at the offset to the register.
  3259  	// Note that Load16 is only for integer types.
  3260  	var inst asm.Instruction
  3261  	var vt runtimeValueType
  3262  	switch o.Type {
  3263  	case wazeroir.SignedInt32:
  3264  		inst = amd64.MOVWLSX
  3265  		vt = runtimeValueTypeI32
  3266  	case wazeroir.SignedInt64:
  3267  		inst = amd64.MOVWQSX
  3268  		vt = runtimeValueTypeI64
  3269  	case wazeroir.SignedUint32:
  3270  		inst = amd64.MOVWLZX
  3271  		vt = runtimeValueTypeI32
  3272  	case wazeroir.SignedUint64:
  3273  		inst = amd64.MOVWQZX
  3274  		vt = runtimeValueTypeI64
  3275  	}
  3276  
  3277  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3278  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3279  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3280  		reg)
  3281  
  3282  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3283  	return nil
  3284  }
  3285  
  3286  // compileLoad32 implements compiler.compileLoad32 for the amd64 architecture.
  3287  func (c *amd64Compiler) compileLoad32(o *wazeroir.OperationLoad32) error {
  3288  	const targetSizeInBytes = 32 / 8
  3289  	reg, err := c.compileMemoryAccessCeilSetup(o.Arg.Offset, targetSizeInBytes)
  3290  	if err != nil {
  3291  		return err
  3292  	}
  3293  
  3294  	// Then move 4 bytes at the offset to the register.
  3295  	var inst asm.Instruction
  3296  	if o.Signed {
  3297  		inst = amd64.MOVLQSX
  3298  	} else {
  3299  		inst = amd64.MOVLQZX
  3300  	}
  3301  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3302  		// We access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3303  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3304  		reg)
  3305  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  3306  	return nil
  3307  }
  3308  
  3309  // compileMemoryAccessCeilSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes"
  3310  // into a register, and returns the stored register. We call the result "ceil" because we access the memory
  3311  // as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3312  //
  3313  // Note: this also emits the instructions to check the out-of-bounds memory access.
  3314  // In other words, if the ceil exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  3315  func (c *amd64Compiler) compileMemoryAccessCeilSetup(offsetArg uint32, targetSizeInBytes int64) (asm.Register, error) {
  3316  	base := c.locationStack.pop()
  3317  	if err := c.compileEnsureOnRegister(base); err != nil {
  3318  		return asm.NilRegister, err
  3319  	}
  3320  
  3321  	result := base.register
  3322  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxInt32 {
  3323  		c.assembler.CompileConstToRegister(amd64.ADDQ, offsetConst, result)
  3324  	} else if offsetConst <= math.MaxUint32 {
  3325  		// Note: in practice, this branch rarely happens as in this case, the wasm binary know that
  3326  		// memory has more than 1 GBi or at least tries to access above 1 GBi memory region.
  3327  		//
  3328  		// This case, we cannot directly add the offset to a register by ADDQ(const) instruction.
  3329  		// That is because the imm32 const is sign-extended to 64-bit in ADDQ(const), and we end up
  3330  		// making offsetConst as the negative number, which is wrong.
  3331  		tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3332  		if err != nil {
  3333  			return asm.NilRegister, err
  3334  		}
  3335  		c.assembler.CompileConstToRegister(amd64.MOVL, int64(uint32(offsetConst)), tmp)
  3336  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, tmp, result)
  3337  	} else {
  3338  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  3339  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3340  		return result, nil
  3341  	}
  3342  
  3343  	// Now we compare the value with the memory length which is held by callEngine.
  3344  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3345  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, result)
  3346  
  3347  	// Jump if the value is within the memory length.
  3348  	okJmp := c.assembler.CompileJump(amd64.JCC)
  3349  
  3350  	// Otherwise, we exit the function with out-of-bounds status code.
  3351  	c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3352  
  3353  	c.assembler.SetJumpTargetOnNext(okJmp)
  3354  
  3355  	c.locationStack.markRegisterUnused(result)
  3356  	return result, nil
  3357  }
  3358  
  3359  // compileStore implements compiler.compileStore for the amd64 architecture.
  3360  func (c *amd64Compiler) compileStore(o *wazeroir.OperationStore) error {
  3361  	var movInst asm.Instruction
  3362  	var targetSizeInByte int64
  3363  	switch o.Type {
  3364  	case wazeroir.UnsignedTypeI32, wazeroir.UnsignedTypeF32:
  3365  		movInst = amd64.MOVL
  3366  		targetSizeInByte = 32 / 8
  3367  	case wazeroir.UnsignedTypeI64, wazeroir.UnsignedTypeF64:
  3368  		movInst = amd64.MOVQ
  3369  		targetSizeInByte = 64 / 8
  3370  	}
  3371  	return c.compileStoreImpl(o.Arg.Offset, movInst, targetSizeInByte)
  3372  }
  3373  
  3374  // compileStore8 implements compiler.compileStore8 for the amd64 architecture.
  3375  func (c *amd64Compiler) compileStore8(o *wazeroir.OperationStore8) error {
  3376  	return c.compileStoreImpl(o.Arg.Offset, amd64.MOVB, 1)
  3377  }
  3378  
  3379  // compileStore32 implements compiler.compileStore32 for the amd64 architecture.
  3380  func (c *amd64Compiler) compileStore16(o *wazeroir.OperationStore16) error {
  3381  	return c.compileStoreImpl(o.Arg.Offset, amd64.MOVW, 16/8)
  3382  }
  3383  
  3384  // compileStore32 implements compiler.compileStore32 for the amd64 architecture.
  3385  func (c *amd64Compiler) compileStore32(o *wazeroir.OperationStore32) error {
  3386  	return c.compileStoreImpl(o.Arg.Offset, amd64.MOVL, 32/8)
  3387  }
  3388  
  3389  func (c *amd64Compiler) compileStoreImpl(offsetConst uint32, inst asm.Instruction, targetSizeInBytes int64) error {
  3390  	val := c.locationStack.pop()
  3391  	if err := c.compileEnsureOnRegister(val); err != nil {
  3392  		return err
  3393  	}
  3394  
  3395  	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
  3396  	if err != nil {
  3397  		return nil
  3398  	}
  3399  
  3400  	c.assembler.CompileRegisterToMemoryWithIndex(
  3401  		inst, val.register,
  3402  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3403  	)
  3404  
  3405  	// We no longer need both the value and base registers.
  3406  	c.locationStack.releaseRegister(val)
  3407  	c.locationStack.markRegisterUnused(reg)
  3408  	return nil
  3409  }
  3410  
  3411  // compileMemoryGrow implements compiler.compileMemoryGrow for the amd64 architecture.
  3412  func (c *amd64Compiler) compileMemoryGrow() error {
  3413  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3414  		return err
  3415  	}
  3416  
  3417  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexMemoryGrow); err != nil {
  3418  		return err
  3419  	}
  3420  
  3421  	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
  3422  	c.compileReservedStackBasePointerInitialization()
  3423  	c.compileReservedMemoryPointerInitialization()
  3424  	return nil
  3425  }
  3426  
  3427  // compileMemorySize implements compiler.compileMemorySize for the amd64 architecture.
  3428  func (c *amd64Compiler) compileMemorySize() error {
  3429  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3430  		return err
  3431  	}
  3432  
  3433  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  3434  	if err != nil {
  3435  		return err
  3436  	}
  3437  	loc := c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  3438  
  3439  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, loc.register)
  3440  
  3441  	// WebAssembly's memory.size returns the page size (65536) of memory region.
  3442  	// That is equivalent to divide the len of memory slice by 65536 and
  3443  	// that can be calculated as SHR by 16 bits as 65536 = 2^16.
  3444  	c.assembler.CompileConstToRegister(amd64.SHRQ, wasm.MemoryPageSizeInBits, loc.register)
  3445  	return nil
  3446  }
  3447  
  3448  // compileMemoryInit implements compiler.compileMemoryInit for the amd64 architecture.
  3449  func (c *amd64Compiler) compileMemoryInit(o *wazeroir.OperationMemoryInit) error {
  3450  	return c.compileInitImpl(false, o.DataIndex, 0)
  3451  }
  3452  
  3453  // compileInitImpl implements compileTableInit and compileMemoryInit.
  3454  //
  3455  // TODO: the compiled code in this function should be reused and compile at once as
  3456  // the code is independent of any module.
  3457  func (c *amd64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  3458  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3459  	if isTable {
  3460  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3461  	}
  3462  
  3463  	copySize := c.locationStack.pop()
  3464  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3465  		return err
  3466  	}
  3467  
  3468  	sourceOffset := c.locationStack.pop()
  3469  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3470  		return err
  3471  	}
  3472  
  3473  	destinationOffset := c.locationStack.pop()
  3474  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3475  		return err
  3476  	}
  3477  
  3478  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  3479  	if err != nil {
  3480  		return err
  3481  	}
  3482  	c.locationStack.markRegisterUsed(instanceAddr)
  3483  	if isTable {
  3484  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  3485  	} else {
  3486  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  3487  	}
  3488  
  3489  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3490  	if err != nil {
  3491  		return err
  3492  	}
  3493  	c.locationStack.markRegisterUsed(tmp)
  3494  
  3495  	// sourceOffset += size.
  3496  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3497  	// destinationOffset += size.
  3498  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3499  
  3500  	// Check instance bounds and if exceeds the length, exit with out of bounds error.
  3501  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3502  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  3503  		sourceOffset.register)
  3504  	sourceBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3505  	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
  3506  	c.assembler.SetJumpTargetOnNext(sourceBoundOKJump)
  3507  
  3508  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3509  	if isTable {
  3510  		// Load the target table's address.
  3511  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3512  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
  3513  		// Compare length.
  3514  		c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
  3515  	} else {
  3516  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3517  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3518  			destinationOffset.register)
  3519  	}
  3520  
  3521  	destinationBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3522  	c.compileExitFromNativeCode(outOfBoundsErrorStatus)
  3523  	c.assembler.SetJumpTargetOnNext(destinationBoundOKJump)
  3524  
  3525  	// Otherwise, ready to copy the value from source to destination.
  3526  	//
  3527  	// If the copy size equal zero, we skip the entire instructions below.
  3528  	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
  3529  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3530  
  3531  	var scale int16
  3532  	var memToReg, regToMem asm.Instruction
  3533  	if isTable {
  3534  		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3535  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
  3536  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3537  		// destinationOffset += table buffer's absolute address.
  3538  		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3539  			tmp, tableInstanceTableOffset, destinationOffset.register)
  3540  		// sourceOffset += data buffer's absolute address.
  3541  		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3542  			instanceAddr, 0, sourceOffset.register)
  3543  
  3544  		// For tables, we move 8 bytes at once.
  3545  		memToReg = amd64.MOVQ
  3546  		regToMem = memToReg
  3547  		scale = 8
  3548  	} else {
  3549  		// destinationOffset += memory buffer's absolute address.
  3550  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3551  
  3552  		// sourceOffset += data buffer's absolute address.
  3553  		c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register)
  3554  
  3555  		// Move one byte at once.
  3556  		memToReg = amd64.MOVBQZX
  3557  		regToMem = amd64.MOVB
  3558  		scale = 1
  3559  	}
  3560  
  3561  	// Negate the counter.
  3562  	c.assembler.CompileNoneToRegister(amd64.NEGQ, copySize.register)
  3563  
  3564  	beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3565  
  3566  	c.assembler.CompileMemoryWithIndexToRegister(memToReg,
  3567  		sourceOffset.register, 0, copySize.register, scale,
  3568  		tmp)
  3569  	// [destinationOffset + (size.register)] = tmp.
  3570  	c.assembler.CompileRegisterToMemoryWithIndex(regToMem,
  3571  		tmp,
  3572  		destinationOffset.register, 0, copySize.register, scale,
  3573  	)
  3574  
  3575  	// size += 1
  3576  	c.assembler.CompileNoneToRegister(amd64.INCQ, copySize.register)
  3577  	c.assembler.CompileJump(amd64.JMI).AssignJumpTarget(beginCopyLoop)
  3578  
  3579  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  3580  		destinationOffset.register, instanceAddr, tmp)
  3581  	c.assembler.SetJumpTargetOnNext(skipJump)
  3582  	return nil
  3583  }
  3584  
  3585  // compileDataDrop implements compiler.compileDataDrop for the amd64 architecture.
  3586  func (c *amd64Compiler) compileDataDrop(o *wazeroir.OperationDataDrop) error {
  3587  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3588  		return err
  3589  	}
  3590  
  3591  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3592  	if err != nil {
  3593  		return err
  3594  	}
  3595  
  3596  	c.compileLoadDataInstanceAddress(o.DataIndex, tmp)
  3597  
  3598  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3599  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
  3600  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
  3601  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
  3602  	return nil
  3603  }
  3604  
  3605  func (c *amd64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3606  	// dst = dataIndex * dataInstanceStructSize.
  3607  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(dataIndex)*dataInstanceStructSize, dst)
  3608  
  3609  	// dst = &moduleInstance.DataInstances[0] + dst
  3610  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3611  	//     = &moduleInstance.DataInstances[dataIndex]
  3612  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3613  		amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3614  		dst,
  3615  	)
  3616  }
  3617  
  3618  // compileCopyLoopImpl implements a REP MOVSQ memory copy for the given range with support for both directions.
  3619  func (c *amd64Compiler) compileCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, backwards bool, bwOffset uint8) {
  3620  	// skip if nothing to copy
  3621  	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
  3622  	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
  3623  
  3624  	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
  3625  	restoreCrossing := c.compilePreventCrossedTargetRegisters(
  3626  		[]*runtimeValueLocation{destinationOffset, sourceOffset, copySize},
  3627  		[]asm.Register{amd64.RegDI, amd64.RegSI, amd64.RegCX})
  3628  
  3629  	// Prepare registers for REP MOVSQ: copy from rsi to rdi, rcx times.
  3630  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3631  	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
  3632  	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
  3633  
  3634  	// Point on first byte of first quadword to copy.
  3635  	if backwards {
  3636  		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegDI)
  3637  		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegSI)
  3638  		// Set REP prefix direction backwards.
  3639  		c.assembler.CompileStandAlone(amd64.STD)
  3640  	}
  3641  
  3642  	c.assembler.CompileStandAlone(amd64.REPMOVSQ)
  3643  
  3644  	if backwards {
  3645  		// Reset direction.
  3646  		c.assembler.CompileStandAlone(amd64.CLD)
  3647  	}
  3648  
  3649  	// Restore registers.
  3650  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3651  	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
  3652  	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
  3653  	restoreCrossing()
  3654  
  3655  	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
  3656  	c.assembler.CompileStandAlone(amd64.NOP)
  3657  }
  3658  
  3659  // compileMemoryCopyLoopImpl is used for directly copying after bounds/direction check.
  3660  func (c *amd64Compiler) compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
  3661  	// Point on first byte to be copied depending on direction.
  3662  	if backwards {
  3663  		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
  3664  		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
  3665  	} else {
  3666  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
  3667  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3668  	}
  3669  
  3670  	// destinationOffset += memory buffer's absolute address.
  3671  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3672  	// sourceOffset += memory buffer's absolute address.
  3673  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, sourceOffset.register)
  3674  
  3675  	// Copy copySize % 8 bytes in loop to allow copying in 8 byte groups afterward.
  3676  	beginLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3677  
  3678  	// Check copySize % 8 == 0.
  3679  	c.assembler.CompileConstToRegister(amd64.TESTQ, 7, copySize.register)
  3680  	breakLoop := c.assembler.CompileJump(amd64.JEQ)
  3681  
  3682  	c.assembler.CompileMemoryToRegister(amd64.MOVBQZX, sourceOffset.register, 0, tmp)
  3683  	c.assembler.CompileRegisterToMemory(amd64.MOVB, tmp, destinationOffset.register, 0)
  3684  
  3685  	if backwards {
  3686  		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
  3687  		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
  3688  	} else {
  3689  		c.assembler.CompileNoneToRegister(amd64.INCQ, sourceOffset.register)
  3690  		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
  3691  	}
  3692  
  3693  	c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
  3694  	c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginLoop)
  3695  	c.assembler.SetJumpTargetOnNext(breakLoop)
  3696  
  3697  	// compileCopyLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
  3698  	c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
  3699  
  3700  	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 7)
  3701  }
  3702  
  3703  // compileMemoryCopy implements compiler.compileMemoryCopy for the amd64 architecture.
  3704  //
  3705  // This uses efficient `REP MOVSQ` instructions to copy in quadword (8 bytes) batches. The remaining bytes
  3706  // are copied with a simple `MOV` loop. It uses backward copying for overlapped segments.
  3707  func (c *amd64Compiler) compileMemoryCopy() error {
  3708  	copySize := c.locationStack.pop()
  3709  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3710  		return err
  3711  	}
  3712  
  3713  	sourceOffset := c.locationStack.pop()
  3714  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3715  		return err
  3716  	}
  3717  
  3718  	destinationOffset := c.locationStack.pop()
  3719  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3720  		return err
  3721  	}
  3722  
  3723  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3724  	if err != nil {
  3725  		return err
  3726  	}
  3727  	c.locationStack.markRegisterUsed(tmp)
  3728  
  3729  	// sourceOffset += size.
  3730  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3731  	// destinationOffset += size.
  3732  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3733  
  3734  	// Check source bounds and if exceeds the length, exit with out of bounds error.
  3735  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3736  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, sourceOffset.register)
  3737  	sourceBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3738  	c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3739  	c.assembler.SetJumpTargetOnNext(sourceBoundOKJump)
  3740  
  3741  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3742  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3743  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, destinationOffset.register)
  3744  	destinationBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3745  	c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3746  	c.assembler.SetJumpTargetOnNext(destinationBoundOKJump)
  3747  
  3748  	// Skip zero size.
  3749  	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
  3750  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3751  
  3752  	// If dest < source, we can copy forwards
  3753  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
  3754  	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
  3755  
  3756  	// If source + size < dest, we can copy forwards
  3757  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
  3758  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
  3759  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
  3760  	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
  3761  
  3762  	// Copy backwards.
  3763  	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, true)
  3764  	endJump := c.assembler.CompileJump(amd64.JMP)
  3765  
  3766  	// Copy forwards.
  3767  	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump, sourceBoundLowerThanDestJump)
  3768  	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, false)
  3769  
  3770  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  3771  		destinationOffset.register, tmp)
  3772  	c.assembler.SetJumpTargetOnNext(skipJump, endJump)
  3773  
  3774  	return nil
  3775  }
  3776  
  3777  // compileFillLoopImpl implements a REP STOSQ fill loop.
  3778  func (c *amd64Compiler) compileFillLoopImpl(destinationOffset, value, fillSize *runtimeValueLocation, tmp asm.Register, replicateByte bool) {
  3779  	// Skip if nothing to fill.
  3780  	c.assembler.CompileRegisterToConst(amd64.CMPQ, fillSize.register, 0)
  3781  	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
  3782  
  3783  	if replicateByte {
  3784  		// Replicate single byte onto full 8-byte register.
  3785  		c.assembler.CompileConstToRegister(amd64.MOVQ, 0x0101010101010101, tmp)
  3786  		c.assembler.CompileRegisterToRegister(amd64.IMULQ, tmp, value.register)
  3787  	}
  3788  
  3789  	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
  3790  	restoreCrossing := c.compilePreventCrossedTargetRegisters(
  3791  		[]*runtimeValueLocation{destinationOffset, value, fillSize},
  3792  		[]asm.Register{amd64.RegDI, amd64.RegAX, amd64.RegCX})
  3793  
  3794  	// Prepare registers for REP STOSQ: fill at [rdi] with rax, rcx times.
  3795  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3796  	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
  3797  	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
  3798  
  3799  	c.assembler.CompileStandAlone(amd64.REPSTOSQ)
  3800  
  3801  	// Restore registers.
  3802  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3803  	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
  3804  	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
  3805  	restoreCrossing()
  3806  
  3807  	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
  3808  }
  3809  
  3810  // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
  3811  //
  3812  // This function uses efficient `REP STOSQ` instructions to copy in quadword (8 bytes) batches
  3813  // if the size if above 15 bytes. For smaller sizes, a simple MOVB copy loop is the best
  3814  // option.
  3815  //
  3816  // TODO: the compiled code in this function should be reused and compile at once as
  3817  // the code is independent of any module.
  3818  func (c *amd64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3819  	copySize := c.locationStack.pop()
  3820  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3821  		return err
  3822  	}
  3823  
  3824  	value := c.locationStack.pop()
  3825  	if err := c.compileEnsureOnRegister(value); err != nil {
  3826  		return err
  3827  	}
  3828  
  3829  	destinationOffset := c.locationStack.pop()
  3830  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3831  		return err
  3832  	}
  3833  
  3834  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3835  	if err != nil {
  3836  		return err
  3837  	}
  3838  	c.locationStack.markRegisterUsed(tmp)
  3839  
  3840  	// destinationOffset += size.
  3841  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3842  
  3843  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3844  	if isTable {
  3845  		// tmp = &tables[0]
  3846  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  3847  			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3848  			tmp)
  3849  
  3850  		// tmp = [tmp + TableIndex*8]
  3851  		//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3852  		//     = [&tables[TableIndex]] = tables[TableIndex].
  3853  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex)*8, tmp)
  3854  
  3855  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3856  			tmp, tableInstanceTableLenOffset,
  3857  			destinationOffset.register)
  3858  	} else {
  3859  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3860  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3861  			destinationOffset.register)
  3862  	}
  3863  	destinationBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3864  	if isTable {
  3865  		c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  3866  	} else {
  3867  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3868  	}
  3869  	c.assembler.SetJumpTargetOnNext(destinationBoundOKJump)
  3870  
  3871  	// Otherwise, ready to copy the value from source to destination.
  3872  	//
  3873  	// If the copy size equal zero, we skip the entire instructions below.
  3874  	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
  3875  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3876  
  3877  	// destinationOffset -= size.
  3878  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3879  
  3880  	if isTable {
  3881  		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3882  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3883  		// destinationOffset += table buffer's absolute address.
  3884  		c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
  3885  
  3886  	} else {
  3887  		// destinationOffset += memory buffer's absolute address.
  3888  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3889  
  3890  		// Copy first %15 bytes with simple MOVB instruction.
  3891  		beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3892  		c.assembler.CompileConstToRegister(amd64.TESTQ, 15, copySize.register)
  3893  		breakLoop := c.assembler.CompileJump(amd64.JEQ)
  3894  
  3895  		c.assembler.CompileRegisterToMemory(amd64.MOVB, value.register, destinationOffset.register, 0)
  3896  
  3897  		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
  3898  		c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
  3899  		c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginCopyLoop)
  3900  
  3901  		c.assembler.SetJumpTargetOnNext(breakLoop)
  3902  		// compileFillLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
  3903  		c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
  3904  	}
  3905  
  3906  	c.compileFillLoopImpl(destinationOffset, value, copySize, tmp, !isTable)
  3907  
  3908  	c.locationStack.markRegisterUnused(copySize.register, value.register,
  3909  		destinationOffset.register, tmp)
  3910  	c.assembler.SetJumpTargetOnNext(skipJump)
  3911  	return nil
  3912  }
  3913  
  3914  // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
  3915  //
  3916  // TODO: the compiled code in this function should be reused and compile at once as
  3917  // the code is independent of any module.
  3918  func (c *amd64Compiler) compileMemoryFill() error {
  3919  	return c.compileFillImpl(false, 0)
  3920  }
  3921  
  3922  // compileTableInit implements compiler.compileTableInit for the amd64 architecture.
  3923  func (c *amd64Compiler) compileTableInit(o *wazeroir.OperationTableInit) error {
  3924  	return c.compileInitImpl(true, o.ElemIndex, o.TableIndex)
  3925  }
  3926  
  3927  // compileTableCopyLoopImpl is used for directly copying after bounds/direction check.
  3928  func (c *amd64Compiler) compileTableCopyLoopImpl(o *wazeroir.OperationTableCopy, destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
  3929  	// Point on first byte to be copied.
  3930  	if !backwards {
  3931  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
  3932  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3933  	}
  3934  
  3935  	// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3936  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
  3937  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3938  	// destinationOffset += table buffer's absolute address.
  3939  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3940  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(o.DstTableIndex*8), tmp)
  3941  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
  3942  	// sourceOffset += table buffer's absolute address.
  3943  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3944  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(o.SrcTableIndex*8), tmp)
  3945  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, sourceOffset.register)
  3946  
  3947  	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 8)
  3948  }
  3949  
  3950  // compileTableCopy implements compiler.compileTableCopy for the amd64 architecture.
  3951  //
  3952  // It uses efficient `REP MOVSB` instructions for optimized copying. It uses backward copying for
  3953  // overlapped segments.
  3954  func (c *amd64Compiler) compileTableCopy(o *wazeroir.OperationTableCopy) error {
  3955  	copySize := c.locationStack.pop()
  3956  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3957  		return err
  3958  	}
  3959  
  3960  	sourceOffset := c.locationStack.pop()
  3961  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3962  		return err
  3963  	}
  3964  
  3965  	destinationOffset := c.locationStack.pop()
  3966  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3967  		return err
  3968  	}
  3969  
  3970  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3971  	if err != nil {
  3972  		return err
  3973  	}
  3974  
  3975  	// sourceOffset += size.
  3976  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3977  	// destinationOffset += size.
  3978  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3979  
  3980  	// Check source bounds and if exceeds the length, exit with out of bounds error.
  3981  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3982  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(o.SrcTableIndex*8), tmp)
  3983  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, sourceOffset.register)
  3984  	sourceBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3985  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  3986  	c.assembler.SetJumpTargetOnNext(sourceBoundOKJump)
  3987  
  3988  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3989  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3990  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(o.DstTableIndex*8), tmp)
  3991  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
  3992  	destinationBoundOKJump := c.assembler.CompileJump(amd64.JCC)
  3993  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  3994  	c.assembler.SetJumpTargetOnNext(destinationBoundOKJump)
  3995  
  3996  	// Skip zero size.
  3997  	c.assembler.CompileRegisterToConst(amd64.CMPQ, copySize.register, 0)
  3998  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3999  
  4000  	// If dest < source, we can copy forwards.
  4001  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
  4002  	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
  4003  
  4004  	// If source + size < dest, we can copy forwards.
  4005  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
  4006  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
  4007  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
  4008  	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
  4009  
  4010  	// Copy backwards.
  4011  	c.compileTableCopyLoopImpl(o, destinationOffset, sourceOffset, copySize, tmp, true)
  4012  	endJump := c.assembler.CompileJump(amd64.JMP)
  4013  
  4014  	// Copy forwards.
  4015  	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump, sourceBoundLowerThanDestJump)
  4016  	c.compileTableCopyLoopImpl(o, destinationOffset, sourceOffset, copySize, tmp, false)
  4017  
  4018  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  4019  		destinationOffset.register, tmp)
  4020  	c.assembler.SetJumpTargetOnNext(skipJump, endJump)
  4021  	return nil
  4022  }
  4023  
  4024  // compileElemDrop implements compiler.compileElemDrop for the amd64 architecture.
  4025  func (c *amd64Compiler) compileElemDrop(o *wazeroir.OperationElemDrop) error {
  4026  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4027  		return err
  4028  	}
  4029  
  4030  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4031  	if err != nil {
  4032  		return err
  4033  	}
  4034  
  4035  	c.compileLoadElemInstanceAddress(o.ElemIndex, tmp)
  4036  
  4037  	// Clears the content of ElementInstances[o.ElemIndex].References (== []uintptr{} type).
  4038  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
  4039  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
  4040  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
  4041  	return nil
  4042  }
  4043  
  4044  func (c *amd64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  4045  	// dst = elemIndex * elementInstanceStructSize
  4046  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(elemIndex)*elementInstanceStructSize, dst)
  4047  
  4048  	// dst = &moduleInstance.ElementInstances[0] + dst
  4049  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  4050  	//     = &moduleInstance.ElementInstances[elemIndex]
  4051  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  4052  		amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4053  		dst,
  4054  	)
  4055  }
  4056  
  4057  // compileTableGet implements compiler.compileTableGet for the amd64 architecture.
  4058  func (c *amd64Compiler) compileTableGet(o *wazeroir.OperationTableGet) error {
  4059  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  4060  	if err != nil {
  4061  		return err
  4062  	}
  4063  
  4064  	c.locationStack.markRegisterUsed(ref)
  4065  
  4066  	offset := c.locationStack.pop()
  4067  	if err := c.compileEnsureOnRegister(offset); err != nil {
  4068  		return err
  4069  	}
  4070  
  4071  	// ref = &tables[0]
  4072  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4073  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4074  		ref)
  4075  
  4076  	// ref = [ref + TableIndex*8]
  4077  	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4078  	//     = [&tables[TableIndex]] = tables[TableIndex].
  4079  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, int64(o.TableIndex)*8, ref)
  4080  
  4081  	// Out of bounds check.
  4082  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, ref, tableInstanceTableLenOffset, offset.register)
  4083  	boundOKJmp := c.assembler.CompileJump(amd64.JHI)
  4084  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  4085  	c.assembler.SetJumpTargetOnNext(boundOKJmp)
  4086  
  4087  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  4088  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableInstanceTableOffset, ref)
  4089  
  4090  	// ref = [ref + 0 + offset.register * 8]
  4091  	//     = [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset]
  4092  	//     = [&tables[TableIndex].References[offset]]
  4093  	//     = tables[TableIndex].References[offset]
  4094  	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, ref,
  4095  		0, offset.register, 8, ref,
  4096  	)
  4097  
  4098  	c.locationStack.markRegisterUnused(offset.register)
  4099  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  4100  	return nil
  4101  }
  4102  
  4103  // compileTableSet implements compiler.compileTableSet for the amd64 architecture.
  4104  func (c *amd64Compiler) compileTableSet(o *wazeroir.OperationTableSet) error {
  4105  	ref := c.locationStack.pop()
  4106  	if err := c.compileEnsureOnRegister(ref); err != nil {
  4107  		return err
  4108  	}
  4109  
  4110  	offset := c.locationStack.pop()
  4111  	if err := c.compileEnsureOnRegister(offset); err != nil {
  4112  		return err
  4113  	}
  4114  
  4115  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4116  	if err != nil {
  4117  		return err
  4118  	}
  4119  
  4120  	// tmp = &tables[0]
  4121  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4122  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4123  		tmp)
  4124  
  4125  	// ref = [ref + TableIndex*8]
  4126  	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4127  	//     = [&tables[TableIndex]] = tables[TableIndex].
  4128  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(o.TableIndex)*8, tmp)
  4129  
  4130  	// Out of bounds check.
  4131  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
  4132  	boundOKJmp := c.assembler.CompileJump(amd64.JHI)
  4133  	c.compileExitFromNativeCode(nativeCallStatusCodeInvalidTableAccess)
  4134  	c.assembler.SetJumpTargetOnNext(boundOKJmp)
  4135  
  4136  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  4137  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableInstanceTableOffset, tmp)
  4138  
  4139  	// [tmp + 0 + offset.register * 8] = ref
  4140  	// [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] = ref
  4141  	// [&tables[TableIndex].References[offset]] = ref
  4142  	// tables[TableIndex].References[offset] = ref
  4143  	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVQ,
  4144  		ref.register,
  4145  		tmp, 0, offset.register, 8)
  4146  
  4147  	c.locationStack.markRegisterUnused(offset.register, ref.register)
  4148  	return nil
  4149  }
  4150  
  4151  // compileTableGrow implements compiler.compileTableGrow for the amd64 architecture.
  4152  func (c *amd64Compiler) compileTableGrow(o *wazeroir.OperationTableGrow) error {
  4153  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4154  		return err
  4155  	}
  4156  
  4157  	// Pushes the table index.
  4158  	if err := c.compileConstI32(&wazeroir.OperationConstI32{Value: o.TableIndex}); err != nil {
  4159  		return err
  4160  	}
  4161  
  4162  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  4163  	// Therefore, call out to the built function for this purpose.
  4164  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexTableGrow); err != nil {
  4165  		return err
  4166  	}
  4167  
  4168  	// TableGrow consumes three values (table index, number of items, initial value).
  4169  	for i := 0; i < 3; i++ {
  4170  		c.locationStack.pop()
  4171  	}
  4172  
  4173  	// Then, the previous length was pushed as the result.
  4174  	loc := c.locationStack.pushRuntimeValueLocationOnStack()
  4175  	loc.valueType = runtimeValueTypeI32
  4176  
  4177  	// After return, we re-initialize reserved registers just like preamble of functions.
  4178  	c.compileReservedStackBasePointerInitialization()
  4179  	c.compileReservedMemoryPointerInitialization()
  4180  	return nil
  4181  }
  4182  
  4183  // compileTableSize implements compiler.compileTableSize for the amd64 architecture.
  4184  func (c *amd64Compiler) compileTableSize(o *wazeroir.OperationTableSize) error {
  4185  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4186  		return err
  4187  	}
  4188  
  4189  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  4190  	if err != nil {
  4191  		return err
  4192  	}
  4193  
  4194  	// result = &tables[0]
  4195  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4196  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4197  		result)
  4198  
  4199  	// result = [result + TableIndex*8]
  4200  	//        = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4201  	//        = [&tables[TableIndex]] = tables[TableIndex].
  4202  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, int64(o.TableIndex)*8, result)
  4203  
  4204  	// result = [result + tableInstanceTableLenOffset]
  4205  	//        = [tables[TableIndex] + tableInstanceTableLenOffset]
  4206  	//        = len(tables[TableIndex])
  4207  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableInstanceTableLenOffset, result)
  4208  
  4209  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  4210  	return nil
  4211  }
  4212  
  4213  // compileTableFill implements compiler.compileTableFill for the amd64 architecture.
  4214  func (c *amd64Compiler) compileTableFill(o *wazeroir.OperationTableFill) error {
  4215  	return c.compileFillImpl(true, o.TableIndex)
  4216  }
  4217  
  4218  // compileRefFunc implements compiler.compileRefFunc for the amd64 architecture.
  4219  func (c *amd64Compiler) compileRefFunc(o *wazeroir.OperationRefFunc) error {
  4220  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4221  		return err
  4222  	}
  4223  
  4224  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  4225  	if err != nil {
  4226  		return err
  4227  	}
  4228  
  4229  	// ref = [amd64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset]
  4230  	//     = &moduleEngine.functions[0]
  4231  	c.assembler.CompileMemoryToRegister(
  4232  		amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4233  		ref,
  4234  	)
  4235  
  4236  	// ref = [ref +  int64(o.FunctionIndex)*8]
  4237  	//     = [&moduleEngine.functions[0] + sizeOf(*function) * index]
  4238  	//     = moduleEngine.functions[index]
  4239  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4240  		ref, int64(o.FunctionIndex)*8, // * 8 because the size of *code equals 8 bytes.
  4241  		ref,
  4242  	)
  4243  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  4244  	return nil
  4245  }
  4246  
  4247  // compileConstI32 implements compiler.compileConstI32 for the amd64 architecture.
  4248  func (c *amd64Compiler) compileConstI32(o *wazeroir.OperationConstI32) error {
  4249  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4250  		return err
  4251  	}
  4252  
  4253  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4254  	if err != nil {
  4255  		return err
  4256  	}
  4257  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  4258  	c.assembler.CompileConstToRegister(amd64.MOVL, int64(o.Value), reg)
  4259  	return nil
  4260  }
  4261  
  4262  // compileConstI64 implements compiler.compileConstI64 for the amd64 architecture.
  4263  func (c *amd64Compiler) compileConstI64(o *wazeroir.OperationConstI64) error {
  4264  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4265  		return err
  4266  	}
  4267  
  4268  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4269  	if err != nil {
  4270  		return err
  4271  	}
  4272  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  4273  
  4274  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.Value), reg)
  4275  	return nil
  4276  }
  4277  
  4278  // compileConstF32 implements compiler.compileConstF32 for the amd64 architecture.
  4279  func (c *amd64Compiler) compileConstF32(o *wazeroir.OperationConstF32) error {
  4280  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4281  		return err
  4282  	}
  4283  
  4284  	reg, err := c.allocateRegister(registerTypeVector)
  4285  	if err != nil {
  4286  		return err
  4287  	}
  4288  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF32)
  4289  
  4290  	// We cannot directly load the value from memory to float regs,
  4291  	// so we move it to int reg temporarily.
  4292  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4293  	if err != nil {
  4294  		return err
  4295  	}
  4296  
  4297  	c.assembler.CompileConstToRegister(amd64.MOVL, int64(math.Float32bits(o.Value)), tmpReg)
  4298  	c.assembler.CompileRegisterToRegister(amd64.MOVL, tmpReg, reg)
  4299  	return nil
  4300  }
  4301  
  4302  // compileConstF64 implements compiler.compileConstF64 for the amd64 architecture.
  4303  func (c *amd64Compiler) compileConstF64(o *wazeroir.OperationConstF64) error {
  4304  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4305  		return err
  4306  	}
  4307  
  4308  	reg, err := c.allocateRegister(registerTypeVector)
  4309  	if err != nil {
  4310  		return err
  4311  	}
  4312  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF64)
  4313  
  4314  	// We cannot directly load the value from memory to float regs,
  4315  	// so we move it to int reg temporarily.
  4316  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4317  	if err != nil {
  4318  		return err
  4319  	}
  4320  
  4321  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(math.Float64bits(o.Value)), tmpReg)
  4322  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, reg)
  4323  	return nil
  4324  }
  4325  
  4326  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for amd64.
  4327  func (c *amd64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  4328  	var inst asm.Instruction
  4329  	switch loc.valueType {
  4330  	case runtimeValueTypeV128Lo:
  4331  		inst = amd64.MOVDQU
  4332  	case runtimeValueTypeV128Hi:
  4333  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  4334  	case runtimeValueTypeI32, runtimeValueTypeF32:
  4335  		inst = amd64.MOVL
  4336  	case runtimeValueTypeI64, runtimeValueTypeF64:
  4337  		inst = amd64.MOVQ
  4338  	default:
  4339  		panic("BUG: unknown runtime value type")
  4340  	}
  4341  
  4342  	// Copy the value from the stack.
  4343  	c.assembler.CompileMemoryToRegister(inst,
  4344  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  4345  		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  4346  		loc.register)
  4347  
  4348  	if loc.valueType == runtimeValueTypeV128Lo {
  4349  		// Higher 64-bits are loaded as well ^^.
  4350  		hi := c.locationStack.stack[loc.stackPointer+1]
  4351  		hi.setRegister(loc.register)
  4352  	}
  4353  }
  4354  
  4355  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  4356  // if the value is located on a conditional register.
  4357  //
  4358  // This is usually called at the beginning of methods on compiler interface where we possibly
  4359  // compile instructions without saving the conditional register value.
  4360  // The compileXXX functions without calling this function is saving the conditional
  4361  // value to the stack or register by invoking compileEnsureOnRegister for the top.
  4362  func (c *amd64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  4363  	if c.locationStack.sp > 0 {
  4364  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  4365  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4366  				return err
  4367  			}
  4368  		}
  4369  	}
  4370  	return
  4371  }
  4372  
  4373  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  4374  // to a general purpose register.
  4375  func (c *amd64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  4376  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4377  	if err != nil {
  4378  		return err
  4379  	}
  4380  	c.compileMoveConditionalToGeneralPurposeRegister(loc, reg)
  4381  	return nil
  4382  }
  4383  
  4384  func (c *amd64Compiler) compileMoveConditionalToGeneralPurposeRegister(loc *runtimeValueLocation, reg asm.Register) {
  4385  	// Set the flag bit to the destination. See
  4386  	// - https://c9x.me/x86/html/file_module_x86_id_288.html
  4387  	// - https://github.com/golang/go/blob/master/src/cmd/internal/obj/x86/asm6.go#L1453-L1468
  4388  	// to translate conditionalRegisterState* to amd64.SET*
  4389  	var inst asm.Instruction
  4390  	switch loc.conditionalRegister {
  4391  	case amd64.ConditionalRegisterStateE:
  4392  		inst = amd64.SETEQ
  4393  	case amd64.ConditionalRegisterStateNE:
  4394  		inst = amd64.SETNE
  4395  	case amd64.ConditionalRegisterStateS:
  4396  		inst = amd64.SETMI
  4397  	case amd64.ConditionalRegisterStateNS:
  4398  		inst = amd64.SETPL
  4399  	case amd64.ConditionalRegisterStateG:
  4400  		inst = amd64.SETGT
  4401  	case amd64.ConditionalRegisterStateGE:
  4402  		inst = amd64.SETGE
  4403  	case amd64.ConditionalRegisterStateL:
  4404  		inst = amd64.SETLT
  4405  	case amd64.ConditionalRegisterStateLE:
  4406  		inst = amd64.SETLE
  4407  	case amd64.ConditionalRegisterStateA:
  4408  		inst = amd64.SETHI
  4409  	case amd64.ConditionalRegisterStateAE:
  4410  		inst = amd64.SETCC
  4411  	case amd64.ConditionalRegisterStateB:
  4412  		inst = amd64.SETCS
  4413  	case amd64.ConditionalRegisterStateBE:
  4414  		inst = amd64.SETLS
  4415  	}
  4416  
  4417  	c.assembler.CompileNoneToRegister(inst, reg)
  4418  
  4419  	// Then we reset the unnecessary bit.
  4420  	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x1, reg)
  4421  
  4422  	// Mark it uses the register.
  4423  	loc.setRegister(reg)
  4424  	c.locationStack.markRegisterUsed(reg)
  4425  }
  4426  
  4427  // allocateRegister implements compiler.allocateRegister for amd64.
  4428  func (c *amd64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  4429  	var ok bool
  4430  	// Try to get the unused register.
  4431  	reg, ok = c.locationStack.takeFreeRegister(t)
  4432  	if ok {
  4433  		return
  4434  	}
  4435  
  4436  	// If not found, we have to steal the register.
  4437  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  4438  	if !ok {
  4439  		err = fmt.Errorf("cannot steal register")
  4440  		return
  4441  	}
  4442  
  4443  	// Release the steal target register value onto stack location.
  4444  	reg = stealTarget.register
  4445  	c.compileReleaseRegisterToStack(stealTarget)
  4446  	return
  4447  }
  4448  
  4449  // callFunction adds instructions to call a function whose address equals either addr parameter or the value on indexReg.
  4450  //
  4451  // Note: this is the counterpart for returnFunction, and see the comments there as well
  4452  // to understand how the function calls are achieved.
  4453  func (c *amd64Compiler) compileCallFunctionImpl(functionAddressRegister asm.Register, functype *wasm.FunctionType) error {
  4454  	// Release all the registers as our calling convention requires the caller-save.
  4455  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  4456  		return err
  4457  	}
  4458  
  4459  	c.locationStack.markRegisterUsed(functionAddressRegister)
  4460  
  4461  	// Obtain a temporary register to be used in the followings.
  4462  	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4463  	if !found {
  4464  		// This in theory never happen as all the registers must be free except codeAddressRegister.
  4465  		return fmt.Errorf("could not find enough free registers")
  4466  	}
  4467  
  4468  	// The stack should look like:
  4469  	//
  4470  	//               reserved slots for results (if len(results) > len(args))
  4471  	//                      |     |
  4472  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
  4473  	//      |                       |                                                        |
  4474  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
  4475  	//      |
  4476  	// nextStackBasePointerOffset
  4477  	//
  4478  	// where callFrame is used to return to this currently executed function.
  4479  
  4480  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
  4481  
  4482  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
  4483  
  4484  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
  4485  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4486  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4487  		tmpRegister)
  4488  	callFrameStackBasePointerInBytesLoc.setRegister(tmpRegister)
  4489  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
  4490  
  4491  	// Set callEngine.stackContext.stackBasePointer for the next function.
  4492  	c.assembler.CompileConstToRegister(amd64.ADDQ, nextStackBasePointerOffset<<3, tmpRegister)
  4493  
  4494  	// Write the calculated value to callEngine.stackContext.stackBasePointer.
  4495  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  4496  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  4497  
  4498  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
  4499  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4500  		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
  4501  		tmpRegister)
  4502  	callFrameFunctionLoc.setRegister(tmpRegister)
  4503  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
  4504  
  4505  	// Set callEngine.moduleContext.fn to the next *function.
  4506  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, functionAddressRegister,
  4507  		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  4508  
  4509  	// Write the return address into callFrameReturnAddressLoc.
  4510  	c.assembler.CompileReadInstructionAddress(tmpRegister, amd64.JMP)
  4511  	callFrameReturnAddressLoc.setRegister(tmpRegister)
  4512  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
  4513  
  4514  	if amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister == functionAddressRegister {
  4515  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
  4516  		// the address (jump target below) will be modified and result in segfault.
  4517  		// See #526.
  4518  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, functionAddressRegister, tmpRegister)
  4519  		functionAddressRegister = tmpRegister
  4520  	}
  4521  
  4522  	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
  4523  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, functionAddressRegister, functionModuleInstanceAddressOffset,
  4524  		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4525  
  4526  	// And jump into the initial address of the target function.
  4527  	c.assembler.CompileJumpToMemory(amd64.JMP, functionAddressRegister, functionCodeInitialAddressOffset)
  4528  
  4529  	// All the registers used are temporary, so we mark them unused.
  4530  	c.locationStack.markRegisterUnused(tmpRegister, functionAddressRegister)
  4531  
  4532  	// On the function return, we have to initialize the state.
  4533  	if err := c.compileModuleContextInitialization(); err != nil {
  4534  		return err
  4535  	}
  4536  
  4537  	// Due to the change to callEngine.stackContext.stackBasePointer.
  4538  	c.compileReservedStackBasePointerInitialization()
  4539  
  4540  	// Due to the change to callEngine.moduleContext.moduleInstanceAddress as that might result in
  4541  	// the memory instance manipulation.
  4542  	c.compileReservedMemoryPointerInitialization()
  4543  
  4544  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
  4545  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
  4546  
  4547  	// Now the function results are pushed by the call.
  4548  	for _, t := range functype.Results {
  4549  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
  4550  		switch t {
  4551  		case wasm.ValueTypeI32:
  4552  			loc.valueType = runtimeValueTypeI32
  4553  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
  4554  			loc.valueType = runtimeValueTypeI64
  4555  		case wasm.ValueTypeF32:
  4556  			loc.valueType = runtimeValueTypeF32
  4557  		case wasm.ValueTypeF64:
  4558  			loc.valueType = runtimeValueTypeF64
  4559  		case wasm.ValueTypeV128:
  4560  			loc.valueType = runtimeValueTypeV128Lo
  4561  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
  4562  			hi.valueType = runtimeValueTypeV128Hi
  4563  		default:
  4564  			panic("BUG: invalid type: " + wasm.ValueTypeName(t))
  4565  		}
  4566  	}
  4567  	return nil
  4568  }
  4569  
  4570  // returnFunction adds instructions to return from the current callframe back to the caller's frame.
  4571  // If this is the current one is the origin, we return to the callEngine.execWasmFunction with the Returned status.
  4572  // Otherwise, we jump into the callers' return address stored in callFrame.returnAddress while setting
  4573  // up all the necessary change on the callEngine's state.
  4574  //
  4575  // Note: this is the counterpart for callFunction, and see the comments there as well
  4576  // to understand how the function calls are achieved.
  4577  func (c *amd64Compiler) compileReturnFunction() error {
  4578  	// Release all the registers as our calling convention requires the caller-save.
  4579  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  4580  		return err
  4581  	}
  4582  
  4583  	if c.withListener {
  4584  		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerAfter); err != nil {
  4585  			return err
  4586  		}
  4587  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
  4588  		c.compileReservedStackBasePointerInitialization()
  4589  	}
  4590  
  4591  	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
  4592  	// so mark it used so that it won't be used as a free register.
  4593  	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4594  	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4595  
  4596  	// Obtain a temporary register to be used in the following.
  4597  	returnAddressRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4598  	if !found {
  4599  		panic("BUG: all the registers should be free at this point: " + c.locationStack.String())
  4600  	}
  4601  
  4602  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.ir.Signature)
  4603  
  4604  	// A zero return address means return from the execution.
  4605  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4606  		amd64ReservedRegisterForStackBasePointerAddress, int64(returnAddress.stackPointer)*8,
  4607  		returnAddressRegister,
  4608  	)
  4609  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, returnAddressRegister, returnAddressRegister)
  4610  
  4611  	jmpIfNotReturn := c.assembler.CompileJump(amd64.JNE)
  4612  	c.compileExitFromNativeCode(nativeCallStatusCodeReturned)
  4613  
  4614  	// Otherwise, we return to the caller.
  4615  	c.assembler.SetJumpTargetOnNext(jmpIfNotReturn)
  4616  
  4617  	// Alias for readability.
  4618  	tmpRegister := amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister
  4619  
  4620  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
  4621  	callerStackBasePointerInBytes.setRegister(tmpRegister)
  4622  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
  4623  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4624  		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  4625  
  4626  	// Next, restore moduleContext.fn from callerFunction.
  4627  	callerFunction.setRegister(tmpRegister)
  4628  	c.compileLoadValueOnStackToRegister(callerFunction)
  4629  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4630  		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  4631  
  4632  	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
  4633  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4634  		tmpRegister, functionModuleInstanceAddressOffset,
  4635  		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4636  
  4637  	// Then, jump into the return address!
  4638  	c.assembler.CompileJumpToRegister(amd64.JMP, returnAddressRegister)
  4639  	return nil
  4640  }
  4641  
  4642  func (c *amd64Compiler) compileCallGoHostFunction() error {
  4643  	return c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction)
  4644  }
  4645  
  4646  func (c *amd64Compiler) compileCallBuiltinFunction(index wasm.Index) error {
  4647  	// Set the functionAddress to the callEngine.exitContext functionCallAddress.
  4648  	c.assembler.CompileConstToMemory(amd64.MOVL, int64(index), amd64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset)
  4649  	return c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction)
  4650  }
  4651  
  4652  func (c *amd64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode) error {
  4653  	// Release all the registers as our calling convention requires the caller-save.
  4654  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  4655  		return err
  4656  	}
  4657  
  4658  	// Read the return address, and write it to callEngine.exitContext.returnAddress.
  4659  	returnAddressReg, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4660  	if !ok {
  4661  		panic("BUG: cannot take free register")
  4662  	}
  4663  	c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.RET)
  4664  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4665  		returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
  4666  
  4667  	c.compileExitFromNativeCode(compilerStatus)
  4668  	return nil
  4669  }
  4670  
  4671  // compileReleaseAllRegistersToStack add the instructions to release all the LIVE value
  4672  // in the value location stack at this point into the stack memory location.
  4673  func (c *amd64Compiler) compileReleaseAllRegistersToStack() (err error) {
  4674  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4675  		if loc := c.locationStack.stack[i]; loc.onRegister() {
  4676  			c.compileReleaseRegisterToStack(loc)
  4677  		} else if loc.onConditionalRegister() {
  4678  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4679  				return
  4680  			}
  4681  			c.compileReleaseRegisterToStack(loc)
  4682  		}
  4683  	}
  4684  	return
  4685  }
  4686  
  4687  func (c *amd64Compiler) onValueReleaseRegisterToStack(reg asm.Register) {
  4688  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4689  		prevValue := c.locationStack.stack[i]
  4690  		if prevValue.register == reg {
  4691  			c.compileReleaseRegisterToStack(prevValue)
  4692  			break
  4693  		}
  4694  	}
  4695  }
  4696  
  4697  // compileReleaseRegisterToStack implements compiler.compileReleaseRegisterToStack for amd64.
  4698  func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  4699  	var inst asm.Instruction
  4700  	switch loc.valueType {
  4701  	case runtimeValueTypeV128Lo:
  4702  		inst = amd64.MOVDQU
  4703  	case runtimeValueTypeV128Hi:
  4704  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  4705  	case runtimeValueTypeI32, runtimeValueTypeF32:
  4706  		inst = amd64.MOVL
  4707  	case runtimeValueTypeI64, runtimeValueTypeF64:
  4708  		inst = amd64.MOVQ
  4709  	default:
  4710  		panic("BUG: unknown runtime value type")
  4711  	}
  4712  
  4713  	c.assembler.CompileRegisterToMemory(inst, loc.register,
  4714  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  4715  		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4716  
  4717  	// Mark the register is free.
  4718  	c.locationStack.releaseRegister(loc)
  4719  
  4720  	if loc.valueType == runtimeValueTypeV128Lo {
  4721  		// Higher 64-bits are released as well ^^.
  4722  		hi := c.locationStack.stack[loc.stackPointer+1]
  4723  		c.locationStack.releaseRegister(hi)
  4724  	}
  4725  }
  4726  
  4727  func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
  4728  	c.assembler.CompileConstToMemory(amd64.MOVB, int64(status),
  4729  		amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
  4730  
  4731  	// Write back the cached SP to the actual eng.stackPointer.
  4732  	c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp),
  4733  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackPointerOffset)
  4734  
  4735  	c.assembler.CompileStandAlone(amd64.RET)
  4736  }
  4737  
  4738  func (c *amd64Compiler) compilePreamble() (err error) {
  4739  	// We assume all function parameters are already pushed onto the stack by
  4740  	// the caller.
  4741  	c.locationStack.init(c.ir.Signature)
  4742  
  4743  	if err := c.compileModuleContextInitialization(); err != nil {
  4744  		return err
  4745  	}
  4746  
  4747  	// Check if it's necessary to grow the value stack by using max stack pointer.
  4748  	if err = c.compileMaybeGrowStack(); err != nil {
  4749  		return err
  4750  	}
  4751  
  4752  	if c.withListener {
  4753  		if err = c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
  4754  			return err
  4755  		}
  4756  	}
  4757  
  4758  	c.compileReservedStackBasePointerInitialization()
  4759  
  4760  	// Finally, we initialize the reserved memory register based on the module context.
  4761  	c.compileReservedMemoryPointerInitialization()
  4762  	return
  4763  }
  4764  
  4765  func (c *amd64Compiler) compileReservedStackBasePointerInitialization() {
  4766  	// First, make reservedRegisterForStackBasePointer point to the beginning of the slice backing array.
  4767  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4768  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  4769  		amd64ReservedRegisterForStackBasePointerAddress)
  4770  
  4771  	// next we move the base pointer (callEngine.stackBasePointer) to the tmp register.
  4772  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  4773  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4774  		amd64ReservedRegisterForStackBasePointerAddress,
  4775  	)
  4776  }
  4777  
  4778  func (c *amd64Compiler) compileReservedMemoryPointerInitialization() {
  4779  	if c.ir.HasMemory || c.ir.UsesMemory {
  4780  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4781  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4782  			amd64ReservedRegisterForMemory,
  4783  		)
  4784  	}
  4785  }
  4786  
  4787  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
  4788  // and if so, make the builtin function call to do so. These instructions are called in the function's
  4789  // preamble.
  4790  func (c *amd64Compiler) compileMaybeGrowStack() error {
  4791  	tmpRegister, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4792  	if !ok {
  4793  		panic("BUG: cannot take free register")
  4794  	}
  4795  
  4796  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4797  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset, tmpRegister)
  4798  	c.assembler.CompileMemoryToRegister(amd64.SUBQ,
  4799  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister)
  4800  
  4801  	// If stack base pointer + max stack pointer > stackLen, we need to grow the stack.
  4802  	cmpWithStackPointerCeil := c.assembler.CompileRegisterToConst(amd64.CMPQ, tmpRegister, 0)
  4803  	c.onStackPointerCeilDeterminedCallBack = func(stackPointerCeil uint64) {
  4804  		cmpWithStackPointerCeil.AssignDestinationConstant(int64(stackPointerCeil) << 3)
  4805  	}
  4806  
  4807  	// Jump if we have no need to grow.
  4808  	jmpIfNoNeedToGrowStack := c.assembler.CompileJump(amd64.JCC)
  4809  
  4810  	// Otherwise, we have to make the builtin function call to grow the call stack.
  4811  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexGrowStack); err != nil {
  4812  		return err
  4813  	}
  4814  
  4815  	c.assembler.SetJumpTargetOnNext(jmpIfNoNeedToGrowStack)
  4816  	return nil
  4817  }
  4818  
  4819  // compileModuleContextInitialization adds instructions to initialize callEngine.ModuleContext's fields based on
  4820  // callEngine.ModuleContext.ModuleInstanceAddress.
  4821  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  4822  func (c *amd64Compiler) compileModuleContextInitialization() error {
  4823  	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
  4824  	// so mark it used so that it won't be used as a free register until the module context initialization finishes.
  4825  	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4826  	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4827  
  4828  	// Obtain the temporary registers to be used in the followings.
  4829  	regs, found := c.locationStack.takeFreeRegisters(registerTypeGeneralPurpose, 2)
  4830  	if !found {
  4831  		// This in theory never happen as all the registers must be free except indexReg.
  4832  		return fmt.Errorf("could not find enough free registers")
  4833  	}
  4834  	c.locationStack.markRegisterUsed(regs...)
  4835  
  4836  	// Alias these free tmp registers for readability.
  4837  	tmpRegister, tmpRegister2 := regs[0], regs[1]
  4838  
  4839  	// If the module instance address stays the same, we could skip the entire code below.
  4840  	// The rationale/idea for this is that, in almost all use cases, users instantiate a single
  4841  	// Wasm binary and run the functions from it, rather than doing import/export on multiple
  4842  	// binaries. As a result, this cmp and jmp instruction sequence below must be easy for
  4843  	// x64 CPU to do branch prediction since almost 100% jump happens across function calls.
  4844  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  4845  		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceAddressOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4846  	jmpIfModuleNotChange := c.assembler.CompileJump(amd64.JEQ)
  4847  
  4848  	// If engine.CallContext.ModuleInstanceAddress is not equal the value on amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
  4849  	// we have to put the new value there.
  4850  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
  4851  		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceAddressOffset)
  4852  
  4853  	// Also, we have to update the following fields:
  4854  	// * callEngine.moduleContext.globalElement0Address
  4855  	// * callEngine.moduleContext.tableElement0Address
  4856  	// * callEngine.moduleContext.memoryInstance
  4857  	// * callEngine.moduleContext.memoryElement0Address
  4858  	// * callEngine.moduleContext.memorySliceLen
  4859  	// * callEngine.moduleContext.codesElement0Address
  4860  	// * callEngine.moduleContext.typeIDsElement0Address
  4861  	// * callEngine.moduleContext.dataInstancesElement0Address
  4862  	// * callEngine.moduleContext.elementInstancesElement0Address
  4863  
  4864  	// Update globalElement0Address.
  4865  	//
  4866  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  4867  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  4868  	// skip the initialization if the module's globals slice is empty.
  4869  	if len(c.ir.Globals) > 0 {
  4870  		// Since ModuleInstance.Globals is []*globalInstance, internally
  4871  		// the address of the first item in the underlying array lies exactly on the globals offset.
  4872  		// See https://go.dev/blog/slices-intro if unfamiliar.
  4873  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset, tmpRegister)
  4874  
  4875  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset)
  4876  	}
  4877  
  4878  	// Update tableElement0Address.
  4879  	//
  4880  	// Note: if there's table instruction in the function, the existence of the table
  4881  	// is ensured by function validation at module instantiation phase, and that's
  4882  	// why it is ok to skip the initialization if the module's table doesn't exist.
  4883  	if c.ir.HasTable {
  4884  		// First, we need to read the *wasm.Table.
  4885  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTablesOffset, tmpRegister)
  4886  
  4887  		// At this point, tmpRegister holds the address of ModuleInstance.Table.
  4888  		// So we are ready to read and put the first item's address stored in Table.Table.
  4889  		// Here we read the value into tmpRegister2.
  4890  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  4891  			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset)
  4892  
  4893  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  4894  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4895  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpRegister)
  4896  		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4897  			tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  4898  	}
  4899  
  4900  	// Update memoryElement0Address and memorySliceLen.
  4901  	//
  4902  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  4903  	// That is ensured by function validation at module instantiation phase, and that's
  4904  	// why it is ok to skip the initialization if the module's memory instance is nil.
  4905  	if c.ir.HasMemory {
  4906  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4907  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  4908  			tmpRegister)
  4909  
  4910  		// Set memory instance.
  4911  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  4912  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset)
  4913  
  4914  		// Set length.
  4915  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferLenOffset, tmpRegister2)
  4916  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
  4917  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset)
  4918  
  4919  		// Set element zero address.
  4920  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferOffset, tmpRegister2)
  4921  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
  4922  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset)
  4923  	}
  4924  
  4925  	// Update moduleContext.codesElement0Address
  4926  	{
  4927  		// "tmpRegister = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  4928  		//
  4929  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  4930  		// where tab points to the interface table, and the latter points to the actual
  4931  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  4932  		// See the following references for detail:
  4933  		// * https://research.swtch.com/interfaces
  4934  		// * https://github.com/golang/go/blob/release-branch.go1.17/src/runtime/runtime2.go#L207-L210
  4935  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset, tmpRegister)
  4936  
  4937  		// "tmpRegister = [tmpRegister + moduleEnginecodesOffset] (== &moduleEngine.codes[0])"
  4938  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, moduleEngineFunctionsOffset, tmpRegister)
  4939  
  4940  		// "callEngine.moduleContext.functionsElement0Address = tmpRegister".
  4941  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine,
  4942  			callEngineModuleContextFunctionsElement0AddressOffset)
  4943  	}
  4944  
  4945  	// Update dataInstancesElement0Address.
  4946  	if c.ir.HasDataInstances {
  4947  		// "tmpRegister = &moduleInstance.DataInstances[0]"
  4948  		c.assembler.CompileMemoryToRegister(
  4949  			amd64.MOVQ,
  4950  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  4951  			tmpRegister,
  4952  		)
  4953  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
  4954  		c.assembler.CompileRegisterToMemory(
  4955  			amd64.MOVQ,
  4956  			tmpRegister,
  4957  			amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  4958  		)
  4959  	}
  4960  
  4961  	// Update callEngine.moduleContext.elementInstancesElement0Address
  4962  	if c.ir.HasElementInstances {
  4963  		// "tmpRegister = &moduleInstance.ElementInstnaces[0]"
  4964  		c.assembler.CompileMemoryToRegister(
  4965  			amd64.MOVQ,
  4966  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  4967  			tmpRegister,
  4968  		)
  4969  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
  4970  		c.assembler.CompileRegisterToMemory(
  4971  			amd64.MOVQ,
  4972  			tmpRegister,
  4973  			amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4974  		)
  4975  	}
  4976  
  4977  	c.locationStack.markRegisterUnused(regs...)
  4978  
  4979  	// Set the jump target towards the next instruction for the case where module instance address hasn't changed.
  4980  	c.assembler.SetJumpTargetOnNext(jmpIfModuleNotChange)
  4981  	return nil
  4982  }
  4983  
  4984  // compileEnsureOnRegister ensures that the given value is located on a
  4985  // general purpose register of an appropriate type.
  4986  func (c *amd64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  4987  	if loc.onStack() {
  4988  		// Allocate the register.
  4989  		reg, err := c.allocateRegister(loc.getRegisterType())
  4990  		if err != nil {
  4991  			return err
  4992  		}
  4993  
  4994  		// Mark it uses the register.
  4995  		loc.setRegister(reg)
  4996  		c.locationStack.markRegisterUsed(reg)
  4997  
  4998  		c.compileLoadValueOnStackToRegister(loc)
  4999  	} else if loc.onConditionalRegister() {
  5000  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  5001  	}
  5002  	return
  5003  }
  5004  
  5005  // compileMaybeSwapRegisters swaps two registers if they're not equal.
  5006  func (c *amd64Compiler) compileMaybeSwapRegisters(reg1, reg2 asm.Register) {
  5007  	if reg1 != reg2 {
  5008  		c.assembler.CompileRegisterToRegister(amd64.XCHGQ, reg1, reg2)
  5009  	}
  5010  }
  5011  
  5012  // compilePreventCrossedTargetRegisters swaps registers in such a way, that for neither runtimeValueLocation from locs its
  5013  // corresponding register with the same index from targets is occupied by some other runtimeValueLocation from locs. It returns a
  5014  // closure to restore the original register placement.
  5015  //
  5016  // This function makes it possible to safely exchange one set of registers with another, where a register might be in both sets.
  5017  // Each register will correspond either to itself or another register not present in its own set.
  5018  //
  5019  // For example, if we have locs = [AX, BX, CX], targets = [BX, SI, AX], then it'll do two swaps
  5020  // to make locs = [BX, CX, AX].
  5021  func (c *amd64Compiler) compilePreventCrossedTargetRegisters(locs []*runtimeValueLocation, targets []asm.Register) (restore func()) {
  5022  	type swap struct{ srcIndex, dstIndex int }
  5023  	var swaps []swap
  5024  	for i := range locs {
  5025  		targetLocation := -1 // -1 means not found.
  5026  		for j := range locs {
  5027  			if locs[j].register == targets[i] {
  5028  				targetLocation = j
  5029  				break
  5030  			}
  5031  		}
  5032  		if targetLocation != -1 && targetLocation != i {
  5033  			c.compileMaybeSwapRegisters(locs[i].register, locs[targetLocation].register)
  5034  			locs[i].register, locs[targetLocation].register = locs[targetLocation].register, locs[i].register
  5035  			swaps = append(swaps, swap{i, targetLocation})
  5036  		}
  5037  	}
  5038  	return func() {
  5039  		// Restore in reverse order because a register can be moved multiple times.
  5040  		for i := len(swaps) - 1; i >= 0; i -= 1 {
  5041  			r1, r2 := swaps[i].srcIndex, swaps[i].dstIndex
  5042  			c.compileMaybeSwapRegisters(locs[r1].register, locs[r2].register)
  5043  			locs[r1].register, locs[r2].register = locs[r2].register, locs[r1].register
  5044  		}
  5045  	}
  5046  }