github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/engine/compiler/impl_amd64.go (about)

     1  package compiler
     2  
     3  // This file implements the compiler for amd64/x86_64 target.
     4  // Please refer to https://www.felixcloutier.com/x86/index.html
     5  // if unfamiliar with amd64 instructions used here.
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"github.com/wasilibs/wazerox/internal/asm"
    12  	"github.com/wasilibs/wazerox/internal/asm/amd64"
    13  	"github.com/wasilibs/wazerox/internal/platform"
    14  	"github.com/wasilibs/wazerox/internal/u32"
    15  	"github.com/wasilibs/wazerox/internal/u64"
    16  	"github.com/wasilibs/wazerox/internal/wasm"
    17  	"github.com/wasilibs/wazerox/internal/wazeroir"
    18  )
    19  
    20  var (
    21  	_minimum32BitSignedInt                  int32  = math.MinInt32
    22  	_maximum32BitSignedInt                  int32  = math.MaxInt32
    23  	_maximum32BitUnsignedInt                uint32 = math.MaxUint32
    24  	_minimum64BitSignedInt                  int64  = math.MinInt64
    25  	_maximum64BitSignedInt                  int64  = math.MaxInt64
    26  	_maximum64BitUnsignedInt                uint64 = math.MaxUint64
    27  	_float32SignBitMask                     uint32 = 1 << 31
    28  	_float32RestBitMask                            = ^_float32SignBitMask
    29  	_float64SignBitMask                     uint64 = 1 << 63
    30  	_float64RestBitMask                            = ^_float64SignBitMask
    31  	_float32ForMinimumSigned32bitInteger           = uint32(0xCF00_0000)
    32  	_float64ForMinimumSigned32bitInteger           = uint64(0xC1E0_0000_0020_0000)
    33  	_float32ForMinimumSigned64bitInteger           = uint32(0xDF00_0000)
    34  	_float64ForMinimumSigned64bitInteger           = uint64(0xC3E0_0000_0000_0000)
    35  	_float32ForMaximumSigned32bitIntPlusOne        = uint32(0x4F00_0000)
    36  	_float64ForMaximumSigned32bitIntPlusOne        = uint64(0x41E0_0000_0000_0000)
    37  	_float32ForMaximumSigned64bitIntPlusOne        = uint32(0x5F00_0000)
    38  	_float64ForMaximumSigned64bitIntPlusOne        = uint64(0x43E0_0000_0000_0000)
    39  )
    40  
    41  var (
    42  	// amd64ReservedRegisterForCallEngine: pointer to callEngine (i.e. *callEngine as uintptr)
    43  	amd64ReservedRegisterForCallEngine = amd64.RegR13
    44  	// amd64ReservedRegisterForStackBasePointerAddress: stack base pointer's address (callEngine.stackBasePointer) in the current function call.
    45  	amd64ReservedRegisterForStackBasePointerAddress = amd64.RegR14
    46  	// amd64ReservedRegisterForMemory: pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
    47  	amd64ReservedRegisterForMemory = amd64.RegR15
    48  )
    49  
    50  var (
    51  	amd64UnreservedVectorRegisters = []asm.Register{ //nolint
    52  		amd64.RegX0, amd64.RegX1, amd64.RegX2, amd64.RegX3,
    53  		amd64.RegX4, amd64.RegX5, amd64.RegX6, amd64.RegX7,
    54  		amd64.RegX8, amd64.RegX9, amd64.RegX10, amd64.RegX11,
    55  		amd64.RegX12, amd64.RegX13, amd64.RegX14, amd64.RegX15,
    56  	}
    57  	// Note that we never invoke "call" instruction,
    58  	// so we don't need to care about the calling convention.
    59  	// TODO: Maybe it is safe just save rbp, rsp somewhere
    60  	// in Go-allocated variables, and reuse these registers
    61  	// in compiled functions and write them back before returns.
    62  	amd64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
    63  		amd64.RegAX, amd64.RegCX, amd64.RegDX, amd64.RegBX,
    64  		amd64.RegSI, amd64.RegDI, amd64.RegR8, amd64.RegR9,
    65  		amd64.RegR10, amd64.RegR11, amd64.RegR12,
    66  	}
    67  )
    68  
    69  // amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds *wasm.ModuleInstance of the
    70  // next executing function instance. The value is set and used when making function calls
    71  // or function returns in the ModuleContextInitialization. See compileModuleContextInitialization.
    72  var amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister = amd64.RegR12
    73  
    74  func (c *amd64Compiler) String() string {
    75  	return c.locationStack.String()
    76  }
    77  
    78  // compileNOP implements compiler.compileNOP for the amd64 architecture.
    79  func (c *amd64Compiler) compileNOP() asm.Node {
    80  	return c.assembler.CompileStandAlone(amd64.NOP)
    81  }
    82  
    83  type amd64Compiler struct {
    84  	assembler   amd64.Assembler
    85  	ir          *wazeroir.CompilationResult
    86  	cpuFeatures platform.CpuFeatureFlags
    87  	// locationStack holds the state of wazeroir virtual stack.
    88  	// and each item is either placed in register or the actual memory stack.
    89  	locationStack *runtimeValueLocationStack
    90  	// labels hold per wazeroir label specific information in this function.
    91  	labels [wazeroir.LabelKindNum][]amd64LabelInfo
    92  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    93  	stackPointerCeil uint64
    94  	// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
    95  	assignStackPointerCeilNeeded asm.Node
    96  	compiledTrapTargets          [nativeCallStatusModuleClosed]asm.Node
    97  	withListener                 bool
    98  	typ                          *wasm.FunctionType
    99  	// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
   100  	// we cache it here, and reset and set to .locationStack in the Init method.
   101  	locationStackForEntrypoint runtimeValueLocationStack
   102  	// frameIDMax tracks the maximum value of frame id per function.
   103  	frameIDMax int
   104  	brTableTmp []runtimeValueLocation
   105  
   106  	fourZeros,
   107  	eightZeros,
   108  	minimum32BitSignedInt,
   109  	maximum32BitSignedInt,
   110  	maximum32BitUnsignedInt,
   111  	minimum64BitSignedInt,
   112  	maximum64BitSignedInt,
   113  	maximum64BitUnsignedInt,
   114  	float32SignBitMask,
   115  	float32RestBitMask,
   116  	float64SignBitMask,
   117  	float64RestBitMask,
   118  	float32ForMinimumSigned32bitInteger,
   119  	float64ForMinimumSigned32bitInteger,
   120  	float32ForMinimumSigned64bitInteger,
   121  	float64ForMinimumSigned64bitInteger,
   122  	float32ForMaximumSigned32bitIntPlusOne,
   123  	float64ForMaximumSigned32bitIntPlusOne,
   124  	float32ForMaximumSigned64bitIntPlusOne,
   125  	float64ForMaximumSigned64bitIntPlusOne *asm.StaticConst
   126  }
   127  
   128  func newAmd64Compiler() compiler {
   129  	c := &amd64Compiler{
   130  		assembler:                  amd64.NewAssembler(),
   131  		locationStackForEntrypoint: newRuntimeValueLocationStack(),
   132  		cpuFeatures:                platform.CpuFeatures,
   133  	}
   134  
   135  	c.fourZeros = asm.NewStaticConst([]byte{0, 0, 0, 0})
   136  	c.eightZeros = asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0})
   137  	c.minimum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_minimum32BitSignedInt)))
   138  	c.maximum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_maximum32BitSignedInt)))
   139  	c.maximum32BitUnsignedInt = asm.NewStaticConst(u32.LeBytes(_maximum32BitUnsignedInt))
   140  	c.minimum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_minimum64BitSignedInt)))
   141  	c.maximum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_maximum64BitSignedInt)))
   142  	c.maximum64BitUnsignedInt = asm.NewStaticConst(u64.LeBytes(_maximum64BitUnsignedInt))
   143  	c.float32SignBitMask = asm.NewStaticConst(u32.LeBytes(_float32SignBitMask))
   144  	c.float32RestBitMask = asm.NewStaticConst(u32.LeBytes(_float32RestBitMask))
   145  	c.float64SignBitMask = asm.NewStaticConst(u64.LeBytes(_float64SignBitMask))
   146  	c.float64RestBitMask = asm.NewStaticConst(u64.LeBytes(_float64RestBitMask))
   147  	c.float32ForMinimumSigned32bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned32bitInteger))
   148  	c.float64ForMinimumSigned32bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned32bitInteger))
   149  	c.float32ForMinimumSigned64bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned64bitInteger))
   150  	c.float64ForMinimumSigned64bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned64bitInteger))
   151  	c.float32ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned32bitIntPlusOne))
   152  	c.float64ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned32bitIntPlusOne))
   153  	c.float32ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned64bitIntPlusOne))
   154  	c.float64ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned64bitIntPlusOne))
   155  	return c
   156  }
   157  
   158  // Init implements compiler.Init.
   159  func (c *amd64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
   160  	c.assembler.Reset()
   161  	c.locationStackForEntrypoint.reset()
   162  	c.resetLabels()
   163  	*c = amd64Compiler{
   164  		ir:                                     ir,
   165  		withListener:                           withListener,
   166  		typ:                                    typ,
   167  		assembler:                              c.assembler,
   168  		cpuFeatures:                            c.cpuFeatures,
   169  		labels:                                 c.labels,
   170  		locationStackForEntrypoint:             c.locationStackForEntrypoint,
   171  		brTableTmp:                             c.brTableTmp,
   172  		fourZeros:                              c.fourZeros,
   173  		eightZeros:                             c.eightZeros,
   174  		minimum32BitSignedInt:                  c.minimum32BitSignedInt,
   175  		maximum32BitSignedInt:                  c.maximum32BitSignedInt,
   176  		maximum32BitUnsignedInt:                c.maximum32BitUnsignedInt,
   177  		minimum64BitSignedInt:                  c.minimum64BitSignedInt,
   178  		maximum64BitSignedInt:                  c.maximum64BitSignedInt,
   179  		maximum64BitUnsignedInt:                c.maximum64BitUnsignedInt,
   180  		float32SignBitMask:                     c.float32SignBitMask,
   181  		float32RestBitMask:                     c.float32RestBitMask,
   182  		float64SignBitMask:                     c.float64SignBitMask,
   183  		float64RestBitMask:                     c.float64RestBitMask,
   184  		float32ForMinimumSigned32bitInteger:    c.float32ForMinimumSigned32bitInteger,
   185  		float64ForMinimumSigned32bitInteger:    c.float64ForMinimumSigned32bitInteger,
   186  		float32ForMinimumSigned64bitInteger:    c.float32ForMinimumSigned64bitInteger,
   187  		float64ForMinimumSigned64bitInteger:    c.float64ForMinimumSigned64bitInteger,
   188  		float32ForMaximumSigned32bitIntPlusOne: c.float32ForMaximumSigned32bitIntPlusOne,
   189  		float64ForMaximumSigned32bitIntPlusOne: c.float64ForMaximumSigned32bitIntPlusOne,
   190  		float32ForMaximumSigned64bitIntPlusOne: c.float32ForMaximumSigned64bitIntPlusOne,
   191  		float64ForMaximumSigned64bitIntPlusOne: c.float64ForMaximumSigned64bitIntPlusOne,
   192  	}
   193  
   194  	// Reuses the initial location stack for the compilation of subsequent functions.
   195  	c.locationStack = &c.locationStackForEntrypoint
   196  }
   197  
   198  // resetLabels resets the existing content in arm64Compiler.labels so that
   199  // we could reuse the allocated slices and stacks in the subsequent compilations.
   200  func (c *amd64Compiler) resetLabels() {
   201  	for i := range c.labels {
   202  		for j := range c.labels[i] {
   203  			if j > c.frameIDMax {
   204  				// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
   205  				break
   206  			}
   207  			l := &c.labels[i][j]
   208  			l.initialInstruction = nil
   209  			l.stackInitialized = false
   210  			l.initialStack.reset()
   211  		}
   212  	}
   213  }
   214  
   215  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   216  func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   217  	return c.locationStack
   218  }
   219  
   220  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   221  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   222  // This is called when we branch into different block.
   223  func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   224  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   225  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   226  	}
   227  	c.locationStack = newStack
   228  }
   229  
   230  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for amd64.
   231  func (c *amd64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   232  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   233  	c.locationStack.markRegisterUsed(reg)
   234  	return
   235  }
   236  
   237  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for amd64.
   238  func (c *amd64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   239  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   240  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   241  	c.locationStack.markRegisterUsed(reg)
   242  	return
   243  }
   244  
   245  type amd64LabelInfo struct {
   246  	// initialInstruction is the initial instruction for this label so other block can jump into it.
   247  	initialInstruction asm.Node
   248  	// initialStack is the initial value location stack from which we start compiling this label.
   249  	initialStack     runtimeValueLocationStack
   250  	stackInitialized bool
   251  }
   252  
   253  func (c *amd64Compiler) label(label wazeroir.Label) *amd64LabelInfo {
   254  	kind := label.Kind()
   255  	frames := c.labels[kind]
   256  	frameID := label.FrameID()
   257  	if c.frameIDMax < frameID {
   258  		c.frameIDMax = frameID
   259  	}
   260  	// If the frameID is not allocated yet, expand the slice by twice of the diff,
   261  	// so that we could reduce the allocation in the subsequent compilation.
   262  	if diff := frameID - len(frames) + 1; diff > 0 {
   263  		for i := 0; i < diff; i++ {
   264  			frames = append(frames, amd64LabelInfo{initialStack: newRuntimeValueLocationStack()})
   265  		}
   266  		c.labels[kind] = frames
   267  	}
   268  	return &frames[frameID]
   269  }
   270  
   271  // compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the amd64 architecture.
   272  func (c *amd64Compiler) compileBuiltinFunctionCheckExitCode() error {
   273  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexCheckExitCode); err != nil {
   274  		return err
   275  	}
   276  
   277  	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
   278  	c.compileReservedStackBasePointerInitialization()
   279  	c.compileReservedMemoryPointerInitialization()
   280  	return nil
   281  }
   282  
   283  // compileGoDefinedHostFunction constructs the entire code to enter the host function implementation,
   284  // and return to the caller.
   285  func (c *amd64Compiler) compileGoDefinedHostFunction() error {
   286  	// First we must update the location stack to reflect the number of host function inputs.
   287  	c.locationStack.init(c.typ)
   288  
   289  	if c.withListener {
   290  		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
   291  			return err
   292  		}
   293  	}
   294  
   295  	// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
   296  	// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
   297  	// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
   298  	// without sacrificing the performance.
   299  	c.compileReservedStackBasePointerInitialization()
   300  	// Alias for readability.
   301  	tmp := amd64.RegAX
   302  	// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
   303  	_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   304  	// Load the value into the tmp register: tmp = &function{..}
   305  	callerFunction.setRegister(tmp)
   306  	c.compileLoadValueOnStackToRegister(callerFunction)
   307  	// tmp = *(tmp+functionSourceOffset) = &wasm.ModuleInstance{...}
   308  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, functionModuleInstanceOffset, tmp)
   309  	// Load it onto callEngine.exitContext.callerFunctionInstance.
   310  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
   311  		tmp,
   312  		amd64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
   313  	// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
   314  	c.locationStack.releaseRegister(callerFunction)
   315  
   316  	if err := c.compileCallGoHostFunction(); err != nil {
   317  		return err
   318  	}
   319  
   320  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   321  	c.compileReservedStackBasePointerInitialization()
   322  
   323  	// Go function can change the module state in arbitrary way, so we have to force
   324  	// the callEngine.moduleContext initialization on the function return. To do so,
   325  	// we zero-out callEngine.moduleInstance.
   326  	c.assembler.CompileConstToMemory(amd64.MOVQ,
   327  		0, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
   328  	return c.compileReturnFunction()
   329  }
   330  
   331  // compile implements compiler.compile for the amd64 architecture.
   332  func (c *amd64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
   333  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
   334  	// used for all labels (via setLocationStack), excluding the current one.
   335  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
   336  	stackPointerCeil = c.stackPointerCeil
   337  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   338  		stackPointerCeil = c.locationStack.stackPointerCeil
   339  	}
   340  
   341  	// Now that the max stack pointer is determined, we are invoking the callback.
   342  	// Note this MUST be called before Assemble() below.
   343  	c.assignStackPointerCeil(stackPointerCeil)
   344  
   345  	err = c.assembler.Assemble(buf)
   346  	return
   347  }
   348  
   349  // compileUnreachable implements compiler.compileUnreachable for the amd64 architecture.
   350  func (c *amd64Compiler) compileUnreachable() error {
   351  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   352  	return nil
   353  }
   354  
   355  // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the amd64 architecture.
   356  func (c *amd64Compiler) assignStackPointerCeil(ceil uint64) {
   357  	if c.assignStackPointerCeilNeeded != nil {
   358  		c.assignStackPointerCeilNeeded.AssignDestinationConstant(int64(ceil) << 3)
   359  	}
   360  }
   361  
   362  // compileSet implements compiler.compileSet for the amd64 architecture.
   363  func (c *amd64Compiler) compileSet(o *wazeroir.UnionOperation) error {
   364  	depth := int(o.U1)
   365  	isTargetVector := o.B3
   366  
   367  	setTargetIndex := int(c.locationStack.sp) - 1 - depth
   368  
   369  	if isTargetVector {
   370  		_ = c.locationStack.pop() // ignore the higher 64-bits.
   371  	}
   372  	v := c.locationStack.pop()
   373  	if err := c.compileEnsureOnRegister(v); err != nil {
   374  		return err
   375  	}
   376  
   377  	targetLocation := &c.locationStack.stack[setTargetIndex]
   378  	if targetLocation.onRegister() {
   379  		// We no longer need the register previously used by the target location.
   380  		c.locationStack.markRegisterUnused(targetLocation.register)
   381  	}
   382  
   383  	reg := v.register
   384  	targetLocation.setRegister(reg)
   385  	targetLocation.valueType = v.valueType
   386  	if isTargetVector {
   387  		hi := &c.locationStack.stack[setTargetIndex+1]
   388  		hi.setRegister(reg)
   389  	}
   390  	return nil
   391  }
   392  
   393  // compileGlobalGet implements compiler.compileGlobalGet for the amd64 architecture.
   394  func (c *amd64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
   395  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   396  		return err
   397  	}
   398  
   399  	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   400  	if err != nil {
   401  		return err
   402  	}
   403  
   404  	// First, move the pointer to the global slice into the allocated register.
   405  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
   406  
   407  	index := o.U1
   408  
   409  	// Now, move the location of the global instance into the register.
   410  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)
   411  
   412  	// When an integer, reuse the pointer register for the value. Otherwise, allocate a float register for it.
   413  	valueReg := intReg
   414  	var vt runtimeValueType
   415  	var inst asm.Instruction
   416  	switch c.ir.Globals[index].ValType {
   417  	case wasm.ValueTypeI32:
   418  		inst = amd64.MOVL
   419  		vt = runtimeValueTypeI32
   420  	case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   421  		inst = amd64.MOVQ
   422  		vt = runtimeValueTypeI64
   423  	case wasm.ValueTypeF32:
   424  		inst = amd64.MOVL
   425  		vt = runtimeValueTypeF32
   426  		valueReg, err = c.allocateRegister(registerTypeVector)
   427  		if err != nil {
   428  			return err
   429  		}
   430  	case wasm.ValueTypeF64:
   431  		inst = amd64.MOVQ
   432  		vt = runtimeValueTypeF64
   433  		valueReg, err = c.allocateRegister(registerTypeVector)
   434  		if err != nil {
   435  			return err
   436  		}
   437  	case wasm.ValueTypeV128:
   438  		inst = amd64.MOVDQU
   439  		vt = runtimeValueTypeV128Lo
   440  		valueReg, err = c.allocateRegister(registerTypeVector)
   441  		if err != nil {
   442  			return err
   443  		}
   444  	default:
   445  		panic("BUG: unknown runtime value type")
   446  	}
   447  
   448  	// Using the register holding the pointer to the target instance, move its value into a register.
   449  	c.assembler.CompileMemoryToRegister(inst, intReg, globalInstanceValueOffset, valueReg)
   450  
   451  	// Record that the retrieved global value on the top of the stack is now in a register.
   452  	if vt == runtimeValueTypeV128Lo {
   453  		c.pushVectorRuntimeValueLocationOnRegister(valueReg)
   454  	} else {
   455  		c.pushRuntimeValueLocationOnRegister(valueReg, vt)
   456  	}
   457  	return nil
   458  }
   459  
   460  // compileGlobalSet implements compiler.compileGlobalSet for the amd64 architecture.
   461  func (c *amd64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
   462  	index := o.U1
   463  
   464  	wasmValueType := c.ir.Globals[index].ValType
   465  	isV128 := wasmValueType == wasm.ValueTypeV128
   466  
   467  	// First, move the value to set into a temporary register.
   468  	val := c.locationStack.pop()
   469  	if isV128 {
   470  		// The previous val is higher 64-bits, and have to use lower 64-bit's runtimeValueLocation for allocation, etc.
   471  		val = c.locationStack.pop()
   472  	}
   473  	if err := c.compileEnsureOnRegister(val); err != nil {
   474  		return err
   475  	}
   476  
   477  	// Allocate a register to hold the memory location of the target global instance.
   478  	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   479  	if err != nil {
   480  		return err
   481  	}
   482  
   483  	// First, move the pointer to the global slice into the allocated register.
   484  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
   485  
   486  	// Now, move the location of the global instance into the register.
   487  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)
   488  
   489  	// Now ready to write the value to the global instance location.
   490  	var inst asm.Instruction
   491  	if isV128 {
   492  		inst = amd64.MOVDQU
   493  	} else if wasmValueType == wasm.ValueTypeI32 || wasmValueType == wasm.ValueTypeF32 {
   494  		inst = amd64.MOVL
   495  	} else {
   496  		inst = amd64.MOVQ
   497  	}
   498  	c.assembler.CompileRegisterToMemory(inst, val.register, intReg, globalInstanceValueOffset)
   499  
   500  	// Since the value is now written to memory, release the value register.
   501  	c.locationStack.releaseRegister(val)
   502  	return nil
   503  }
   504  
   505  // compileBr implements compiler.compileBr for the amd64 architecture.
   506  func (c *amd64Compiler) compileBr(o *wazeroir.UnionOperation) error {
   507  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   508  		return err
   509  	}
   510  	return c.branchInto(wazeroir.Label(o.U1))
   511  }
   512  
   513  // branchInto adds instruction necessary to jump into the given branch target.
   514  func (c *amd64Compiler) branchInto(target wazeroir.Label) error {
   515  	if target.IsReturnTarget() {
   516  		return c.compileReturnFunction()
   517  	} else {
   518  		if c.ir.LabelCallers[target] > 1 {
   519  			// We can only re-use register state if when there's a single call-site.
   520  			// Release existing values on registers to the stack if there's multiple ones to have
   521  			// the consistent value location state at the beginning of label.
   522  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   523  				return err
   524  			}
   525  		}
   526  		// Set the initial stack of the target label, so we can start compiling the label
   527  		// with the appropriate value locations. Note we clone the stack here as we maybe
   528  		// manipulate the stack before compiler reaches the label.
   529  		targetLabel := c.label(target)
   530  		if !targetLabel.stackInitialized {
   531  			targetLabel.initialStack.cloneFrom(*c.locationStack)
   532  			targetLabel.stackInitialized = true
   533  		}
   534  		jmp := c.assembler.CompileJump(amd64.JMP)
   535  		c.assignJumpTarget(target, jmp)
   536  	}
   537  	return nil
   538  }
   539  
   540  // compileBrIf implements compiler.compileBrIf for the amd64 architecture.
   541  func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
   542  	cond := c.locationStack.pop()
   543  	var jmpWithCond asm.Node
   544  	if cond.onConditionalRegister() {
   545  		var inst asm.Instruction
   546  		switch cond.conditionalRegister {
   547  		case amd64.ConditionalRegisterStateE:
   548  			inst = amd64.JEQ
   549  		case amd64.ConditionalRegisterStateNE:
   550  			inst = amd64.JNE
   551  		case amd64.ConditionalRegisterStateS:
   552  			inst = amd64.JMI
   553  		case amd64.ConditionalRegisterStateNS:
   554  			inst = amd64.JPL
   555  		case amd64.ConditionalRegisterStateG:
   556  			inst = amd64.JGT
   557  		case amd64.ConditionalRegisterStateGE:
   558  			inst = amd64.JGE
   559  		case amd64.ConditionalRegisterStateL:
   560  			inst = amd64.JLT
   561  		case amd64.ConditionalRegisterStateLE:
   562  			inst = amd64.JLE
   563  		case amd64.ConditionalRegisterStateA:
   564  			inst = amd64.JHI
   565  		case amd64.ConditionalRegisterStateAE:
   566  			inst = amd64.JCC
   567  		case amd64.ConditionalRegisterStateB:
   568  			inst = amd64.JCS
   569  		case amd64.ConditionalRegisterStateBE:
   570  			inst = amd64.JLS
   571  		}
   572  		jmpWithCond = c.assembler.CompileJump(inst)
   573  	} else {
   574  		// Usually the comparison operand for br_if is on the conditional register,
   575  		// but in some cases, they are on the stack or register.
   576  		// For example, the following code
   577  		// 		i64.const 1
   578  		//      local.get 1
   579  		//      i64.add
   580  		//      br_if ....
   581  		// will try to use the result of i64.add, which resides on the (virtual) stack,
   582  		// as the operand for br_if instruction.
   583  		if err := c.compileEnsureOnRegister(cond); err != nil {
   584  			return err
   585  		}
   586  		// Check if the value not equals zero.
   587  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, cond.register, cond.register)
   588  
   589  		// Emit jump instruction which jumps when the value does not equals zero.
   590  		jmpWithCond = c.assembler.CompileJump(amd64.JNE)
   591  		c.locationStack.markRegisterUnused(cond.register)
   592  	}
   593  
   594  	// Make sure that the next coming label is the else jump target.
   595  	thenTarget := wazeroir.Label(o.U1)
   596  	elseTarget := wazeroir.Label(o.U2)
   597  	thenToDrop := o.U3
   598  
   599  	// Here's the diagram of how we organize the instructions necessarily for brif operation.
   600  	//
   601  	// jmp_with_cond -> jmp (.Else) -> Then operations...
   602  	//    |---------(satisfied)------------^^^
   603  	//
   604  	// Note that .Else branch doesn't have ToDrop as .Else is in reality
   605  	// corresponding to either If's Else block or Br_if's else block in Wasm.
   606  
   607  	// Emit the else branch.
   608  	if elseTarget.IsReturnTarget() {
   609  		if err := c.compileReturnFunction(); err != nil {
   610  			return err
   611  		}
   612  	} else {
   613  		labelInfo := c.label(elseTarget)
   614  		if !labelInfo.stackInitialized {
   615  			labelInfo.initialStack.cloneFrom(*c.locationStack)
   616  			labelInfo.stackInitialized = true
   617  		}
   618  
   619  		elseJmp := c.assembler.CompileJump(amd64.JMP)
   620  		c.assignJumpTarget(elseTarget, elseJmp)
   621  	}
   622  
   623  	// Handle then branch.
   624  	c.assembler.SetJumpTargetOnNext(jmpWithCond)
   625  	if err := compileDropRange(c, thenToDrop); err != nil {
   626  		return err
   627  	}
   628  	if thenTarget.IsReturnTarget() {
   629  		return c.compileReturnFunction()
   630  	} else {
   631  		thenLabel := thenTarget
   632  		if c.ir.LabelCallers[thenLabel] > 1 {
   633  			// We can only re-use register state if when there's a single call-site.
   634  			// Release existing values on registers to the stack if there's multiple ones to have
   635  			// the consistent value location state at the beginning of label.
   636  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   637  				return err
   638  			}
   639  		}
   640  		// Set the initial stack of the target label, so we can start compiling the label
   641  		// with the appropriate value locations. Note we clone the stack here as we maybe
   642  		// manipulate the stack before compiler reaches the label.
   643  		labelInfo := c.label(thenLabel)
   644  		if !labelInfo.stackInitialized {
   645  			labelInfo.initialStack.cloneFrom(*c.locationStack)
   646  			labelInfo.stackInitialized = true
   647  		}
   648  		thenJmp := c.assembler.CompileJump(amd64.JMP)
   649  		c.assignJumpTarget(thenLabel, thenJmp)
   650  		return nil
   651  	}
   652  }
   653  
   654  // compileBrTable implements compiler.compileBrTable for the amd64 architecture.
   655  func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
   656  	index := c.locationStack.pop()
   657  
   658  	// If the operation only consists of the default target, we branch into it and return early.
   659  	if len(o.Us) == 2 {
   660  		c.locationStack.releaseRegister(index)
   661  		if err := compileDropRange(c, o.Us[1]); err != nil {
   662  			return err
   663  		}
   664  		return c.branchInto(wazeroir.Label(o.Us[0]))
   665  	}
   666  
   667  	// Otherwise, we jump into the selected branch.
   668  	if err := c.compileEnsureOnRegister(index); err != nil {
   669  		return err
   670  	}
   671  
   672  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   673  	if err != nil {
   674  		return err
   675  	}
   676  
   677  	// First, we move the length of target list into the tmp register.
   678  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(len(o.Us)/2-1), tmp)
   679  
   680  	// Then, we compare the value with the length of targets.
   681  	c.assembler.CompileRegisterToRegister(amd64.CMPL, tmp, index.register)
   682  
   683  	// If the value is larger than the length,
   684  	// we round the index to the length as the spec states that
   685  	// if the index is larger than or equal the length of list,
   686  	// branch into the default branch.
   687  	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, tmp, index.register)
   688  
   689  	// We prepare the static data which holds the offset of
   690  	// each target's first instruction (incl. default)
   691  	// relative to the beginning of label tables.
   692  	//
   693  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   694  	// we emit the the code like this at [Emit the code for each targets and default branch] below.
   695  	//
   696  	// L0:
   697  	//  0x123001: XXXX, ...
   698  	//  .....
   699  	// L1:
   700  	//  0x123005: YYY, ...
   701  	//  .....
   702  	// L_DEFAULT:
   703  	//  0x123009: ZZZ, ...
   704  	//
   705  	// then offsetData becomes like [0x0, 0x5, 0x8].
   706  	// By using this offset list, we could jump into the label for the index by
   707  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA"
   708  	// instruction.
   709  	//
   710  	// Note: We store each offset of 32-bite unsigned integer as 4 consecutive bytes. So more precisely,
   711  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   712  	//
   713  	// Note: this is similar to how GCC implements Switch statements in C.
   714  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))
   715  
   716  	// Load the offsetData's address into tmp.
   717  	if err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, offsetData, tmp); err != nil {
   718  		return err
   719  	}
   720  
   721  	// Now we have the address of first byte of offsetData in tmp register.
   722  	// So the target offset's first byte is at tmp+index*4 as we store
   723  	// the offset as 4 bytes for a 32-byte integer.
   724  	// Here, we store the offset into the index.register.
   725  	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVL, tmp, 0, index.register, 4, index.register)
   726  
   727  	// Now we read the address of the beginning of the jump table.
   728  	// In the above example, this corresponds to reading the address of 0x123001.
   729  	c.assembler.CompileReadInstructionAddress(tmp, amd64.JMP)
   730  
   731  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   732  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   733  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, index.register, tmp)
   734  
   735  	c.assembler.CompileJumpToRegister(amd64.JMP, tmp)
   736  
   737  	// We no longer need the index's register, so mark it unused.
   738  	c.locationStack.markRegisterUnused(index.register)
   739  
   740  	// [Emit the code for each targets and default branch]
   741  	labelInitialInstructions := make([]asm.Node, len(o.Us)/2)
   742  
   743  	// Since we might end up having the different stack state in each branch,
   744  	// we need to save the initial stack state here, and use the same initial state
   745  	// for each iteration.
   746  	initialLocationStack := c.getSavedTemporaryLocationStack()
   747  
   748  	for i := range labelInitialInstructions {
   749  		// Emit the initial instruction of each target.
   750  		// We use NOP as we don't yet know the next instruction in each label.
   751  		// Assembler would optimize out this NOP during code generation, so this is harmless.
   752  		labelInitialInstructions[i] = c.assembler.CompileStandAlone(amd64.NOP)
   753  
   754  		targetLabel := wazeroir.Label(o.Us[i*2])
   755  		targetToDrop := o.Us[i*2+1]
   756  		if err = compileDropRange(c, targetToDrop); err != nil {
   757  			return err
   758  		}
   759  		if err = c.branchInto(targetLabel); err != nil {
   760  			return err
   761  		}
   762  		// After the iteration, reset the stack's state with initialLocationStack.
   763  		c.locationStack.cloneFrom(initialLocationStack)
   764  	}
   765  
   766  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   767  	return nil
   768  }
   769  
   770  func (c *amd64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
   771  	initialLocationStack := *c.locationStack // Take copy!
   772  	// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
   773  	if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
   774  		c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
   775  	}
   776  	copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
   777  	initialLocationStack.stack = c.brTableTmp
   778  	return initialLocationStack
   779  }
   780  
   781  func (c *amd64Compiler) assignJumpTarget(label wazeroir.Label, jmpInstruction asm.Node) {
   782  	jmpTargetLabel := c.label(label)
   783  	targetInst := jmpTargetLabel.initialInstruction
   784  	if targetInst == nil {
   785  		// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
   786  		targetInst = c.assembler.AllocateNOP()
   787  		jmpTargetLabel.initialInstruction = targetInst
   788  	}
   789  	jmpInstruction.AssignJumpTarget(targetInst)
   790  }
   791  
   792  // compileLabel implements compiler.compileLabel for the amd64 architecture.
   793  func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool) {
   794  	label := wazeroir.Label(o.U1)
   795  	labelInfo := c.label(label)
   796  
   797  	// If initialStack is not set, that means this label has never been reached.
   798  	if !labelInfo.stackInitialized {
   799  		skipLabel = true
   800  		return
   801  	}
   802  
   803  	// We use NOP as a beginning of instructions in a label.
   804  	if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
   805  		// We use NOP as a beginning of instructions in a label.
   806  		// This should be eventually optimized out by assembler.
   807  		labelInfo.initialInstruction = c.assembler.CompileStandAlone(amd64.NOP)
   808  	} else {
   809  		c.assembler.Add(labelBegin)
   810  	}
   811  
   812  	// Set the initial stack.
   813  	c.setLocationStack(&labelInfo.initialStack)
   814  	return
   815  }
   816  
   817  // compileCall implements compiler.compileCall for the amd64 architecture.
   818  func (c *amd64Compiler) compileCall(o *wazeroir.UnionOperation) error {
   819  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   820  		return err
   821  	}
   822  
   823  	functionIndex := o.U1
   824  
   825  	target := c.ir.Functions[functionIndex]
   826  	targetType := &c.ir.Types[target]
   827  
   828  	targetAddressRegister, err := c.allocateRegister(registerTypeGeneralPurpose)
   829  	if err != nil {
   830  		return err
   831  	}
   832  
   833  	// First, push the index to the callEngine.functionsElement0Address into the target register.
   834  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(functionIndex)*functionSize, targetAddressRegister)
   835  
   836  	// Next, we add the address of the first item of callEngine.functions slice (= &callEngine.functions[0])
   837  	// to the target register.
   838  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine,
   839  		callEngineModuleContextFunctionsElement0AddressOffset, targetAddressRegister)
   840  
   841  	if err := c.compileCallFunctionImpl(targetAddressRegister, targetType); err != nil {
   842  		return err
   843  	}
   844  	return nil
   845  }
   846  
   847  // compileCallIndirect implements compiler.compileCallIndirect for the amd64 architecture.
   848  func (c *amd64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) error {
   849  	offset := c.locationStack.pop()
   850  	if err := c.compileEnsureOnRegister(offset); err != nil {
   851  		return nil
   852  	}
   853  	typeIndex := o.U1
   854  	tableIndex := o.U2
   855  
   856  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   857  	if err != nil {
   858  		return err
   859  	}
   860  	c.locationStack.markRegisterUsed(tmp)
   861  
   862  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
   863  	if err != nil {
   864  		return err
   865  	}
   866  	c.locationStack.markRegisterUsed(tmp2)
   867  
   868  	// Load the address of the target table: tmp = &module.Tables[0]
   869  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
   870  	// tmp = &module.Tables[0] + Index*8 = &module.Tables[0] + sizeOf(*TableInstance)*index = module.Tables[o.TableIndex].
   871  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
   872  
   873  	// Then, we need to trap if the offset exceeds the length of table.
   874  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
   875  	c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
   876  
   877  	// next we check if the target's type matches the operation's one.
   878  	// In order to get the type instance's address, we have to multiply the offset
   879  	// by 8 as the offset is the "length" of table in Go's "[]uintptr{}",
   880  	// and size of uintptr equals 8 bytes == (2^3).
   881  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, offset.register)
   882  
   883  	// Adds the address of wasm.Table[0] stored as callEngine.tableElement0Address to the offset.
   884  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
   885  		tmp, tableInstanceTableOffset, offset.register)
   886  
   887  	// "offset = (*offset) (== table[offset]  == *code type)"
   888  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, 0, offset.register)
   889  
   890  	// At this point offset.register holds the address of *code (as uintptr) at wasm.Table[offset].
   891  	//
   892  	// Check if the value of table[offset] equals zero, meaning that the target is uninitialized.
   893  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, offset.register, offset.register)
   894  
   895  	// Skipped if the target is initialized.
   896  	c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusCodeInvalidTableAccess)
   897  
   898  	// Next, we need to check the type matches, i.e. table[offset].source.TypeID == targetFunctionType's typeID.
   899  	//
   900  	// "tmp2 = [&moduleInstance.TypeIDs[0] + index * 4] (== moduleInstance.TypeIDs[index])"
   901  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
   902  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
   903  		tmp2)
   904  	c.assembler.CompileMemoryToRegister(amd64.MOVL, tmp2, int64(typeIndex)*4, tmp2)
   905  
   906  	// Skipped if the type matches.
   907  	c.assembler.CompileMemoryToRegister(amd64.CMPL, offset.register, functionTypeIDOffset, tmp2)
   908  	c.compileMaybeExitFromNativeCode(amd64.JEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
   909  	targetFunctionType := &c.ir.Types[typeIndex]
   910  	if err = c.compileCallFunctionImpl(offset.register, targetFunctionType); err != nil {
   911  		return nil
   912  	}
   913  
   914  	// The offset register should be marked as un-used as we consumed in the function call.
   915  	c.locationStack.markRegisterUnused(offset.register, tmp, tmp2)
   916  	return nil
   917  }
   918  
   919  // compileDrop implements compiler.compileDrop for the amd64 architecture.
   920  func (c *amd64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
   921  	return compileDropRange(c, o.U1)
   922  }
   923  
   924  // compileSelectV128Impl implements compileSelect for vector values.
   925  func (c *amd64Compiler) compileSelectV128Impl(selectorReg asm.Register) error {
   926  	x2 := c.locationStack.popV128()
   927  	if err := c.compileEnsureOnRegister(x2); err != nil {
   928  		return err
   929  	}
   930  
   931  	x1 := c.locationStack.popV128()
   932  	if err := c.compileEnsureOnRegister(x1); err != nil {
   933  		return err
   934  	}
   935  
   936  	// Compare the conditional value with zero.
   937  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, selectorReg, selectorReg)
   938  
   939  	// Set the jump if the top value is not zero.
   940  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
   941  
   942  	// In this branch, we select the value of x2, so we move the value into x1.register so that
   943  	// we can have the result in x1.register regardless of the selection.
   944  	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, x2.register, x1.register)
   945  
   946  	// Else, we don't need to adjust value, just need to jump to the next instruction.
   947  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
   948  
   949  	// As noted, the result exists in x1.register regardless of the selector.
   950  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   951  	// Plus, x2.register is no longer used.
   952  	c.locationStack.markRegisterUnused(x2.register)
   953  	c.locationStack.markRegisterUnused(selectorReg)
   954  	return nil
   955  }
   956  
   957  // compileSelect implements compiler.compileSelect for the amd64 architecture.
   958  //
   959  // The emitted native code depends on whether the values are on
   960  // the physical registers or memory stack, or maybe conditional register.
   961  func (c *amd64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
   962  	cv := c.locationStack.pop()
   963  	if err := c.compileEnsureOnRegister(cv); err != nil {
   964  		return err
   965  	}
   966  
   967  	isTargetVector := o.B3
   968  	if isTargetVector {
   969  		return c.compileSelectV128Impl(cv.register)
   970  	}
   971  
   972  	x2 := c.locationStack.pop()
   973  	// We do not consume x1 here, but modify the value according to
   974  	// the conditional value "c" above.
   975  	peekedX1 := c.locationStack.peek()
   976  
   977  	// Compare the conditional value with zero.
   978  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, cv.register, cv.register)
   979  
   980  	// Now we can use c.register as temporary location.
   981  	// We alias it here for readability.
   982  	tmpRegister := cv.register
   983  
   984  	// Set the jump if the top value is not zero.
   985  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
   986  
   987  	// If the value is zero, we must place the value of x2 onto the stack position of x1.
   988  
   989  	// First we copy the value of x2 to the temporary register if x2 is not currently on a register.
   990  	if x2.onStack() {
   991  		x2.register = tmpRegister
   992  		c.compileLoadValueOnStackToRegister(x2)
   993  	}
   994  
   995  	//
   996  	// At this point x2's value is always on a register.
   997  	//
   998  
   999  	// Then release the value in the x2's register to the x1's stack position.
  1000  	if peekedX1.onRegister() {
  1001  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, peekedX1.register)
  1002  	} else {
  1003  		peekedX1.register = x2.register
  1004  		c.compileReleaseRegisterToStack(peekedX1) // Note inside we mark the register unused!
  1005  	}
  1006  
  1007  	// Else, we don't need to adjust value, just need to jump to the next instruction.
  1008  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
  1009  
  1010  	// In any case, we don't need x2 and c anymore!
  1011  	c.locationStack.releaseRegister(x2)
  1012  	c.locationStack.releaseRegister(cv)
  1013  	return nil
  1014  }
  1015  
  1016  // compilePick implements compiler.compilePick for the amd64 architecture.
  1017  func (c *amd64Compiler) compilePick(o *wazeroir.UnionOperation) error {
  1018  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1019  		return err
  1020  	}
  1021  	depth := o.U1
  1022  	isTargetVector := o.B3
  1023  
  1024  	// TODO: if we track the type of values on the stack,
  1025  	// we could optimize the instruction according to the bit size of the value.
  1026  	// For now, we just move the entire register i.e. as a quad word (8 bytes).
  1027  	pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
  1028  	reg, err := c.allocateRegister(pickTarget.getRegisterType())
  1029  	if err != nil {
  1030  		return err
  1031  	}
  1032  
  1033  	if pickTarget.onRegister() {
  1034  		var inst asm.Instruction
  1035  		if isTargetVector {
  1036  			inst = amd64.MOVDQU
  1037  		} else if pickTarget.valueType == runtimeValueTypeI32 { // amd64 cannot copy single-precisions between registers.
  1038  			inst = amd64.MOVL
  1039  		} else {
  1040  			inst = amd64.MOVQ
  1041  		}
  1042  		c.assembler.CompileRegisterToRegister(inst, pickTarget.register, reg)
  1043  	} else if pickTarget.onStack() {
  1044  		// Copy the value from the stack.
  1045  		var inst asm.Instruction
  1046  		if isTargetVector {
  1047  			inst = amd64.MOVDQU
  1048  		} else if pickTarget.valueType == runtimeValueTypeI32 || pickTarget.valueType == runtimeValueTypeF32 {
  1049  			inst = amd64.MOVL
  1050  		} else {
  1051  			inst = amd64.MOVQ
  1052  		}
  1053  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  1054  		c.assembler.CompileMemoryToRegister(inst, amd64ReservedRegisterForStackBasePointerAddress,
  1055  			int64(pickTarget.stackPointer)*8, reg)
  1056  	}
  1057  	// Now we already placed the picked value on the register,
  1058  	// so push the location onto the stack.
  1059  	if isTargetVector {
  1060  		c.pushVectorRuntimeValueLocationOnRegister(reg)
  1061  	} else {
  1062  		c.pushRuntimeValueLocationOnRegister(reg, pickTarget.valueType)
  1063  	}
  1064  	return nil
  1065  }
  1066  
  1067  // compileAdd implements compiler.compileAdd for the amd64 architecture.
  1068  func (c *amd64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
  1069  	// TODO: if the previous instruction is const, then
  1070  	// this can be optimized. Same goes for other arithmetic instructions.
  1071  
  1072  	var instruction asm.Instruction
  1073  
  1074  	unsignedType := wazeroir.UnsignedType(o.B1)
  1075  	switch unsignedType {
  1076  	case wazeroir.UnsignedTypeI32:
  1077  		instruction = amd64.ADDL
  1078  	case wazeroir.UnsignedTypeI64:
  1079  		instruction = amd64.ADDQ
  1080  	case wazeroir.UnsignedTypeF32:
  1081  		instruction = amd64.ADDSS
  1082  	case wazeroir.UnsignedTypeF64:
  1083  		instruction = amd64.ADDSD
  1084  	}
  1085  
  1086  	x2 := c.locationStack.pop()
  1087  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1088  		return err
  1089  	}
  1090  
  1091  	x1 := c.locationStack.peek() // Note this is peek!
  1092  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1093  		return err
  1094  	}
  1095  
  1096  	// x1 += x2.
  1097  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1098  
  1099  	// We no longer need x2 register after ADD operation here,
  1100  	// so we release it.
  1101  	c.locationStack.releaseRegister(x2)
  1102  	return nil
  1103  }
  1104  
  1105  // compileSub implements compiler.compileSub for the amd64 architecture.
  1106  func (c *amd64Compiler) compileSub(o *wazeroir.UnionOperation) error {
  1107  	// TODO: if the previous instruction is const, then
  1108  	// this can be optimized. Same goes for other arithmetic instructions.
  1109  
  1110  	var instruction asm.Instruction
  1111  	unsignedType := wazeroir.UnsignedType(o.B1)
  1112  	switch unsignedType {
  1113  	case wazeroir.UnsignedTypeI32:
  1114  		instruction = amd64.SUBL
  1115  	case wazeroir.UnsignedTypeI64:
  1116  		instruction = amd64.SUBQ
  1117  	case wazeroir.UnsignedTypeF32:
  1118  		instruction = amd64.SUBSS
  1119  	case wazeroir.UnsignedTypeF64:
  1120  		instruction = amd64.SUBSD
  1121  	}
  1122  
  1123  	x2 := c.locationStack.pop()
  1124  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1125  		return err
  1126  	}
  1127  
  1128  	x1 := c.locationStack.peek() // Note this is peek!
  1129  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1130  		return err
  1131  	}
  1132  
  1133  	// x1 -= x2.
  1134  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1135  
  1136  	// We no longer need x2 register after ADD operation here,
  1137  	// so we release it.
  1138  	c.locationStack.releaseRegister(x2)
  1139  	return nil
  1140  }
  1141  
  1142  // compileMul implements compiler.compileMul for the amd64 architecture.
  1143  func (c *amd64Compiler) compileMul(o *wazeroir.UnionOperation) (err error) {
  1144  	unsignedType := wazeroir.UnsignedType(o.B1)
  1145  	switch unsignedType {
  1146  	case wazeroir.UnsignedTypeI32:
  1147  		err = c.compileMulForInts(true, amd64.MULL)
  1148  	case wazeroir.UnsignedTypeI64:
  1149  		err = c.compileMulForInts(false, amd64.MULQ)
  1150  	case wazeroir.UnsignedTypeF32:
  1151  		err = c.compileMulForFloats(amd64.MULSS)
  1152  	case wazeroir.UnsignedTypeF64:
  1153  		err = c.compileMulForFloats(amd64.MULSD)
  1154  	}
  1155  	return
  1156  }
  1157  
  1158  // compileMulForInts emits instructions to perform integer multiplication for
  1159  // top two values on the stack. If unfamiliar with the convention for integer
  1160  // multiplication on x86, see https://www.felixcloutier.com/x86/mul.
  1161  //
  1162  // In summary, one of the values must be on the AX register,
  1163  // and the mul instruction stores the overflow info in DX register which we don't use.
  1164  // Here, we mean "the overflow info" by 65 bit or higher part of the result for 64 bit case.
  1165  //
  1166  // So, we have to ensure that
  1167  //  1. Previously located value on DX must be saved to memory stack. That is because
  1168  //     the existing value will be overridden after the mul execution.
  1169  //  2. One of the operands (x1 or x2) must be on AX register.
  1170  //
  1171  // See https://www.felixcloutier.com/x86/mul#description for detail semantics.
  1172  func (c *amd64Compiler) compileMulForInts(is32Bit bool, mulInstruction asm.Instruction) error {
  1173  	const (
  1174  		resultRegister   = amd64.RegAX
  1175  		reservedRegister = amd64.RegDX
  1176  	)
  1177  
  1178  	x2 := c.locationStack.pop()
  1179  	x1 := c.locationStack.pop()
  1180  
  1181  	var valueOnAX *runtimeValueLocation
  1182  	if x1.register == resultRegister {
  1183  		valueOnAX = x1
  1184  	} else if x2.register == resultRegister {
  1185  		valueOnAX = x2
  1186  	} else {
  1187  		valueOnAX = x2
  1188  		// This case we  move x2 to AX register.
  1189  		c.onValueReleaseRegisterToStack(resultRegister)
  1190  		if x2.onConditionalRegister() {
  1191  			c.compileMoveConditionalToGeneralPurposeRegister(x2, resultRegister)
  1192  		} else if x2.onStack() {
  1193  			x2.setRegister(resultRegister)
  1194  			c.compileLoadValueOnStackToRegister(x2)
  1195  			c.locationStack.markRegisterUsed(resultRegister)
  1196  		} else {
  1197  			var inst asm.Instruction
  1198  			if is32Bit {
  1199  				inst = amd64.MOVL
  1200  			} else {
  1201  				inst = amd64.MOVQ
  1202  			}
  1203  			c.assembler.CompileRegisterToRegister(inst, x2.register, resultRegister)
  1204  
  1205  			// We no longer uses the prev register of x2.
  1206  			c.locationStack.releaseRegister(x2)
  1207  			x2.setRegister(resultRegister)
  1208  			c.locationStack.markRegisterUsed(resultRegister)
  1209  		}
  1210  	}
  1211  
  1212  	// We have to make sure that at this point the operands must be on registers.
  1213  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1214  		return err
  1215  	}
  1216  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1217  		return err
  1218  	}
  1219  
  1220  	// We have to save the existing value on DX.
  1221  	// If the DX register is used by either x1 or x2, we don't need to
  1222  	// save the value because it is consumed by mul anyway.
  1223  	if x1.register != reservedRegister && x2.register != reservedRegister {
  1224  		c.onValueReleaseRegisterToStack(reservedRegister)
  1225  	}
  1226  
  1227  	// Now ready to emit the mul instruction.
  1228  	if x1 == valueOnAX {
  1229  		c.assembler.CompileRegisterToNone(mulInstruction, x2.register)
  1230  	} else {
  1231  		c.assembler.CompileRegisterToNone(mulInstruction, x1.register)
  1232  	}
  1233  
  1234  	c.locationStack.markRegisterUnused(x2.register)
  1235  	c.locationStack.markRegisterUnused(x1.register)
  1236  
  1237  	// Now we have the result in the AX register,
  1238  	// so we record it.
  1239  	c.pushRuntimeValueLocationOnRegister(resultRegister, x1.valueType)
  1240  	return nil
  1241  }
  1242  
  1243  func (c *amd64Compiler) compileMulForFloats(instruction asm.Instruction) error {
  1244  	x2 := c.locationStack.pop()
  1245  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1246  		return err
  1247  	}
  1248  
  1249  	x1 := c.locationStack.peek() // Note this is peek!
  1250  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1251  		return err
  1252  	}
  1253  
  1254  	// x1 *= x2.
  1255  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1256  
  1257  	// We no longer need x2 register after MUL operation here,
  1258  	// so we release it.
  1259  	c.locationStack.releaseRegister(x2)
  1260  	return nil
  1261  }
  1262  
  1263  // compileClz implements compiler.compileClz for the amd64 architecture.
  1264  func (c *amd64Compiler) compileClz(o *wazeroir.UnionOperation) error {
  1265  	target := c.locationStack.pop()
  1266  	if err := c.compileEnsureOnRegister(target); err != nil {
  1267  		return err
  1268  	}
  1269  
  1270  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1271  	if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
  1272  		if unsignedInt == wazeroir.UnsignedInt32 {
  1273  			c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register)
  1274  		} else {
  1275  			c.assembler.CompileRegisterToRegister(amd64.LZCNTQ, target.register, target.register)
  1276  		}
  1277  	} else {
  1278  		// On processors that do not support LZCNT, we combine BSR (calculating
  1279  		// most significant set bit) with XOR. This logic is described in
  1280  		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
  1281  		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
  1282  
  1283  		// First, we have to check if the target is non-zero as BSR is undefined
  1284  		// on zero. See https://www.felixcloutier.com/x86/bsr.
  1285  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, target.register, target.register)
  1286  		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
  1287  
  1288  		// If the value is zero, we just push the const value.
  1289  		if unsignedInt == wazeroir.UnsignedInt32 {
  1290  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
  1291  		} else {
  1292  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
  1293  		}
  1294  
  1295  		// Emit the jmp instruction to jump to the position right after
  1296  		// the non-zero case.
  1297  		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
  1298  
  1299  		// Start emitting non-zero case.
  1300  		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
  1301  		// First, we calculate the most significant set bit.
  1302  		if unsignedInt == wazeroir.UnsignedInt32 {
  1303  			c.assembler.CompileRegisterToRegister(amd64.BSRL, target.register, target.register)
  1304  		} else {
  1305  			c.assembler.CompileRegisterToRegister(amd64.BSRQ, target.register, target.register)
  1306  		}
  1307  
  1308  		// Now we XOR the value with the bit length minus one.
  1309  		if unsignedInt == wazeroir.UnsignedInt32 {
  1310  			c.assembler.CompileConstToRegister(amd64.XORL, 31, target.register)
  1311  		} else {
  1312  			c.assembler.CompileConstToRegister(amd64.XORQ, 63, target.register)
  1313  		}
  1314  
  1315  		// Finally the end jump instruction of zero case must target towards
  1316  		// the next instruction.
  1317  		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
  1318  	}
  1319  
  1320  	// We reused the same register of target for the result.
  1321  	c.locationStack.markRegisterUnused(target.register)
  1322  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1323  	return nil
  1324  }
  1325  
  1326  // compileCtz implements compiler.compileCtz for the amd64 architecture.
  1327  func (c *amd64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
  1328  	target := c.locationStack.pop()
  1329  	if err := c.compileEnsureOnRegister(target); err != nil {
  1330  		return err
  1331  	}
  1332  
  1333  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1334  	if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
  1335  		if unsignedInt == wazeroir.UnsignedInt32 {
  1336  			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
  1337  		} else {
  1338  			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
  1339  		}
  1340  	} else {
  1341  		// On processors that do not support TZCNT, the BSF instruction is
  1342  		// executed instead. The key difference between TZCNT and BSF
  1343  		// instruction is that if source operand is zero, the content of
  1344  		// destination operand is undefined.
  1345  		// https://www.felixcloutier.com/x86/tzcnt.html
  1346  
  1347  		// First we compare the target with zero.
  1348  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, target.register, target.register)
  1349  		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
  1350  
  1351  		// If the value is zero, we just push the const value.
  1352  		if unsignedInt == wazeroir.UnsignedInt32 {
  1353  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
  1354  		} else {
  1355  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
  1356  		}
  1357  
  1358  		// Emit the jmp instruction to jump to the position right after
  1359  		// the non-zero case.
  1360  		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
  1361  
  1362  		// Otherwise, emit the TZCNT.
  1363  		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
  1364  		if unsignedInt == wazeroir.UnsignedInt32 {
  1365  			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
  1366  		} else {
  1367  			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
  1368  		}
  1369  
  1370  		// Finally the end jump instruction of zero case must target towards
  1371  		// the next instruction.
  1372  		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
  1373  	}
  1374  
  1375  	// We reused the same register of target for the result.
  1376  	c.locationStack.markRegisterUnused(target.register)
  1377  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1378  	return nil
  1379  }
  1380  
  1381  // compilePopcnt implements compiler.compilePopcnt for the amd64 architecture.
  1382  func (c *amd64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
  1383  	target := c.locationStack.pop()
  1384  	if err := c.compileEnsureOnRegister(target); err != nil {
  1385  		return err
  1386  	}
  1387  
  1388  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1389  	if unsignedInt == wazeroir.UnsignedInt32 {
  1390  		c.assembler.CompileRegisterToRegister(amd64.POPCNTL, target.register, target.register)
  1391  	} else {
  1392  		c.assembler.CompileRegisterToRegister(amd64.POPCNTQ, target.register, target.register)
  1393  	}
  1394  
  1395  	// We reused the same register of target for the result.
  1396  	c.locationStack.markRegisterUnused(target.register)
  1397  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1398  	return nil
  1399  }
  1400  
  1401  // compileDiv implements compiler.compileDiv for the amd64 architecture.
  1402  func (c *amd64Compiler) compileDiv(o *wazeroir.UnionOperation) (err error) {
  1403  	signedType := wazeroir.SignedType(o.B1)
  1404  	switch signedType {
  1405  	case wazeroir.SignedTypeUint32:
  1406  		err = c.compileDivForInts(true, false)
  1407  	case wazeroir.SignedTypeUint64:
  1408  		err = c.compileDivForInts(false, false)
  1409  	case wazeroir.SignedTypeInt32:
  1410  		err = c.compileDivForInts(true, true)
  1411  	case wazeroir.SignedTypeInt64:
  1412  		err = c.compileDivForInts(false, true)
  1413  	case wazeroir.SignedTypeFloat32:
  1414  		err = c.compileDivForFloats(true)
  1415  	case wazeroir.SignedTypeFloat64:
  1416  		err = c.compileDivForFloats(false)
  1417  	}
  1418  	return
  1419  }
  1420  
  1421  // compileDivForInts emits the instructions to perform division on the top
  1422  // two values of integer type on the stack and puts the quotient of the result
  1423  // onto the stack. For example, stack [..., 10, 3] results in [..., 3] where
  1424  // the remainder is discarded.
  1425  func (c *amd64Compiler) compileDivForInts(is32Bit bool, signed bool) error {
  1426  	if err := c.performDivisionOnInts(false, is32Bit, signed); err != nil {
  1427  		return err
  1428  	}
  1429  	// Now we have the quotient of the division result in the AX register,
  1430  	// so we record it.
  1431  	if is32Bit {
  1432  		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI32)
  1433  	} else {
  1434  		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI64)
  1435  	}
  1436  	return nil
  1437  }
  1438  
  1439  // compileRem implements compiler.compileRem for the amd64 architecture.
  1440  func (c *amd64Compiler) compileRem(o *wazeroir.UnionOperation) (err error) {
  1441  	var vt runtimeValueType
  1442  	signedInt := wazeroir.SignedInt(o.B1)
  1443  	switch signedInt {
  1444  	case wazeroir.SignedInt32:
  1445  		err = c.performDivisionOnInts(true, true, true)
  1446  		vt = runtimeValueTypeI32
  1447  	case wazeroir.SignedInt64:
  1448  		err = c.performDivisionOnInts(true, false, true)
  1449  		vt = runtimeValueTypeI64
  1450  	case wazeroir.SignedUint32:
  1451  		err = c.performDivisionOnInts(true, true, false)
  1452  		vt = runtimeValueTypeI32
  1453  	case wazeroir.SignedUint64:
  1454  		err = c.performDivisionOnInts(true, false, false)
  1455  		vt = runtimeValueTypeI64
  1456  	}
  1457  	if err != nil {
  1458  		return err
  1459  	}
  1460  
  1461  	// Now we have the remainder of the division result in the DX register,
  1462  	// so we record it.
  1463  	c.pushRuntimeValueLocationOnRegister(amd64.RegDX, vt)
  1464  	return
  1465  }
  1466  
  1467  // performDivisionOnInts emits the instructions to do divisions on top two integers on the stack
  1468  // via DIV (unsigned div) and IDIV (signed div) instructions.
  1469  // See the following explanation of these instructions' semantics from https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf
  1470  //
  1471  // >> Division requires special arrangements: idiv (signed) and div (unsigned) operate on a 2n-byte dividend and
  1472  // >> an n-byte divisor to produce an n-byte quotient and n-byte remainder. The dividend always lives in a fixed pair of
  1473  // >> registers (%edx and %eax for the 32-bit case; %rdx and %rax for the 64-bit case); the divisor is specified as the
  1474  // >> source operand in the instruction. The quotient goes in %eax (resp. %rax); the remainder in %edx (resp. %rdx). For
  1475  // >> signed division, the cltd (resp. ctqo) instruction is used to prepare %edx (resp. %rdx) with the sign extension of
  1476  // >> %eax (resp. %rax). For example, if a,b, c are memory locations holding quad words, then we could set c = a/b
  1477  // >> using the sequence: movq a(%rip), %rax; ctqo; idivq b(%rip); movq %rax, c(%rip).
  1478  //
  1479  // tl;dr is that the division result is placed in AX and DX registers after instructions emitted by this function
  1480  // where AX holds the quotient while DX the remainder of the division result.
  1481  func (c *amd64Compiler) performDivisionOnInts(isRem, is32Bit, signed bool) error {
  1482  	const (
  1483  		quotientRegister  = amd64.RegAX
  1484  		remainderRegister = amd64.RegDX
  1485  	)
  1486  
  1487  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1488  		return err
  1489  	}
  1490  
  1491  	// Ensures that previous values on these registers are saved to memory.
  1492  	c.onValueReleaseRegisterToStack(quotientRegister)
  1493  	c.onValueReleaseRegisterToStack(remainderRegister)
  1494  
  1495  	// In order to ensure x2 is placed on a temporary register for x2 value other than AX and DX,
  1496  	// we mark them as used here.
  1497  	c.locationStack.markRegisterUsed(quotientRegister)
  1498  	c.locationStack.markRegisterUsed(remainderRegister)
  1499  
  1500  	// Ensure that x2 is placed on a register which is not either AX or DX.
  1501  	x2 := c.locationStack.pop()
  1502  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1503  		return err
  1504  	}
  1505  
  1506  	// Now we successfully place x2 on a temp register, so we no longer need to
  1507  	// mark these registers used.
  1508  	c.locationStack.markRegisterUnused(quotientRegister)
  1509  	c.locationStack.markRegisterUnused(remainderRegister)
  1510  
  1511  	// Check if the x2 equals zero.
  1512  	if is32Bit {
  1513  		c.assembler.CompileRegisterToRegister(amd64.TESTL, x2.register, x2.register)
  1514  	} else {
  1515  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, x2.register, x2.register)
  1516  	}
  1517  
  1518  	// Skipped if the divisor is nonzero.
  1519  	c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusIntegerDivisionByZero)
  1520  
  1521  	// next, we ensure that x1 is placed on AX.
  1522  	x1 := c.locationStack.pop()
  1523  	if x1.onRegister() && x1.register != quotientRegister {
  1524  		// Move x1 to quotientRegister.
  1525  		if is32Bit {
  1526  			c.assembler.CompileRegisterToRegister(amd64.MOVL, x1.register, quotientRegister)
  1527  		} else {
  1528  			c.assembler.CompileRegisterToRegister(amd64.MOVQ, x1.register, quotientRegister)
  1529  		}
  1530  		c.locationStack.markRegisterUnused(x1.register)
  1531  		x1.setRegister(quotientRegister)
  1532  	} else if x1.onStack() {
  1533  		x1.setRegister(quotientRegister)
  1534  		c.compileLoadValueOnStackToRegister(x1)
  1535  	}
  1536  
  1537  	// Note: at this point, x1 is placed on AX, x2 is on a register which is not AX or DX.
  1538  
  1539  	isSignedRem := isRem && signed
  1540  	isSignedDiv := !isRem && signed
  1541  	var signedRemMinusOneDivisorJmp asm.Node
  1542  	if isSignedRem {
  1543  		// If this is for getting remainder of signed division,
  1544  		// we have to treat the special case where the divisor equals -1.
  1545  		// For example, if this is 32-bit case, the result of (-2^31) / -1 equals (quotient=2^31, remainder=0)
  1546  		// where quotient doesn't fit in the 32-bit range whose maximum is 2^31-1.
  1547  		// x86 in this case cause floating point exception, but according to the Wasm spec
  1548  		// if the divisor equals -1, the result must be zero (not undefined!) as opposed to be "undefined"
  1549  		// for divisions on (-2^31) / -1 where we do not need to emit the special branches.
  1550  		// For detail, please refer to https://stackoverflow.com/questions/56303282/why-idiv-with-1-causes-floating-point-exception
  1551  
  1552  		// First we store zero into the remainder result register (DX) and compare the divisor with -1.
  1553  		if is32Bit {
  1554  			c.assembler.CompileRegisterToRegister(amd64.XORL, remainderRegister, remainderRegister)
  1555  			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
  1556  		} else {
  1557  			c.assembler.CompileRegisterToRegister(amd64.XORQ, remainderRegister, remainderRegister)
  1558  			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
  1559  		}
  1560  
  1561  		// If it equals minus one, we skip the normal case.
  1562  		signedRemMinusOneDivisorJmp = c.assembler.CompileJump(amd64.JEQ)
  1563  	} else if isSignedDiv {
  1564  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1565  		// case which results in the floating point exception via division error as
  1566  		// the resulting value exceeds the maximum of signed int.
  1567  
  1568  		// First we compare the division with -1.
  1569  		if is32Bit {
  1570  			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
  1571  		} else {
  1572  			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
  1573  		}
  1574  
  1575  		// If it doesn't equal minus one, we jump to the normal case.
  1576  		nonMinusOneDivisorJmp := c.assembler.CompileJump(amd64.JNE)
  1577  
  1578  		// next we check if the quotient is the most negative value for the signed integer.
  1579  		// That means whether or not we try to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
  1580  		if is32Bit {
  1581  			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPL, x1.register, c.minimum32BitSignedInt); err != nil {
  1582  				return err
  1583  			}
  1584  		} else {
  1585  			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPQ, x1.register, c.minimum64BitSignedInt); err != nil {
  1586  				return err
  1587  			}
  1588  		}
  1589  
  1590  		// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
  1591  		// as that is the overflow in division as the result becomes 2^31 which is larger than
  1592  		// the maximum of signed 32-bit int (2^31-1).
  1593  		c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusIntegerOverflow)
  1594  		// Set the normal case's jump target.
  1595  		c.assembler.SetJumpTargetOnNext(nonMinusOneDivisorJmp)
  1596  	}
  1597  
  1598  	// Now ready to emit the div instruction.
  1599  	// Since the div instructions takes 2n byte dividend placed in DX:AX registers...
  1600  	// * signed case - we need to sign-extend the dividend into DX register via CDQ (32 bit) or CQO (64 bit).
  1601  	// * unsigned case - we need to zero DX register via "XOR DX DX"
  1602  	if is32Bit && signed {
  1603  		// Emit sign-extension to have 64 bit dividend over DX and AX registers.
  1604  		c.assembler.CompileStandAlone(amd64.CDQ)
  1605  		c.assembler.CompileRegisterToNone(amd64.IDIVL, x2.register)
  1606  	} else if is32Bit && !signed {
  1607  		// Zeros DX register to have 64 bit dividend over DX and AX registers.
  1608  		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
  1609  		c.assembler.CompileRegisterToNone(amd64.DIVL, x2.register)
  1610  	} else if !is32Bit && signed {
  1611  		// Emits sign-extension to have 128 bit dividend over DX and AX registers.
  1612  		c.assembler.CompileStandAlone(amd64.CQO)
  1613  		c.assembler.CompileRegisterToNone(amd64.IDIVQ, x2.register)
  1614  	} else if !is32Bit && !signed {
  1615  		// Zeros DX register to have 128 bit dividend over DX and AX registers.
  1616  		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
  1617  		c.assembler.CompileRegisterToNone(amd64.DIVQ, x2.register)
  1618  	}
  1619  
  1620  	// If this is signed rem instruction, we must set the jump target of
  1621  	// the exit jump from division -1 case towards the next instruction.
  1622  	if signedRemMinusOneDivisorJmp != nil {
  1623  		c.assembler.SetJumpTargetOnNext(signedRemMinusOneDivisorJmp)
  1624  	}
  1625  
  1626  	// We mark them as unused so that we can push one of them onto the location stack at call sites.
  1627  	c.locationStack.markRegisterUnused(remainderRegister)
  1628  	c.locationStack.markRegisterUnused(quotientRegister)
  1629  	c.locationStack.markRegisterUnused(x2.register)
  1630  	return nil
  1631  }
  1632  
  1633  // compileDivForFloats emits the instructions to perform division
  1634  // on the top two values of float type on the stack, placing the result back onto the stack.
  1635  // For example, stack [..., 1.0, 4.0] results in [..., 0.25].
  1636  func (c *amd64Compiler) compileDivForFloats(is32Bit bool) error {
  1637  	if is32Bit {
  1638  		return c.compileSimpleBinaryOp(amd64.DIVSS)
  1639  	} else {
  1640  		return c.compileSimpleBinaryOp(amd64.DIVSD)
  1641  	}
  1642  }
  1643  
  1644  // compileAnd implements compiler.compileAnd for the amd64 architecture.
  1645  func (c *amd64Compiler) compileAnd(o *wazeroir.UnionOperation) (err error) {
  1646  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1647  	switch unsignedInt {
  1648  	case wazeroir.UnsignedInt32:
  1649  		err = c.compileSimpleBinaryOp(amd64.ANDL)
  1650  	case wazeroir.UnsignedInt64:
  1651  		err = c.compileSimpleBinaryOp(amd64.ANDQ)
  1652  	}
  1653  	return
  1654  }
  1655  
  1656  // compileOr implements compiler.compileOr for the amd64 architecture.
  1657  func (c *amd64Compiler) compileOr(o *wazeroir.UnionOperation) (err error) {
  1658  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1659  	switch unsignedInt {
  1660  	case wazeroir.UnsignedInt32:
  1661  		err = c.compileSimpleBinaryOp(amd64.ORL)
  1662  	case wazeroir.UnsignedInt64:
  1663  		err = c.compileSimpleBinaryOp(amd64.ORQ)
  1664  	}
  1665  	return
  1666  }
  1667  
  1668  // compileXor implements compiler.compileXor for the amd64 architecture.
  1669  func (c *amd64Compiler) compileXor(o *wazeroir.UnionOperation) (err error) {
  1670  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1671  	switch unsignedInt {
  1672  	case wazeroir.UnsignedInt32:
  1673  		err = c.compileSimpleBinaryOp(amd64.XORL)
  1674  	case wazeroir.UnsignedInt64:
  1675  		err = c.compileSimpleBinaryOp(amd64.XORQ)
  1676  	}
  1677  	return
  1678  }
  1679  
  1680  // compileSimpleBinaryOp emits instructions to pop two values from the stack
  1681  // and perform the given instruction on these two values and push the result
  1682  // onto the stack.
  1683  func (c *amd64Compiler) compileSimpleBinaryOp(instruction asm.Instruction) error {
  1684  	x2 := c.locationStack.pop()
  1685  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1686  		return err
  1687  	}
  1688  
  1689  	x1 := c.locationStack.pop()
  1690  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1691  		return err
  1692  	}
  1693  
  1694  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1695  
  1696  	// We consumed x2 register after the operation here,
  1697  	// so we release it.
  1698  	c.locationStack.releaseRegister(x2)
  1699  
  1700  	// We already stored the result in the register used by x1
  1701  	// so we record it.
  1702  	c.locationStack.markRegisterUnused(x1.register)
  1703  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1704  	return nil
  1705  }
  1706  
  1707  // compileShl implements compiler.compileShl for the amd64 architecture.
  1708  func (c *amd64Compiler) compileShl(o *wazeroir.UnionOperation) (err error) {
  1709  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1710  	switch unsignedInt {
  1711  	case wazeroir.UnsignedInt32:
  1712  		err = c.compileShiftOp(amd64.SHLL, false)
  1713  	case wazeroir.UnsignedInt64:
  1714  		err = c.compileShiftOp(amd64.SHLQ, true)
  1715  	}
  1716  	return
  1717  }
  1718  
  1719  // compileShr implements compiler.compileShr for the amd64 architecture.
  1720  func (c *amd64Compiler) compileShr(o *wazeroir.UnionOperation) (err error) {
  1721  	signedInt := wazeroir.SignedInt(o.B1)
  1722  	switch signedInt {
  1723  	case wazeroir.SignedInt32:
  1724  		err = c.compileShiftOp(amd64.SARL, true)
  1725  	case wazeroir.SignedInt64:
  1726  		err = c.compileShiftOp(amd64.SARQ, false)
  1727  	case wazeroir.SignedUint32:
  1728  		err = c.compileShiftOp(amd64.SHRL, true)
  1729  	case wazeroir.SignedUint64:
  1730  		err = c.compileShiftOp(amd64.SHRQ, false)
  1731  	}
  1732  	return
  1733  }
  1734  
  1735  // compileRotl implements compiler.compileRotl for the amd64 architecture.
  1736  func (c *amd64Compiler) compileRotl(o *wazeroir.UnionOperation) (err error) {
  1737  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1738  	switch unsignedInt {
  1739  	case wazeroir.UnsignedInt32:
  1740  		err = c.compileShiftOp(amd64.ROLL, true)
  1741  	case wazeroir.UnsignedInt64:
  1742  		err = c.compileShiftOp(amd64.ROLQ, false)
  1743  	}
  1744  	return
  1745  }
  1746  
  1747  // compileRotr implements compiler.compileRotr for the amd64 architecture.
  1748  func (c *amd64Compiler) compileRotr(o *wazeroir.UnionOperation) (err error) {
  1749  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1750  	switch unsignedInt {
  1751  	case wazeroir.UnsignedInt32:
  1752  		err = c.compileShiftOp(amd64.RORL, true)
  1753  	case wazeroir.UnsignedInt64:
  1754  		err = c.compileShiftOp(amd64.RORQ, false)
  1755  	}
  1756  	return
  1757  }
  1758  
  1759  // compileShiftOp adds instructions for shift operations (SHR, SHL, ROTR, ROTL)
  1760  // where we have to place the second value (shift counts) on the CX register.
  1761  func (c *amd64Compiler) compileShiftOp(instruction asm.Instruction, is32Bit bool) error {
  1762  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1763  		return err
  1764  	}
  1765  
  1766  	x2 := c.locationStack.pop()
  1767  
  1768  	// Ensures that x2 (holding shift counts) is placed on the CX register.
  1769  	const shiftCountRegister = amd64.RegCX
  1770  	if (x2.onRegister() && x2.register != shiftCountRegister) || x2.onStack() {
  1771  		// If another value lives on the CX register, we release it to the stack.
  1772  		c.onValueReleaseRegisterToStack(shiftCountRegister)
  1773  
  1774  		if x2.onRegister() {
  1775  			x2r := x2.register
  1776  			// If x2 lives on a register, we move the value to CX.
  1777  			if is32Bit {
  1778  				c.assembler.CompileRegisterToRegister(amd64.MOVL, x2r, shiftCountRegister)
  1779  			} else {
  1780  				c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2r, shiftCountRegister)
  1781  			}
  1782  			// We no longer place any value on the original register, so we record it.
  1783  			c.locationStack.markRegisterUnused(x2r)
  1784  		} else {
  1785  			// If it is on stack, we just move the memory allocated value to the CX register.
  1786  			x2.setRegister(shiftCountRegister)
  1787  			c.compileLoadValueOnStackToRegister(x2)
  1788  		}
  1789  		c.locationStack.markRegisterUsed(shiftCountRegister)
  1790  	}
  1791  
  1792  	x1 := c.locationStack.peek() // Note this is peek!
  1793  	x1r := x1.register
  1794  
  1795  	if x1.onRegister() {
  1796  		c.assembler.CompileRegisterToRegister(instruction, shiftCountRegister, x1r)
  1797  	} else {
  1798  		// Shift target can be placed on a memory location.
  1799  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  1800  		c.assembler.CompileRegisterToMemory(instruction, shiftCountRegister, amd64ReservedRegisterForStackBasePointerAddress, int64(x1.stackPointer)*8)
  1801  	}
  1802  
  1803  	// We consumed x2 register after the operation here,
  1804  	// so we release it.
  1805  	c.locationStack.markRegisterUnused(shiftCountRegister)
  1806  	return nil
  1807  }
  1808  
  1809  // compileAbs implements compiler.compileAbs for the amd64 architecture.
  1810  //
  1811  // See the following discussions for how we could take the abs of floats on x86 assembly.
  1812  // https://stackoverflow.com/questions/32408665/fastest-way-to-compute-absolute-value-using-sse/32422471#32422471
  1813  // https://stackoverflow.com/questions/44630015/how-would-fabsdouble-be-implemented-on-x86-is-it-an-expensive-operation
  1814  func (c *amd64Compiler) compileAbs(o *wazeroir.UnionOperation) (err error) {
  1815  	target := c.locationStack.peek() // Note this is peek!
  1816  	if err = c.compileEnsureOnRegister(target); err != nil {
  1817  		return err
  1818  	}
  1819  
  1820  	// First shift left by one to clear the sign bit, and then shift right by one.
  1821  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  1822  		c.assembler.CompileConstToRegister(amd64.PSLLD, 1, target.register)
  1823  		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, target.register)
  1824  	} else {
  1825  		c.assembler.CompileConstToRegister(amd64.PSLLQ, 1, target.register)
  1826  		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, target.register)
  1827  	}
  1828  	return nil
  1829  }
  1830  
  1831  // compileNeg implements compiler.compileNeg for the amd64 architecture.
  1832  func (c *amd64Compiler) compileNeg(o *wazeroir.UnionOperation) (err error) {
  1833  	target := c.locationStack.peek() // Note this is peek!
  1834  	if err := c.compileEnsureOnRegister(target); err != nil {
  1835  		return err
  1836  	}
  1837  
  1838  	tmpReg, err := c.allocateRegister(registerTypeVector)
  1839  	if err != nil {
  1840  		return err
  1841  	}
  1842  
  1843  	// First we move the sign-bit mask (placed in memory) to the tmp register,
  1844  	// since we cannot take XOR directly with float reg and const.
  1845  	// And then negate the value by XOR it with the sign-bit mask.
  1846  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  1847  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
  1848  		if err != nil {
  1849  			return err
  1850  		}
  1851  		c.assembler.CompileRegisterToRegister(amd64.XORPS, tmpReg, target.register)
  1852  	} else {
  1853  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
  1854  		if err != nil {
  1855  			return err
  1856  		}
  1857  		c.assembler.CompileRegisterToRegister(amd64.XORPD, tmpReg, target.register)
  1858  	}
  1859  	return nil
  1860  }
  1861  
  1862  // compileCeil implements compiler.compileCeil for the amd64 architecture.
  1863  func (c *amd64Compiler) compileCeil(o *wazeroir.UnionOperation) (err error) {
  1864  	// Internally, ceil can be performed via ROUND instruction with 0x02 mode.
  1865  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/ceilf.S for example.
  1866  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x02)
  1867  }
  1868  
  1869  // compileFloor implements compiler.compileFloor for the amd64 architecture.
  1870  func (c *amd64Compiler) compileFloor(o *wazeroir.UnionOperation) (err error) {
  1871  	// Internally, floor can be performed via ROUND instruction with 0x01 mode.
  1872  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/floorf.S for example.
  1873  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x01)
  1874  }
  1875  
  1876  // compileTrunc implements compiler.compileTrunc for the amd64 architecture.
  1877  func (c *amd64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
  1878  	// Internally, trunc can be performed via ROUND instruction with 0x03 mode.
  1879  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/truncf.S for example.
  1880  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x03)
  1881  }
  1882  
  1883  // compileNearest implements compiler.compileNearest for the amd64 architecture.
  1884  func (c *amd64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
  1885  	// Nearest can be performed via ROUND instruction with 0x00 mode.
  1886  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x00)
  1887  }
  1888  
  1889  func (c *amd64Compiler) compileRoundInstruction(is32Bit bool, mode int64) error {
  1890  	target := c.locationStack.peek() // Note this is peek!
  1891  	if err := c.compileEnsureOnRegister(target); err != nil {
  1892  		return err
  1893  	}
  1894  
  1895  	if is32Bit {
  1896  		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSS, target.register, target.register, byte(mode))
  1897  	} else {
  1898  		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSD, target.register, target.register, byte(mode))
  1899  	}
  1900  	return nil
  1901  }
  1902  
  1903  // compileMin implements compiler.compileMin for the amd64 architecture.
  1904  func (c *amd64Compiler) compileMin(o *wazeroir.UnionOperation) error {
  1905  	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
  1906  	if is32Bit {
  1907  		return c.compileMinOrMax(is32Bit, true, amd64.MINSS)
  1908  	} else {
  1909  		return c.compileMinOrMax(is32Bit, true, amd64.MINSD)
  1910  	}
  1911  }
  1912  
  1913  // compileMax implements compiler.compileMax for the amd64 architecture.
  1914  func (c *amd64Compiler) compileMax(o *wazeroir.UnionOperation) error {
  1915  	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
  1916  	if is32Bit {
  1917  		return c.compileMinOrMax(is32Bit, false, amd64.MAXSS)
  1918  	} else {
  1919  		return c.compileMinOrMax(is32Bit, false, amd64.MAXSD)
  1920  	}
  1921  }
  1922  
  1923  // emitMinOrMax adds instructions to pop two values from the stack, and push back either minimum or
  1924  // minimum of these two values onto the stack according to the minOrMaxInstruction argument.
  1925  // minOrMaxInstruction must be one of MAXSS, MAXSD, MINSS or MINSD.
  1926  // Note: These native min/max instructions are almost compatible with min/max in the Wasm specification,
  1927  // but it is slightly different with respect to the NaN handling.
  1928  // Native min/max instructions return non-NaN value if exactly one of target values
  1929  // is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN.
  1930  // However, WebAssembly specifies that min/max must always return NaN if one of values is NaN.
  1931  // Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before
  1932  // the native min/max, which is why we cannot simply emit a native min/max instruction here.
  1933  //
  1934  // For the semantics, see wazeroir.Min and wazeroir.Max for detail.
  1935  func (c *amd64Compiler) compileMinOrMax(is32Bit, isMin bool, minOrMaxInstruction asm.Instruction) error {
  1936  	x2 := c.locationStack.pop()
  1937  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1938  		return err
  1939  	}
  1940  	x1 := c.locationStack.pop()
  1941  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1942  		return err
  1943  	}
  1944  
  1945  	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case
  1946  	if is32Bit {
  1947  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
  1948  	} else {
  1949  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
  1950  	}
  1951  
  1952  	// At this point, we have the three cases of conditional flags below
  1953  	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
  1954  	//
  1955  	// 1) Two values are NaN-free and different: All flags are cleared.
  1956  	// 2) Two values are NaN-free and equal: Only ZF flags is set.
  1957  	// 3) One of Two values is NaN: ZF, PF and CF flags are set.
  1958  
  1959  	// Jump instruction to handle 1) case by checking the ZF flag
  1960  	// as ZF is only set for 2) and 3) cases.
  1961  	nanFreeOrDiffJump := c.assembler.CompileJump(amd64.JNE)
  1962  
  1963  	// Start handling 2) and 3).
  1964  
  1965  	// Jump if one of two values is NaN by checking the parity flag (PF).
  1966  	includeNaNJmp := c.assembler.CompileJump(amd64.JPS)
  1967  
  1968  	// Start handling 2).
  1969  
  1970  	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
  1971  	// returned if two values are positive and negative zeros.
  1972  	var inst asm.Instruction
  1973  	switch {
  1974  	case is32Bit && isMin:
  1975  		inst = amd64.ORPS
  1976  	case !is32Bit && isMin:
  1977  		inst = amd64.ORPD
  1978  	case is32Bit && !isMin:
  1979  		inst = amd64.ANDPS
  1980  	case !is32Bit && !isMin:
  1981  		inst = amd64.ANDPD
  1982  	}
  1983  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1984  
  1985  	sameExitJmp := c.assembler.CompileJump(amd64.JMP)
  1986  
  1987  	// start handling 3).
  1988  	c.assembler.SetJumpTargetOnNext(includeNaNJmp)
  1989  
  1990  	// We emit the ADD instruction to produce the NaN in x1.
  1991  	if is32Bit {
  1992  		c.assembler.CompileRegisterToRegister(amd64.ADDSS, x2.register, x1.register)
  1993  	} else {
  1994  		c.assembler.CompileRegisterToRegister(amd64.ADDSD, x2.register, x1.register)
  1995  	}
  1996  
  1997  	// Exit from the NaN case branch.
  1998  	nanExitJmp := c.assembler.CompileJump(amd64.JMP)
  1999  
  2000  	// Start handling 1).
  2001  	c.assembler.SetJumpTargetOnNext(nanFreeOrDiffJump)
  2002  
  2003  	// Now handle the NaN-free and different values case.
  2004  	c.assembler.CompileRegisterToRegister(minOrMaxInstruction, x2.register, x1.register)
  2005  
  2006  	// Set the jump target of 1) and 2) cases to the next instruction after 3) case.
  2007  	c.assembler.SetJumpTargetOnNext(nanExitJmp)
  2008  	c.assembler.SetJumpTargetOnNext(sameExitJmp)
  2009  
  2010  	// Record that we consumed the x2 and placed the minOrMax result in the x1's register.
  2011  	c.locationStack.markRegisterUnused(x2.register)
  2012  	c.locationStack.markRegisterUnused(x1.register)
  2013  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2014  	return nil
  2015  }
  2016  
  2017  // compileCopysign implements compiler.compileCopysign for the amd64 architecture.
  2018  func (c *amd64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
  2019  	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
  2020  
  2021  	x2 := c.locationStack.pop()
  2022  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2023  		return err
  2024  	}
  2025  	x1 := c.locationStack.pop()
  2026  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2027  		return err
  2028  	}
  2029  	tmpReg, err := c.allocateRegister(registerTypeVector)
  2030  	if err != nil {
  2031  		return err
  2032  	}
  2033  
  2034  	// Move the rest bit mask to the temp register.
  2035  	if is32Bit {
  2036  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32RestBitMask, tmpReg)
  2037  	} else {
  2038  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64RestBitMask, tmpReg)
  2039  	}
  2040  	if err != nil {
  2041  		return err
  2042  	}
  2043  
  2044  	// Clear the sign bit of x1 via AND with the mask.
  2045  	if is32Bit {
  2046  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x1.register)
  2047  	} else {
  2048  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x1.register)
  2049  	}
  2050  
  2051  	// Move the sign bit mask to the temp register.
  2052  	if is32Bit {
  2053  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
  2054  	} else {
  2055  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
  2056  	}
  2057  	if err != nil {
  2058  		return err
  2059  	}
  2060  
  2061  	// Clear the non-sign bits of x2 via AND with the mask.
  2062  	if is32Bit {
  2063  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x2.register)
  2064  	} else {
  2065  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x2.register)
  2066  	}
  2067  
  2068  	// Finally, copy the sign bit of x2 to x1.
  2069  	if is32Bit {
  2070  		c.assembler.CompileRegisterToRegister(amd64.ORPS, x2.register, x1.register)
  2071  	} else {
  2072  		c.assembler.CompileRegisterToRegister(amd64.ORPD, x2.register, x1.register)
  2073  	}
  2074  
  2075  	// Record that we consumed the x2 and placed the copysign result in the x1's register.
  2076  	c.locationStack.markRegisterUnused(x2.register)
  2077  	c.locationStack.markRegisterUnused(x1.register)
  2078  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2079  	return nil
  2080  }
  2081  
  2082  // compileSqrt implements compiler.compileSqrt for the amd64 architecture.
  2083  func (c *amd64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
  2084  	target := c.locationStack.peek() // Note this is peek!
  2085  	if err := c.compileEnsureOnRegister(target); err != nil {
  2086  		return err
  2087  	}
  2088  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2089  		c.assembler.CompileRegisterToRegister(amd64.SQRTSS, target.register, target.register)
  2090  	} else {
  2091  		c.assembler.CompileRegisterToRegister(amd64.SQRTSD, target.register, target.register)
  2092  	}
  2093  	return nil
  2094  }
  2095  
  2096  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the amd64 architecture.
  2097  func (c *amd64Compiler) compileI32WrapFromI64() error {
  2098  	target := c.locationStack.peek() // Note this is peek!
  2099  	if err := c.compileEnsureOnRegister(target); err != nil {
  2100  		return err
  2101  	}
  2102  	c.assembler.CompileRegisterToRegister(amd64.MOVL, target.register, target.register)
  2103  	target.valueType = runtimeValueTypeI32
  2104  	return nil
  2105  }
  2106  
  2107  // compileITruncFromF implements compiler.compileITruncFromF for the amd64 architecture.
  2108  //
  2109  // Note: in the following implementation, we use CVTSS2SI and CVTSD2SI to convert floats to signed integers.
  2110  // According to the Intel manual ([1],[2]), if the source float value is either +-Inf or NaN, or it exceeds representative ranges
  2111  // of target signed integer, then the instruction returns "masked" response float32SignBitMask (or float64SignBitMask for 64 bit case).
  2112  // [1] Chapter 11.5.2, SIMD Floating-Point Exception Conditions in "Vol 1, Intel® 64 and IA-32 Architectures Manual"
  2113  //
  2114  //	https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html
  2115  //
  2116  // [2] https://xem.github.io/minix86/manual/intel-x86-and-64-manual-vol1/o_7281d5ea06a5b67a-268.html
  2117  func (c *amd64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) (err error) {
  2118  	inputType := wazeroir.Float(o.B1)
  2119  	outputType := wazeroir.SignedInt(o.B2)
  2120  	nonTrapping := o.B3
  2121  	if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt32 {
  2122  		err = c.emitSignedI32TruncFromFloat(true, nonTrapping)
  2123  	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt64 {
  2124  		err = c.emitSignedI64TruncFromFloat(true, nonTrapping)
  2125  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt32 {
  2126  		err = c.emitSignedI32TruncFromFloat(false, nonTrapping)
  2127  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt64 {
  2128  		err = c.emitSignedI64TruncFromFloat(false, nonTrapping)
  2129  	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint32 {
  2130  		err = c.emitUnsignedI32TruncFromFloat(true, nonTrapping)
  2131  	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint64 {
  2132  		err = c.emitUnsignedI64TruncFromFloat(true, nonTrapping)
  2133  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint32 {
  2134  		err = c.emitUnsignedI32TruncFromFloat(false, nonTrapping)
  2135  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint64 {
  2136  		err = c.emitUnsignedI64TruncFromFloat(false, nonTrapping)
  2137  	}
  2138  	return
  2139  }
  2140  
  2141  // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit unsigned integer.
  2142  func (c *amd64Compiler) emitUnsignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2143  	source := c.locationStack.pop()
  2144  	if err := c.compileEnsureOnRegister(source); err != nil {
  2145  		return err
  2146  	}
  2147  
  2148  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2149  	if err != nil {
  2150  		return err
  2151  	}
  2152  
  2153  	// First, we check the source float value is above or equal math.MaxInt32+1.
  2154  	if isFloat32Bit {
  2155  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
  2156  	} else {
  2157  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
  2158  	}
  2159  	if err != nil {
  2160  		return err
  2161  	}
  2162  
  2163  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2164  	var nonTrappingNaNJump asm.Node
  2165  	if nonTrapping {
  2166  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2167  		// In non trapping case, NaN is casted as zero.
  2168  		// Zero out the result register by XOR itsself.
  2169  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2170  		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
  2171  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2172  	} else {
  2173  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2174  	}
  2175  
  2176  	// Jump if the source float value is above or equal math.MaxInt32+1.
  2177  	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
  2178  
  2179  	// next we convert the value as a signed integer.
  2180  	if isFloat32Bit {
  2181  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2182  	} else {
  2183  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2184  	}
  2185  
  2186  	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
  2187  	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
  2188  
  2189  	var nonTrappingMinusJump asm.Node
  2190  	if nonTrapping {
  2191  		jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
  2192  		// In non trapping case, the minus value is casted as zero.
  2193  		// Zero out the result register by XOR itsself.
  2194  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2195  		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
  2196  		c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
  2197  	} else {
  2198  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2199  	}
  2200  
  2201  	// Otherwise, the values is valid.
  2202  	okJmpForLessThanMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP)
  2203  
  2204  	// Now, start handling the case where the original float value is above or equal math.MaxInt32+1.
  2205  	//
  2206  	// First, we subtract the math.MaxInt32+1 from the original value so it can fit in signed 32-bit integer.
  2207  	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
  2208  	if isFloat32Bit {
  2209  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
  2210  	} else {
  2211  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
  2212  	}
  2213  	if err != nil {
  2214  		return err
  2215  	}
  2216  
  2217  	// Then, convert the subtracted value as a signed 32-bit integer.
  2218  	if isFloat32Bit {
  2219  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2220  	} else {
  2221  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2222  	}
  2223  
  2224  	// next, we have to check if the value is from NaN, +Inf.
  2225  	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
  2226  	// This means we check if the result int value is minus or not.
  2227  	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
  2228  
  2229  	// If the result is minus, the conversion is invalid (from NaN or +Inf)
  2230  	var nonTrappingAboveOrEqualMaxInt32PlusOne asm.Node
  2231  	if nonTrapping {
  2232  		jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
  2233  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitUnsignedInt, result)
  2234  		if err != nil {
  2235  			return err
  2236  		}
  2237  		nonTrappingAboveOrEqualMaxInt32PlusOne = c.assembler.CompileJump(amd64.JMP)
  2238  		c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
  2239  	} else {
  2240  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2241  	}
  2242  
  2243  	// Otherwise, we successfully converted the source float minus (math.MaxInt32+1) to int.
  2244  	// So, we retrieve the original source float value by adding the sign mask.
  2245  	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDL, c.float32SignBitMask, result); err != nil {
  2246  		return err
  2247  	}
  2248  
  2249  	// We jump to the next instructions for valid cases.
  2250  	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt32PlusOne)
  2251  	if nonTrapping {
  2252  		c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt32PlusOne)
  2253  		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
  2254  		c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
  2255  	}
  2256  
  2257  	// We consumed the source's register and placed the conversion result
  2258  	// in the result register.
  2259  	c.locationStack.markRegisterUnused(source.register)
  2260  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  2261  	return nil
  2262  }
  2263  
  2264  // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit unsigned integer.
  2265  func (c *amd64Compiler) emitUnsignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2266  	source := c.locationStack.pop()
  2267  	if err := c.compileEnsureOnRegister(source); err != nil {
  2268  		return err
  2269  	}
  2270  
  2271  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2272  	if err != nil {
  2273  		return err
  2274  	}
  2275  
  2276  	// First, we check the source float value is above or equal math.MaxInt64+1.
  2277  	if isFloat32Bit {
  2278  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
  2279  	} else {
  2280  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
  2281  	}
  2282  	if err != nil {
  2283  		return err
  2284  	}
  2285  
  2286  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2287  	var nonTrappingNaNJump asm.Node
  2288  	if nonTrapping {
  2289  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is c.not set.
  2290  		// In non trapping case, NaN is casted as zero.
  2291  		// Zero out the result register by XOR itsself.
  2292  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2293  		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
  2294  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2295  	} else {
  2296  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2297  	}
  2298  
  2299  	// Jump if the source float values is above or equal math.MaxInt64+1.
  2300  	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
  2301  
  2302  	// next we convert the value as a signed integer.
  2303  	if isFloat32Bit {
  2304  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2305  	} else {
  2306  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2307  	}
  2308  
  2309  	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
  2310  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
  2311  
  2312  	var nonTrappingMinusJump asm.Node
  2313  	if nonTrapping {
  2314  		jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
  2315  		// In non trapping case, the minus value is casted as zero.
  2316  		// Zero out the result register by XOR itsself.
  2317  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2318  		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
  2319  		c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
  2320  	} else {
  2321  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2322  	}
  2323  
  2324  	// Otherwise, the values is valid.
  2325  	okJmpForLessThanMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP)
  2326  
  2327  	// Now, start handling the case where the original float value is above or equal math.MaxInt64+1.
  2328  	//
  2329  	// First, we subtract the math.MaxInt64+1 from the original value so it can fit in signed 64-bit integer.
  2330  	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
  2331  	if isFloat32Bit {
  2332  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
  2333  	} else {
  2334  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
  2335  	}
  2336  	if err != nil {
  2337  		return err
  2338  	}
  2339  
  2340  	// Then, convert the subtracted value as a signed 64-bit integer.
  2341  	if isFloat32Bit {
  2342  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2343  	} else {
  2344  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2345  	}
  2346  
  2347  	// next, we have to check if the value is from NaN, +Inf.
  2348  	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
  2349  	// This means we check if the result int value is minus or not.
  2350  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
  2351  
  2352  	// If the result is minus, the conversion is invalid (from NaN or +Inf)
  2353  	var nonTrappingAboveOrEqualMaxInt64PlusOne asm.Node
  2354  	if nonTrapping {
  2355  		jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
  2356  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitUnsignedInt, result)
  2357  		if err != nil {
  2358  			return err
  2359  		}
  2360  		nonTrappingAboveOrEqualMaxInt64PlusOne = c.assembler.CompileJump(amd64.JMP)
  2361  		c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
  2362  	} else {
  2363  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2364  	}
  2365  
  2366  	// Otherwise, we successfully converted the the source float minus (math.MaxInt64+1) to int.
  2367  	// So, we retrieve the original source float value by adding the sign mask.
  2368  	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDQ, c.float64SignBitMask, result); err != nil {
  2369  		return err
  2370  	}
  2371  
  2372  	// We jump to the next instructions for valid cases.
  2373  	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt64PlusOne)
  2374  	if nonTrapping {
  2375  		c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt64PlusOne)
  2376  		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
  2377  		c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
  2378  	}
  2379  
  2380  	// We consumed the source's register and placed the conversion result
  2381  	// in the result register.
  2382  	c.locationStack.markRegisterUnused(source.register)
  2383  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
  2384  	return nil
  2385  }
  2386  
  2387  // emitSignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit signed integer.
  2388  func (c *amd64Compiler) emitSignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2389  	source := c.locationStack.pop()
  2390  	if err := c.compileEnsureOnRegister(source); err != nil {
  2391  		return err
  2392  	}
  2393  
  2394  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2395  	if err != nil {
  2396  		return err
  2397  	}
  2398  
  2399  	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
  2400  	if isFloat32Bit {
  2401  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2402  	} else {
  2403  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2404  	}
  2405  
  2406  	// We compare the conversion result with the sign bit mask to check if it is either
  2407  	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
  2408  	// 2) the source equals the minimum signed 32-bit (=-2147483648.000000) whose bit pattern is float32ForMinimumSigned32bitIntegerAddress for 32 bit float
  2409  	// 	  or float64ForMinimumSigned32bitIntegerAddress for 64bit float.
  2410  	err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.float32SignBitMask, result)
  2411  	if err != nil {
  2412  		return err
  2413  	}
  2414  
  2415  	// Otherwise, jump to exit as the result is valid.
  2416  	okJmp := c.assembler.CompileJump(amd64.JNE)
  2417  
  2418  	// Start handling the case of 1) and 2).
  2419  	// First, check if the value is NaN.
  2420  	if isFloat32Bit {
  2421  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
  2422  	} else {
  2423  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
  2424  	}
  2425  
  2426  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2427  	var nontrappingNanJump asm.Node
  2428  	if nonTrapping {
  2429  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2430  		// In non trapping case, NaN is casted as zero.
  2431  		// Zero out the result register by XOR itsself.
  2432  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2433  		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
  2434  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2435  	} else {
  2436  		// If the value is NaN, we return the function with nativeCallStatusCodeInvalidFloatToIntConversion.
  2437  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2438  	}
  2439  
  2440  	// Check if the value is larger than or equal the minimum 32-bit integer value,
  2441  	// meaning that the value exceeds the lower bound of 32-bit signed integer range.
  2442  	if isFloat32Bit {
  2443  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned32bitInteger, source.register)
  2444  	} else {
  2445  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned32bitInteger, source.register)
  2446  	}
  2447  	if err != nil {
  2448  		return err
  2449  	}
  2450  
  2451  	if !nonTrapping {
  2452  		// Trap if the value does not exceed the lower bound.
  2453  		if isFloat32Bit {
  2454  			c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
  2455  		} else {
  2456  			c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusIntegerOverflow)
  2457  		}
  2458  
  2459  		// At this point, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
  2460  		// So, check if the value equals the minimum signed 32-bit int.
  2461  		if isFloat32Bit {
  2462  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2463  		} else {
  2464  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2465  		}
  2466  		if err != nil {
  2467  			return err
  2468  		}
  2469  
  2470  		// Trap if the value is not minus (= the minimum signed 32-bit int).
  2471  		c.compileMaybeExitFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)
  2472  
  2473  		// We jump to the next instructions for valid cases.
  2474  		c.assembler.SetJumpTargetOnNext(okJmp)
  2475  	} else {
  2476  		// Jump if the value does not exceed the lower bound.
  2477  		var jmpIfNotExceedsLowerBound asm.Node
  2478  		if isFloat32Bit {
  2479  			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JCC)
  2480  		} else {
  2481  			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JHI)
  2482  		}
  2483  
  2484  		// If the value exceeds the lower bound, we "saturate" it to the minimum.
  2485  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.minimum32BitSignedInt, result); err != nil {
  2486  			return err
  2487  		}
  2488  		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
  2489  
  2490  		// Otherwise, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
  2491  		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
  2492  		if isFloat32Bit {
  2493  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2494  		} else {
  2495  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2496  		}
  2497  		if err != nil {
  2498  			return err
  2499  		}
  2500  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int).
  2501  
  2502  		// If the value exceeds signed 32-bit maximum, we saturate it to the maximum.
  2503  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitSignedInt, result); err != nil {
  2504  			return err
  2505  		}
  2506  
  2507  		c.assembler.SetJumpTargetOnNext(okJmp)
  2508  		c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
  2509  		c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
  2510  		c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
  2511  	}
  2512  
  2513  	// We consumed the source's register and placed the conversion result
  2514  	// in the result register.
  2515  	c.locationStack.markRegisterUnused(source.register)
  2516  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  2517  	return nil
  2518  }
  2519  
  2520  // emitSignedI64TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit signed integer.
  2521  func (c *amd64Compiler) emitSignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2522  	source := c.locationStack.pop()
  2523  	if err := c.compileEnsureOnRegister(source); err != nil {
  2524  		return err
  2525  	}
  2526  
  2527  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2528  	if err != nil {
  2529  		return err
  2530  	}
  2531  
  2532  	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
  2533  	if isFloat32Bit {
  2534  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2535  	} else {
  2536  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2537  	}
  2538  
  2539  	// We compare the conversion result with the sign bit mask to check if it is either
  2540  	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
  2541  	// 2) the source equals the minimum signed 32-bit (=-9223372036854775808.0) whose bit pattern is float32ForMinimumSigned64bitIntegerAddress for 32 bit float
  2542  	// 	  or float64ForMinimumSigned64bitIntegerAddress for 64bit float.
  2543  	err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.float64SignBitMask, result)
  2544  	if err != nil {
  2545  		return err
  2546  	}
  2547  
  2548  	// Otherwise, we simply jump to exit as the result is valid.
  2549  	okJmp := c.assembler.CompileJump(amd64.JNE)
  2550  
  2551  	// Start handling the case of 1) and 2).
  2552  	// First, check if the value is NaN.
  2553  	if isFloat32Bit {
  2554  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
  2555  	} else {
  2556  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
  2557  	}
  2558  
  2559  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2560  	var nontrappingNanJump asm.Node
  2561  	if nonTrapping {
  2562  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2563  		// In non trapping case, NaN is casted as zero.
  2564  		// Zero out the result register by XOR itsself.
  2565  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2566  		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
  2567  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2568  	} else {
  2569  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2570  	}
  2571  
  2572  	// Check if the value is larger than or equal the minimum 64-bit integer value,
  2573  	// meaning that the value exceeds the lower bound of 64-bit signed integer range.
  2574  	if isFloat32Bit {
  2575  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned64bitInteger, source.register)
  2576  	} else {
  2577  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned64bitInteger, source.register)
  2578  	}
  2579  	if err != nil {
  2580  		return err
  2581  	}
  2582  
  2583  	if !nonTrapping {
  2584  		// Jump if the value is -Inf.
  2585  		c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
  2586  
  2587  		// At this point, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
  2588  		// So, check if the value equals the minimum signed 64-bit int.
  2589  		if isFloat32Bit {
  2590  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2591  		} else {
  2592  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2593  		}
  2594  		if err != nil {
  2595  			return err
  2596  		}
  2597  
  2598  		// Trap if the value is not minus (= the minimum signed 64-bit int).
  2599  		c.compileMaybeExitFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)
  2600  
  2601  		// We jump to the next instructions for valid cases.
  2602  		c.assembler.SetJumpTargetOnNext(okJmp)
  2603  	} else {
  2604  		// Jump if the value is not -Inf.
  2605  		jmpIfNotExceedsLowerBound := c.assembler.CompileJump(amd64.JCC)
  2606  
  2607  		// If the value exceeds the lower bound, we "saturate" it to the minimum.
  2608  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.minimum64BitSignedInt, result)
  2609  		if err != nil {
  2610  			return err
  2611  		}
  2612  
  2613  		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
  2614  
  2615  		// Otherwise, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
  2616  		// So, check if the value equals the minimum signed 64-bit int.
  2617  		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
  2618  		if isFloat32Bit {
  2619  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2620  		} else {
  2621  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2622  		}
  2623  		if err != nil {
  2624  			return err
  2625  		}
  2626  
  2627  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int).
  2628  
  2629  		// If the value exceeds signed 64-bit maximum, we saturate it to the maximum.
  2630  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitSignedInt, result); err != nil {
  2631  			return err
  2632  		}
  2633  
  2634  		c.assembler.SetJumpTargetOnNext(okJmp)
  2635  		c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
  2636  		c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
  2637  		c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
  2638  	}
  2639  
  2640  	// We consumed the source's register and placed the conversion result
  2641  	// in the result register.
  2642  	c.locationStack.markRegisterUnused(source.register)
  2643  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
  2644  	return nil
  2645  }
  2646  
  2647  // compileFConvertFromI implements compiler.compileFConvertFromI for the amd64 architecture.
  2648  func (c *amd64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) (err error) {
  2649  	inputType := wazeroir.SignedInt(o.B1)
  2650  	outputType := wazeroir.Float(o.B2)
  2651  	if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
  2652  		err = c.compileSimpleConversion(amd64.CVTSL2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 32bit int
  2653  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
  2654  		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int
  2655  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
  2656  		err = c.compileSimpleConversion(amd64.CVTSL2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 32bit int
  2657  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
  2658  		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int
  2659  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
  2660  		// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
  2661  		// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
  2662  		//
  2663  		// Here's the summary:
  2664  		// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
  2665  		// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
  2666  		// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
  2667  		// >> which allows CVTSI2SS to be used after all.
  2668  		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int.
  2669  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
  2670  		// For the same reason above, we use 64bit conversion for unsigned 32bit.
  2671  		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int.
  2672  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
  2673  		err = c.emitUnsignedInt64ToFloatConversion(true)
  2674  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
  2675  		err = c.emitUnsignedInt64ToFloatConversion(false)
  2676  	}
  2677  	return
  2678  }
  2679  
  2680  // emitUnsignedInt64ToFloatConversion is handling the case of unsigned 64-bit integer
  2681  // in compileFConvertFromI.
  2682  func (c *amd64Compiler) emitUnsignedInt64ToFloatConversion(isFloat32bit bool) error {
  2683  	// The logic here is exactly the same as GCC emits for the following code:
  2684  	//
  2685  	// float convert(int num) {
  2686  	//     float foo;
  2687  	//     uint64_t ptr1 = 100;
  2688  	//     foo = (float)(ptr1);
  2689  	//     return foo;
  2690  	// }
  2691  	//
  2692  	// which is compiled by GCC as
  2693  	//
  2694  	// convert:
  2695  	// 	   push    rbp
  2696  	// 	   mov     rbp, rsp
  2697  	// 	   mov     DWORD PTR [rbp-20], edi
  2698  	// 	   mov     DWORD PTR [rbp-4], 100
  2699  	// 	   mov     eax, DWORD PTR [rbp-4]
  2700  	// 	   test    rax, rax
  2701  	// 	   js      .handle_sign_bit_case
  2702  	// 	   cvtsi2ss        xmm0, rax
  2703  	// 	   jmp     .exit
  2704  	// .handle_sign_bit_case:
  2705  	// 	   mov     rdx, rax
  2706  	// 	   shr     rdx
  2707  	// 	   and     eax, 1
  2708  	// 	   or      rdx, rax
  2709  	// 	   cvtsi2ss        xmm0, rdx
  2710  	// 	   addsd   xmm0, xmm0
  2711  	// .exit: ...
  2712  	//
  2713  	// tl;dr is that we have a branch depending on whether or not sign bit is set.
  2714  
  2715  	origin := c.locationStack.pop()
  2716  	if err := c.compileEnsureOnRegister(origin); err != nil {
  2717  		return err
  2718  	}
  2719  
  2720  	dest, err := c.allocateRegister(registerTypeVector)
  2721  	if err != nil {
  2722  		return err
  2723  	}
  2724  
  2725  	c.locationStack.markRegisterUsed(dest)
  2726  
  2727  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2728  	if err != nil {
  2729  		return err
  2730  	}
  2731  
  2732  	// Check if the most significant bit (sign bit) is set.
  2733  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, origin.register, origin.register)
  2734  
  2735  	// Jump if the sign bit is set.
  2736  	jmpIfSignbitSet := c.assembler.CompileJump(amd64.JMI)
  2737  
  2738  	// Otherwise, we could fit the unsigned int into float32.
  2739  	// So, we convert it to float32 and emit jump instruction to exit from this branch.
  2740  	if isFloat32bit {
  2741  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, origin.register, dest)
  2742  	} else {
  2743  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, origin.register, dest)
  2744  	}
  2745  	exitFromSignbitUnSet := c.assembler.CompileJump(amd64.JMP)
  2746  
  2747  	// Now handling the case where sign-bit is set.
  2748  	// We emit the following sequences:
  2749  	// 	   mov     tmpReg, origin
  2750  	// 	   shr     tmpReg, 1
  2751  	// 	   and     origin, 1
  2752  	// 	   or      tmpReg, origin
  2753  	// 	   cvtsi2ss        xmm0, tmpReg
  2754  	// 	   addsd   xmm0, xmm0
  2755  
  2756  	c.assembler.SetJumpTargetOnNext(jmpIfSignbitSet)
  2757  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, tmpReg)
  2758  	c.assembler.CompileConstToRegister(amd64.SHRQ, 1, tmpReg)
  2759  	c.assembler.CompileConstToRegister(amd64.ANDQ, 1, origin.register)
  2760  	c.assembler.CompileRegisterToRegister(amd64.ORQ, origin.register, tmpReg)
  2761  	if isFloat32bit {
  2762  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, tmpReg, dest)
  2763  	} else {
  2764  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, tmpReg, dest)
  2765  	}
  2766  	if isFloat32bit {
  2767  		c.assembler.CompileRegisterToRegister(amd64.ADDSS, dest, dest)
  2768  	} else {
  2769  		c.assembler.CompileRegisterToRegister(amd64.ADDSD, dest, dest)
  2770  	}
  2771  
  2772  	// Now, we finished the sign-bit set branch.
  2773  	// We have to make the exit jump target of sign-bit unset branch
  2774  	// towards the next instruction.
  2775  	c.assembler.SetJumpTargetOnNext(exitFromSignbitUnSet)
  2776  
  2777  	// We consumed the origin's register and placed the conversion result
  2778  	// in the dest register.
  2779  	c.locationStack.markRegisterUnused(origin.register)
  2780  	if isFloat32bit {
  2781  		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF32)
  2782  	} else {
  2783  		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF64)
  2784  	}
  2785  	return nil
  2786  }
  2787  
  2788  // compileSimpleConversion pops a value type from the stack, and applies the
  2789  // given instruction on it, and push the result onto a register of the given type.
  2790  func (c *amd64Compiler) compileSimpleConversion(convInstruction asm.Instruction,
  2791  	destinationRegisterType registerType, destinationValueType runtimeValueType,
  2792  ) error {
  2793  	origin := c.locationStack.pop()
  2794  	if err := c.compileEnsureOnRegister(origin); err != nil {
  2795  		return err
  2796  	}
  2797  
  2798  	dest, err := c.allocateRegister(destinationRegisterType)
  2799  	if err != nil {
  2800  		return err
  2801  	}
  2802  
  2803  	c.assembler.CompileRegisterToRegister(convInstruction, origin.register, dest)
  2804  
  2805  	c.locationStack.markRegisterUnused(origin.register)
  2806  	c.pushRuntimeValueLocationOnRegister(dest, destinationValueType)
  2807  	return nil
  2808  }
  2809  
  2810  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the amd64 architecture.
  2811  func (c *amd64Compiler) compileF32DemoteFromF64() error {
  2812  	target := c.locationStack.peek() // Note this is peek!
  2813  	if err := c.compileEnsureOnRegister(target); err != nil {
  2814  		return err
  2815  	}
  2816  
  2817  	c.assembler.CompileRegisterToRegister(amd64.CVTSD2SS, target.register, target.register)
  2818  	target.valueType = runtimeValueTypeF32
  2819  	return nil
  2820  }
  2821  
  2822  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the amd64 architecture.
  2823  func (c *amd64Compiler) compileF64PromoteFromF32() error {
  2824  	target := c.locationStack.peek() // Note this is peek!
  2825  	if err := c.compileEnsureOnRegister(target); err != nil {
  2826  		return err
  2827  	}
  2828  
  2829  	c.assembler.CompileRegisterToRegister(amd64.CVTSS2SD, target.register, target.register)
  2830  	target.valueType = runtimeValueTypeF64
  2831  	return nil
  2832  }
  2833  
  2834  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the amd64 architecture.
  2835  func (c *amd64Compiler) compileI32ReinterpretFromF32() error {
  2836  	if peek := c.locationStack.peek(); peek.onStack() {
  2837  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2838  		peek.valueType = runtimeValueTypeI32
  2839  		return nil
  2840  	}
  2841  	return c.compileSimpleConversion(amd64.MOVL, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2842  }
  2843  
  2844  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the amd64 architecture.
  2845  func (c *amd64Compiler) compileI64ReinterpretFromF64() error {
  2846  	if peek := c.locationStack.peek(); peek.onStack() {
  2847  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2848  		peek.valueType = runtimeValueTypeI64
  2849  		return nil
  2850  	}
  2851  	return c.compileSimpleConversion(amd64.MOVQ, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2852  }
  2853  
  2854  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the amd64 architecture.
  2855  func (c *amd64Compiler) compileF32ReinterpretFromI32() error {
  2856  	if peek := c.locationStack.peek(); peek.onStack() {
  2857  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2858  		peek.valueType = runtimeValueTypeF32
  2859  		return nil
  2860  	}
  2861  	return c.compileSimpleConversion(amd64.MOVL, registerTypeVector, runtimeValueTypeF32)
  2862  }
  2863  
  2864  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the amd64 architecture.
  2865  func (c *amd64Compiler) compileF64ReinterpretFromI64() error {
  2866  	if peek := c.locationStack.peek(); peek.onStack() {
  2867  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2868  		peek.valueType = runtimeValueTypeF64
  2869  		return nil
  2870  	}
  2871  	return c.compileSimpleConversion(amd64.MOVQ, registerTypeVector, runtimeValueTypeF64)
  2872  }
  2873  
  2874  // compileExtend implements compiler.compileExtend for the amd64 architecture.
  2875  func (c *amd64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
  2876  	var inst asm.Instruction
  2877  	signed := o.B1 != 0
  2878  	if signed {
  2879  		inst = amd64.MOVLQSX // = MOVSXD https://www.felixcloutier.com/x86/movsx:movsxd
  2880  	} else {
  2881  		inst = amd64.MOVL
  2882  	}
  2883  	return c.compileExtendImpl(inst, runtimeValueTypeI64)
  2884  }
  2885  
  2886  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the amd64 architecture.
  2887  func (c *amd64Compiler) compileSignExtend32From8() error {
  2888  	return c.compileExtendImpl(amd64.MOVBLSX, runtimeValueTypeI32)
  2889  }
  2890  
  2891  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the amd64 architecture.
  2892  func (c *amd64Compiler) compileSignExtend32From16() error {
  2893  	return c.compileExtendImpl(amd64.MOVWLSX, runtimeValueTypeI32)
  2894  }
  2895  
  2896  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the amd64 architecture.
  2897  func (c *amd64Compiler) compileSignExtend64From8() error {
  2898  	return c.compileExtendImpl(amd64.MOVBQSX, runtimeValueTypeI64)
  2899  }
  2900  
  2901  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the amd64 architecture.
  2902  func (c *amd64Compiler) compileSignExtend64From16() error {
  2903  	return c.compileExtendImpl(amd64.MOVWQSX, runtimeValueTypeI64)
  2904  }
  2905  
  2906  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the amd64 architecture.
  2907  func (c *amd64Compiler) compileSignExtend64From32() error {
  2908  	return c.compileExtendImpl(amd64.MOVLQSX, runtimeValueTypeI64)
  2909  }
  2910  
  2911  func (c *amd64Compiler) compileExtendImpl(inst asm.Instruction, destinationType runtimeValueType) error {
  2912  	target := c.locationStack.peek() // Note this is peek!
  2913  	if err := c.compileEnsureOnRegister(target); err != nil {
  2914  		return err
  2915  	}
  2916  
  2917  	c.assembler.CompileRegisterToRegister(inst, target.register, target.register)
  2918  	target.valueType = destinationType
  2919  	return nil
  2920  }
  2921  
  2922  // compileEq implements compiler.compileEq for the amd64 architecture.
  2923  func (c *amd64Compiler) compileEq(o *wazeroir.UnionOperation) error {
  2924  	return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), true)
  2925  }
  2926  
  2927  // compileNe implements compiler.compileNe for the amd64 architecture.
  2928  func (c *amd64Compiler) compileNe(o *wazeroir.UnionOperation) error {
  2929  	return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), false)
  2930  }
  2931  
  2932  func (c *amd64Compiler) compileEqOrNe(t wazeroir.UnsignedType, shouldEqual bool) (err error) {
  2933  	x2 := c.locationStack.pop()
  2934  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2935  		return err
  2936  	}
  2937  
  2938  	x1 := c.locationStack.pop()
  2939  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2940  		return err
  2941  	}
  2942  
  2943  	x1r, x2r := x1.register, x2.register
  2944  
  2945  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  2946  	c.locationStack.releaseRegister(x1)
  2947  	c.locationStack.releaseRegister(x2)
  2948  
  2949  	switch t {
  2950  	case wazeroir.UnsignedTypeI32:
  2951  		err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPL, shouldEqual)
  2952  	case wazeroir.UnsignedTypeI64:
  2953  		err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPQ, shouldEqual)
  2954  	case wazeroir.UnsignedTypeF32:
  2955  		err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISS, shouldEqual)
  2956  	case wazeroir.UnsignedTypeF64:
  2957  		err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISD, shouldEqual)
  2958  	}
  2959  	if err != nil {
  2960  		return
  2961  	}
  2962  	return
  2963  }
  2964  
  2965  func (c *amd64Compiler) compileEqOrNeForInts(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction,
  2966  	shouldEqual bool,
  2967  ) error {
  2968  	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
  2969  
  2970  	// Record that the result is on the conditional register.
  2971  	var condReg asm.ConditionalRegisterState
  2972  	if shouldEqual {
  2973  		condReg = amd64.ConditionalRegisterStateE
  2974  	} else {
  2975  		condReg = amd64.ConditionalRegisterStateNE
  2976  	}
  2977  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(condReg)
  2978  	loc.valueType = runtimeValueTypeI32
  2979  	return nil
  2980  }
  2981  
  2982  // For float EQ and NE, we have to take NaN values into account.
  2983  // Notably, Wasm specification states that if one of targets is NaN,
  2984  // the result must be zero for EQ or one for NE.
  2985  func (c *amd64Compiler) compileEqOrNeForFloats(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool) error {
  2986  	// Before we allocate the result, we have to reserve two int registers.
  2987  	nanFragReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2988  	if err != nil {
  2989  		return err
  2990  	}
  2991  	c.locationStack.markRegisterUsed(nanFragReg)
  2992  	cmpResultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2993  	if err != nil {
  2994  		return err
  2995  	}
  2996  
  2997  	// Then, execute the comparison.
  2998  	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
  2999  
  3000  	// First, we get the parity flag which indicates whether one of values was NaN.
  3001  	if shouldEqual {
  3002  		// Set 1 if two values are NOT NaN.
  3003  		c.assembler.CompileNoneToRegister(amd64.SETPC, nanFragReg)
  3004  	} else {
  3005  		// Set 1 if one of values is NaN.
  3006  		c.assembler.CompileNoneToRegister(amd64.SETPS, nanFragReg)
  3007  	}
  3008  
  3009  	// next, we get the usual comparison flag.
  3010  	if shouldEqual {
  3011  		// Set 1 if equal.
  3012  		c.assembler.CompileNoneToRegister(amd64.SETEQ, cmpResultReg)
  3013  	} else {
  3014  		// Set 1 if not equal.
  3015  		c.assembler.CompileNoneToRegister(amd64.SETNE, cmpResultReg)
  3016  	}
  3017  
  3018  	// Do "and" or "or" operations on these two flags to get the actual result.
  3019  	if shouldEqual {
  3020  		c.assembler.CompileRegisterToRegister(amd64.ANDL, nanFragReg, cmpResultReg)
  3021  	} else {
  3022  		c.assembler.CompileRegisterToRegister(amd64.ORL, nanFragReg, cmpResultReg)
  3023  	}
  3024  
  3025  	// Clear the unnecessary bits by zero extending the first byte.
  3026  	// This is necessary the upper bits (5 to 32 bits) of SET* instruction result is undefined.
  3027  	c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, cmpResultReg, cmpResultReg)
  3028  
  3029  	// Now we have the result in cmpResultReg register, so we record it.
  3030  	c.pushRuntimeValueLocationOnRegister(cmpResultReg, runtimeValueTypeI32)
  3031  	// Also, we no longer need nanFragRegister.
  3032  	c.locationStack.markRegisterUnused(nanFragReg)
  3033  	return nil
  3034  }
  3035  
  3036  // compileEqz implements compiler.compileEqz for the amd64 architecture.
  3037  func (c *amd64Compiler) compileEqz(o *wazeroir.UnionOperation) (err error) {
  3038  	v := c.locationStack.pop()
  3039  	if err = c.compileEnsureOnRegister(v); err != nil {
  3040  		return err
  3041  	}
  3042  
  3043  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  3044  	switch unsignedInt {
  3045  	case wazeroir.UnsignedInt32:
  3046  		err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.fourZeros, v.register)
  3047  	case wazeroir.UnsignedInt64:
  3048  		err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.eightZeros, v.register)
  3049  	}
  3050  	if err != nil {
  3051  		return err
  3052  	}
  3053  
  3054  	// v is consumed by the cmp operation so release it.
  3055  	c.locationStack.releaseRegister(v)
  3056  
  3057  	// Finally, record that the result is on the conditional register.
  3058  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
  3059  	loc.valueType = runtimeValueTypeI32
  3060  	return nil
  3061  }
  3062  
  3063  // compileLt implements compiler.compileLt for the amd64 architecture.
  3064  func (c *amd64Compiler) compileLt(o *wazeroir.UnionOperation) error {
  3065  	x2 := c.locationStack.pop()
  3066  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3067  		return err
  3068  	}
  3069  
  3070  	x1 := c.locationStack.pop()
  3071  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3072  		return err
  3073  	}
  3074  
  3075  	// Emit the compare instruction.
  3076  	var resultConditionState asm.ConditionalRegisterState
  3077  	var inst asm.Instruction
  3078  	signedType := wazeroir.SignedType(o.B1)
  3079  	switch signedType {
  3080  	case wazeroir.SignedTypeInt32:
  3081  		resultConditionState = amd64.ConditionalRegisterStateL
  3082  		inst = amd64.CMPL
  3083  	case wazeroir.SignedTypeUint32:
  3084  		resultConditionState = amd64.ConditionalRegisterStateB
  3085  		inst = amd64.CMPL
  3086  	case wazeroir.SignedTypeInt64:
  3087  		inst = amd64.CMPQ
  3088  		resultConditionState = amd64.ConditionalRegisterStateL
  3089  	case wazeroir.SignedTypeUint64:
  3090  		resultConditionState = amd64.ConditionalRegisterStateB
  3091  		inst = amd64.CMPQ
  3092  	case wazeroir.SignedTypeFloat32:
  3093  		resultConditionState = amd64.ConditionalRegisterStateA
  3094  		inst = amd64.COMISS
  3095  	case wazeroir.SignedTypeFloat64:
  3096  		resultConditionState = amd64.ConditionalRegisterStateA
  3097  		inst = amd64.COMISD
  3098  	}
  3099  	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
  3100  
  3101  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3102  	c.locationStack.releaseRegister(x1)
  3103  	c.locationStack.releaseRegister(x2)
  3104  
  3105  	// Finally, record that the result is on the conditional register.
  3106  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3107  	loc.valueType = runtimeValueTypeI32
  3108  	return nil
  3109  }
  3110  
  3111  // compileGt implements compiler.compileGt for the amd64 architecture.
  3112  func (c *amd64Compiler) compileGt(o *wazeroir.UnionOperation) error {
  3113  	x2 := c.locationStack.pop()
  3114  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3115  		return err
  3116  	}
  3117  
  3118  	x1 := c.locationStack.pop()
  3119  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3120  		return err
  3121  	}
  3122  
  3123  	// Emit the compare instruction.
  3124  	var resultConditionState asm.ConditionalRegisterState
  3125  	signedType := wazeroir.SignedType(o.B1)
  3126  	switch signedType {
  3127  	case wazeroir.SignedTypeInt32:
  3128  		resultConditionState = amd64.ConditionalRegisterStateG
  3129  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3130  	case wazeroir.SignedTypeUint32:
  3131  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3132  		resultConditionState = amd64.ConditionalRegisterStateA
  3133  	case wazeroir.SignedTypeInt64:
  3134  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3135  		resultConditionState = amd64.ConditionalRegisterStateG
  3136  	case wazeroir.SignedTypeUint64:
  3137  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3138  		resultConditionState = amd64.ConditionalRegisterStateA
  3139  	case wazeroir.SignedTypeFloat32:
  3140  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
  3141  		resultConditionState = amd64.ConditionalRegisterStateA
  3142  	case wazeroir.SignedTypeFloat64:
  3143  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
  3144  		resultConditionState = amd64.ConditionalRegisterStateA
  3145  	}
  3146  
  3147  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3148  	c.locationStack.releaseRegister(x1)
  3149  	c.locationStack.releaseRegister(x2)
  3150  
  3151  	// Finally, record that the result is on the conditional register.
  3152  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3153  	loc.valueType = runtimeValueTypeI32
  3154  	return nil
  3155  }
  3156  
  3157  // compileLe implements compiler.compileLe for the amd64 architecture.
  3158  func (c *amd64Compiler) compileLe(o *wazeroir.UnionOperation) error {
  3159  	x2 := c.locationStack.pop()
  3160  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3161  		return err
  3162  	}
  3163  
  3164  	x1 := c.locationStack.pop()
  3165  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3166  		return err
  3167  	}
  3168  
  3169  	// Emit the compare instruction.
  3170  	var inst asm.Instruction
  3171  	var resultConditionState asm.ConditionalRegisterState
  3172  	signedType := wazeroir.SignedType(o.B1)
  3173  	switch signedType {
  3174  	case wazeroir.SignedTypeInt32:
  3175  		resultConditionState = amd64.ConditionalRegisterStateLE
  3176  		inst = amd64.CMPL
  3177  	case wazeroir.SignedTypeUint32:
  3178  		resultConditionState = amd64.ConditionalRegisterStateBE
  3179  		inst = amd64.CMPL
  3180  	case wazeroir.SignedTypeInt64:
  3181  		resultConditionState = amd64.ConditionalRegisterStateLE
  3182  		inst = amd64.CMPQ
  3183  	case wazeroir.SignedTypeUint64:
  3184  		resultConditionState = amd64.ConditionalRegisterStateBE
  3185  		inst = amd64.CMPQ
  3186  	case wazeroir.SignedTypeFloat32:
  3187  		resultConditionState = amd64.ConditionalRegisterStateAE
  3188  		inst = amd64.UCOMISS
  3189  	case wazeroir.SignedTypeFloat64:
  3190  		resultConditionState = amd64.ConditionalRegisterStateAE
  3191  		inst = amd64.UCOMISD
  3192  	}
  3193  	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
  3194  
  3195  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3196  	c.locationStack.releaseRegister(x1)
  3197  	c.locationStack.releaseRegister(x2)
  3198  
  3199  	// Finally, record that the result is on the conditional register.
  3200  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3201  	loc.valueType = runtimeValueTypeI32
  3202  	return nil
  3203  }
  3204  
  3205  // compileGe implements compiler.compileGe for the amd64 architecture.
  3206  func (c *amd64Compiler) compileGe(o *wazeroir.UnionOperation) error {
  3207  	x2 := c.locationStack.pop()
  3208  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3209  		return err
  3210  	}
  3211  
  3212  	x1 := c.locationStack.pop()
  3213  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3214  		return err
  3215  	}
  3216  
  3217  	// Emit the compare instruction.
  3218  	var resultConditionState asm.ConditionalRegisterState
  3219  	signedType := wazeroir.SignedType(o.B1)
  3220  	switch signedType {
  3221  	case wazeroir.SignedTypeInt32:
  3222  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3223  		resultConditionState = amd64.ConditionalRegisterStateGE
  3224  	case wazeroir.SignedTypeUint32:
  3225  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3226  		resultConditionState = amd64.ConditionalRegisterStateAE
  3227  	case wazeroir.SignedTypeInt64:
  3228  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3229  		resultConditionState = amd64.ConditionalRegisterStateGE
  3230  	case wazeroir.SignedTypeUint64:
  3231  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3232  		resultConditionState = amd64.ConditionalRegisterStateAE
  3233  	case wazeroir.SignedTypeFloat32:
  3234  		c.assembler.CompileRegisterToRegister(amd64.COMISS, x2.register, x1.register)
  3235  		resultConditionState = amd64.ConditionalRegisterStateAE
  3236  	case wazeroir.SignedTypeFloat64:
  3237  		c.assembler.CompileRegisterToRegister(amd64.COMISD, x2.register, x1.register)
  3238  		resultConditionState = amd64.ConditionalRegisterStateAE
  3239  	}
  3240  
  3241  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3242  	c.locationStack.releaseRegister(x1)
  3243  	c.locationStack.releaseRegister(x2)
  3244  
  3245  	// Finally, record that the result is on the conditional register.
  3246  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3247  	loc.valueType = runtimeValueTypeI32
  3248  	return nil
  3249  }
  3250  
  3251  // compileLoad implements compiler.compileLoad for the amd64 architecture.
  3252  func (c *amd64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
  3253  	var (
  3254  		isIntType         bool
  3255  		movInst           asm.Instruction
  3256  		targetSizeInBytes int64
  3257  		vt                runtimeValueType
  3258  	)
  3259  
  3260  	unsignedType := wazeroir.UnsignedType(o.B1)
  3261  	offset := uint32(o.U2)
  3262  
  3263  	switch unsignedType {
  3264  	case wazeroir.UnsignedTypeI32:
  3265  		isIntType = true
  3266  		movInst = amd64.MOVL
  3267  		targetSizeInBytes = 32 / 8
  3268  		vt = runtimeValueTypeI32
  3269  	case wazeroir.UnsignedTypeI64:
  3270  		isIntType = true
  3271  		movInst = amd64.MOVQ
  3272  		targetSizeInBytes = 64 / 8
  3273  		vt = runtimeValueTypeI64
  3274  	case wazeroir.UnsignedTypeF32:
  3275  		isIntType = false
  3276  		movInst = amd64.MOVL
  3277  		targetSizeInBytes = 32 / 8
  3278  		vt = runtimeValueTypeF32
  3279  	case wazeroir.UnsignedTypeF64:
  3280  		isIntType = false
  3281  		movInst = amd64.MOVQ
  3282  		targetSizeInBytes = 64 / 8
  3283  		vt = runtimeValueTypeF64
  3284  	}
  3285  
  3286  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3287  	if err != nil {
  3288  		return err
  3289  	}
  3290  
  3291  	if isIntType {
  3292  		// For integer types, read the corresponding bytes from the offset to the memory
  3293  		// and store the value to the int register.
  3294  		c.assembler.CompileMemoryWithIndexToRegister(movInst,
  3295  			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3296  			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3297  			reg)
  3298  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  3299  	} else {
  3300  		// For float types, we read the value to the float register.
  3301  		floatReg, err := c.allocateRegister(registerTypeVector)
  3302  		if err != nil {
  3303  			return err
  3304  		}
  3305  		c.assembler.CompileMemoryWithIndexToRegister(movInst,
  3306  			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3307  			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3308  			floatReg)
  3309  		c.pushRuntimeValueLocationOnRegister(floatReg, vt)
  3310  		// We no longer need the int register so mark it unused.
  3311  		c.locationStack.markRegisterUnused(reg)
  3312  	}
  3313  	return nil
  3314  }
  3315  
  3316  // compileLoad8 implements compiler.compileLoad8 for the amd64 architecture.
  3317  func (c *amd64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
  3318  	const targetSizeInBytes = 1
  3319  	offset := uint32(o.U2)
  3320  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3321  	if err != nil {
  3322  		return err
  3323  	}
  3324  
  3325  	// Then move a byte at the offset to the register.
  3326  	// Note that Load8 is only for integer types.
  3327  	var inst asm.Instruction
  3328  	var vt runtimeValueType
  3329  	signedInt := wazeroir.SignedInt(o.B1)
  3330  	switch signedInt {
  3331  	case wazeroir.SignedInt32:
  3332  		inst = amd64.MOVBLSX
  3333  		vt = runtimeValueTypeI32
  3334  	case wazeroir.SignedUint32:
  3335  		inst = amd64.MOVBLZX
  3336  		vt = runtimeValueTypeI32
  3337  	case wazeroir.SignedInt64:
  3338  		inst = amd64.MOVBQSX
  3339  		vt = runtimeValueTypeI64
  3340  	case wazeroir.SignedUint64:
  3341  		inst = amd64.MOVBQZX
  3342  		vt = runtimeValueTypeI64
  3343  	}
  3344  
  3345  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3346  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3347  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3348  		reg)
  3349  
  3350  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3351  	return nil
  3352  }
  3353  
  3354  // compileLoad16 implements compiler.compileLoad16 for the amd64 architecture.
  3355  func (c *amd64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
  3356  	const targetSizeInBytes = 16 / 8
  3357  	offset := uint32(o.U2)
  3358  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3359  	if err != nil {
  3360  		return err
  3361  	}
  3362  
  3363  	// Then move 2 bytes at the offset to the register.
  3364  	// Note that Load16 is only for integer types.
  3365  	var inst asm.Instruction
  3366  	var vt runtimeValueType
  3367  	signedInt := wazeroir.SignedInt(o.B1)
  3368  	switch signedInt {
  3369  	case wazeroir.SignedInt32:
  3370  		inst = amd64.MOVWLSX
  3371  		vt = runtimeValueTypeI32
  3372  	case wazeroir.SignedInt64:
  3373  		inst = amd64.MOVWQSX
  3374  		vt = runtimeValueTypeI64
  3375  	case wazeroir.SignedUint32:
  3376  		inst = amd64.MOVWLZX
  3377  		vt = runtimeValueTypeI32
  3378  	case wazeroir.SignedUint64:
  3379  		inst = amd64.MOVWQZX
  3380  		vt = runtimeValueTypeI64
  3381  	}
  3382  
  3383  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3384  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3385  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3386  		reg)
  3387  
  3388  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3389  	return nil
  3390  }
  3391  
  3392  // compileLoad32 implements compiler.compileLoad32 for the amd64 architecture.
  3393  func (c *amd64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
  3394  	const targetSizeInBytes = 32 / 8
  3395  	offset := uint32(o.U2)
  3396  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3397  	if err != nil {
  3398  		return err
  3399  	}
  3400  
  3401  	// Then move 4 bytes at the offset to the register.
  3402  	var inst asm.Instruction
  3403  	signed := o.B1 == 1
  3404  	if signed {
  3405  		inst = amd64.MOVLQSX
  3406  	} else {
  3407  		inst = amd64.MOVLQZX
  3408  	}
  3409  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3410  		// We access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3411  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3412  		reg)
  3413  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  3414  	return nil
  3415  }
  3416  
  3417  // compileMemoryAccessCeilSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes"
  3418  // into a register, and returns the stored register. We call the result "ceil" because we access the memory
  3419  // as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3420  //
  3421  // Note: this also emits the instructions to check the out-of-bounds memory access.
  3422  // In other words, if the ceil exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  3423  func (c *amd64Compiler) compileMemoryAccessCeilSetup(offsetArg uint32, targetSizeInBytes int64) (asm.Register, error) {
  3424  	base := c.locationStack.pop()
  3425  	if err := c.compileEnsureOnRegister(base); err != nil {
  3426  		return asm.NilRegister, err
  3427  	}
  3428  
  3429  	result := base.register
  3430  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxInt32 {
  3431  		c.assembler.CompileConstToRegister(amd64.ADDQ, offsetConst, result)
  3432  	} else if offsetConst <= math.MaxUint32 {
  3433  		// Note: in practice, this branch rarely happens as in this case, the wasm binary know that
  3434  		// memory has more than 1 GBi or at least tries to access above 1 GBi memory region.
  3435  		//
  3436  		// This case, we cannot directly add the offset to a register by ADDQ(const) instruction.
  3437  		// That is because the imm32 const is sign-extended to 64-bit in ADDQ(const), and we end up
  3438  		// making offsetConst as the negative number, which is wrong.
  3439  		tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3440  		if err != nil {
  3441  			return asm.NilRegister, err
  3442  		}
  3443  		c.assembler.CompileConstToRegister(amd64.MOVL, int64(uint32(offsetConst)), tmp)
  3444  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, tmp, result)
  3445  	} else {
  3446  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  3447  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3448  		return result, nil
  3449  	}
  3450  
  3451  	// Now we compare the value with the memory length which is held by callEngine.
  3452  	if err := c.compileCompareWithMemorySliceLen(result); err != nil {
  3453  		return asm.NilRegister, err
  3454  	}
  3455  
  3456  	// Trap if the value is out-of-bounds of memory length.
  3457  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3458  
  3459  	c.locationStack.markRegisterUnused(result)
  3460  	return result, nil
  3461  }
  3462  
  3463  // compileStore implements compiler.compileStore for the amd64 architecture.
  3464  func (c *amd64Compiler) compileStore(o *wazeroir.UnionOperation) error {
  3465  	var movInst asm.Instruction
  3466  	var targetSizeInByte int64
  3467  	unsignedType := wazeroir.UnsignedType(o.B1)
  3468  	offset := uint32(o.U2)
  3469  	switch unsignedType {
  3470  	case wazeroir.UnsignedTypeI32, wazeroir.UnsignedTypeF32:
  3471  		movInst = amd64.MOVL
  3472  		targetSizeInByte = 32 / 8
  3473  	case wazeroir.UnsignedTypeI64, wazeroir.UnsignedTypeF64:
  3474  		movInst = amd64.MOVQ
  3475  		targetSizeInByte = 64 / 8
  3476  	}
  3477  	return c.compileStoreImpl(offset, movInst, targetSizeInByte)
  3478  }
  3479  
  3480  // compileStore8 implements compiler.compileStore8 for the amd64 architecture.
  3481  func (c *amd64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
  3482  	return c.compileStoreImpl(uint32(o.U2), amd64.MOVB, 1)
  3483  }
  3484  
  3485  // compileStore32 implements compiler.compileStore32 for the amd64 architecture.
  3486  func (c *amd64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
  3487  	return c.compileStoreImpl(uint32(o.U2), amd64.MOVW, 16/8)
  3488  }
  3489  
  3490  // compileStore32 implements compiler.compileStore32 for the amd64 architecture.
  3491  func (c *amd64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
  3492  	return c.compileStoreImpl(uint32(o.U2), amd64.MOVL, 32/8)
  3493  }
  3494  
  3495  func (c *amd64Compiler) compileStoreImpl(offsetConst uint32, inst asm.Instruction, targetSizeInBytes int64) error {
  3496  	val := c.locationStack.pop()
  3497  	if err := c.compileEnsureOnRegister(val); err != nil {
  3498  		return err
  3499  	}
  3500  
  3501  	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
  3502  	if err != nil {
  3503  		return err
  3504  	}
  3505  
  3506  	c.assembler.CompileRegisterToMemoryWithIndex(
  3507  		inst, val.register,
  3508  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3509  	)
  3510  
  3511  	// We no longer need both the value and base registers.
  3512  	c.locationStack.releaseRegister(val)
  3513  	c.locationStack.markRegisterUnused(reg)
  3514  	return nil
  3515  }
  3516  
  3517  // compileMemoryGrow implements compiler.compileMemoryGrow for the amd64 architecture.
  3518  func (c *amd64Compiler) compileMemoryGrow() error {
  3519  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3520  		return err
  3521  	}
  3522  
  3523  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexMemoryGrow); err != nil {
  3524  		return err
  3525  	}
  3526  
  3527  	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
  3528  	c.compileReservedStackBasePointerInitialization()
  3529  	c.compileReservedMemoryPointerInitialization()
  3530  	return nil
  3531  }
  3532  
  3533  // compileMemorySize implements compiler.compileMemorySize for the amd64 architecture.
  3534  func (c *amd64Compiler) compileMemorySize() error {
  3535  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3536  		return err
  3537  	}
  3538  
  3539  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  3540  	if err != nil {
  3541  		return err
  3542  	}
  3543  	loc := c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  3544  
  3545  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  3546  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset,
  3547  		loc.register)
  3548  
  3549  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, loc.register, memoryInstanceBufferLenOffset, loc.register)
  3550  
  3551  	// WebAssembly's memory.size returns the page size (65536) of memory region.
  3552  	// That is equivalent to divide the len of memory slice by 65536 and
  3553  	// that can be calculated as SHR by 16 bits as 65536 = 2^16.
  3554  	c.assembler.CompileConstToRegister(amd64.SHRQ, wasm.MemoryPageSizeInBits, loc.register)
  3555  	return nil
  3556  }
  3557  
  3558  // compileMemoryInit implements compiler.compileMemoryInit for the amd64 architecture.
  3559  func (c *amd64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
  3560  	dataIndex := uint32(o.U1)
  3561  	return c.compileInitImpl(false, dataIndex, 0)
  3562  }
  3563  
  3564  // compileInitImpl implements compileTableInit and compileMemoryInit.
  3565  //
  3566  // TODO: the compiled code in this function should be reused and compile at once as
  3567  // the code is independent of any module.
  3568  func (c *amd64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  3569  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3570  	if isTable {
  3571  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3572  	}
  3573  
  3574  	copySize := c.locationStack.pop()
  3575  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3576  		return err
  3577  	}
  3578  
  3579  	sourceOffset := c.locationStack.pop()
  3580  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3581  		return err
  3582  	}
  3583  
  3584  	destinationOffset := c.locationStack.pop()
  3585  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3586  		return err
  3587  	}
  3588  
  3589  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  3590  	if err != nil {
  3591  		return err
  3592  	}
  3593  	c.locationStack.markRegisterUsed(instanceAddr)
  3594  	if isTable {
  3595  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  3596  	} else {
  3597  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  3598  	}
  3599  
  3600  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3601  	if err != nil {
  3602  		return err
  3603  	}
  3604  	c.locationStack.markRegisterUsed(tmp)
  3605  
  3606  	// sourceOffset += size.
  3607  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3608  	// destinationOffset += size.
  3609  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3610  
  3611  	// Check instance bounds and if exceeds the length, exit with out of bounds error.
  3612  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3613  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  3614  		sourceOffset.register)
  3615  	c.compileMaybeExitFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)
  3616  
  3617  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3618  	if isTable {
  3619  		// Load the target table's address.
  3620  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3621  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
  3622  		// Compare length.
  3623  		c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
  3624  	} else {
  3625  		if err := c.compileCompareWithMemorySliceLen(destinationOffset.register); err != nil {
  3626  			return err
  3627  		}
  3628  	}
  3629  
  3630  	c.compileMaybeExitFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)
  3631  
  3632  	// Otherwise, ready to copy the value from source to destination.
  3633  	//
  3634  	// If the copy size equal zero, we skip the entire instructions below.
  3635  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3636  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3637  
  3638  	var scale int16
  3639  	var memToReg, regToMem asm.Instruction
  3640  	if isTable {
  3641  		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3642  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
  3643  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3644  		// destinationOffset += table buffer's absolute address.
  3645  		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3646  			tmp, tableInstanceTableOffset, destinationOffset.register)
  3647  		// sourceOffset += data buffer's absolute address.
  3648  		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3649  			instanceAddr, 0, sourceOffset.register)
  3650  
  3651  		// For tables, we move 8 bytes at once.
  3652  		memToReg = amd64.MOVQ
  3653  		regToMem = memToReg
  3654  		scale = 8
  3655  	} else {
  3656  		// destinationOffset += memory buffer's absolute address.
  3657  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3658  
  3659  		// sourceOffset += data buffer's absolute address.
  3660  		c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register)
  3661  
  3662  		// Move one byte at once.
  3663  		memToReg = amd64.MOVBQZX
  3664  		regToMem = amd64.MOVB
  3665  		scale = 1
  3666  	}
  3667  
  3668  	// Negate the counter.
  3669  	c.assembler.CompileNoneToRegister(amd64.NEGQ, copySize.register)
  3670  
  3671  	beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3672  
  3673  	c.assembler.CompileMemoryWithIndexToRegister(memToReg,
  3674  		sourceOffset.register, 0, copySize.register, scale,
  3675  		tmp)
  3676  	// [destinationOffset + (size.register)] = tmp.
  3677  	c.assembler.CompileRegisterToMemoryWithIndex(regToMem,
  3678  		tmp,
  3679  		destinationOffset.register, 0, copySize.register, scale,
  3680  	)
  3681  
  3682  	// size += 1
  3683  	c.assembler.CompileNoneToRegister(amd64.INCQ, copySize.register)
  3684  	c.assembler.CompileJump(amd64.JMI).AssignJumpTarget(beginCopyLoop)
  3685  
  3686  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  3687  		destinationOffset.register, instanceAddr, tmp)
  3688  	c.assembler.SetJumpTargetOnNext(skipJump)
  3689  	return nil
  3690  }
  3691  
  3692  // compileDataDrop implements compiler.compileDataDrop for the amd64 architecture.
  3693  func (c *amd64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
  3694  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3695  		return err
  3696  	}
  3697  
  3698  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3699  	if err != nil {
  3700  		return err
  3701  	}
  3702  
  3703  	dataIndex := uint32(o.U1)
  3704  	c.compileLoadDataInstanceAddress(dataIndex, tmp)
  3705  
  3706  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3707  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
  3708  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
  3709  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
  3710  	return nil
  3711  }
  3712  
  3713  func (c *amd64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3714  	// dst = dataIndex * dataInstanceStructSize.
  3715  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(dataIndex)*dataInstanceStructSize, dst)
  3716  
  3717  	// dst = &moduleInstance.DataInstances[0] + dst
  3718  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3719  	//     = &moduleInstance.DataInstances[dataIndex]
  3720  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3721  		amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3722  		dst,
  3723  	)
  3724  }
  3725  
  3726  // compileCopyLoopImpl implements a REP MOVSQ memory copy for the given range with support for both directions.
  3727  func (c *amd64Compiler) compileCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, backwards bool, bwOffset uint8) {
  3728  	// skip if nothing to copy
  3729  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3730  	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
  3731  
  3732  	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
  3733  	restoreCrossing := c.compilePreventCrossedTargetRegisters(
  3734  		[]*runtimeValueLocation{destinationOffset, sourceOffset, copySize},
  3735  		[]asm.Register{amd64.RegDI, amd64.RegSI, amd64.RegCX})
  3736  
  3737  	// Prepare registers for REP MOVSQ: copy from rsi to rdi, rcx times.
  3738  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3739  	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
  3740  	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
  3741  
  3742  	// Point on first byte of first quadword to copy.
  3743  	if backwards {
  3744  		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegDI)
  3745  		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegSI)
  3746  		// Set REP prefix direction backwards.
  3747  		c.assembler.CompileStandAlone(amd64.STD)
  3748  	}
  3749  
  3750  	c.assembler.CompileStandAlone(amd64.REPMOVSQ)
  3751  
  3752  	if backwards {
  3753  		// Reset direction.
  3754  		c.assembler.CompileStandAlone(amd64.CLD)
  3755  	}
  3756  
  3757  	// Restore registers.
  3758  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3759  	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
  3760  	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
  3761  	restoreCrossing()
  3762  
  3763  	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
  3764  	c.assembler.CompileStandAlone(amd64.NOP)
  3765  }
  3766  
  3767  // compileMemoryCopyLoopImpl is used for directly copying after bounds/direction check.
  3768  func (c *amd64Compiler) compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
  3769  	// Point on first byte to be copied depending on direction.
  3770  	if backwards {
  3771  		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
  3772  		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
  3773  	} else {
  3774  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
  3775  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3776  	}
  3777  
  3778  	// destinationOffset += memory buffer's absolute address.
  3779  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3780  	// sourceOffset += memory buffer's absolute address.
  3781  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, sourceOffset.register)
  3782  
  3783  	// Copy copySize % 8 bytes in loop to allow copying in 8 byte groups afterward.
  3784  	beginLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3785  
  3786  	// Check copySize % 8 == 0.
  3787  	c.assembler.CompileConstToRegister(amd64.TESTQ, 7, copySize.register)
  3788  	breakLoop := c.assembler.CompileJump(amd64.JEQ)
  3789  
  3790  	c.assembler.CompileMemoryToRegister(amd64.MOVBQZX, sourceOffset.register, 0, tmp)
  3791  	c.assembler.CompileRegisterToMemory(amd64.MOVB, tmp, destinationOffset.register, 0)
  3792  
  3793  	if backwards {
  3794  		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
  3795  		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
  3796  	} else {
  3797  		c.assembler.CompileNoneToRegister(amd64.INCQ, sourceOffset.register)
  3798  		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
  3799  	}
  3800  
  3801  	c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
  3802  	c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginLoop)
  3803  	c.assembler.SetJumpTargetOnNext(breakLoop)
  3804  
  3805  	// compileCopyLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
  3806  	c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
  3807  
  3808  	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 7)
  3809  }
  3810  
  3811  // compileMemoryCopy implements compiler.compileMemoryCopy for the amd64 architecture.
  3812  //
  3813  // This uses efficient `REP MOVSQ` instructions to copy in quadword (8 bytes) batches. The remaining bytes
  3814  // are copied with a simple `MOV` loop. It uses backward copying for overlapped segments.
  3815  func (c *amd64Compiler) compileMemoryCopy() error {
  3816  	copySize := c.locationStack.pop()
  3817  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3818  		return err
  3819  	}
  3820  
  3821  	sourceOffset := c.locationStack.pop()
  3822  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3823  		return err
  3824  	}
  3825  
  3826  	destinationOffset := c.locationStack.pop()
  3827  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3828  		return err
  3829  	}
  3830  
  3831  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3832  	if err != nil {
  3833  		return err
  3834  	}
  3835  	c.locationStack.markRegisterUsed(tmp)
  3836  
  3837  	// sourceOffset += size.
  3838  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3839  	// destinationOffset += size.
  3840  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3841  	// tmp = max(sourceOffset, destinationOffset).
  3842  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, destinationOffset.register)
  3843  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, sourceOffset.register, tmp)
  3844  	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, destinationOffset.register, tmp)
  3845  
  3846  	// Check source bounds and if exceeds the length, exit with out of bounds error.
  3847  	if err := c.compileCompareWithMemorySliceLen(sourceOffset.register); err != nil {
  3848  		return err
  3849  	}
  3850  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3851  
  3852  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3853  	if err := c.compileCompareWithMemorySliceLen(destinationOffset.register); err != nil {
  3854  		return err
  3855  	}
  3856  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3857  
  3858  	// Skip zero size.
  3859  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3860  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3861  
  3862  	// If dest < source, we can copy forwards
  3863  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
  3864  	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
  3865  
  3866  	// If source + size < dest, we can copy forwards
  3867  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
  3868  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
  3869  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
  3870  	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
  3871  
  3872  	// Copy backwards.
  3873  	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, true)
  3874  	endJump := c.assembler.CompileJump(amd64.JMP)
  3875  
  3876  	// Copy forwards.
  3877  	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  3878  	c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
  3879  	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, false)
  3880  
  3881  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  3882  		destinationOffset.register, tmp)
  3883  	c.assembler.SetJumpTargetOnNext(skipJump)
  3884  	c.assembler.SetJumpTargetOnNext(endJump)
  3885  
  3886  	return nil
  3887  }
  3888  
  3889  // compileFillLoopImpl implements a REP STOSQ fill loop.
  3890  func (c *amd64Compiler) compileFillLoopImpl(destinationOffset, value, fillSize *runtimeValueLocation, tmp asm.Register, replicateByte bool) {
  3891  	// Skip if nothing to fill.
  3892  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, fillSize.register, fillSize.register)
  3893  	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
  3894  
  3895  	if replicateByte {
  3896  		// Truncate value.register to a single byte
  3897  		c.assembler.CompileConstToRegister(amd64.ANDQ, 0xff, value.register)
  3898  		// Replicate single byte onto full 8-byte register.
  3899  		c.assembler.CompileConstToRegister(amd64.MOVQ, 0x0101010101010101, tmp)
  3900  		c.assembler.CompileRegisterToRegister(amd64.IMULQ, tmp, value.register)
  3901  	}
  3902  
  3903  	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
  3904  	restoreCrossing := c.compilePreventCrossedTargetRegisters(
  3905  		[]*runtimeValueLocation{destinationOffset, value, fillSize},
  3906  		[]asm.Register{amd64.RegDI, amd64.RegAX, amd64.RegCX})
  3907  
  3908  	// Prepare registers for REP STOSQ: fill at [rdi] with rax, rcx times.
  3909  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3910  	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
  3911  	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
  3912  
  3913  	c.assembler.CompileStandAlone(amd64.REPSTOSQ)
  3914  
  3915  	// Restore registers.
  3916  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3917  	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
  3918  	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
  3919  	restoreCrossing()
  3920  
  3921  	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
  3922  }
  3923  
  3924  // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
  3925  //
  3926  // This function uses efficient `REP STOSQ` instructions to copy in quadword (8 bytes) batches
  3927  // if the size if above 15 bytes. For smaller sizes, a simple MOVB copy loop is the best
  3928  // option.
  3929  //
  3930  // TODO: the compiled code in this function should be reused and compile at once as
  3931  // the code is independent of any module.
  3932  func (c *amd64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3933  	copySize := c.locationStack.pop()
  3934  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3935  		return err
  3936  	}
  3937  
  3938  	value := c.locationStack.pop()
  3939  	if err := c.compileEnsureOnRegister(value); err != nil {
  3940  		return err
  3941  	}
  3942  
  3943  	destinationOffset := c.locationStack.pop()
  3944  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3945  		return err
  3946  	}
  3947  
  3948  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3949  	if err != nil {
  3950  		return err
  3951  	}
  3952  	c.locationStack.markRegisterUsed(tmp)
  3953  
  3954  	// destinationOffset += size.
  3955  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3956  
  3957  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3958  	if isTable {
  3959  		// tmp = &tables[0]
  3960  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  3961  			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3962  			tmp)
  3963  
  3964  		// tmp = [tmp + TableIndex*8]
  3965  		//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3966  		//     = [&tables[TableIndex]] = tables[TableIndex].
  3967  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex)*8, tmp)
  3968  
  3969  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3970  			tmp, tableInstanceTableLenOffset,
  3971  			destinationOffset.register)
  3972  	} else {
  3973  		if err := c.compileCompareWithMemorySliceLen(destinationOffset.register); err != nil {
  3974  			return err
  3975  		}
  3976  	}
  3977  	if isTable {
  3978  		c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
  3979  	} else {
  3980  		c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3981  	}
  3982  
  3983  	// Otherwise, ready to copy the value from source to destination.
  3984  	//
  3985  	// If the copy size equal zero, we skip the entire instructions below.
  3986  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3987  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3988  
  3989  	// destinationOffset -= size.
  3990  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3991  
  3992  	if isTable {
  3993  		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3994  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3995  		// destinationOffset += table buffer's absolute address.
  3996  		c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
  3997  
  3998  	} else {
  3999  		// destinationOffset += memory buffer's absolute address.
  4000  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  4001  
  4002  		// Copy first % 16 bytes with simple MOVB instruction.
  4003  		beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
  4004  		c.assembler.CompileConstToRegister(amd64.TESTQ, 15, copySize.register)
  4005  		breakLoop := c.assembler.CompileJump(amd64.JEQ)
  4006  
  4007  		c.assembler.CompileRegisterToMemory(amd64.MOVB, value.register, destinationOffset.register, 0)
  4008  
  4009  		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
  4010  		c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
  4011  		c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginCopyLoop)
  4012  
  4013  		c.assembler.SetJumpTargetOnNext(breakLoop)
  4014  		// compileFillLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
  4015  		c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
  4016  	}
  4017  
  4018  	c.compileFillLoopImpl(destinationOffset, value, copySize, tmp, !isTable)
  4019  
  4020  	c.locationStack.markRegisterUnused(copySize.register, value.register,
  4021  		destinationOffset.register, tmp)
  4022  	c.assembler.SetJumpTargetOnNext(skipJump)
  4023  	return nil
  4024  }
  4025  
  4026  // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
  4027  //
  4028  // TODO: the compiled code in this function should be reused and compile at once as
  4029  // the code is independent of any module.
  4030  func (c *amd64Compiler) compileMemoryFill() error {
  4031  	return c.compileFillImpl(false, 0)
  4032  }
  4033  
  4034  // compileTableInit implements compiler.compileTableInit for the amd64 architecture.
  4035  func (c *amd64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
  4036  	elemIndex := uint32(o.U1)
  4037  	tableIndex := uint32(o.U2)
  4038  	return c.compileInitImpl(true, elemIndex, tableIndex)
  4039  }
  4040  
  4041  // compileTableCopyLoopImpl is used for directly copying after bounds/direction check.
  4042  func (c *amd64Compiler) compileTableCopyLoopImpl(srcTableIndex, dstTableIndex uint32, destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
  4043  	// Point on first byte to be copied.
  4044  	if !backwards {
  4045  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
  4046  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  4047  	}
  4048  
  4049  	// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  4050  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
  4051  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  4052  	// destinationOffset += table buffer's absolute address.
  4053  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4054  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
  4055  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
  4056  	// sourceOffset += table buffer's absolute address.
  4057  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4058  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
  4059  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, sourceOffset.register)
  4060  
  4061  	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 8)
  4062  }
  4063  
  4064  // compileTableCopy implements compiler.compileTableCopy for the amd64 architecture.
  4065  //
  4066  // It uses efficient `REP MOVSB` instructions for optimized copying. It uses backward copying for
  4067  // overlapped segments.
  4068  func (c *amd64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
  4069  	copySize := c.locationStack.pop()
  4070  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  4071  		return err
  4072  	}
  4073  
  4074  	sourceOffset := c.locationStack.pop()
  4075  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  4076  		return err
  4077  	}
  4078  
  4079  	destinationOffset := c.locationStack.pop()
  4080  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  4081  		return err
  4082  	}
  4083  
  4084  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4085  	if err != nil {
  4086  		return err
  4087  	}
  4088  
  4089  	// sourceOffset += size.
  4090  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  4091  	// destinationOffset += size.
  4092  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  4093  
  4094  	srcTableIndex := uint32(o.U1)
  4095  	dstTableIndex := uint32(o.U2)
  4096  
  4097  	// Check source bounds and if exceeds the length, exit with out of bounds error.
  4098  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4099  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
  4100  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, sourceOffset.register)
  4101  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
  4102  
  4103  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  4104  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4105  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
  4106  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
  4107  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
  4108  
  4109  	// Skip zero size.
  4110  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  4111  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  4112  
  4113  	// If dest < source, we can copy forwards.
  4114  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
  4115  	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
  4116  
  4117  	// If source + size < dest, we can copy forwards.
  4118  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
  4119  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
  4120  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
  4121  	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
  4122  
  4123  	// Copy backwards.
  4124  	c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, true)
  4125  	endJump := c.assembler.CompileJump(amd64.JMP)
  4126  
  4127  	// Copy forwards.
  4128  	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  4129  	c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
  4130  	c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, false)
  4131  
  4132  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  4133  		destinationOffset.register, tmp)
  4134  	c.assembler.SetJumpTargetOnNext(skipJump)
  4135  	c.assembler.SetJumpTargetOnNext(endJump)
  4136  	return nil
  4137  }
  4138  
  4139  // compileElemDrop implements compiler.compileElemDrop for the amd64 architecture.
  4140  func (c *amd64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
  4141  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4142  		return err
  4143  	}
  4144  
  4145  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4146  	if err != nil {
  4147  		return err
  4148  	}
  4149  
  4150  	elemIndex := uint32(o.U1)
  4151  	c.compileLoadElemInstanceAddress(elemIndex, tmp)
  4152  
  4153  	// Clears the content of ElementInstances[o.ElemIndex].References (== []uintptr{} type).
  4154  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
  4155  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
  4156  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
  4157  	return nil
  4158  }
  4159  
  4160  func (c *amd64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  4161  	// dst = elemIndex * elementInstanceStructSize
  4162  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(elemIndex)*elementInstanceStructSize, dst)
  4163  
  4164  	// dst = &moduleInstance.ElementInstances[0] + dst
  4165  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  4166  	//     = &moduleInstance.ElementInstances[elemIndex]
  4167  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  4168  		amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4169  		dst,
  4170  	)
  4171  }
  4172  
  4173  // compileTableGet implements compiler.compileTableGet for the amd64 architecture.
  4174  func (c *amd64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
  4175  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  4176  	if err != nil {
  4177  		return err
  4178  	}
  4179  
  4180  	c.locationStack.markRegisterUsed(ref)
  4181  
  4182  	offset := c.locationStack.pop()
  4183  	if err := c.compileEnsureOnRegister(offset); err != nil {
  4184  		return err
  4185  	}
  4186  
  4187  	// ref = &tables[0]
  4188  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4189  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4190  		ref)
  4191  
  4192  	// ref = [ref + TableIndex*8]
  4193  	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4194  	//     = [&tables[TableIndex]] = tables[TableIndex].
  4195  	tableIndex := int64(o.U1)
  4196  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableIndex*8, ref)
  4197  
  4198  	// Out of bounds check.
  4199  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, ref, tableInstanceTableLenOffset, offset.register)
  4200  	c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
  4201  
  4202  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  4203  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableInstanceTableOffset, ref)
  4204  
  4205  	// ref = [ref + 0 + offset.register * 8]
  4206  	//     = [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset]
  4207  	//     = [&tables[TableIndex].References[offset]]
  4208  	//     = tables[TableIndex].References[offset]
  4209  	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, ref,
  4210  		0, offset.register, 8, ref,
  4211  	)
  4212  
  4213  	c.locationStack.markRegisterUnused(offset.register)
  4214  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  4215  	return nil
  4216  }
  4217  
  4218  // compileTableSet implements compiler.compileTableSet for the amd64 architecture.
  4219  func (c *amd64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
  4220  	ref := c.locationStack.pop()
  4221  	if err := c.compileEnsureOnRegister(ref); err != nil {
  4222  		return err
  4223  	}
  4224  
  4225  	offset := c.locationStack.pop()
  4226  	if err := c.compileEnsureOnRegister(offset); err != nil {
  4227  		return err
  4228  	}
  4229  
  4230  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4231  	if err != nil {
  4232  		return err
  4233  	}
  4234  
  4235  	// tmp = &tables[0]
  4236  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4237  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4238  		tmp)
  4239  
  4240  	// ref = [ref + TableIndex*8]
  4241  	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4242  	//     = [&tables[TableIndex]] = tables[TableIndex].
  4243  	tableIndex := int64(o.U1)
  4244  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableIndex*8, tmp)
  4245  
  4246  	// Out of bounds check.
  4247  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
  4248  	c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
  4249  
  4250  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  4251  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableInstanceTableOffset, tmp)
  4252  
  4253  	// [tmp + 0 + offset.register * 8] = ref
  4254  	// [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] = ref
  4255  	// [&tables[TableIndex].References[offset]] = ref
  4256  	// tables[TableIndex].References[offset] = ref
  4257  	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVQ,
  4258  		ref.register,
  4259  		tmp, 0, offset.register, 8)
  4260  
  4261  	c.locationStack.markRegisterUnused(offset.register, ref.register)
  4262  	return nil
  4263  }
  4264  
  4265  // compileTableGrow implements compiler.compileTableGrow for the amd64 architecture.
  4266  func (c *amd64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
  4267  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4268  		return err
  4269  	}
  4270  
  4271  	// Pushes the table index.
  4272  	tableIndex := uint32(o.U1)
  4273  	if err := c.compileConstI32Impl(tableIndex); err != nil {
  4274  		return err
  4275  	}
  4276  
  4277  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  4278  	// Therefore, call out to the built function for this purpose.
  4279  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexTableGrow); err != nil {
  4280  		return err
  4281  	}
  4282  
  4283  	// TableGrow consumes three values (table index, number of items, initial value).
  4284  	for i := 0; i < 3; i++ {
  4285  		c.locationStack.pop()
  4286  	}
  4287  
  4288  	// Then, the previous length was pushed as the result.
  4289  	loc := c.locationStack.pushRuntimeValueLocationOnStack()
  4290  	loc.valueType = runtimeValueTypeI32
  4291  
  4292  	// After return, we re-initialize reserved registers just like preamble of functions.
  4293  	c.compileReservedStackBasePointerInitialization()
  4294  	c.compileReservedMemoryPointerInitialization()
  4295  	return nil
  4296  }
  4297  
  4298  // compileTableSize implements compiler.compileTableSize for the amd64 architecture.
  4299  func (c *amd64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
  4300  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4301  		return err
  4302  	}
  4303  
  4304  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  4305  	if err != nil {
  4306  		return err
  4307  	}
  4308  
  4309  	// result = &tables[0]
  4310  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4311  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4312  		result)
  4313  
  4314  	// result = [result + TableIndex*8]
  4315  	//        = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4316  	//        = [&tables[TableIndex]] = tables[TableIndex].
  4317  	tableIndex := int64(o.U1)
  4318  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableIndex*8, result)
  4319  
  4320  	// result = [result + tableInstanceTableLenOffset]
  4321  	//        = [tables[TableIndex] + tableInstanceTableLenOffset]
  4322  	//        = len(tables[TableIndex])
  4323  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableInstanceTableLenOffset, result)
  4324  
  4325  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  4326  	return nil
  4327  }
  4328  
  4329  // compileTableFill implements compiler.compileTableFill for the amd64 architecture.
  4330  func (c *amd64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
  4331  	tableIndex := uint32(o.U1)
  4332  	return c.compileFillImpl(true, tableIndex)
  4333  }
  4334  
  4335  // compileRefFunc implements compiler.compileRefFunc for the amd64 architecture.
  4336  func (c *amd64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
  4337  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4338  		return err
  4339  	}
  4340  
  4341  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  4342  	if err != nil {
  4343  		return err
  4344  	}
  4345  
  4346  	functionIndex := int64(o.U1)
  4347  	c.assembler.CompileConstToRegister(amd64.MOVQ, functionIndex*functionSize, ref)
  4348  
  4349  	// ref = [amd64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset + int64(o.FunctionIndex)*functionSize]
  4350  	//     = &moduleEngine.functions[index]
  4351  	c.assembler.CompileMemoryToRegister(
  4352  		amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4353  		ref,
  4354  	)
  4355  
  4356  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  4357  	return nil
  4358  }
  4359  
  4360  // compileConstI32 implements compiler.compileConstI32 for the amd64 architecture.
  4361  func (c *amd64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
  4362  	return c.compileConstI32Impl(uint32(o.U1))
  4363  }
  4364  
  4365  func (c *amd64Compiler) compileConstI32Impl(v uint32) error {
  4366  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4367  		return err
  4368  	}
  4369  
  4370  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4371  	if err != nil {
  4372  		return err
  4373  	}
  4374  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  4375  	c.assembler.CompileConstToRegister(amd64.MOVL, int64(v), reg)
  4376  	return nil
  4377  }
  4378  
  4379  // compileConstI64 implements compiler.compileConstI64 for the amd64 architecture.
  4380  func (c *amd64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
  4381  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4382  		return err
  4383  	}
  4384  
  4385  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4386  	if err != nil {
  4387  		return err
  4388  	}
  4389  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  4390  
  4391  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1), reg)
  4392  	return nil
  4393  }
  4394  
  4395  // compileConstF32 implements compiler.compileConstF32 for the amd64 architecture.
  4396  func (c *amd64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
  4397  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4398  		return err
  4399  	}
  4400  
  4401  	reg, err := c.allocateRegister(registerTypeVector)
  4402  	if err != nil {
  4403  		return err
  4404  	}
  4405  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF32)
  4406  
  4407  	// We cannot directly load the value from memory to float regs,
  4408  	// so we move it to int reg temporarily.
  4409  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4410  	if err != nil {
  4411  		return err
  4412  	}
  4413  
  4414  	c.assembler.CompileConstToRegister(amd64.MOVL, int64(o.U1) /*math.Float32bits(o.Value)*/, tmpReg)
  4415  	c.assembler.CompileRegisterToRegister(amd64.MOVL, tmpReg, reg)
  4416  	return nil
  4417  }
  4418  
  4419  // compileConstF64 implements compiler.compileConstF64 for the amd64 architecture.
  4420  func (c *amd64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
  4421  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4422  		return err
  4423  	}
  4424  
  4425  	reg, err := c.allocateRegister(registerTypeVector)
  4426  	if err != nil {
  4427  		return err
  4428  	}
  4429  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF64)
  4430  
  4431  	// We cannot directly load the value from memory to float regs,
  4432  	// so we move it to int reg temporarily.
  4433  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4434  	if err != nil {
  4435  		return err
  4436  	}
  4437  
  4438  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1) /* math.Float64bits(o.Value) */, tmpReg)
  4439  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, reg)
  4440  	return nil
  4441  }
  4442  
  4443  func (c *amd64Compiler) compileAtomicLoad(o *wazeroir.UnionOperation) error {
  4444  	var (
  4445  		inst              asm.Instruction
  4446  		targetSizeInBytes int64
  4447  		vt                runtimeValueType
  4448  	)
  4449  
  4450  	unsignedType := wazeroir.UnsignedType(o.B1)
  4451  	offset := uint32(o.U2)
  4452  
  4453  	switch unsignedType {
  4454  	case wazeroir.UnsignedTypeI32:
  4455  		inst = amd64.MOVL
  4456  		targetSizeInBytes = 32 / 8
  4457  		vt = runtimeValueTypeI32
  4458  	case wazeroir.UnsignedTypeI64:
  4459  		inst = amd64.MOVQ
  4460  		targetSizeInBytes = 64 / 8
  4461  		vt = runtimeValueTypeI64
  4462  	}
  4463  
  4464  	return c.compileAtomicLoadImpl(inst, offset, targetSizeInBytes, vt)
  4465  }
  4466  
  4467  func (c *amd64Compiler) compileAtomicLoad8(o *wazeroir.UnionOperation) error {
  4468  	var (
  4469  		inst asm.Instruction
  4470  		vt   runtimeValueType
  4471  	)
  4472  
  4473  	unsignedType := wazeroir.UnsignedType(o.B1)
  4474  	offset := uint32(o.U2)
  4475  
  4476  	switch unsignedType {
  4477  	case wazeroir.UnsignedTypeI32:
  4478  		inst = amd64.MOVBLZX
  4479  		vt = runtimeValueTypeI32
  4480  	case wazeroir.UnsignedTypeI64:
  4481  		inst = amd64.MOVBQZX
  4482  		vt = runtimeValueTypeI64
  4483  	}
  4484  
  4485  	return c.compileAtomicLoadImpl(inst, offset, 1, vt)
  4486  }
  4487  
  4488  func (c *amd64Compiler) compileAtomicLoad16(o *wazeroir.UnionOperation) error {
  4489  	var (
  4490  		inst asm.Instruction
  4491  		vt   runtimeValueType
  4492  	)
  4493  
  4494  	unsignedType := wazeroir.UnsignedType(o.B1)
  4495  	offset := uint32(o.U2)
  4496  
  4497  	switch unsignedType {
  4498  	case wazeroir.UnsignedTypeI32:
  4499  		inst = amd64.MOVWLZX
  4500  		vt = runtimeValueTypeI32
  4501  	case wazeroir.UnsignedTypeI64:
  4502  		inst = amd64.MOVWQZX
  4503  		vt = runtimeValueTypeI64
  4504  	}
  4505  
  4506  	return c.compileAtomicLoadImpl(inst, offset, 16/8, vt)
  4507  }
  4508  
  4509  func (c *amd64Compiler) compileAtomicLoadImpl(
  4510  	inst asm.Instruction, offset uint32, targetSizeInBytes int64, resultRuntimeValueType runtimeValueType,
  4511  ) error {
  4512  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  4513  	if err != nil {
  4514  		return err
  4515  	}
  4516  
  4517  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4518  
  4519  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  4520  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  4521  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  4522  		reg)
  4523  	c.pushRuntimeValueLocationOnRegister(reg, resultRuntimeValueType)
  4524  
  4525  	return nil
  4526  }
  4527  
  4528  func (c *amd64Compiler) compileAtomicStore(o *wazeroir.UnionOperation) error {
  4529  	var inst asm.Instruction
  4530  	var targetSizeInByte int64
  4531  	unsignedType := wazeroir.UnsignedType(o.B1)
  4532  	offset := uint32(o.U2)
  4533  	switch unsignedType {
  4534  	case wazeroir.UnsignedTypeI32:
  4535  		inst = amd64.XCHGL
  4536  		targetSizeInByte = 32 / 8
  4537  	case wazeroir.UnsignedTypeI64:
  4538  		inst = amd64.XCHGQ
  4539  		targetSizeInByte = 64 / 8
  4540  	}
  4541  	return c.compileAtomicStoreImpl(inst, offset, targetSizeInByte)
  4542  }
  4543  
  4544  func (c *amd64Compiler) compileAtomicStore8(o *wazeroir.UnionOperation) error {
  4545  	return c.compileAtomicStoreImpl(amd64.XCHGB, uint32(o.U2), 1)
  4546  }
  4547  
  4548  func (c *amd64Compiler) compileAtomicStore16(o *wazeroir.UnionOperation) error {
  4549  	return c.compileAtomicStoreImpl(amd64.XCHGW, uint32(o.U2), 16/8)
  4550  }
  4551  
  4552  func (c *amd64Compiler) compileAtomicStoreImpl(
  4553  	inst asm.Instruction, offset uint32, targetSizeInBytes int64,
  4554  ) error {
  4555  	val := c.locationStack.pop()
  4556  	if err := c.compileEnsureOnRegister(val); err != nil {
  4557  		return err
  4558  	}
  4559  
  4560  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  4561  	if err != nil {
  4562  		return err
  4563  	}
  4564  
  4565  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4566  
  4567  	c.assembler.CompileRegisterToMemoryWithIndex(
  4568  		inst, val.register,
  4569  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  4570  	)
  4571  
  4572  	// We no longer need both the value and base registers.
  4573  	c.locationStack.releaseRegister(val)
  4574  	c.locationStack.markRegisterUnused(reg)
  4575  	return nil
  4576  }
  4577  
  4578  func (c *amd64Compiler) compileAtomicRMW(o *wazeroir.UnionOperation) error {
  4579  	var (
  4580  		inst              asm.Instruction
  4581  		targetSizeInBytes int64
  4582  		vt                runtimeValueType
  4583  	)
  4584  
  4585  	unsignedType := wazeroir.UnsignedType(o.B1)
  4586  	op := wazeroir.AtomicArithmeticOp(o.B2)
  4587  	offset := uint32(o.U2)
  4588  
  4589  	switch unsignedType {
  4590  	case wazeroir.UnsignedTypeI32:
  4591  		targetSizeInBytes = 32 / 8
  4592  		vt = runtimeValueTypeI32
  4593  		switch op {
  4594  		case wazeroir.AtomicArithmeticOpAdd:
  4595  			return c.compileAtomicAddImpl(amd64.XADDL, offset, false, targetSizeInBytes, vt)
  4596  		case wazeroir.AtomicArithmeticOpSub:
  4597  			return c.compileAtomicAddImpl(amd64.XADDL, offset, true, targetSizeInBytes, vt)
  4598  		case wazeroir.AtomicArithmeticOpAnd:
  4599  			inst = amd64.ANDL
  4600  		case wazeroir.AtomicArithmeticOpOr:
  4601  			inst = amd64.ORL
  4602  		case wazeroir.AtomicArithmeticOpXor:
  4603  			inst = amd64.XORL
  4604  		case wazeroir.AtomicArithmeticOpNop:
  4605  			return c.compileAtomicXchgImpl(amd64.XCHGL, offset, targetSizeInBytes, vt)
  4606  		}
  4607  	case wazeroir.UnsignedTypeI64:
  4608  		targetSizeInBytes = 64 / 8
  4609  		vt = runtimeValueTypeI64
  4610  		switch op {
  4611  		case wazeroir.AtomicArithmeticOpAdd:
  4612  			return c.compileAtomicAddImpl(amd64.XADDQ, offset, false, targetSizeInBytes, vt)
  4613  		case wazeroir.AtomicArithmeticOpSub:
  4614  			return c.compileAtomicAddImpl(amd64.XADDQ, offset, true, targetSizeInBytes, vt)
  4615  		case wazeroir.AtomicArithmeticOpAnd:
  4616  			inst = amd64.ANDQ
  4617  		case wazeroir.AtomicArithmeticOpOr:
  4618  			inst = amd64.ORQ
  4619  		case wazeroir.AtomicArithmeticOpXor:
  4620  			inst = amd64.XORQ
  4621  		case wazeroir.AtomicArithmeticOpNop:
  4622  			return c.compileAtomicXchgImpl(amd64.XCHGQ, offset, targetSizeInBytes, vt)
  4623  		}
  4624  	}
  4625  
  4626  	return c.compileAtomicRMWCASLoopImpl(inst, offset, targetSizeInBytes, vt)
  4627  }
  4628  
  4629  func (c *amd64Compiler) compileAtomicRMW8(o *wazeroir.UnionOperation) error {
  4630  	var (
  4631  		inst asm.Instruction
  4632  		vt   runtimeValueType
  4633  	)
  4634  
  4635  	unsignedType := wazeroir.UnsignedType(o.B1)
  4636  	op := wazeroir.AtomicArithmeticOp(o.B2)
  4637  	offset := uint32(o.U2)
  4638  
  4639  	switch unsignedType {
  4640  	case wazeroir.UnsignedTypeI32:
  4641  		vt = runtimeValueTypeI32
  4642  	case wazeroir.UnsignedTypeI64:
  4643  		vt = runtimeValueTypeI64
  4644  	}
  4645  
  4646  	switch op {
  4647  	case wazeroir.AtomicArithmeticOpAdd:
  4648  		return c.compileAtomicAddImpl(amd64.XADDB, offset, false, 1, vt)
  4649  	case wazeroir.AtomicArithmeticOpSub:
  4650  		return c.compileAtomicAddImpl(amd64.XADDB, offset, true, 1, vt)
  4651  	case wazeroir.AtomicArithmeticOpAnd:
  4652  		inst = amd64.ANDL
  4653  	case wazeroir.AtomicArithmeticOpOr:
  4654  		inst = amd64.ORL
  4655  	case wazeroir.AtomicArithmeticOpXor:
  4656  		inst = amd64.XORL
  4657  	case wazeroir.AtomicArithmeticOpNop:
  4658  		return c.compileAtomicXchgImpl(amd64.XCHGB, offset, 1, vt)
  4659  	}
  4660  
  4661  	return c.compileAtomicRMWCASLoopImpl(inst, offset, 1, vt)
  4662  }
  4663  
  4664  func (c *amd64Compiler) compileAtomicRMW16(o *wazeroir.UnionOperation) error {
  4665  	var (
  4666  		inst asm.Instruction
  4667  		vt   runtimeValueType
  4668  	)
  4669  
  4670  	unsignedType := wazeroir.UnsignedType(o.B1)
  4671  	op := wazeroir.AtomicArithmeticOp(o.B2)
  4672  	offset := uint32(o.U2)
  4673  
  4674  	switch unsignedType {
  4675  	case wazeroir.UnsignedTypeI32:
  4676  		vt = runtimeValueTypeI32
  4677  	case wazeroir.UnsignedTypeI64:
  4678  		vt = runtimeValueTypeI64
  4679  	}
  4680  
  4681  	switch op {
  4682  	case wazeroir.AtomicArithmeticOpAdd:
  4683  		return c.compileAtomicAddImpl(amd64.XADDW, offset, false, 16/8, vt)
  4684  	case wazeroir.AtomicArithmeticOpSub:
  4685  		return c.compileAtomicAddImpl(amd64.XADDW, offset, true, 16/8, vt)
  4686  	case wazeroir.AtomicArithmeticOpAnd:
  4687  		inst = amd64.ANDL
  4688  	case wazeroir.AtomicArithmeticOpOr:
  4689  		inst = amd64.ORL
  4690  	case wazeroir.AtomicArithmeticOpXor:
  4691  		inst = amd64.XORL
  4692  	case wazeroir.AtomicArithmeticOpNop:
  4693  		return c.compileAtomicXchgImpl(amd64.XCHGW, offset, 16/8, vt)
  4694  	}
  4695  
  4696  	return c.compileAtomicRMWCASLoopImpl(inst, offset, 16/8, vt)
  4697  }
  4698  
  4699  func (c *amd64Compiler) compileAtomicAddImpl(inst asm.Instruction, offsetConst uint32, negateArg bool, targetSizeInBytes int64, resultRuntimeValueType runtimeValueType) error {
  4700  	val := c.locationStack.pop()
  4701  	if err := c.compileEnsureOnRegister(val); err != nil {
  4702  		return err
  4703  	}
  4704  
  4705  	if negateArg {
  4706  		var negArg asm.Instruction
  4707  		switch targetSizeInBytes {
  4708  		case 1:
  4709  			negArg = amd64.NEGB
  4710  		case 2:
  4711  			negArg = amd64.NEGW
  4712  		case 4:
  4713  			negArg = amd64.NEGL
  4714  		case 8:
  4715  			negArg = amd64.NEGQ
  4716  		}
  4717  		c.assembler.CompileNoneToRegister(negArg, val.register)
  4718  	}
  4719  
  4720  	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
  4721  	if err != nil {
  4722  		return err
  4723  	}
  4724  
  4725  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4726  
  4727  	c.assembler.CompileRegisterToMemoryWithIndexAndLock(
  4728  		inst, val.register,
  4729  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  4730  	)
  4731  
  4732  	if targetSizeInBytes < 4 {
  4733  		mask := (1 << (8 * targetSizeInBytes)) - 1
  4734  		c.assembler.CompileConstToRegister(amd64.ANDQ, int64(mask), val.register)
  4735  	}
  4736  
  4737  	c.locationStack.markRegisterUnused(reg)
  4738  	c.locationStack.pushRuntimeValueLocationOnRegister(val.register, resultRuntimeValueType)
  4739  
  4740  	return nil
  4741  }
  4742  
  4743  func (c *amd64Compiler) compileAtomicXchgImpl(inst asm.Instruction, offsetConst uint32, targetSizeInBytes int64, resultRuntimeValueType runtimeValueType) error {
  4744  	val := c.locationStack.pop()
  4745  	if err := c.compileEnsureOnRegister(val); err != nil {
  4746  		return err
  4747  	}
  4748  
  4749  	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
  4750  	if err != nil {
  4751  		return err
  4752  	}
  4753  
  4754  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4755  
  4756  	c.assembler.CompileRegisterToMemoryWithIndex(
  4757  		inst, val.register,
  4758  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  4759  	)
  4760  
  4761  	if targetSizeInBytes < 4 {
  4762  		mask := (1 << (8 * targetSizeInBytes)) - 1
  4763  		c.assembler.CompileConstToRegister(amd64.ANDQ, int64(mask), val.register)
  4764  	}
  4765  
  4766  	c.locationStack.markRegisterUnused(reg)
  4767  	c.locationStack.pushRuntimeValueLocationOnRegister(val.register, resultRuntimeValueType)
  4768  
  4769  	return nil
  4770  }
  4771  
  4772  func (c *amd64Compiler) compileAtomicRMWCASLoopImpl(rmwInst asm.Instruction,
  4773  	offsetConst uint32, targetSizeInBytes int64, resultRuntimeValueType runtimeValueType,
  4774  ) error {
  4775  	const resultRegister = amd64.RegAX
  4776  
  4777  	var copyInst asm.Instruction
  4778  	var loadInst asm.Instruction
  4779  	var cmpXchgInst asm.Instruction
  4780  
  4781  	switch targetSizeInBytes {
  4782  	case 8:
  4783  		copyInst = amd64.MOVQ
  4784  		loadInst = amd64.MOVQ
  4785  		cmpXchgInst = amd64.CMPXCHGQ
  4786  	case 4:
  4787  		copyInst = amd64.MOVL
  4788  		loadInst = amd64.MOVL
  4789  		cmpXchgInst = amd64.CMPXCHGL
  4790  	case 2:
  4791  		copyInst = amd64.MOVL
  4792  		loadInst = amd64.MOVWLZX
  4793  		cmpXchgInst = amd64.CMPXCHGW
  4794  	case 1:
  4795  		copyInst = amd64.MOVL
  4796  		loadInst = amd64.MOVBLZX
  4797  		cmpXchgInst = amd64.CMPXCHGB
  4798  	}
  4799  
  4800  	c.onValueReleaseRegisterToStack(resultRegister)
  4801  	c.locationStack.markRegisterUsed(resultRegister)
  4802  
  4803  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4804  	if err != nil {
  4805  		return err
  4806  	}
  4807  	c.locationStack.markRegisterUsed(tmp)
  4808  
  4809  	val := c.locationStack.pop()
  4810  	if err := c.compileEnsureOnRegister(val); err != nil {
  4811  		return err
  4812  	}
  4813  
  4814  	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
  4815  	if err != nil {
  4816  		return err
  4817  	}
  4818  
  4819  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4820  
  4821  	if targetSizeInBytes < 32 {
  4822  		mask := (1 << (8 * targetSizeInBytes)) - 1
  4823  		c.assembler.CompileConstToRegister(amd64.ANDQ, int64(mask), val.register)
  4824  	}
  4825  
  4826  	beginLoop := c.assembler.CompileStandAlone(amd64.NOP)
  4827  	c.assembler.CompileRegisterToRegister(copyInst, val.register, tmp)
  4828  	c.assembler.CompileMemoryWithIndexToRegister(
  4829  		loadInst, amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1, resultRegister)
  4830  	if targetSizeInBytes < 32 {
  4831  		mask := (1 << (8 * targetSizeInBytes)) - 1
  4832  		c.assembler.CompileConstToRegister(amd64.ANDQ, int64(mask), resultRegister)
  4833  	}
  4834  	c.assembler.CompileRegisterToRegister(rmwInst, resultRegister, tmp)
  4835  	c.assembler.CompileRegisterToMemoryWithIndexAndLock(
  4836  		cmpXchgInst, tmp,
  4837  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  4838  	)
  4839  	c.assembler.CompileJump(amd64.JNE).AssignJumpTarget(beginLoop)
  4840  
  4841  	if targetSizeInBytes < 32 {
  4842  		mask := (1 << (8 * targetSizeInBytes)) - 1
  4843  		c.assembler.CompileConstToRegister(amd64.ANDQ, int64(mask), resultRegister)
  4844  	}
  4845  
  4846  	c.locationStack.markRegisterUnused(reg)
  4847  	c.locationStack.markRegisterUnused(tmp)
  4848  	c.locationStack.markRegisterUnused(val.register)
  4849  	c.locationStack.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  4850  
  4851  	return nil
  4852  }
  4853  
  4854  func (c *amd64Compiler) compileAtomicRMWCmpxchg(o *wazeroir.UnionOperation) error {
  4855  	var (
  4856  		casInst           asm.Instruction
  4857  		targetSizeInBytes int64
  4858  		vt                runtimeValueType
  4859  	)
  4860  
  4861  	unsignedType := wazeroir.UnsignedType(o.B1)
  4862  	offset := uint32(o.U2)
  4863  
  4864  	switch unsignedType {
  4865  	case wazeroir.UnsignedTypeI32:
  4866  		casInst = amd64.CMPXCHGL
  4867  		targetSizeInBytes = 32 / 8
  4868  		vt = runtimeValueTypeI32
  4869  	case wazeroir.UnsignedTypeI64:
  4870  		casInst = amd64.CMPXCHGQ
  4871  		targetSizeInBytes = 64 / 8
  4872  		vt = runtimeValueTypeI64
  4873  	}
  4874  	return c.compileAtomicRMWCmpxchgImpl(casInst, offset, targetSizeInBytes, vt)
  4875  }
  4876  
  4877  func (c *amd64Compiler) compileAtomicRMW8Cmpxchg(o *wazeroir.UnionOperation) error {
  4878  	var vt runtimeValueType
  4879  
  4880  	unsignedType := wazeroir.UnsignedType(o.B1)
  4881  	offset := uint32(o.U2)
  4882  
  4883  	switch unsignedType {
  4884  	case wazeroir.UnsignedTypeI32:
  4885  		vt = runtimeValueTypeI32
  4886  	case wazeroir.UnsignedTypeI64:
  4887  		vt = runtimeValueTypeI64
  4888  	}
  4889  	return c.compileAtomicRMWCmpxchgImpl(amd64.CMPXCHGB, offset, 1, vt)
  4890  }
  4891  
  4892  func (c *amd64Compiler) compileAtomicRMW16Cmpxchg(o *wazeroir.UnionOperation) error {
  4893  	var vt runtimeValueType
  4894  
  4895  	unsignedType := wazeroir.UnsignedType(o.B1)
  4896  	offset := uint32(o.U2)
  4897  
  4898  	switch unsignedType {
  4899  	case wazeroir.UnsignedTypeI32:
  4900  		vt = runtimeValueTypeI32
  4901  	case wazeroir.UnsignedTypeI64:
  4902  		vt = runtimeValueTypeI64
  4903  	}
  4904  	return c.compileAtomicRMWCmpxchgImpl(amd64.CMPXCHGW, offset, 16/8, vt)
  4905  }
  4906  
  4907  func (c *amd64Compiler) compileAtomicRMWCmpxchgImpl(inst asm.Instruction, offsetArg uint32, targetSizeInBytes int64, resultRuntimeValueType runtimeValueType) error {
  4908  	const resultRegister = amd64.RegAX
  4909  
  4910  	repl := c.locationStack.pop()
  4911  	exp := c.locationStack.pop()
  4912  
  4913  	// expected value must be in accumulator register, which will also hold the loaded result.
  4914  	if exp.register != resultRegister {
  4915  		c.onValueReleaseRegisterToStack(resultRegister)
  4916  		if exp.onConditionalRegister() {
  4917  			c.compileMoveConditionalToGeneralPurposeRegister(exp, resultRegister)
  4918  		} else if exp.onStack() {
  4919  			exp.setRegister(resultRegister)
  4920  			c.compileLoadValueOnStackToRegister(exp)
  4921  			c.locationStack.markRegisterUnused(resultRegister)
  4922  		} else {
  4923  			c.assembler.CompileRegisterToRegister(amd64.MOVQ, exp.register, resultRegister)
  4924  			c.locationStack.releaseRegister(exp)
  4925  			exp.setRegister(resultRegister)
  4926  			c.locationStack.markRegisterUsed(resultRegister)
  4927  		}
  4928  	}
  4929  
  4930  	if err := c.compileEnsureOnRegister(repl); err != nil {
  4931  		return err
  4932  	}
  4933  
  4934  	reg, err := c.compileMemoryAccessCeilSetup(offsetArg, targetSizeInBytes)
  4935  	if err != nil {
  4936  		return err
  4937  	}
  4938  
  4939  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4940  
  4941  	c.assembler.CompileRegisterToMemoryWithIndexAndLock(
  4942  		inst, repl.register,
  4943  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  4944  	)
  4945  
  4946  	if targetSizeInBytes < 4 {
  4947  		mask := (1 << (8 * targetSizeInBytes)) - 1
  4948  		c.assembler.CompileConstToRegister(amd64.ANDQ, int64(mask), resultRegister)
  4949  	}
  4950  
  4951  	c.locationStack.markRegisterUnused(reg)
  4952  	c.locationStack.markRegisterUnused(repl.register)
  4953  	c.locationStack.pushRuntimeValueLocationOnRegister(resultRegister, resultRuntimeValueType)
  4954  
  4955  	return nil
  4956  }
  4957  
  4958  func (c *amd64Compiler) compileAtomicMemoryWait(o *wazeroir.UnionOperation) error {
  4959  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4960  		return err
  4961  	}
  4962  
  4963  	var (
  4964  		vt                runtimeValueType
  4965  		targetSizeInBytes int64
  4966  		waitFunc          wasm.Index
  4967  	)
  4968  
  4969  	unsignedType := wazeroir.UnsignedType(o.B1)
  4970  	offset := uint32(o.U2)
  4971  
  4972  	switch unsignedType {
  4973  	case wazeroir.UnsignedTypeI32:
  4974  		vt = runtimeValueTypeI32
  4975  		targetSizeInBytes = 32 / 8
  4976  		waitFunc = builtinFunctionMemoryWait32
  4977  	case wazeroir.UnsignedTypeI64:
  4978  		vt = runtimeValueTypeI64
  4979  		targetSizeInBytes = 64 / 8
  4980  		waitFunc = builtinFunctionMemoryWait64
  4981  	}
  4982  
  4983  	timeout := c.locationStack.pop()
  4984  	if err := c.compileEnsureOnRegister(timeout); err != nil {
  4985  		return err
  4986  	}
  4987  	exp := c.locationStack.pop()
  4988  	if err := c.compileEnsureOnRegister(exp); err != nil {
  4989  		return err
  4990  	}
  4991  
  4992  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  4993  	if err != nil {
  4994  		return err
  4995  	}
  4996  	c.locationStack.markRegisterUsed(reg)
  4997  	c.compileMemoryAlignmentCheck(reg, targetSizeInBytes)
  4998  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, reg)
  4999  	c.assembler.CompileConstToRegister(amd64.ADDQ, -targetSizeInBytes, reg)
  5000  
  5001  	// Push address, values, and timeout back to read in Go
  5002  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  5003  	c.pushRuntimeValueLocationOnRegister(exp.register, vt)
  5004  	c.pushRuntimeValueLocationOnRegister(timeout.register, runtimeValueTypeI64)
  5005  	if err := c.compileCallBuiltinFunction(waitFunc); err != nil {
  5006  		return err
  5007  	}
  5008  	// Address, values and timeout consumed in Go
  5009  	c.locationStack.pop()
  5010  	c.locationStack.pop()
  5011  	c.locationStack.pop()
  5012  
  5013  	// Then, the result was pushed.
  5014  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  5015  	v.valueType = runtimeValueTypeI32
  5016  
  5017  	c.locationStack.markRegisterUnused(reg)
  5018  	c.locationStack.releaseRegister(exp)
  5019  	c.locationStack.releaseRegister(timeout)
  5020  
  5021  	// After return, we re-initialize reserved registers just like preamble of functions.
  5022  	c.compileReservedStackBasePointerInitialization()
  5023  	c.compileReservedMemoryPointerInitialization()
  5024  
  5025  	return nil
  5026  }
  5027  
  5028  func (c *amd64Compiler) compileAtomicMemoryNotify(o *wazeroir.UnionOperation) error {
  5029  	offset := uint32(o.U2)
  5030  
  5031  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  5032  		return err
  5033  	}
  5034  
  5035  	count := c.locationStack.pop()
  5036  	if err := c.compileEnsureOnRegister(count); err != nil {
  5037  		return err
  5038  	}
  5039  
  5040  	reg, err := c.compileMemoryAccessCeilSetup(offset, 4)
  5041  	if err != nil {
  5042  		return err
  5043  	}
  5044  	c.compileMemoryAlignmentCheck(reg, 4)
  5045  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, reg)
  5046  	c.assembler.CompileConstToRegister(amd64.ADDQ, -4, reg)
  5047  
  5048  	// Push address and count back to read in Go
  5049  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  5050  	c.pushRuntimeValueLocationOnRegister(count.register, runtimeValueTypeI32)
  5051  	if err := c.compileCallBuiltinFunction(builtinFunctionMemoryNotify); err != nil {
  5052  		return err
  5053  	}
  5054  
  5055  	// Address and count consumed by Go
  5056  	c.locationStack.pop()
  5057  	c.locationStack.pop()
  5058  
  5059  	// Then, the result was pushed.
  5060  	v := c.locationStack.pushRuntimeValueLocationOnStack()
  5061  	v.valueType = runtimeValueTypeI32
  5062  
  5063  	// After return, we re-initialize reserved registers just like preamble of functions.
  5064  	c.compileReservedStackBasePointerInitialization()
  5065  	c.compileReservedMemoryPointerInitialization()
  5066  	return nil
  5067  }
  5068  
  5069  func (c *amd64Compiler) compileAtomicFence(_ *wazeroir.UnionOperation) error {
  5070  	c.assembler.CompileStandAlone(amd64.MFENCE)
  5071  	return nil
  5072  }
  5073  
  5074  func (c *amd64Compiler) compileMemoryAlignmentCheck(baseRegister asm.Register, targetSizeInBytes int64) {
  5075  	if targetSizeInBytes == 1 {
  5076  		return // No alignment restrictions when accessing a byte
  5077  	}
  5078  	var checkBits asm.ConstantValue
  5079  	switch targetSizeInBytes {
  5080  	case 2:
  5081  		checkBits = 0b1
  5082  	case 4:
  5083  		checkBits = 0b11
  5084  	case 8:
  5085  		checkBits = 0b111
  5086  	}
  5087  	c.assembler.CompileConstToRegister(amd64.TESTQ, checkBits, baseRegister)
  5088  	aligned := c.assembler.CompileJump(amd64.JEQ)
  5089  
  5090  	c.compileExitFromNativeCode(nativeCallStatusUnalignedAtomic)
  5091  	c.assembler.SetJumpTargetOnNext(aligned)
  5092  }
  5093  
  5094  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for amd64.
  5095  func (c *amd64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  5096  	var inst asm.Instruction
  5097  	switch loc.valueType {
  5098  	case runtimeValueTypeV128Lo:
  5099  		inst = amd64.MOVDQU
  5100  	case runtimeValueTypeV128Hi:
  5101  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  5102  	case runtimeValueTypeI32, runtimeValueTypeF32:
  5103  		inst = amd64.MOVL
  5104  	case runtimeValueTypeI64, runtimeValueTypeF64:
  5105  		inst = amd64.MOVQ
  5106  	default:
  5107  		panic("BUG: unknown runtime value type")
  5108  	}
  5109  
  5110  	// Copy the value from the stack.
  5111  	c.assembler.CompileMemoryToRegister(inst,
  5112  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  5113  		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  5114  		loc.register)
  5115  
  5116  	if loc.valueType == runtimeValueTypeV128Lo {
  5117  		// Higher 64-bits are loaded as well ^^.
  5118  		hi := &c.locationStack.stack[loc.stackPointer+1]
  5119  		hi.setRegister(loc.register)
  5120  	}
  5121  }
  5122  
  5123  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  5124  // if the value is located on a conditional register.
  5125  //
  5126  // This is usually called at the beginning of methods on compiler interface where we possibly
  5127  // compile instructions without saving the conditional register value.
  5128  // The compileXXX functions without calling this function is saving the conditional
  5129  // value to the stack or register by invoking compileEnsureOnRegister for the top.
  5130  func (c *amd64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  5131  	if c.locationStack.sp > 0 {
  5132  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  5133  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  5134  				return err
  5135  			}
  5136  		}
  5137  	}
  5138  	return
  5139  }
  5140  
  5141  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  5142  // to a general purpose register.
  5143  func (c *amd64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  5144  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  5145  	if err != nil {
  5146  		return err
  5147  	}
  5148  	c.compileMoveConditionalToGeneralPurposeRegister(loc, reg)
  5149  	return nil
  5150  }
  5151  
  5152  func (c *amd64Compiler) compileMoveConditionalToGeneralPurposeRegister(loc *runtimeValueLocation, reg asm.Register) {
  5153  	// Set the flag bit to the destination. See
  5154  	// - https://c9x.me/x86/html/file_module_x86_id_288.html
  5155  	// - https://github.com/golang/go/blob/master/src/cmd/internal/obj/x86/asm6.go#L1453-L1468
  5156  	// to translate conditionalRegisterState* to amd64.SET*
  5157  	var inst asm.Instruction
  5158  	switch loc.conditionalRegister {
  5159  	case amd64.ConditionalRegisterStateE:
  5160  		inst = amd64.SETEQ
  5161  	case amd64.ConditionalRegisterStateNE:
  5162  		inst = amd64.SETNE
  5163  	case amd64.ConditionalRegisterStateS:
  5164  		inst = amd64.SETMI
  5165  	case amd64.ConditionalRegisterStateNS:
  5166  		inst = amd64.SETPL
  5167  	case amd64.ConditionalRegisterStateG:
  5168  		inst = amd64.SETGT
  5169  	case amd64.ConditionalRegisterStateGE:
  5170  		inst = amd64.SETGE
  5171  	case amd64.ConditionalRegisterStateL:
  5172  		inst = amd64.SETLT
  5173  	case amd64.ConditionalRegisterStateLE:
  5174  		inst = amd64.SETLE
  5175  	case amd64.ConditionalRegisterStateA:
  5176  		inst = amd64.SETHI
  5177  	case amd64.ConditionalRegisterStateAE:
  5178  		inst = amd64.SETCC
  5179  	case amd64.ConditionalRegisterStateB:
  5180  		inst = amd64.SETCS
  5181  	case amd64.ConditionalRegisterStateBE:
  5182  		inst = amd64.SETLS
  5183  	}
  5184  
  5185  	c.assembler.CompileNoneToRegister(inst, reg)
  5186  
  5187  	// Then we reset the unnecessary bit.
  5188  	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x1, reg)
  5189  
  5190  	// Mark it uses the register.
  5191  	loc.setRegister(reg)
  5192  	c.locationStack.markRegisterUsed(reg)
  5193  }
  5194  
  5195  // allocateRegister implements compiler.allocateRegister for amd64.
  5196  func (c *amd64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  5197  	var ok bool
  5198  	// Try to get the unused register.
  5199  	reg, ok = c.locationStack.takeFreeRegister(t)
  5200  	if ok {
  5201  		return
  5202  	}
  5203  
  5204  	// If not found, we have to steal the register.
  5205  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  5206  	if !ok {
  5207  		err = fmt.Errorf("cannot steal register")
  5208  		return
  5209  	}
  5210  
  5211  	// Release the steal target register value onto stack location.
  5212  	reg = stealTarget.register
  5213  	c.compileReleaseRegisterToStack(stealTarget)
  5214  	return
  5215  }
  5216  
  5217  // callFunction adds instructions to call a function whose address equals either addr parameter or the value on indexReg.
  5218  //
  5219  // Note: this is the counterpart for returnFunction, and see the comments there as well
  5220  // to understand how the function calls are achieved.
  5221  func (c *amd64Compiler) compileCallFunctionImpl(functionAddressRegister asm.Register, functype *wasm.FunctionType) error {
  5222  	// Release all the registers as our calling convention requires the caller-save.
  5223  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  5224  		return err
  5225  	}
  5226  
  5227  	c.locationStack.markRegisterUsed(functionAddressRegister)
  5228  
  5229  	// Obtain a temporary register to be used in the followings.
  5230  	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  5231  	if !found {
  5232  		// This in theory never happen as all the registers must be free except codeAddressRegister.
  5233  		return fmt.Errorf("could not find enough free registers")
  5234  	}
  5235  
  5236  	// The stack should look like:
  5237  	//
  5238  	//               reserved slots for results (if len(results) > len(args))
  5239  	//                      |     |
  5240  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
  5241  	//      |                       |                                                        |
  5242  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
  5243  	//      |
  5244  	// nextStackBasePointerOffset
  5245  	//
  5246  	// where callFrame is used to return to this currently executed function.
  5247  
  5248  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
  5249  
  5250  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
  5251  
  5252  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
  5253  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5254  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  5255  		tmpRegister)
  5256  	callFrameStackBasePointerInBytesLoc.setRegister(tmpRegister)
  5257  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
  5258  
  5259  	// Set callEngine.stackContext.stackBasePointer for the next function.
  5260  	c.assembler.CompileConstToRegister(amd64.ADDQ, nextStackBasePointerOffset<<3, tmpRegister)
  5261  
  5262  	// Write the calculated value to callEngine.stackContext.stackBasePointer.
  5263  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  5264  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  5265  
  5266  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
  5267  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5268  		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
  5269  		tmpRegister)
  5270  	callFrameFunctionLoc.setRegister(tmpRegister)
  5271  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
  5272  
  5273  	// Set callEngine.moduleContext.fn to the next *function.
  5274  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, functionAddressRegister,
  5275  		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  5276  
  5277  	// Write the return address into callFrameReturnAddressLoc.
  5278  	c.assembler.CompileReadInstructionAddress(tmpRegister, amd64.JMP)
  5279  	callFrameReturnAddressLoc.setRegister(tmpRegister)
  5280  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
  5281  
  5282  	if amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister == functionAddressRegister {
  5283  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
  5284  		// the address (jump target below) will be modified and result in segfault.
  5285  		// See #526.
  5286  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, functionAddressRegister, tmpRegister)
  5287  		functionAddressRegister = tmpRegister
  5288  	}
  5289  
  5290  	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
  5291  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, functionAddressRegister, functionModuleInstanceOffset,
  5292  		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5293  
  5294  	// And jump into the initial address of the target function.
  5295  	c.assembler.CompileJumpToMemory(amd64.JMP, functionAddressRegister, functionCodeInitialAddressOffset)
  5296  
  5297  	// All the registers used are temporary, so we mark them unused.
  5298  	c.locationStack.markRegisterUnused(tmpRegister, functionAddressRegister)
  5299  
  5300  	// On the function return, we have to initialize the state.
  5301  	if err := c.compileModuleContextInitialization(); err != nil {
  5302  		return err
  5303  	}
  5304  
  5305  	// Due to the change to callEngine.stackContext.stackBasePointer.
  5306  	c.compileReservedStackBasePointerInitialization()
  5307  
  5308  	// Due to the change to callEngine.moduleContext.moduleInstance as that might result in
  5309  	// the memory instance manipulation.
  5310  	c.compileReservedMemoryPointerInitialization()
  5311  
  5312  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
  5313  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
  5314  
  5315  	// Now the function results are pushed by the call.
  5316  	for _, t := range functype.Results {
  5317  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
  5318  		switch t {
  5319  		case wasm.ValueTypeI32:
  5320  			loc.valueType = runtimeValueTypeI32
  5321  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
  5322  			loc.valueType = runtimeValueTypeI64
  5323  		case wasm.ValueTypeF32:
  5324  			loc.valueType = runtimeValueTypeF32
  5325  		case wasm.ValueTypeF64:
  5326  			loc.valueType = runtimeValueTypeF64
  5327  		case wasm.ValueTypeV128:
  5328  			loc.valueType = runtimeValueTypeV128Lo
  5329  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
  5330  			hi.valueType = runtimeValueTypeV128Hi
  5331  		default:
  5332  			panic("BUG: invalid type: " + wasm.ValueTypeName(t))
  5333  		}
  5334  	}
  5335  	return nil
  5336  }
  5337  
  5338  // returnFunction adds instructions to return from the current callframe back to the caller's frame.
  5339  // If this is the current one is the origin, we return to the callEngine.execWasmFunction with the Returned status.
  5340  // Otherwise, we jump into the callers' return address stored in callFrame.returnAddress while setting
  5341  // up all the necessary change on the callEngine's state.
  5342  //
  5343  // Note: this is the counterpart for callFunction, and see the comments there as well
  5344  // to understand how the function calls are achieved.
  5345  func (c *amd64Compiler) compileReturnFunction() error {
  5346  	// Release all the registers as our calling convention requires the caller-save.
  5347  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  5348  		return err
  5349  	}
  5350  
  5351  	if c.withListener {
  5352  		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerAfter); err != nil {
  5353  			return err
  5354  		}
  5355  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
  5356  		c.compileReservedStackBasePointerInitialization()
  5357  	}
  5358  
  5359  	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
  5360  	// so mark it used so that it won't be used as a free register.
  5361  	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5362  	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5363  
  5364  	// Obtain a temporary register to be used in the following.
  5365  	returnAddressRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  5366  	if !found {
  5367  		panic("BUG: all the registers should be free at this point: " + c.locationStack.String())
  5368  	}
  5369  
  5370  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
  5371  
  5372  	// A zero return address means return from the execution.
  5373  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5374  		amd64ReservedRegisterForStackBasePointerAddress, int64(returnAddress.stackPointer)*8,
  5375  		returnAddressRegister,
  5376  	)
  5377  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, returnAddressRegister, returnAddressRegister)
  5378  
  5379  	c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusCodeReturned)
  5380  
  5381  	// Alias for readability.
  5382  	tmpRegister := amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister
  5383  
  5384  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
  5385  	callerStackBasePointerInBytes.setRegister(tmpRegister)
  5386  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
  5387  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  5388  		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  5389  
  5390  	// Next, restore moduleContext.fn from callerFunction.
  5391  	callerFunction.setRegister(tmpRegister)
  5392  	c.compileLoadValueOnStackToRegister(callerFunction)
  5393  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  5394  		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  5395  
  5396  	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
  5397  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5398  		tmpRegister, functionModuleInstanceOffset,
  5399  		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5400  
  5401  	// Then, jump into the return address!
  5402  	c.assembler.CompileJumpToRegister(amd64.JMP, returnAddressRegister)
  5403  	return nil
  5404  }
  5405  
  5406  func (c *amd64Compiler) compileCallGoHostFunction() error {
  5407  	return c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction)
  5408  }
  5409  
  5410  func (c *amd64Compiler) compileCallBuiltinFunction(index wasm.Index) error {
  5411  	// Set the functionAddress to the callEngine.exitContext functionCallAddress.
  5412  	c.assembler.CompileConstToMemory(amd64.MOVL, int64(index), amd64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset)
  5413  	return c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction)
  5414  }
  5415  
  5416  func (c *amd64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode) error {
  5417  	// Release all the registers as our calling convention requires the caller-save.
  5418  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  5419  		return err
  5420  	}
  5421  
  5422  	c.compileExitFromNativeCode(compilerStatus)
  5423  	return nil
  5424  }
  5425  
  5426  // compileReleaseAllRegistersToStack add the instructions to release all the LIVE value
  5427  // in the value location stack at this point into the stack memory location.
  5428  func (c *amd64Compiler) compileReleaseAllRegistersToStack() (err error) {
  5429  	for i := uint64(0); i < c.locationStack.sp; i++ {
  5430  		if loc := &c.locationStack.stack[i]; loc.onRegister() {
  5431  			c.compileReleaseRegisterToStack(loc)
  5432  		} else if loc.onConditionalRegister() {
  5433  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  5434  				return
  5435  			}
  5436  			c.compileReleaseRegisterToStack(loc)
  5437  		}
  5438  	}
  5439  	return
  5440  }
  5441  
  5442  func (c *amd64Compiler) onValueReleaseRegisterToStack(reg asm.Register) {
  5443  	for i := uint64(0); i < c.locationStack.sp; i++ {
  5444  		prevValue := &c.locationStack.stack[i]
  5445  		if prevValue.register == reg {
  5446  			c.compileReleaseRegisterToStack(prevValue)
  5447  			break
  5448  		}
  5449  	}
  5450  }
  5451  
  5452  // compileReleaseRegisterToStack implements compiler.compileReleaseRegisterToStack for amd64.
  5453  func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  5454  	var inst asm.Instruction
  5455  	switch loc.valueType {
  5456  	case runtimeValueTypeV128Lo:
  5457  		inst = amd64.MOVDQU
  5458  	case runtimeValueTypeV128Hi:
  5459  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  5460  	case runtimeValueTypeI32, runtimeValueTypeF32:
  5461  		inst = amd64.MOVL
  5462  	case runtimeValueTypeI64, runtimeValueTypeF64:
  5463  		inst = amd64.MOVQ
  5464  	default:
  5465  		panic("BUG: unknown runtime value type")
  5466  	}
  5467  
  5468  	c.assembler.CompileRegisterToMemory(inst, loc.register,
  5469  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  5470  		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  5471  
  5472  	// Mark the register is free.
  5473  	c.locationStack.releaseRegister(loc)
  5474  
  5475  	if loc.valueType == runtimeValueTypeV128Lo {
  5476  		// Higher 64-bits are released as well ^^.
  5477  		hi := &c.locationStack.stack[loc.stackPointer+1]
  5478  		c.locationStack.releaseRegister(hi)
  5479  	}
  5480  }
  5481  
  5482  func (c *amd64Compiler) compileMaybeExitFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
  5483  	if target := c.compiledTrapTargets[status]; target != nil {
  5484  		// We've already compiled this.
  5485  		// Invert the return condition to jump into the appropriate target.
  5486  		var returnCondition asm.Instruction
  5487  		switch skipCondition {
  5488  		case amd64.JHI:
  5489  			returnCondition = amd64.JLS
  5490  		case amd64.JLS:
  5491  			returnCondition = amd64.JHI
  5492  		case amd64.JNE:
  5493  			returnCondition = amd64.JEQ
  5494  		case amd64.JEQ:
  5495  			returnCondition = amd64.JNE
  5496  		case amd64.JCC:
  5497  			returnCondition = amd64.JCS
  5498  		case amd64.JCS:
  5499  			returnCondition = amd64.JCC
  5500  		case amd64.JPC:
  5501  			returnCondition = amd64.JPS
  5502  		case amd64.JPS:
  5503  			returnCondition = amd64.JPC
  5504  		case amd64.JPL:
  5505  			returnCondition = amd64.JMI
  5506  		case amd64.JMI:
  5507  			returnCondition = amd64.JPL
  5508  		default:
  5509  			panic("BUG: couldn't invert condition")
  5510  		}
  5511  		c.assembler.CompileJump(returnCondition).AssignJumpTarget(target)
  5512  	} else {
  5513  		skip := c.assembler.CompileJump(skipCondition)
  5514  		c.compileExitFromNativeCode(status)
  5515  		c.assembler.SetJumpTargetOnNext(skip)
  5516  	}
  5517  }
  5518  
  5519  func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
  5520  	if target := c.compiledTrapTargets[status]; target != nil {
  5521  		c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(target)
  5522  		return
  5523  	}
  5524  
  5525  	switch status {
  5526  	case nativeCallStatusCodeReturned:
  5527  		// Save the target for reuse.
  5528  		c.compiledTrapTargets[status] = c.compileNOP()
  5529  	case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
  5530  		// Read the return address, and write it to callEngine.exitContext.returnAddress.
  5531  		returnAddressReg, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  5532  		if !ok {
  5533  			panic("BUG: cannot take free register")
  5534  		}
  5535  		c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.RET)
  5536  		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  5537  			returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
  5538  	default:
  5539  		if c.ir.IROperationSourceOffsetsInWasmBinary != nil {
  5540  			// This case, the execution traps and we want the top frame's source position in the stack trace.
  5541  			// Take RegR15 and store the instruction address onto callEngine.returnAddress.
  5542  			returnAddressReg := amd64.RegR15
  5543  			c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.MOVQ)
  5544  			c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  5545  				returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
  5546  		} else {
  5547  			// We won't use the source position, so just save the target for reuse.
  5548  			c.compiledTrapTargets[status] = c.compileNOP()
  5549  		}
  5550  	}
  5551  
  5552  	// Write the status to callEngine.exitContext.statusCode.
  5553  	c.assembler.CompileConstToMemory(amd64.MOVB, int64(status),
  5554  		amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
  5555  
  5556  	// Write back the cached SP to the actual eng.stackPointer.
  5557  	c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp),
  5558  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackPointerOffset)
  5559  
  5560  	c.assembler.CompileStandAlone(amd64.RET)
  5561  }
  5562  
  5563  func (c *amd64Compiler) compilePreamble() (err error) {
  5564  	// We assume all function parameters are already pushed onto the stack by
  5565  	// the caller.
  5566  	c.locationStack.init(c.typ)
  5567  
  5568  	if err := c.compileModuleContextInitialization(); err != nil {
  5569  		return err
  5570  	}
  5571  
  5572  	// Check if it's necessary to grow the value stack by using max stack pointer.
  5573  	if err = c.compileMaybeGrowStack(); err != nil {
  5574  		return err
  5575  	}
  5576  
  5577  	if c.withListener {
  5578  		if err = c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
  5579  			return err
  5580  		}
  5581  	}
  5582  
  5583  	c.compileReservedStackBasePointerInitialization()
  5584  
  5585  	// Finally, we initialize the reserved memory register based on the module context.
  5586  	c.compileReservedMemoryPointerInitialization()
  5587  	return
  5588  }
  5589  
  5590  func (c *amd64Compiler) compileReservedStackBasePointerInitialization() {
  5591  	// First, make reservedRegisterForStackBasePointer point to the beginning of the slice backing array.
  5592  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5593  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  5594  		amd64ReservedRegisterForStackBasePointerAddress)
  5595  
  5596  	// next we move the base pointer (callEngine.stackBasePointer) to the tmp register.
  5597  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  5598  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  5599  		amd64ReservedRegisterForStackBasePointerAddress,
  5600  	)
  5601  }
  5602  
  5603  func (c *amd64Compiler) compileReservedMemoryPointerInitialization() {
  5604  	if c.ir.HasMemory || c.ir.UsesMemory {
  5605  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5606  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  5607  			amd64ReservedRegisterForMemory,
  5608  		)
  5609  	}
  5610  }
  5611  
  5612  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
  5613  // and if so, make the builtin function call to do so. These instructions are called in the function's
  5614  // preamble.
  5615  func (c *amd64Compiler) compileMaybeGrowStack() error {
  5616  	tmpRegister, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  5617  	if !ok {
  5618  		panic("BUG: cannot take free register")
  5619  	}
  5620  
  5621  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5622  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset, tmpRegister)
  5623  	c.assembler.CompileMemoryToRegister(amd64.SUBQ,
  5624  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister)
  5625  
  5626  	// If stack base pointer + max stack pointer > stackLen, we need to grow the stack.
  5627  	cmpWithStackPointerCeil := c.assembler.CompileRegisterToConst(amd64.CMPQ, tmpRegister, 0)
  5628  	c.assignStackPointerCeilNeeded = cmpWithStackPointerCeil
  5629  
  5630  	// Jump if we have no need to grow.
  5631  	jmpIfNoNeedToGrowStack := c.assembler.CompileJump(amd64.JCC)
  5632  
  5633  	// Otherwise, we have to make the builtin function call to grow the call stack.
  5634  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexGrowStack); err != nil {
  5635  		return err
  5636  	}
  5637  
  5638  	c.assembler.SetJumpTargetOnNext(jmpIfNoNeedToGrowStack)
  5639  	return nil
  5640  }
  5641  
  5642  // compileModuleContextInitialization adds instructions to initialize callEngine.ModuleContext's fields based on
  5643  // callEngine.ModuleContext.ModuleInstanceAddress.
  5644  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  5645  func (c *amd64Compiler) compileModuleContextInitialization() error {
  5646  	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
  5647  	// so mark it used so that it won't be used as a free register until the module context initialization finishes.
  5648  	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5649  	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5650  
  5651  	// Obtain the temporary registers to be used in the followings.
  5652  	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  5653  	if !found {
  5654  		// This in theory never happen as all the registers must be free except indexReg.
  5655  		return fmt.Errorf("could not find enough free registers")
  5656  	}
  5657  	c.locationStack.markRegisterUsed(tmpRegister)
  5658  	tmpRegister2, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  5659  	if !found {
  5660  		// This in theory never happen as all the registers must be free except indexReg.
  5661  		return fmt.Errorf("could not find enough free registers")
  5662  	}
  5663  	c.locationStack.markRegisterUsed(tmpRegister2)
  5664  
  5665  	// If the module instance address stays the same, we could skip the entire code below.
  5666  	// The rationale/idea for this is that, in almost all use cases, users instantiate a single
  5667  	// Wasm binary and run the functions from it, rather than doing import/export on multiple
  5668  	// binaries. As a result, this cmp and jmp instruction sequence below must be easy for
  5669  	// x64 CPU to do branch prediction since almost 100% jump happens across function calls.
  5670  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  5671  		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5672  	jmpIfModuleNotChange := c.assembler.CompileJump(amd64.JEQ)
  5673  
  5674  	// If engine.ModuleContext.ModuleInstance is not equal the value on amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
  5675  	// we have to put the new value there.
  5676  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
  5677  		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
  5678  
  5679  	// Also, we have to update the following fields:
  5680  	// * callEngine.moduleContext.globalElement0Address
  5681  	// * callEngine.moduleContext.tableElement0Address
  5682  	// * callEngine.moduleContext.memoryInstance
  5683  	// * callEngine.moduleContext.memoryElement0Address
  5684  	// * callEngine.moduleContext.memorySliceLen
  5685  	// * callEngine.moduleContext.codesElement0Address
  5686  	// * callEngine.moduleContext.typeIDsElement0Address
  5687  	// * callEngine.moduleContext.dataInstancesElement0Address
  5688  	// * callEngine.moduleContext.elementInstancesElement0Address
  5689  
  5690  	// Update globalElement0Address.
  5691  	//
  5692  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  5693  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  5694  	// skip the initialization if the module's globals slice is empty.
  5695  	if len(c.ir.Globals) > 0 {
  5696  		// Since ModuleInstance.Globals is []*globalInstance, internally
  5697  		// the address of the first item in the underlying array lies exactly on the globals offset.
  5698  		// See https://go.dev/blog/slices-intro if unfamiliar.
  5699  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset, tmpRegister)
  5700  
  5701  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset)
  5702  	}
  5703  
  5704  	// Update tableElement0Address.
  5705  	//
  5706  	// Note: if there's table instruction in the function, the existence of the table
  5707  	// is ensured by function validation at module instantiation phase, and that's
  5708  	// why it is ok to skip the initialization if the module's table doesn't exist.
  5709  	if c.ir.HasTable {
  5710  		// First, we need to read the *wasm.Table.
  5711  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTablesOffset, tmpRegister)
  5712  
  5713  		// At this point, tmpRegister holds the address of ModuleInstance.Table.
  5714  		// So we are ready to read and put the first item's address stored in Table.Table.
  5715  		// Here we read the value into tmpRegister2.
  5716  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  5717  			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset)
  5718  
  5719  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  5720  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5721  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpRegister)
  5722  		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  5723  			tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  5724  	}
  5725  
  5726  	// Update memoryElement0Address and memorySliceLen.
  5727  	//
  5728  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  5729  	// That is ensured by function validation at module instantiation phase, and that's
  5730  	// why it is ok to skip the initialization if the module's memory instance is nil.
  5731  	if c.ir.HasMemory {
  5732  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5733  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  5734  			tmpRegister)
  5735  
  5736  		// Set memory instance.
  5737  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  5738  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset)
  5739  
  5740  		// Set element zero address.
  5741  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferOffset, tmpRegister2)
  5742  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
  5743  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset)
  5744  	}
  5745  
  5746  	// Update moduleContext.codesElement0Address
  5747  	{
  5748  		// "tmpRegister = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  5749  		//
  5750  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  5751  		// where tab points to the interface table, and the latter points to the actual
  5752  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  5753  		// See the following references for detail:
  5754  		// * https://research.swtch.com/interfaces
  5755  		// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
  5756  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset, tmpRegister)
  5757  
  5758  		// "tmpRegister = [tmpRegister + moduleEnginecodesOffset] (== &moduleEngine.codes[0])"
  5759  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, moduleEngineFunctionsOffset, tmpRegister)
  5760  
  5761  		// "callEngine.moduleContext.functionsElement0Address = tmpRegister".
  5762  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine,
  5763  			callEngineModuleContextFunctionsElement0AddressOffset)
  5764  	}
  5765  
  5766  	// Update dataInstancesElement0Address.
  5767  	if c.ir.HasDataInstances {
  5768  		// "tmpRegister = &moduleInstance.DataInstances[0]"
  5769  		c.assembler.CompileMemoryToRegister(
  5770  			amd64.MOVQ,
  5771  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  5772  			tmpRegister,
  5773  		)
  5774  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
  5775  		c.assembler.CompileRegisterToMemory(
  5776  			amd64.MOVQ,
  5777  			tmpRegister,
  5778  			amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  5779  		)
  5780  	}
  5781  
  5782  	// Update callEngine.moduleContext.elementInstancesElement0Address
  5783  	if c.ir.HasElementInstances {
  5784  		// "tmpRegister = &moduleInstance.ElementInstnaces[0]"
  5785  		c.assembler.CompileMemoryToRegister(
  5786  			amd64.MOVQ,
  5787  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  5788  			tmpRegister,
  5789  		)
  5790  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
  5791  		c.assembler.CompileRegisterToMemory(
  5792  			amd64.MOVQ,
  5793  			tmpRegister,
  5794  			amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  5795  		)
  5796  	}
  5797  
  5798  	c.locationStack.markRegisterUnused(tmpRegister, tmpRegister2)
  5799  
  5800  	// Set the jump target towards the next instruction for the case where module instance address hasn't changed.
  5801  	c.assembler.SetJumpTargetOnNext(jmpIfModuleNotChange)
  5802  	return nil
  5803  }
  5804  
  5805  // compileEnsureOnRegister ensures that the given value is located on a
  5806  // general purpose register of an appropriate type.
  5807  func (c *amd64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  5808  	if loc.onStack() {
  5809  		// Allocate the register.
  5810  		reg, err := c.allocateRegister(loc.getRegisterType())
  5811  		if err != nil {
  5812  			return err
  5813  		}
  5814  
  5815  		// Mark it uses the register.
  5816  		loc.setRegister(reg)
  5817  		c.locationStack.markRegisterUsed(reg)
  5818  
  5819  		c.compileLoadValueOnStackToRegister(loc)
  5820  	} else if loc.onConditionalRegister() {
  5821  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  5822  	}
  5823  	return
  5824  }
  5825  
  5826  // compileMaybeSwapRegisters swaps two registers if they're not equal.
  5827  func (c *amd64Compiler) compileMaybeSwapRegisters(reg1, reg2 asm.Register) {
  5828  	if reg1 != reg2 {
  5829  		c.assembler.CompileRegisterToRegister(amd64.XCHGQ, reg1, reg2)
  5830  	}
  5831  }
  5832  
  5833  func (c *amd64Compiler) compileCompareWithMemorySliceLen(addrReg asm.Register) error {
  5834  	// Obtain the temporary registers to be used in the followings.
  5835  	tmpRegister, err := c.allocateRegister(registerTypeGeneralPurpose)
  5836  	if err != nil {
  5837  		return err
  5838  	}
  5839  
  5840  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5841  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset,
  5842  		tmpRegister)
  5843  
  5844  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmpRegister, memoryInstanceBufferLenOffset, addrReg)
  5845  
  5846  	c.locationStack.markRegisterUnused(tmpRegister)
  5847  
  5848  	return nil
  5849  }
  5850  
  5851  // compilePreventCrossedTargetRegisters swaps registers in such a way, that for neither runtimeValueLocation from locs its
  5852  // corresponding register with the same index from targets is occupied by some other runtimeValueLocation from locs. It returns a
  5853  // closure to restore the original register placement.
  5854  //
  5855  // This function makes it possible to safely exchange one set of registers with another, where a register might be in both sets.
  5856  // Each register will correspond either to itself or another register not present in its own set.
  5857  //
  5858  // For example, if we have locs = [AX, BX, CX], targets = [BX, SI, AX], then it'll do two swaps
  5859  // to make locs = [BX, CX, AX].
  5860  func (c *amd64Compiler) compilePreventCrossedTargetRegisters(locs []*runtimeValueLocation, targets []asm.Register) (restore func()) {
  5861  	type swap struct{ srcIndex, dstIndex int }
  5862  	var swaps []swap
  5863  	for i := range locs {
  5864  		targetLocation := -1 // -1 means not found.
  5865  		for j := range locs {
  5866  			if locs[j].register == targets[i] {
  5867  				targetLocation = j
  5868  				break
  5869  			}
  5870  		}
  5871  		if targetLocation != -1 && targetLocation != i {
  5872  			c.compileMaybeSwapRegisters(locs[i].register, locs[targetLocation].register)
  5873  			locs[i].register, locs[targetLocation].register = locs[targetLocation].register, locs[i].register
  5874  			swaps = append(swaps, swap{i, targetLocation})
  5875  		}
  5876  	}
  5877  	return func() {
  5878  		// Restore in reverse order because a register can be moved multiple times.
  5879  		for i := len(swaps) - 1; i >= 0; i -= 1 {
  5880  			r1, r2 := swaps[i].srcIndex, swaps[i].dstIndex
  5881  			c.compileMaybeSwapRegisters(locs[r1].register, locs[r2].register)
  5882  			locs[r1].register, locs[r2].register = locs[r2].register, locs[r1].register
  5883  		}
  5884  	}
  5885  }