github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/compiler/impl_amd64.go

github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/engine/compiler/impl_amd64.go (about)

     1  package compiler
     2  
     3  // This file implements the compiler for amd64/x86_64 target.
     4  // Please refer to https://www.felixcloutier.com/x86/index.html
     5  // if unfamiliar with amd64 instructions used here.
     6  
     7  import (
     8  	"fmt"
     9  	"math"
    10  
    11  	"github.com/bananabytelabs/wazero/internal/asm"
    12  	"github.com/bananabytelabs/wazero/internal/asm/amd64"
    13  	"github.com/bananabytelabs/wazero/internal/platform"
    14  	"github.com/bananabytelabs/wazero/internal/u32"
    15  	"github.com/bananabytelabs/wazero/internal/u64"
    16  	"github.com/bananabytelabs/wazero/internal/wasm"
    17  	"github.com/bananabytelabs/wazero/internal/wazeroir"
    18  )
    19  
    20  var (
    21  	_minimum32BitSignedInt                  int32  = math.MinInt32
    22  	_maximum32BitSignedInt                  int32  = math.MaxInt32
    23  	_maximum32BitUnsignedInt                uint32 = math.MaxUint32
    24  	_minimum64BitSignedInt                  int64  = math.MinInt64
    25  	_maximum64BitSignedInt                  int64  = math.MaxInt64
    26  	_maximum64BitUnsignedInt                uint64 = math.MaxUint64
    27  	_float32SignBitMask                     uint32 = 1 << 31
    28  	_float32RestBitMask                            = ^_float32SignBitMask
    29  	_float64SignBitMask                     uint64 = 1 << 63
    30  	_float64RestBitMask                            = ^_float64SignBitMask
    31  	_float32ForMinimumSigned32bitInteger           = uint32(0xCF00_0000)
    32  	_float64ForMinimumSigned32bitInteger           = uint64(0xC1E0_0000_0020_0000)
    33  	_float32ForMinimumSigned64bitInteger           = uint32(0xDF00_0000)
    34  	_float64ForMinimumSigned64bitInteger           = uint64(0xC3E0_0000_0000_0000)
    35  	_float32ForMaximumSigned32bitIntPlusOne        = uint32(0x4F00_0000)
    36  	_float64ForMaximumSigned32bitIntPlusOne        = uint64(0x41E0_0000_0000_0000)
    37  	_float32ForMaximumSigned64bitIntPlusOne        = uint32(0x5F00_0000)
    38  	_float64ForMaximumSigned64bitIntPlusOne        = uint64(0x43E0_0000_0000_0000)
    39  )
    40  
    41  var (
    42  	// amd64ReservedRegisterForCallEngine: pointer to callEngine (i.e. *callEngine as uintptr)
    43  	amd64ReservedRegisterForCallEngine = amd64.RegR13
    44  	// amd64ReservedRegisterForStackBasePointerAddress: stack base pointer's address (callEngine.stackBasePointer) in the current function call.
    45  	amd64ReservedRegisterForStackBasePointerAddress = amd64.RegR14
    46  	// amd64ReservedRegisterForMemory: pointer to the memory slice's data (i.e. &memory.Buffer[0] as uintptr).
    47  	amd64ReservedRegisterForMemory = amd64.RegR15
    48  )
    49  
    50  var (
    51  	amd64UnreservedVectorRegisters = []asm.Register{ //nolint
    52  		amd64.RegX0, amd64.RegX1, amd64.RegX2, amd64.RegX3,
    53  		amd64.RegX4, amd64.RegX5, amd64.RegX6, amd64.RegX7,
    54  		amd64.RegX8, amd64.RegX9, amd64.RegX10, amd64.RegX11,
    55  		amd64.RegX12, amd64.RegX13, amd64.RegX14, amd64.RegX15,
    56  	}
    57  	// Note that we never invoke "call" instruction,
    58  	// so we don't need to care about the calling convention.
    59  	// TODO: Maybe it is safe just save rbp, rsp somewhere
    60  	// in Go-allocated variables, and reuse these registers
    61  	// in compiled functions and write them back before returns.
    62  	amd64UnreservedGeneralPurposeRegisters = []asm.Register{ //nolint
    63  		amd64.RegAX, amd64.RegCX, amd64.RegDX, amd64.RegBX,
    64  		amd64.RegSI, amd64.RegDI, amd64.RegR8, amd64.RegR9,
    65  		amd64.RegR10, amd64.RegR11, amd64.RegR12,
    66  	}
    67  )
    68  
    69  // amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds *wasm.ModuleInstance of the
    70  // next executing function instance. The value is set and used when making function calls
    71  // or function returns in the ModuleContextInitialization. See compileModuleContextInitialization.
    72  var amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister = amd64.RegR12
    73  
    74  func (c *amd64Compiler) String() string {
    75  	return c.locationStack.String()
    76  }
    77  
    78  // compileNOP implements compiler.compileNOP for the amd64 architecture.
    79  func (c *amd64Compiler) compileNOP() asm.Node {
    80  	return c.assembler.CompileStandAlone(amd64.NOP)
    81  }
    82  
    83  type amd64Compiler struct {
    84  	assembler   amd64.Assembler
    85  	ir          *wazeroir.CompilationResult
    86  	cpuFeatures platform.CpuFeatureFlags
    87  	// locationStack holds the state of wazeroir virtual stack.
    88  	// and each item is either placed in register or the actual memory stack.
    89  	locationStack *runtimeValueLocationStack
    90  	// labels hold per wazeroir label specific information in this function.
    91  	labels [wazeroir.LabelKindNum][]amd64LabelInfo
    92  	// stackPointerCeil is the greatest stack pointer value (from runtimeValueLocationStack) seen during compilation.
    93  	stackPointerCeil uint64
    94  	// assignStackPointerCeilNeeded holds an asm.Node whose AssignDestinationConstant must be called with the determined stack pointer ceiling.
    95  	assignStackPointerCeilNeeded asm.Node
    96  	compiledTrapTargets          [nativeCallStatusModuleClosed]asm.Node
    97  	withListener                 bool
    98  	typ                          *wasm.FunctionType
    99  	// locationStackForEntrypoint is the initial location stack for all functions. To reuse the allocated stack,
   100  	// we cache it here, and reset and set to .locationStack in the Init method.
   101  	locationStackForEntrypoint runtimeValueLocationStack
   102  	// frameIDMax tracks the maximum value of frame id per function.
   103  	frameIDMax int
   104  	brTableTmp []runtimeValueLocation
   105  
   106  	fourZeros,
   107  	eightZeros,
   108  	minimum32BitSignedInt,
   109  	maximum32BitSignedInt,
   110  	maximum32BitUnsignedInt,
   111  	minimum64BitSignedInt,
   112  	maximum64BitSignedInt,
   113  	maximum64BitUnsignedInt,
   114  	float32SignBitMask,
   115  	float32RestBitMask,
   116  	float64SignBitMask,
   117  	float64RestBitMask,
   118  	float32ForMinimumSigned32bitInteger,
   119  	float64ForMinimumSigned32bitInteger,
   120  	float32ForMinimumSigned64bitInteger,
   121  	float64ForMinimumSigned64bitInteger,
   122  	float32ForMaximumSigned32bitIntPlusOne,
   123  	float64ForMaximumSigned32bitIntPlusOne,
   124  	float32ForMaximumSigned64bitIntPlusOne,
   125  	float64ForMaximumSigned64bitIntPlusOne *asm.StaticConst
   126  }
   127  
   128  func newAmd64Compiler() compiler {
   129  	c := &amd64Compiler{
   130  		assembler:                  amd64.NewAssembler(),
   131  		locationStackForEntrypoint: newRuntimeValueLocationStack(),
   132  		cpuFeatures:                platform.CpuFeatures,
   133  	}
   134  
   135  	c.fourZeros = asm.NewStaticConst([]byte{0, 0, 0, 0})
   136  	c.eightZeros = asm.NewStaticConst([]byte{0, 0, 0, 0, 0, 0, 0, 0})
   137  	c.minimum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_minimum32BitSignedInt)))
   138  	c.maximum32BitSignedInt = asm.NewStaticConst(u32.LeBytes(uint32(_maximum32BitSignedInt)))
   139  	c.maximum32BitUnsignedInt = asm.NewStaticConst(u32.LeBytes(_maximum32BitUnsignedInt))
   140  	c.minimum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_minimum64BitSignedInt)))
   141  	c.maximum64BitSignedInt = asm.NewStaticConst(u64.LeBytes(uint64(_maximum64BitSignedInt)))
   142  	c.maximum64BitUnsignedInt = asm.NewStaticConst(u64.LeBytes(_maximum64BitUnsignedInt))
   143  	c.float32SignBitMask = asm.NewStaticConst(u32.LeBytes(_float32SignBitMask))
   144  	c.float32RestBitMask = asm.NewStaticConst(u32.LeBytes(_float32RestBitMask))
   145  	c.float64SignBitMask = asm.NewStaticConst(u64.LeBytes(_float64SignBitMask))
   146  	c.float64RestBitMask = asm.NewStaticConst(u64.LeBytes(_float64RestBitMask))
   147  	c.float32ForMinimumSigned32bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned32bitInteger))
   148  	c.float64ForMinimumSigned32bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned32bitInteger))
   149  	c.float32ForMinimumSigned64bitInteger = asm.NewStaticConst(u32.LeBytes(_float32ForMinimumSigned64bitInteger))
   150  	c.float64ForMinimumSigned64bitInteger = asm.NewStaticConst(u64.LeBytes(_float64ForMinimumSigned64bitInteger))
   151  	c.float32ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned32bitIntPlusOne))
   152  	c.float64ForMaximumSigned32bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned32bitIntPlusOne))
   153  	c.float32ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u32.LeBytes(_float32ForMaximumSigned64bitIntPlusOne))
   154  	c.float64ForMaximumSigned64bitIntPlusOne = asm.NewStaticConst(u64.LeBytes(_float64ForMaximumSigned64bitIntPlusOne))
   155  	return c
   156  }
   157  
   158  // Init implements compiler.Init.
   159  func (c *amd64Compiler) Init(typ *wasm.FunctionType, ir *wazeroir.CompilationResult, withListener bool) {
   160  	c.assembler.Reset()
   161  	c.locationStackForEntrypoint.reset()
   162  	c.resetLabels()
   163  	*c = amd64Compiler{
   164  		ir:                                     ir,
   165  		withListener:                           withListener,
   166  		typ:                                    typ,
   167  		assembler:                              c.assembler,
   168  		cpuFeatures:                            c.cpuFeatures,
   169  		labels:                                 c.labels,
   170  		locationStackForEntrypoint:             c.locationStackForEntrypoint,
   171  		brTableTmp:                             c.brTableTmp,
   172  		fourZeros:                              c.fourZeros,
   173  		eightZeros:                             c.eightZeros,
   174  		minimum32BitSignedInt:                  c.minimum32BitSignedInt,
   175  		maximum32BitSignedInt:                  c.maximum32BitSignedInt,
   176  		maximum32BitUnsignedInt:                c.maximum32BitUnsignedInt,
   177  		minimum64BitSignedInt:                  c.minimum64BitSignedInt,
   178  		maximum64BitSignedInt:                  c.maximum64BitSignedInt,
   179  		maximum64BitUnsignedInt:                c.maximum64BitUnsignedInt,
   180  		float32SignBitMask:                     c.float32SignBitMask,
   181  		float32RestBitMask:                     c.float32RestBitMask,
   182  		float64SignBitMask:                     c.float64SignBitMask,
   183  		float64RestBitMask:                     c.float64RestBitMask,
   184  		float32ForMinimumSigned32bitInteger:    c.float32ForMinimumSigned32bitInteger,
   185  		float64ForMinimumSigned32bitInteger:    c.float64ForMinimumSigned32bitInteger,
   186  		float32ForMinimumSigned64bitInteger:    c.float32ForMinimumSigned64bitInteger,
   187  		float64ForMinimumSigned64bitInteger:    c.float64ForMinimumSigned64bitInteger,
   188  		float32ForMaximumSigned32bitIntPlusOne: c.float32ForMaximumSigned32bitIntPlusOne,
   189  		float64ForMaximumSigned32bitIntPlusOne: c.float64ForMaximumSigned32bitIntPlusOne,
   190  		float32ForMaximumSigned64bitIntPlusOne: c.float32ForMaximumSigned64bitIntPlusOne,
   191  		float64ForMaximumSigned64bitIntPlusOne: c.float64ForMaximumSigned64bitIntPlusOne,
   192  	}
   193  
   194  	// Reuses the initial location stack for the compilation of subsequent functions.
   195  	c.locationStack = &c.locationStackForEntrypoint
   196  }
   197  
   198  // resetLabels resets the existing content in arm64Compiler.labels so that
   199  // we could reuse the allocated slices and stacks in the subsequent compilations.
   200  func (c *amd64Compiler) resetLabels() {
   201  	for i := range c.labels {
   202  		for j := range c.labels[i] {
   203  			if j > c.frameIDMax {
   204  				// Only need to reset until the maximum frame id. This makes the compilation faster for large binary.
   205  				break
   206  			}
   207  			l := &c.labels[i][j]
   208  			l.initialInstruction = nil
   209  			l.stackInitialized = false
   210  			l.initialStack.reset()
   211  		}
   212  	}
   213  }
   214  
   215  // runtimeValueLocationStack implements compilerImpl.runtimeValueLocationStack for the amd64 architecture.
   216  func (c *amd64Compiler) runtimeValueLocationStack() *runtimeValueLocationStack {
   217  	return c.locationStack
   218  }
   219  
   220  // setLocationStack sets the given runtimeValueLocationStack to .locationStack field,
   221  // while allowing us to track runtimeValueLocationStack.stackPointerCeil across multiple stacks.
   222  // This is called when we branch into different block.
   223  func (c *amd64Compiler) setLocationStack(newStack *runtimeValueLocationStack) {
   224  	if c.stackPointerCeil < c.locationStack.stackPointerCeil {
   225  		c.stackPointerCeil = c.locationStack.stackPointerCeil
   226  	}
   227  	c.locationStack = newStack
   228  }
   229  
   230  // pushRuntimeValueLocationOnRegister implements compiler.pushRuntimeValueLocationOnRegister for amd64.
   231  func (c *amd64Compiler) pushRuntimeValueLocationOnRegister(reg asm.Register, vt runtimeValueType) (ret *runtimeValueLocation) {
   232  	ret = c.locationStack.pushRuntimeValueLocationOnRegister(reg, vt)
   233  	c.locationStack.markRegisterUsed(reg)
   234  	return
   235  }
   236  
   237  // pushVectorRuntimeValueLocationOnRegister implements compiler.pushVectorRuntimeValueLocationOnRegister for amd64.
   238  func (c *amd64Compiler) pushVectorRuntimeValueLocationOnRegister(reg asm.Register) (lowerBitsLocation *runtimeValueLocation) {
   239  	lowerBitsLocation = c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Lo)
   240  	c.locationStack.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeV128Hi)
   241  	c.locationStack.markRegisterUsed(reg)
   242  	return
   243  }
   244  
   245  type amd64LabelInfo struct {
   246  	// initialInstruction is the initial instruction for this label so other block can jump into it.
   247  	initialInstruction asm.Node
   248  	// initialStack is the initial value location stack from which we start compiling this label.
   249  	initialStack     runtimeValueLocationStack
   250  	stackInitialized bool
   251  }
   252  
   253  func (c *amd64Compiler) label(label wazeroir.Label) *amd64LabelInfo {
   254  	kind := label.Kind()
   255  	frames := c.labels[kind]
   256  	frameID := label.FrameID()
   257  	if c.frameIDMax < frameID {
   258  		c.frameIDMax = frameID
   259  	}
   260  	// If the frameID is not allocated yet, expand the slice by twice of the diff,
   261  	// so that we could reduce the allocation in the subsequent compilation.
   262  	if diff := frameID - len(frames) + 1; diff > 0 {
   263  		for i := 0; i < diff; i++ {
   264  			frames = append(frames, amd64LabelInfo{initialStack: newRuntimeValueLocationStack()})
   265  		}
   266  		c.labels[kind] = frames
   267  	}
   268  	return &frames[frameID]
   269  }
   270  
   271  // compileBuiltinFunctionCheckExitCode implements compiler.compileBuiltinFunctionCheckExitCode for the amd64 architecture.
   272  func (c *amd64Compiler) compileBuiltinFunctionCheckExitCode() error {
   273  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexCheckExitCode); err != nil {
   274  		return err
   275  	}
   276  
   277  	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
   278  	c.compileReservedStackBasePointerInitialization()
   279  	c.compileReservedMemoryPointerInitialization()
   280  	return nil
   281  }
   282  
   283  // compileGoDefinedHostFunction constructs the entire code to enter the host function implementation,
   284  // and return to the caller.
   285  func (c *amd64Compiler) compileGoDefinedHostFunction() error {
   286  	// First we must update the location stack to reflect the number of host function inputs.
   287  	c.locationStack.init(c.typ)
   288  
   289  	if c.withListener {
   290  		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
   291  			return err
   292  		}
   293  	}
   294  
   295  	// Host function needs access to the caller's Function Instance, and the caller's information is stored in the stack
   296  	// (as described in the doc of callEngine.stack). Here, we get the caller's *wasm.FunctionInstance from the stack,
   297  	// and save it in callEngine.exitContext.callerFunctionInstance so we can pass it to the host function
   298  	// without sacrificing the performance.
   299  	c.compileReservedStackBasePointerInitialization()
   300  	// Alias for readability.
   301  	tmp := amd64.RegAX
   302  	// Get the location of the callerFunction (*function) in the stack, which depends on the signature.
   303  	_, _, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
   304  	// Load the value into the tmp register: tmp = &function{..}
   305  	callerFunction.setRegister(tmp)
   306  	c.compileLoadValueOnStackToRegister(callerFunction)
   307  	// tmp = *(tmp+functionSourceOffset) = &wasm.ModuleInstance{...}
   308  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, functionModuleInstanceOffset, tmp)
   309  	// Load it onto callEngine.exitContext.callerFunctionInstance.
   310  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
   311  		tmp,
   312  		amd64ReservedRegisterForCallEngine, callEngineExitContextCallerModuleInstanceOffset)
   313  	// Reset the state of callerFunction value location so that we won't mess up subsequent code generation below.
   314  	c.locationStack.releaseRegister(callerFunction)
   315  
   316  	if err := c.compileCallGoHostFunction(); err != nil {
   317  		return err
   318  	}
   319  
   320  	// Initializes the reserved stack base pointer which is used to retrieve the call frame stack.
   321  	c.compileReservedStackBasePointerInitialization()
   322  
   323  	// Go function can change the module state in arbitrary way, so we have to force
   324  	// the callEngine.moduleContext initialization on the function return. To do so,
   325  	// we zero-out callEngine.moduleInstance.
   326  	c.assembler.CompileConstToMemory(amd64.MOVQ,
   327  		0, amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
   328  	return c.compileReturnFunction()
   329  }
   330  
   331  // compile implements compiler.compile for the amd64 architecture.
   332  func (c *amd64Compiler) compile(buf asm.Buffer) (stackPointerCeil uint64, err error) {
   333  	// c.stackPointerCeil tracks the stack pointer ceiling (max seen) value across all runtimeValueLocationStack(s)
   334  	// used for all labels (via setLocationStack), excluding the current one.
   335  	// Hence, we check here if the final block's max one exceeds the current c.stackPointerCeil.
   336  	stackPointerCeil = c.stackPointerCeil
   337  	if stackPointerCeil < c.locationStack.stackPointerCeil {
   338  		stackPointerCeil = c.locationStack.stackPointerCeil
   339  	}
   340  
   341  	// Now that the max stack pointer is determined, we are invoking the callback.
   342  	// Note this MUST be called before Assemble() below.
   343  	c.assignStackPointerCeil(stackPointerCeil)
   344  
   345  	err = c.assembler.Assemble(buf)
   346  	return
   347  }
   348  
   349  // compileUnreachable implements compiler.compileUnreachable for the amd64 architecture.
   350  func (c *amd64Compiler) compileUnreachable() error {
   351  	c.compileExitFromNativeCode(nativeCallStatusCodeUnreachable)
   352  	return nil
   353  }
   354  
   355  // assignStackPointerCeil implements compilerImpl.assignStackPointerCeil for the amd64 architecture.
   356  func (c *amd64Compiler) assignStackPointerCeil(ceil uint64) {
   357  	if c.assignStackPointerCeilNeeded != nil {
   358  		c.assignStackPointerCeilNeeded.AssignDestinationConstant(int64(ceil) << 3)
   359  	}
   360  }
   361  
   362  // compileSet implements compiler.compileSet for the amd64 architecture.
   363  func (c *amd64Compiler) compileSet(o *wazeroir.UnionOperation) error {
   364  	depth := int(o.U1)
   365  	isTargetVector := o.B3
   366  
   367  	setTargetIndex := int(c.locationStack.sp) - 1 - depth
   368  
   369  	if isTargetVector {
   370  		_ = c.locationStack.pop() // ignore the higher 64-bits.
   371  	}
   372  	v := c.locationStack.pop()
   373  	if err := c.compileEnsureOnRegister(v); err != nil {
   374  		return err
   375  	}
   376  
   377  	targetLocation := &c.locationStack.stack[setTargetIndex]
   378  	if targetLocation.onRegister() {
   379  		// We no longer need the register previously used by the target location.
   380  		c.locationStack.markRegisterUnused(targetLocation.register)
   381  	}
   382  
   383  	reg := v.register
   384  	targetLocation.setRegister(reg)
   385  	targetLocation.valueType = v.valueType
   386  	if isTargetVector {
   387  		hi := &c.locationStack.stack[setTargetIndex+1]
   388  		hi.setRegister(reg)
   389  	}
   390  	return nil
   391  }
   392  
   393  // compileGlobalGet implements compiler.compileGlobalGet for the amd64 architecture.
   394  func (c *amd64Compiler) compileGlobalGet(o *wazeroir.UnionOperation) error {
   395  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   396  		return err
   397  	}
   398  
   399  	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   400  	if err != nil {
   401  		return err
   402  	}
   403  
   404  	// First, move the pointer to the global slice into the allocated register.
   405  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
   406  
   407  	index := o.U1
   408  
   409  	// Now, move the location of the global instance into the register.
   410  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)
   411  
   412  	// When an integer, reuse the pointer register for the value. Otherwise, allocate a float register for it.
   413  	valueReg := intReg
   414  	var vt runtimeValueType
   415  	var inst asm.Instruction
   416  	switch c.ir.Globals[index].ValType {
   417  	case wasm.ValueTypeI32:
   418  		inst = amd64.MOVL
   419  		vt = runtimeValueTypeI32
   420  	case wasm.ValueTypeI64, wasm.ValueTypeExternref, wasm.ValueTypeFuncref:
   421  		inst = amd64.MOVQ
   422  		vt = runtimeValueTypeI64
   423  	case wasm.ValueTypeF32:
   424  		inst = amd64.MOVL
   425  		vt = runtimeValueTypeF32
   426  		valueReg, err = c.allocateRegister(registerTypeVector)
   427  		if err != nil {
   428  			return err
   429  		}
   430  	case wasm.ValueTypeF64:
   431  		inst = amd64.MOVQ
   432  		vt = runtimeValueTypeF64
   433  		valueReg, err = c.allocateRegister(registerTypeVector)
   434  		if err != nil {
   435  			return err
   436  		}
   437  	case wasm.ValueTypeV128:
   438  		inst = amd64.MOVDQU
   439  		vt = runtimeValueTypeV128Lo
   440  		valueReg, err = c.allocateRegister(registerTypeVector)
   441  		if err != nil {
   442  			return err
   443  		}
   444  	default:
   445  		panic("BUG: unknown runtime value type")
   446  	}
   447  
   448  	// Using the register holding the pointer to the target instance, move its value into a register.
   449  	c.assembler.CompileMemoryToRegister(inst, intReg, globalInstanceValueOffset, valueReg)
   450  
   451  	// Record that the retrieved global value on the top of the stack is now in a register.
   452  	if vt == runtimeValueTypeV128Lo {
   453  		c.pushVectorRuntimeValueLocationOnRegister(valueReg)
   454  	} else {
   455  		c.pushRuntimeValueLocationOnRegister(valueReg, vt)
   456  	}
   457  	return nil
   458  }
   459  
   460  // compileGlobalSet implements compiler.compileGlobalSet for the amd64 architecture.
   461  func (c *amd64Compiler) compileGlobalSet(o *wazeroir.UnionOperation) error {
   462  	index := o.U1
   463  
   464  	wasmValueType := c.ir.Globals[index].ValType
   465  	isV128 := wasmValueType == wasm.ValueTypeV128
   466  
   467  	// First, move the value to set into a temporary register.
   468  	val := c.locationStack.pop()
   469  	if isV128 {
   470  		// The previous val is higher 64-bits, and have to use lower 64-bit's runtimeValueLocation for allocation, etc.
   471  		val = c.locationStack.pop()
   472  	}
   473  	if err := c.compileEnsureOnRegister(val); err != nil {
   474  		return err
   475  	}
   476  
   477  	// Allocate a register to hold the memory location of the target global instance.
   478  	intReg, err := c.allocateRegister(registerTypeGeneralPurpose)
   479  	if err != nil {
   480  		return err
   481  	}
   482  
   483  	// First, move the pointer to the global slice into the allocated register.
   484  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset, intReg)
   485  
   486  	// Now, move the location of the global instance into the register.
   487  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, intReg, 8*int64(index), intReg)
   488  
   489  	// Now ready to write the value to the global instance location.
   490  	var inst asm.Instruction
   491  	if isV128 {
   492  		inst = amd64.MOVDQU
   493  	} else if wasmValueType == wasm.ValueTypeI32 || wasmValueType == wasm.ValueTypeF32 {
   494  		inst = amd64.MOVL
   495  	} else {
   496  		inst = amd64.MOVQ
   497  	}
   498  	c.assembler.CompileRegisterToMemory(inst, val.register, intReg, globalInstanceValueOffset)
   499  
   500  	// Since the value is now written to memory, release the value register.
   501  	c.locationStack.releaseRegister(val)
   502  	return nil
   503  }
   504  
   505  // compileBr implements compiler.compileBr for the amd64 architecture.
   506  func (c *amd64Compiler) compileBr(o *wazeroir.UnionOperation) error {
   507  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   508  		return err
   509  	}
   510  	return c.branchInto(wazeroir.Label(o.U1))
   511  }
   512  
   513  // branchInto adds instruction necessary to jump into the given branch target.
   514  func (c *amd64Compiler) branchInto(target wazeroir.Label) error {
   515  	if target.IsReturnTarget() {
   516  		return c.compileReturnFunction()
   517  	} else {
   518  		if c.ir.LabelCallers[target] > 1 {
   519  			// We can only re-use register state if when there's a single call-site.
   520  			// Release existing values on registers to the stack if there's multiple ones to have
   521  			// the consistent value location state at the beginning of label.
   522  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   523  				return err
   524  			}
   525  		}
   526  		// Set the initial stack of the target label, so we can start compiling the label
   527  		// with the appropriate value locations. Note we clone the stack here as we maybe
   528  		// manipulate the stack before compiler reaches the label.
   529  		targetLabel := c.label(target)
   530  		if !targetLabel.stackInitialized {
   531  			targetLabel.initialStack.cloneFrom(*c.locationStack)
   532  			targetLabel.stackInitialized = true
   533  		}
   534  		jmp := c.assembler.CompileJump(amd64.JMP)
   535  		c.assignJumpTarget(target, jmp)
   536  	}
   537  	return nil
   538  }
   539  
   540  // compileBrIf implements compiler.compileBrIf for the amd64 architecture.
   541  func (c *amd64Compiler) compileBrIf(o *wazeroir.UnionOperation) error {
   542  	cond := c.locationStack.pop()
   543  	var jmpWithCond asm.Node
   544  	if cond.onConditionalRegister() {
   545  		var inst asm.Instruction
   546  		switch cond.conditionalRegister {
   547  		case amd64.ConditionalRegisterStateE:
   548  			inst = amd64.JEQ
   549  		case amd64.ConditionalRegisterStateNE:
   550  			inst = amd64.JNE
   551  		case amd64.ConditionalRegisterStateS:
   552  			inst = amd64.JMI
   553  		case amd64.ConditionalRegisterStateNS:
   554  			inst = amd64.JPL
   555  		case amd64.ConditionalRegisterStateG:
   556  			inst = amd64.JGT
   557  		case amd64.ConditionalRegisterStateGE:
   558  			inst = amd64.JGE
   559  		case amd64.ConditionalRegisterStateL:
   560  			inst = amd64.JLT
   561  		case amd64.ConditionalRegisterStateLE:
   562  			inst = amd64.JLE
   563  		case amd64.ConditionalRegisterStateA:
   564  			inst = amd64.JHI
   565  		case amd64.ConditionalRegisterStateAE:
   566  			inst = amd64.JCC
   567  		case amd64.ConditionalRegisterStateB:
   568  			inst = amd64.JCS
   569  		case amd64.ConditionalRegisterStateBE:
   570  			inst = amd64.JLS
   571  		}
   572  		jmpWithCond = c.assembler.CompileJump(inst)
   573  	} else {
   574  		// Usually the comparison operand for br_if is on the conditional register,
   575  		// but in some cases, they are on the stack or register.
   576  		// For example, the following code
   577  		// 		i64.const 1
   578  		//      local.get 1
   579  		//      i64.add
   580  		//      br_if ....
   581  		// will try to use the result of i64.add, which resides on the (virtual) stack,
   582  		// as the operand for br_if instruction.
   583  		if err := c.compileEnsureOnRegister(cond); err != nil {
   584  			return err
   585  		}
   586  		// Check if the value not equals zero.
   587  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, cond.register, cond.register)
   588  
   589  		// Emit jump instruction which jumps when the value does not equals zero.
   590  		jmpWithCond = c.assembler.CompileJump(amd64.JNE)
   591  		c.locationStack.markRegisterUnused(cond.register)
   592  	}
   593  
   594  	// Make sure that the next coming label is the else jump target.
   595  	thenTarget := wazeroir.Label(o.U1)
   596  	elseTarget := wazeroir.Label(o.U2)
   597  	thenToDrop := o.U3
   598  
   599  	// Here's the diagram of how we organize the instructions necessarily for brif operation.
   600  	//
   601  	// jmp_with_cond -> jmp (.Else) -> Then operations...
   602  	//    |---------(satisfied)------------^^^
   603  	//
   604  	// Note that .Else branch doesn't have ToDrop as .Else is in reality
   605  	// corresponding to either If's Else block or Br_if's else block in Wasm.
   606  
   607  	// Emit the else branch.
   608  	if elseTarget.IsReturnTarget() {
   609  		if err := c.compileReturnFunction(); err != nil {
   610  			return err
   611  		}
   612  	} else {
   613  		labelInfo := c.label(elseTarget)
   614  		if !labelInfo.stackInitialized {
   615  			labelInfo.initialStack.cloneFrom(*c.locationStack)
   616  			labelInfo.stackInitialized = true
   617  		}
   618  
   619  		elseJmp := c.assembler.CompileJump(amd64.JMP)
   620  		c.assignJumpTarget(elseTarget, elseJmp)
   621  	}
   622  
   623  	// Handle then branch.
   624  	c.assembler.SetJumpTargetOnNext(jmpWithCond)
   625  	if err := compileDropRange(c, thenToDrop); err != nil {
   626  		return err
   627  	}
   628  	if thenTarget.IsReturnTarget() {
   629  		return c.compileReturnFunction()
   630  	} else {
   631  		thenLabel := thenTarget
   632  		if c.ir.LabelCallers[thenLabel] > 1 {
   633  			// We can only re-use register state if when there's a single call-site.
   634  			// Release existing values on registers to the stack if there's multiple ones to have
   635  			// the consistent value location state at the beginning of label.
   636  			if err := c.compileReleaseAllRegistersToStack(); err != nil {
   637  				return err
   638  			}
   639  		}
   640  		// Set the initial stack of the target label, so we can start compiling the label
   641  		// with the appropriate value locations. Note we clone the stack here as we maybe
   642  		// manipulate the stack before compiler reaches the label.
   643  		labelInfo := c.label(thenLabel)
   644  		if !labelInfo.stackInitialized {
   645  			labelInfo.initialStack.cloneFrom(*c.locationStack)
   646  			labelInfo.stackInitialized = true
   647  		}
   648  		thenJmp := c.assembler.CompileJump(amd64.JMP)
   649  		c.assignJumpTarget(thenLabel, thenJmp)
   650  		return nil
   651  	}
   652  }
   653  
   654  // compileBrTable implements compiler.compileBrTable for the amd64 architecture.
   655  func (c *amd64Compiler) compileBrTable(o *wazeroir.UnionOperation) error {
   656  	index := c.locationStack.pop()
   657  
   658  	// If the operation only consists of the default target, we branch into it and return early.
   659  	if len(o.Us) == 2 {
   660  		c.locationStack.releaseRegister(index)
   661  		if err := compileDropRange(c, o.Us[1]); err != nil {
   662  			return err
   663  		}
   664  		return c.branchInto(wazeroir.Label(o.Us[0]))
   665  	}
   666  
   667  	// Otherwise, we jump into the selected branch.
   668  	if err := c.compileEnsureOnRegister(index); err != nil {
   669  		return err
   670  	}
   671  
   672  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   673  	if err != nil {
   674  		return err
   675  	}
   676  
   677  	// First, we move the length of target list into the tmp register.
   678  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(len(o.Us)/2-1), tmp)
   679  
   680  	// Then, we compare the value with the length of targets.
   681  	c.assembler.CompileRegisterToRegister(amd64.CMPL, tmp, index.register)
   682  
   683  	// If the value is larger than the length,
   684  	// we round the index to the length as the spec states that
   685  	// if the index is larger than or equal the length of list,
   686  	// branch into the default branch.
   687  	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, tmp, index.register)
   688  
   689  	// We prepare the static data which holds the offset of
   690  	// each target's first instruction (incl. default)
   691  	// relative to the beginning of label tables.
   692  	//
   693  	// For example, if we have targets=[L0, L1] and default=L_DEFAULT,
   694  	// we emit the the code like this at [Emit the code for each targets and default branch] below.
   695  	//
   696  	// L0:
   697  	//  0x123001: XXXX, ...
   698  	//  .....
   699  	// L1:
   700  	//  0x123005: YYY, ...
   701  	//  .....
   702  	// L_DEFAULT:
   703  	//  0x123009: ZZZ, ...
   704  	//
   705  	// then offsetData becomes like [0x0, 0x5, 0x8].
   706  	// By using this offset list, we could jump into the label for the index by
   707  	// "jmp offsetData[index]+0x123001" and "0x123001" can be acquired by "LEA"
   708  	// instruction.
   709  	//
   710  	// Note: We store each offset of 32-bite unsigned integer as 4 consecutive bytes. So more precisely,
   711  	// the above example's offsetData would be [0x0, 0x0, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0].
   712  	//
   713  	// Note: this is similar to how GCC implements Switch statements in C.
   714  	offsetData := asm.NewStaticConst(make([]byte, 4*(len(o.Us)/2)))
   715  
   716  	// Load the offsetData's address into tmp.
   717  	if err = c.assembler.CompileStaticConstToRegister(amd64.LEAQ, offsetData, tmp); err != nil {
   718  		return err
   719  	}
   720  
   721  	// Now we have the address of first byte of offsetData in tmp register.
   722  	// So the target offset's first byte is at tmp+index*4 as we store
   723  	// the offset as 4 bytes for a 32-byte integer.
   724  	// Here, we store the offset into the index.register.
   725  	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVL, tmp, 0, index.register, 4, index.register)
   726  
   727  	// Now we read the address of the beginning of the jump table.
   728  	// In the above example, this corresponds to reading the address of 0x123001.
   729  	c.assembler.CompileReadInstructionAddress(tmp, amd64.JMP)
   730  
   731  	// Now we have the address of L0 in tmp register, and the offset to the target label in the index.register.
   732  	// So we could achieve the br_table jump by adding them and jump into the resulting address.
   733  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, index.register, tmp)
   734  
   735  	c.assembler.CompileJumpToRegister(amd64.JMP, tmp)
   736  
   737  	// We no longer need the index's register, so mark it unused.
   738  	c.locationStack.markRegisterUnused(index.register)
   739  
   740  	// [Emit the code for each targets and default branch]
   741  	labelInitialInstructions := make([]asm.Node, len(o.Us)/2)
   742  
   743  	// Since we might end up having the different stack state in each branch,
   744  	// we need to save the initial stack state here, and use the same initial state
   745  	// for each iteration.
   746  	initialLocationStack := c.getSavedTemporaryLocationStack()
   747  
   748  	for i := range labelInitialInstructions {
   749  		// Emit the initial instruction of each target.
   750  		// We use NOP as we don't yet know the next instruction in each label.
   751  		// Assembler would optimize out this NOP during code generation, so this is harmless.
   752  		labelInitialInstructions[i] = c.assembler.CompileStandAlone(amd64.NOP)
   753  
   754  		targetLabel := wazeroir.Label(o.Us[i*2])
   755  		targetToDrop := o.Us[i*2+1]
   756  		if err = compileDropRange(c, targetToDrop); err != nil {
   757  			return err
   758  		}
   759  		if err = c.branchInto(targetLabel); err != nil {
   760  			return err
   761  		}
   762  		// After the iteration, reset the stack's state with initialLocationStack.
   763  		c.locationStack.cloneFrom(initialLocationStack)
   764  	}
   765  
   766  	c.assembler.BuildJumpTable(offsetData, labelInitialInstructions)
   767  	return nil
   768  }
   769  
   770  func (c *amd64Compiler) getSavedTemporaryLocationStack() runtimeValueLocationStack {
   771  	initialLocationStack := *c.locationStack // Take copy!
   772  	// Use c.brTableTmp for the underlying stack so that we could reduce the allocations.
   773  	if diff := int(initialLocationStack.sp) - len(c.brTableTmp); diff > 0 {
   774  		c.brTableTmp = append(c.brTableTmp, make([]runtimeValueLocation, diff)...)
   775  	}
   776  	copy(c.brTableTmp, initialLocationStack.stack[:initialLocationStack.sp])
   777  	initialLocationStack.stack = c.brTableTmp
   778  	return initialLocationStack
   779  }
   780  
   781  func (c *amd64Compiler) assignJumpTarget(label wazeroir.Label, jmpInstruction asm.Node) {
   782  	jmpTargetLabel := c.label(label)
   783  	targetInst := jmpTargetLabel.initialInstruction
   784  	if targetInst == nil {
   785  		// If the label isn't compiled yet, allocate the NOP node, and set as the initial instruction.
   786  		targetInst = c.assembler.AllocateNOP()
   787  		jmpTargetLabel.initialInstruction = targetInst
   788  	}
   789  	jmpInstruction.AssignJumpTarget(targetInst)
   790  }
   791  
   792  // compileLabel implements compiler.compileLabel for the amd64 architecture.
   793  func (c *amd64Compiler) compileLabel(o *wazeroir.UnionOperation) (skipLabel bool) {
   794  	label := wazeroir.Label(o.U1)
   795  	labelInfo := c.label(label)
   796  
   797  	// If initialStack is not set, that means this label has never been reached.
   798  	if !labelInfo.stackInitialized {
   799  		skipLabel = true
   800  		return
   801  	}
   802  
   803  	// We use NOP as a beginning of instructions in a label.
   804  	if labelBegin := labelInfo.initialInstruction; labelBegin == nil {
   805  		// We use NOP as a beginning of instructions in a label.
   806  		// This should be eventually optimized out by assembler.
   807  		labelInfo.initialInstruction = c.assembler.CompileStandAlone(amd64.NOP)
   808  	} else {
   809  		c.assembler.Add(labelBegin)
   810  	}
   811  
   812  	// Set the initial stack.
   813  	c.setLocationStack(&labelInfo.initialStack)
   814  	return
   815  }
   816  
   817  // compileCall implements compiler.compileCall for the amd64 architecture.
   818  func (c *amd64Compiler) compileCall(o *wazeroir.UnionOperation) error {
   819  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
   820  		return err
   821  	}
   822  
   823  	functionIndex := o.U1
   824  
   825  	target := c.ir.Functions[functionIndex]
   826  	targetType := &c.ir.Types[target]
   827  
   828  	targetAddressRegister, err := c.allocateRegister(registerTypeGeneralPurpose)
   829  	if err != nil {
   830  		return err
   831  	}
   832  
   833  	// First, push the index to the callEngine.functionsElement0Address into the target register.
   834  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(functionIndex)*functionSize, targetAddressRegister)
   835  
   836  	// Next, we add the address of the first item of callEngine.functions slice (= &callEngine.functions[0])
   837  	// to the target register.
   838  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, amd64ReservedRegisterForCallEngine,
   839  		callEngineModuleContextFunctionsElement0AddressOffset, targetAddressRegister)
   840  
   841  	if err := c.compileCallFunctionImpl(targetAddressRegister, targetType); err != nil {
   842  		return err
   843  	}
   844  	return nil
   845  }
   846  
   847  // compileCallIndirect implements compiler.compileCallIndirect for the amd64 architecture.
   848  func (c *amd64Compiler) compileCallIndirect(o *wazeroir.UnionOperation) error {
   849  	offset := c.locationStack.pop()
   850  	if err := c.compileEnsureOnRegister(offset); err != nil {
   851  		return nil
   852  	}
   853  	typeIndex := o.U1
   854  	tableIndex := o.U2
   855  
   856  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
   857  	if err != nil {
   858  		return err
   859  	}
   860  	c.locationStack.markRegisterUsed(tmp)
   861  
   862  	tmp2, err := c.allocateRegister(registerTypeGeneralPurpose)
   863  	if err != nil {
   864  		return err
   865  	}
   866  	c.locationStack.markRegisterUsed(tmp2)
   867  
   868  	// Load the address of the target table: tmp = &module.Tables[0]
   869  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
   870  	// tmp = &module.Tables[0] + Index*8 = &module.Tables[0] + sizeOf(*TableInstance)*index = module.Tables[o.TableIndex].
   871  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
   872  
   873  	// Then, we need to trap if the offset exceeds the length of table.
   874  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
   875  	c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
   876  
   877  	// next we check if the target's type matches the operation's one.
   878  	// In order to get the type instance's address, we have to multiply the offset
   879  	// by 8 as the offset is the "length" of table in Go's "[]uintptr{}",
   880  	// and size of uintptr equals 8 bytes == (2^3).
   881  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, offset.register)
   882  
   883  	// Adds the address of wasm.Table[0] stored as callEngine.tableElement0Address to the offset.
   884  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
   885  		tmp, tableInstanceTableOffset, offset.register)
   886  
   887  	// "offset = (*offset) (== table[offset]  == *code type)"
   888  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, offset.register, 0, offset.register)
   889  
   890  	// At this point offset.register holds the address of *code (as uintptr) at wasm.Table[offset].
   891  	//
   892  	// Check if the value of table[offset] equals zero, meaning that the target is uninitialized.
   893  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, offset.register, offset.register)
   894  
   895  	// Skipped if the target is initialized.
   896  	c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusCodeInvalidTableAccess)
   897  
   898  	// Next, we need to check the type matches, i.e. table[offset].source.TypeID == targetFunctionType's typeID.
   899  	//
   900  	// "tmp2 = [&moduleInstance.TypeIDs[0] + index * 4] (== moduleInstance.TypeIDs[index])"
   901  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
   902  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset,
   903  		tmp2)
   904  	c.assembler.CompileMemoryToRegister(amd64.MOVL, tmp2, int64(typeIndex)*4, tmp2)
   905  
   906  	// Skipped if the type matches.
   907  	c.assembler.CompileMemoryToRegister(amd64.CMPL, offset.register, functionTypeIDOffset, tmp2)
   908  	c.compileMaybeExitFromNativeCode(amd64.JEQ, nativeCallStatusCodeTypeMismatchOnIndirectCall)
   909  	targetFunctionType := &c.ir.Types[typeIndex]
   910  	if err = c.compileCallFunctionImpl(offset.register, targetFunctionType); err != nil {
   911  		return nil
   912  	}
   913  
   914  	// The offset register should be marked as un-used as we consumed in the function call.
   915  	c.locationStack.markRegisterUnused(offset.register, tmp, tmp2)
   916  	return nil
   917  }
   918  
   919  // compileDrop implements compiler.compileDrop for the amd64 architecture.
   920  func (c *amd64Compiler) compileDrop(o *wazeroir.UnionOperation) error {
   921  	return compileDropRange(c, o.U1)
   922  }
   923  
   924  // compileSelectV128Impl implements compileSelect for vector values.
   925  func (c *amd64Compiler) compileSelectV128Impl(selectorReg asm.Register) error {
   926  	x2 := c.locationStack.popV128()
   927  	if err := c.compileEnsureOnRegister(x2); err != nil {
   928  		return err
   929  	}
   930  
   931  	x1 := c.locationStack.popV128()
   932  	if err := c.compileEnsureOnRegister(x1); err != nil {
   933  		return err
   934  	}
   935  
   936  	// Compare the conditional value with zero.
   937  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, selectorReg, selectorReg)
   938  
   939  	// Set the jump if the top value is not zero.
   940  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
   941  
   942  	// In this branch, we select the value of x2, so we move the value into x1.register so that
   943  	// we can have the result in x1.register regardless of the selection.
   944  	c.assembler.CompileRegisterToRegister(amd64.MOVDQU, x2.register, x1.register)
   945  
   946  	// Else, we don't need to adjust value, just need to jump to the next instruction.
   947  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
   948  
   949  	// As noted, the result exists in x1.register regardless of the selector.
   950  	c.pushVectorRuntimeValueLocationOnRegister(x1.register)
   951  	// Plus, x2.register is no longer used.
   952  	c.locationStack.markRegisterUnused(x2.register)
   953  	c.locationStack.markRegisterUnused(selectorReg)
   954  	return nil
   955  }
   956  
   957  // compileSelect implements compiler.compileSelect for the amd64 architecture.
   958  //
   959  // The emitted native code depends on whether the values are on
   960  // the physical registers or memory stack, or maybe conditional register.
   961  func (c *amd64Compiler) compileSelect(o *wazeroir.UnionOperation) error {
   962  	cv := c.locationStack.pop()
   963  	if err := c.compileEnsureOnRegister(cv); err != nil {
   964  		return err
   965  	}
   966  
   967  	isTargetVector := o.B3
   968  	if isTargetVector {
   969  		return c.compileSelectV128Impl(cv.register)
   970  	}
   971  
   972  	x2 := c.locationStack.pop()
   973  	// We do not consume x1 here, but modify the value according to
   974  	// the conditional value "c" above.
   975  	peekedX1 := c.locationStack.peek()
   976  
   977  	// Compare the conditional value with zero.
   978  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, cv.register, cv.register)
   979  
   980  	// Now we can use c.register as temporary location.
   981  	// We alias it here for readability.
   982  	tmpRegister := cv.register
   983  
   984  	// Set the jump if the top value is not zero.
   985  	jmpIfNotZero := c.assembler.CompileJump(amd64.JNE)
   986  
   987  	// If the value is zero, we must place the value of x2 onto the stack position of x1.
   988  
   989  	// First we copy the value of x2 to the temporary register if x2 is not currently on a register.
   990  	if x2.onStack() {
   991  		x2.register = tmpRegister
   992  		c.compileLoadValueOnStackToRegister(x2)
   993  	}
   994  
   995  	//
   996  	// At this point x2's value is always on a register.
   997  	//
   998  
   999  	// Then release the value in the x2's register to the x1's stack position.
  1000  	if peekedX1.onRegister() {
  1001  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2.register, peekedX1.register)
  1002  	} else {
  1003  		peekedX1.register = x2.register
  1004  		c.compileReleaseRegisterToStack(peekedX1) // Note inside we mark the register unused!
  1005  	}
  1006  
  1007  	// Else, we don't need to adjust value, just need to jump to the next instruction.
  1008  	c.assembler.SetJumpTargetOnNext(jmpIfNotZero)
  1009  
  1010  	// In any case, we don't need x2 and c anymore!
  1011  	c.locationStack.releaseRegister(x2)
  1012  	c.locationStack.releaseRegister(cv)
  1013  	return nil
  1014  }
  1015  
  1016  // compilePick implements compiler.compilePick for the amd64 architecture.
  1017  func (c *amd64Compiler) compilePick(o *wazeroir.UnionOperation) error {
  1018  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1019  		return err
  1020  	}
  1021  	depth := o.U1
  1022  	isTargetVector := o.B3
  1023  
  1024  	// TODO: if we track the type of values on the stack,
  1025  	// we could optimize the instruction according to the bit size of the value.
  1026  	// For now, we just move the entire register i.e. as a quad word (8 bytes).
  1027  	pickTarget := &c.locationStack.stack[c.locationStack.sp-1-uint64(depth)]
  1028  	reg, err := c.allocateRegister(pickTarget.getRegisterType())
  1029  	if err != nil {
  1030  		return err
  1031  	}
  1032  
  1033  	if pickTarget.onRegister() {
  1034  		var inst asm.Instruction
  1035  		if isTargetVector {
  1036  			inst = amd64.MOVDQU
  1037  		} else if pickTarget.valueType == runtimeValueTypeI32 { // amd64 cannot copy single-precisions between registers.
  1038  			inst = amd64.MOVL
  1039  		} else {
  1040  			inst = amd64.MOVQ
  1041  		}
  1042  		c.assembler.CompileRegisterToRegister(inst, pickTarget.register, reg)
  1043  	} else if pickTarget.onStack() {
  1044  		// Copy the value from the stack.
  1045  		var inst asm.Instruction
  1046  		if isTargetVector {
  1047  			inst = amd64.MOVDQU
  1048  		} else if pickTarget.valueType == runtimeValueTypeI32 || pickTarget.valueType == runtimeValueTypeF32 {
  1049  			inst = amd64.MOVL
  1050  		} else {
  1051  			inst = amd64.MOVQ
  1052  		}
  1053  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  1054  		c.assembler.CompileMemoryToRegister(inst, amd64ReservedRegisterForStackBasePointerAddress,
  1055  			int64(pickTarget.stackPointer)*8, reg)
  1056  	}
  1057  	// Now we already placed the picked value on the register,
  1058  	// so push the location onto the stack.
  1059  	if isTargetVector {
  1060  		c.pushVectorRuntimeValueLocationOnRegister(reg)
  1061  	} else {
  1062  		c.pushRuntimeValueLocationOnRegister(reg, pickTarget.valueType)
  1063  	}
  1064  	return nil
  1065  }
  1066  
  1067  // compileAdd implements compiler.compileAdd for the amd64 architecture.
  1068  func (c *amd64Compiler) compileAdd(o *wazeroir.UnionOperation) error {
  1069  	// TODO: if the previous instruction is const, then
  1070  	// this can be optimized. Same goes for other arithmetic instructions.
  1071  
  1072  	var instruction asm.Instruction
  1073  
  1074  	unsignedType := wazeroir.UnsignedType(o.B1)
  1075  	switch unsignedType {
  1076  	case wazeroir.UnsignedTypeI32:
  1077  		instruction = amd64.ADDL
  1078  	case wazeroir.UnsignedTypeI64:
  1079  		instruction = amd64.ADDQ
  1080  	case wazeroir.UnsignedTypeF32:
  1081  		instruction = amd64.ADDSS
  1082  	case wazeroir.UnsignedTypeF64:
  1083  		instruction = amd64.ADDSD
  1084  	}
  1085  
  1086  	x2 := c.locationStack.pop()
  1087  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1088  		return err
  1089  	}
  1090  
  1091  	x1 := c.locationStack.peek() // Note this is peek!
  1092  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1093  		return err
  1094  	}
  1095  
  1096  	// x1 += x2.
  1097  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1098  
  1099  	// We no longer need x2 register after ADD operation here,
  1100  	// so we release it.
  1101  	c.locationStack.releaseRegister(x2)
  1102  	return nil
  1103  }
  1104  
  1105  // compileSub implements compiler.compileSub for the amd64 architecture.
  1106  func (c *amd64Compiler) compileSub(o *wazeroir.UnionOperation) error {
  1107  	// TODO: if the previous instruction is const, then
  1108  	// this can be optimized. Same goes for other arithmetic instructions.
  1109  
  1110  	var instruction asm.Instruction
  1111  	unsignedType := wazeroir.UnsignedType(o.B1)
  1112  	switch unsignedType {
  1113  	case wazeroir.UnsignedTypeI32:
  1114  		instruction = amd64.SUBL
  1115  	case wazeroir.UnsignedTypeI64:
  1116  		instruction = amd64.SUBQ
  1117  	case wazeroir.UnsignedTypeF32:
  1118  		instruction = amd64.SUBSS
  1119  	case wazeroir.UnsignedTypeF64:
  1120  		instruction = amd64.SUBSD
  1121  	}
  1122  
  1123  	x2 := c.locationStack.pop()
  1124  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1125  		return err
  1126  	}
  1127  
  1128  	x1 := c.locationStack.peek() // Note this is peek!
  1129  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1130  		return err
  1131  	}
  1132  
  1133  	// x1 -= x2.
  1134  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1135  
  1136  	// We no longer need x2 register after ADD operation here,
  1137  	// so we release it.
  1138  	c.locationStack.releaseRegister(x2)
  1139  	return nil
  1140  }
  1141  
  1142  // compileMul implements compiler.compileMul for the amd64 architecture.
  1143  func (c *amd64Compiler) compileMul(o *wazeroir.UnionOperation) (err error) {
  1144  	unsignedType := wazeroir.UnsignedType(o.B1)
  1145  	switch unsignedType {
  1146  	case wazeroir.UnsignedTypeI32:
  1147  		err = c.compileMulForInts(true, amd64.MULL)
  1148  	case wazeroir.UnsignedTypeI64:
  1149  		err = c.compileMulForInts(false, amd64.MULQ)
  1150  	case wazeroir.UnsignedTypeF32:
  1151  		err = c.compileMulForFloats(amd64.MULSS)
  1152  	case wazeroir.UnsignedTypeF64:
  1153  		err = c.compileMulForFloats(amd64.MULSD)
  1154  	}
  1155  	return
  1156  }
  1157  
  1158  // compileMulForInts emits instructions to perform integer multiplication for
  1159  // top two values on the stack. If unfamiliar with the convention for integer
  1160  // multiplication on x86, see https://www.felixcloutier.com/x86/mul.
  1161  //
  1162  // In summary, one of the values must be on the AX register,
  1163  // and the mul instruction stores the overflow info in DX register which we don't use.
  1164  // Here, we mean "the overflow info" by 65 bit or higher part of the result for 64 bit case.
  1165  //
  1166  // So, we have to ensure that
  1167  //  1. Previously located value on DX must be saved to memory stack. That is because
  1168  //     the existing value will be overridden after the mul execution.
  1169  //  2. One of the operands (x1 or x2) must be on AX register.
  1170  //
  1171  // See https://www.felixcloutier.com/x86/mul#description for detail semantics.
  1172  func (c *amd64Compiler) compileMulForInts(is32Bit bool, mulInstruction asm.Instruction) error {
  1173  	const (
  1174  		resultRegister   = amd64.RegAX
  1175  		reservedRegister = amd64.RegDX
  1176  	)
  1177  
  1178  	x2 := c.locationStack.pop()
  1179  	x1 := c.locationStack.pop()
  1180  
  1181  	var valueOnAX *runtimeValueLocation
  1182  	if x1.register == resultRegister {
  1183  		valueOnAX = x1
  1184  	} else if x2.register == resultRegister {
  1185  		valueOnAX = x2
  1186  	} else {
  1187  		valueOnAX = x2
  1188  		// This case we  move x2 to AX register.
  1189  		c.onValueReleaseRegisterToStack(resultRegister)
  1190  		if x2.onConditionalRegister() {
  1191  			c.compileMoveConditionalToGeneralPurposeRegister(x2, resultRegister)
  1192  		} else if x2.onStack() {
  1193  			x2.setRegister(resultRegister)
  1194  			c.compileLoadValueOnStackToRegister(x2)
  1195  			c.locationStack.markRegisterUsed(resultRegister)
  1196  		} else {
  1197  			var inst asm.Instruction
  1198  			if is32Bit {
  1199  				inst = amd64.MOVL
  1200  			} else {
  1201  				inst = amd64.MOVQ
  1202  			}
  1203  			c.assembler.CompileRegisterToRegister(inst, x2.register, resultRegister)
  1204  
  1205  			// We no longer uses the prev register of x2.
  1206  			c.locationStack.releaseRegister(x2)
  1207  			x2.setRegister(resultRegister)
  1208  			c.locationStack.markRegisterUsed(resultRegister)
  1209  		}
  1210  	}
  1211  
  1212  	// We have to make sure that at this point the operands must be on registers.
  1213  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1214  		return err
  1215  	}
  1216  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1217  		return err
  1218  	}
  1219  
  1220  	// We have to save the existing value on DX.
  1221  	// If the DX register is used by either x1 or x2, we don't need to
  1222  	// save the value because it is consumed by mul anyway.
  1223  	if x1.register != reservedRegister && x2.register != reservedRegister {
  1224  		c.onValueReleaseRegisterToStack(reservedRegister)
  1225  	}
  1226  
  1227  	// Now ready to emit the mul instruction.
  1228  	if x1 == valueOnAX {
  1229  		c.assembler.CompileRegisterToNone(mulInstruction, x2.register)
  1230  	} else {
  1231  		c.assembler.CompileRegisterToNone(mulInstruction, x1.register)
  1232  	}
  1233  
  1234  	c.locationStack.markRegisterUnused(x2.register)
  1235  	c.locationStack.markRegisterUnused(x1.register)
  1236  
  1237  	// Now we have the result in the AX register,
  1238  	// so we record it.
  1239  	c.pushRuntimeValueLocationOnRegister(resultRegister, x1.valueType)
  1240  	return nil
  1241  }
  1242  
  1243  func (c *amd64Compiler) compileMulForFloats(instruction asm.Instruction) error {
  1244  	x2 := c.locationStack.pop()
  1245  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1246  		return err
  1247  	}
  1248  
  1249  	x1 := c.locationStack.peek() // Note this is peek!
  1250  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1251  		return err
  1252  	}
  1253  
  1254  	// x1 *= x2.
  1255  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1256  
  1257  	// We no longer need x2 register after MUL operation here,
  1258  	// so we release it.
  1259  	c.locationStack.releaseRegister(x2)
  1260  	return nil
  1261  }
  1262  
  1263  // compileClz implements compiler.compileClz for the amd64 architecture.
  1264  func (c *amd64Compiler) compileClz(o *wazeroir.UnionOperation) error {
  1265  	target := c.locationStack.pop()
  1266  	if err := c.compileEnsureOnRegister(target); err != nil {
  1267  		return err
  1268  	}
  1269  
  1270  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1271  	if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
  1272  		if unsignedInt == wazeroir.UnsignedInt32 {
  1273  			c.assembler.CompileRegisterToRegister(amd64.LZCNTL, target.register, target.register)
  1274  		} else {
  1275  			c.assembler.CompileRegisterToRegister(amd64.LZCNTQ, target.register, target.register)
  1276  		}
  1277  	} else {
  1278  		// On processors that do not support LZCNT, we combine BSR (calculating
  1279  		// most significant set bit) with XOR. This logic is described in
  1280  		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
  1281  		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.
  1282  
  1283  		// First, we have to check if the target is non-zero as BSR is undefined
  1284  		// on zero. See https://www.felixcloutier.com/x86/bsr.
  1285  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, target.register, target.register)
  1286  		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
  1287  
  1288  		// If the value is zero, we just push the const value.
  1289  		if unsignedInt == wazeroir.UnsignedInt32 {
  1290  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
  1291  		} else {
  1292  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
  1293  		}
  1294  
  1295  		// Emit the jmp instruction to jump to the position right after
  1296  		// the non-zero case.
  1297  		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
  1298  
  1299  		// Start emitting non-zero case.
  1300  		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
  1301  		// First, we calculate the most significant set bit.
  1302  		if unsignedInt == wazeroir.UnsignedInt32 {
  1303  			c.assembler.CompileRegisterToRegister(amd64.BSRL, target.register, target.register)
  1304  		} else {
  1305  			c.assembler.CompileRegisterToRegister(amd64.BSRQ, target.register, target.register)
  1306  		}
  1307  
  1308  		// Now we XOR the value with the bit length minus one.
  1309  		if unsignedInt == wazeroir.UnsignedInt32 {
  1310  			c.assembler.CompileConstToRegister(amd64.XORL, 31, target.register)
  1311  		} else {
  1312  			c.assembler.CompileConstToRegister(amd64.XORQ, 63, target.register)
  1313  		}
  1314  
  1315  		// Finally the end jump instruction of zero case must target towards
  1316  		// the next instruction.
  1317  		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
  1318  	}
  1319  
  1320  	// We reused the same register of target for the result.
  1321  	c.locationStack.markRegisterUnused(target.register)
  1322  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1323  	return nil
  1324  }
  1325  
  1326  // compileCtz implements compiler.compileCtz for the amd64 architecture.
  1327  func (c *amd64Compiler) compileCtz(o *wazeroir.UnionOperation) error {
  1328  	target := c.locationStack.pop()
  1329  	if err := c.compileEnsureOnRegister(target); err != nil {
  1330  		return err
  1331  	}
  1332  
  1333  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1334  	if c.cpuFeatures.HasExtra(platform.CpuExtraFeatureABM) {
  1335  		if unsignedInt == wazeroir.UnsignedInt32 {
  1336  			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
  1337  		} else {
  1338  			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
  1339  		}
  1340  	} else {
  1341  		// On processors that do not support TZCNT, the BSF instruction is
  1342  		// executed instead. The key difference between TZCNT and BSF
  1343  		// instruction is that if source operand is zero, the content of
  1344  		// destination operand is undefined.
  1345  		// https://www.felixcloutier.com/x86/tzcnt.html
  1346  
  1347  		// First we compare the target with zero.
  1348  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, target.register, target.register)
  1349  		jmpIfNonZero := c.assembler.CompileJump(amd64.JNE)
  1350  
  1351  		// If the value is zero, we just push the const value.
  1352  		if unsignedInt == wazeroir.UnsignedInt32 {
  1353  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(32), target.register)
  1354  		} else {
  1355  			c.assembler.CompileConstToRegister(amd64.MOVL, int64(64), target.register)
  1356  		}
  1357  
  1358  		// Emit the jmp instruction to jump to the position right after
  1359  		// the non-zero case.
  1360  		jmpAtEndOfZero := c.assembler.CompileJump(amd64.JMP)
  1361  
  1362  		// Otherwise, emit the TZCNT.
  1363  		c.assembler.SetJumpTargetOnNext(jmpIfNonZero)
  1364  		if unsignedInt == wazeroir.UnsignedInt32 {
  1365  			c.assembler.CompileRegisterToRegister(amd64.TZCNTL, target.register, target.register)
  1366  		} else {
  1367  			c.assembler.CompileRegisterToRegister(amd64.TZCNTQ, target.register, target.register)
  1368  		}
  1369  
  1370  		// Finally the end jump instruction of zero case must target towards
  1371  		// the next instruction.
  1372  		c.assembler.SetJumpTargetOnNext(jmpAtEndOfZero)
  1373  	}
  1374  
  1375  	// We reused the same register of target for the result.
  1376  	c.locationStack.markRegisterUnused(target.register)
  1377  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1378  	return nil
  1379  }
  1380  
  1381  // compilePopcnt implements compiler.compilePopcnt for the amd64 architecture.
  1382  func (c *amd64Compiler) compilePopcnt(o *wazeroir.UnionOperation) error {
  1383  	target := c.locationStack.pop()
  1384  	if err := c.compileEnsureOnRegister(target); err != nil {
  1385  		return err
  1386  	}
  1387  
  1388  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1389  	if unsignedInt == wazeroir.UnsignedInt32 {
  1390  		c.assembler.CompileRegisterToRegister(amd64.POPCNTL, target.register, target.register)
  1391  	} else {
  1392  		c.assembler.CompileRegisterToRegister(amd64.POPCNTQ, target.register, target.register)
  1393  	}
  1394  
  1395  	// We reused the same register of target for the result.
  1396  	c.locationStack.markRegisterUnused(target.register)
  1397  	c.pushRuntimeValueLocationOnRegister(target.register, target.valueType)
  1398  	return nil
  1399  }
  1400  
  1401  // compileDiv implements compiler.compileDiv for the amd64 architecture.
  1402  func (c *amd64Compiler) compileDiv(o *wazeroir.UnionOperation) (err error) {
  1403  	signedType := wazeroir.SignedType(o.B1)
  1404  	switch signedType {
  1405  	case wazeroir.SignedTypeUint32:
  1406  		err = c.compileDivForInts(true, false)
  1407  	case wazeroir.SignedTypeUint64:
  1408  		err = c.compileDivForInts(false, false)
  1409  	case wazeroir.SignedTypeInt32:
  1410  		err = c.compileDivForInts(true, true)
  1411  	case wazeroir.SignedTypeInt64:
  1412  		err = c.compileDivForInts(false, true)
  1413  	case wazeroir.SignedTypeFloat32:
  1414  		err = c.compileDivForFloats(true)
  1415  	case wazeroir.SignedTypeFloat64:
  1416  		err = c.compileDivForFloats(false)
  1417  	}
  1418  	return
  1419  }
  1420  
  1421  // compileDivForInts emits the instructions to perform division on the top
  1422  // two values of integer type on the stack and puts the quotient of the result
  1423  // onto the stack. For example, stack [..., 10, 3] results in [..., 3] where
  1424  // the remainder is discarded.
  1425  func (c *amd64Compiler) compileDivForInts(is32Bit bool, signed bool) error {
  1426  	if err := c.performDivisionOnInts(false, is32Bit, signed); err != nil {
  1427  		return err
  1428  	}
  1429  	// Now we have the quotient of the division result in the AX register,
  1430  	// so we record it.
  1431  	if is32Bit {
  1432  		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI32)
  1433  	} else {
  1434  		c.pushRuntimeValueLocationOnRegister(amd64.RegAX, runtimeValueTypeI64)
  1435  	}
  1436  	return nil
  1437  }
  1438  
  1439  // compileRem implements compiler.compileRem for the amd64 architecture.
  1440  func (c *amd64Compiler) compileRem(o *wazeroir.UnionOperation) (err error) {
  1441  	var vt runtimeValueType
  1442  	signedInt := wazeroir.SignedInt(o.B1)
  1443  	switch signedInt {
  1444  	case wazeroir.SignedInt32:
  1445  		err = c.performDivisionOnInts(true, true, true)
  1446  		vt = runtimeValueTypeI32
  1447  	case wazeroir.SignedInt64:
  1448  		err = c.performDivisionOnInts(true, false, true)
  1449  		vt = runtimeValueTypeI64
  1450  	case wazeroir.SignedUint32:
  1451  		err = c.performDivisionOnInts(true, true, false)
  1452  		vt = runtimeValueTypeI32
  1453  	case wazeroir.SignedUint64:
  1454  		err = c.performDivisionOnInts(true, false, false)
  1455  		vt = runtimeValueTypeI64
  1456  	}
  1457  	if err != nil {
  1458  		return err
  1459  	}
  1460  
  1461  	// Now we have the remainder of the division result in the DX register,
  1462  	// so we record it.
  1463  	c.pushRuntimeValueLocationOnRegister(amd64.RegDX, vt)
  1464  	return
  1465  }
  1466  
  1467  // performDivisionOnInts emits the instructions to do divisions on top two integers on the stack
  1468  // via DIV (unsigned div) and IDIV (signed div) instructions.
  1469  // See the following explanation of these instructions' semantics from https://www.lri.fr/~filliatr/ens/compil/x86-64.pdf
  1470  //
  1471  // >> Division requires special arrangements: idiv (signed) and div (unsigned) operate on a 2n-byte dividend and
  1472  // >> an n-byte divisor to produce an n-byte quotient and n-byte remainder. The dividend always lives in a fixed pair of
  1473  // >> registers (%edx and %eax for the 32-bit case; %rdx and %rax for the 64-bit case); the divisor is specified as the
  1474  // >> source operand in the instruction. The quotient goes in %eax (resp. %rax); the remainder in %edx (resp. %rdx). For
  1475  // >> signed division, the cltd (resp. ctqo) instruction is used to prepare %edx (resp. %rdx) with the sign extension of
  1476  // >> %eax (resp. %rax). For example, if a,b, c are memory locations holding quad words, then we could set c = a/b
  1477  // >> using the sequence: movq a(%rip), %rax; ctqo; idivq b(%rip); movq %rax, c(%rip).
  1478  //
  1479  // tl;dr is that the division result is placed in AX and DX registers after instructions emitted by this function
  1480  // where AX holds the quotient while DX the remainder of the division result.
  1481  func (c *amd64Compiler) performDivisionOnInts(isRem, is32Bit, signed bool) error {
  1482  	const (
  1483  		quotientRegister  = amd64.RegAX
  1484  		remainderRegister = amd64.RegDX
  1485  	)
  1486  
  1487  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1488  		return err
  1489  	}
  1490  
  1491  	// Ensures that previous values on these registers are saved to memory.
  1492  	c.onValueReleaseRegisterToStack(quotientRegister)
  1493  	c.onValueReleaseRegisterToStack(remainderRegister)
  1494  
  1495  	// In order to ensure x2 is placed on a temporary register for x2 value other than AX and DX,
  1496  	// we mark them as used here.
  1497  	c.locationStack.markRegisterUsed(quotientRegister)
  1498  	c.locationStack.markRegisterUsed(remainderRegister)
  1499  
  1500  	// Ensure that x2 is placed on a register which is not either AX or DX.
  1501  	x2 := c.locationStack.pop()
  1502  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1503  		return err
  1504  	}
  1505  
  1506  	// Now we successfully place x2 on a temp register, so we no longer need to
  1507  	// mark these registers used.
  1508  	c.locationStack.markRegisterUnused(quotientRegister)
  1509  	c.locationStack.markRegisterUnused(remainderRegister)
  1510  
  1511  	// Check if the x2 equals zero.
  1512  	if is32Bit {
  1513  		c.assembler.CompileRegisterToRegister(amd64.TESTL, x2.register, x2.register)
  1514  	} else {
  1515  		c.assembler.CompileRegisterToRegister(amd64.TESTQ, x2.register, x2.register)
  1516  	}
  1517  
  1518  	// Skipped if the divisor is nonzero.
  1519  	c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusIntegerDivisionByZero)
  1520  
  1521  	// next, we ensure that x1 is placed on AX.
  1522  	x1 := c.locationStack.pop()
  1523  	if x1.onRegister() && x1.register != quotientRegister {
  1524  		// Move x1 to quotientRegister.
  1525  		if is32Bit {
  1526  			c.assembler.CompileRegisterToRegister(amd64.MOVL, x1.register, quotientRegister)
  1527  		} else {
  1528  			c.assembler.CompileRegisterToRegister(amd64.MOVQ, x1.register, quotientRegister)
  1529  		}
  1530  		c.locationStack.markRegisterUnused(x1.register)
  1531  		x1.setRegister(quotientRegister)
  1532  	} else if x1.onStack() {
  1533  		x1.setRegister(quotientRegister)
  1534  		c.compileLoadValueOnStackToRegister(x1)
  1535  	}
  1536  
  1537  	// Note: at this point, x1 is placed on AX, x2 is on a register which is not AX or DX.
  1538  
  1539  	isSignedRem := isRem && signed
  1540  	isSignedDiv := !isRem && signed
  1541  	var signedRemMinusOneDivisorJmp asm.Node
  1542  	if isSignedRem {
  1543  		// If this is for getting remainder of signed division,
  1544  		// we have to treat the special case where the divisor equals -1.
  1545  		// For example, if this is 32-bit case, the result of (-2^31) / -1 equals (quotient=2^31, remainder=0)
  1546  		// where quotient doesn't fit in the 32-bit range whose maximum is 2^31-1.
  1547  		// x86 in this case cause floating point exception, but according to the Wasm spec
  1548  		// if the divisor equals -1, the result must be zero (not undefined!) as opposed to be "undefined"
  1549  		// for divisions on (-2^31) / -1 where we do not need to emit the special branches.
  1550  		// For detail, please refer to https://stackoverflow.com/questions/56303282/why-idiv-with-1-causes-floating-point-exception
  1551  
  1552  		// First we store zero into the remainder result register (DX) and compare the divisor with -1.
  1553  		if is32Bit {
  1554  			c.assembler.CompileRegisterToRegister(amd64.XORL, remainderRegister, remainderRegister)
  1555  			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
  1556  		} else {
  1557  			c.assembler.CompileRegisterToRegister(amd64.XORQ, remainderRegister, remainderRegister)
  1558  			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
  1559  		}
  1560  
  1561  		// If it equals minus one, we skip the normal case.
  1562  		signedRemMinusOneDivisorJmp = c.assembler.CompileJump(amd64.JEQ)
  1563  	} else if isSignedDiv {
  1564  		// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
  1565  		// case which results in the floating point exception via division error as
  1566  		// the resulting value exceeds the maximum of signed int.
  1567  
  1568  		// First we compare the division with -1.
  1569  		if is32Bit {
  1570  			c.assembler.CompileRegisterToConst(amd64.CMPL, x2.register, -1)
  1571  		} else {
  1572  			c.assembler.CompileRegisterToConst(amd64.CMPQ, x2.register, -1)
  1573  		}
  1574  
  1575  		// If it doesn't equal minus one, we jump to the normal case.
  1576  		nonMinusOneDivisorJmp := c.assembler.CompileJump(amd64.JNE)
  1577  
  1578  		// next we check if the quotient is the most negative value for the signed integer.
  1579  		// That means whether or not we try to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
  1580  		if is32Bit {
  1581  			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPL, x1.register, c.minimum32BitSignedInt); err != nil {
  1582  				return err
  1583  			}
  1584  		} else {
  1585  			if err := c.assembler.CompileRegisterToStaticConst(amd64.CMPQ, x1.register, c.minimum64BitSignedInt); err != nil {
  1586  				return err
  1587  			}
  1588  		}
  1589  
  1590  		// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
  1591  		// as that is the overflow in division as the result becomes 2^31 which is larger than
  1592  		// the maximum of signed 32-bit int (2^31-1).
  1593  		c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusIntegerOverflow)
  1594  		// Set the normal case's jump target.
  1595  		c.assembler.SetJumpTargetOnNext(nonMinusOneDivisorJmp)
  1596  	}
  1597  
  1598  	// Now ready to emit the div instruction.
  1599  	// Since the div instructions takes 2n byte dividend placed in DX:AX registers...
  1600  	// * signed case - we need to sign-extend the dividend into DX register via CDQ (32 bit) or CQO (64 bit).
  1601  	// * unsigned case - we need to zero DX register via "XOR DX DX"
  1602  	if is32Bit && signed {
  1603  		// Emit sign-extension to have 64 bit dividend over DX and AX registers.
  1604  		c.assembler.CompileStandAlone(amd64.CDQ)
  1605  		c.assembler.CompileRegisterToNone(amd64.IDIVL, x2.register)
  1606  	} else if is32Bit && !signed {
  1607  		// Zeros DX register to have 64 bit dividend over DX and AX registers.
  1608  		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
  1609  		c.assembler.CompileRegisterToNone(amd64.DIVL, x2.register)
  1610  	} else if !is32Bit && signed {
  1611  		// Emits sign-extension to have 128 bit dividend over DX and AX registers.
  1612  		c.assembler.CompileStandAlone(amd64.CQO)
  1613  		c.assembler.CompileRegisterToNone(amd64.IDIVQ, x2.register)
  1614  	} else if !is32Bit && !signed {
  1615  		// Zeros DX register to have 128 bit dividend over DX and AX registers.
  1616  		c.assembler.CompileRegisterToRegister(amd64.XORQ, amd64.RegDX, amd64.RegDX)
  1617  		c.assembler.CompileRegisterToNone(amd64.DIVQ, x2.register)
  1618  	}
  1619  
  1620  	// If this is signed rem instruction, we must set the jump target of
  1621  	// the exit jump from division -1 case towards the next instruction.
  1622  	if signedRemMinusOneDivisorJmp != nil {
  1623  		c.assembler.SetJumpTargetOnNext(signedRemMinusOneDivisorJmp)
  1624  	}
  1625  
  1626  	// We mark them as unused so that we can push one of them onto the location stack at call sites.
  1627  	c.locationStack.markRegisterUnused(remainderRegister)
  1628  	c.locationStack.markRegisterUnused(quotientRegister)
  1629  	c.locationStack.markRegisterUnused(x2.register)
  1630  	return nil
  1631  }
  1632  
  1633  // compileDivForFloats emits the instructions to perform division
  1634  // on the top two values of float type on the stack, placing the result back onto the stack.
  1635  // For example, stack [..., 1.0, 4.0] results in [..., 0.25].
  1636  func (c *amd64Compiler) compileDivForFloats(is32Bit bool) error {
  1637  	if is32Bit {
  1638  		return c.compileSimpleBinaryOp(amd64.DIVSS)
  1639  	} else {
  1640  		return c.compileSimpleBinaryOp(amd64.DIVSD)
  1641  	}
  1642  }
  1643  
  1644  // compileAnd implements compiler.compileAnd for the amd64 architecture.
  1645  func (c *amd64Compiler) compileAnd(o *wazeroir.UnionOperation) (err error) {
  1646  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1647  	switch unsignedInt {
  1648  	case wazeroir.UnsignedInt32:
  1649  		err = c.compileSimpleBinaryOp(amd64.ANDL)
  1650  	case wazeroir.UnsignedInt64:
  1651  		err = c.compileSimpleBinaryOp(amd64.ANDQ)
  1652  	}
  1653  	return
  1654  }
  1655  
  1656  // compileOr implements compiler.compileOr for the amd64 architecture.
  1657  func (c *amd64Compiler) compileOr(o *wazeroir.UnionOperation) (err error) {
  1658  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1659  	switch unsignedInt {
  1660  	case wazeroir.UnsignedInt32:
  1661  		err = c.compileSimpleBinaryOp(amd64.ORL)
  1662  	case wazeroir.UnsignedInt64:
  1663  		err = c.compileSimpleBinaryOp(amd64.ORQ)
  1664  	}
  1665  	return
  1666  }
  1667  
  1668  // compileXor implements compiler.compileXor for the amd64 architecture.
  1669  func (c *amd64Compiler) compileXor(o *wazeroir.UnionOperation) (err error) {
  1670  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1671  	switch unsignedInt {
  1672  	case wazeroir.UnsignedInt32:
  1673  		err = c.compileSimpleBinaryOp(amd64.XORL)
  1674  	case wazeroir.UnsignedInt64:
  1675  		err = c.compileSimpleBinaryOp(amd64.XORQ)
  1676  	}
  1677  	return
  1678  }
  1679  
  1680  // compileSimpleBinaryOp emits instructions to pop two values from the stack
  1681  // and perform the given instruction on these two values and push the result
  1682  // onto the stack.
  1683  func (c *amd64Compiler) compileSimpleBinaryOp(instruction asm.Instruction) error {
  1684  	x2 := c.locationStack.pop()
  1685  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1686  		return err
  1687  	}
  1688  
  1689  	x1 := c.locationStack.pop()
  1690  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1691  		return err
  1692  	}
  1693  
  1694  	c.assembler.CompileRegisterToRegister(instruction, x2.register, x1.register)
  1695  
  1696  	// We consumed x2 register after the operation here,
  1697  	// so we release it.
  1698  	c.locationStack.releaseRegister(x2)
  1699  
  1700  	// We already stored the result in the register used by x1
  1701  	// so we record it.
  1702  	c.locationStack.markRegisterUnused(x1.register)
  1703  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  1704  	return nil
  1705  }
  1706  
  1707  // compileShl implements compiler.compileShl for the amd64 architecture.
  1708  func (c *amd64Compiler) compileShl(o *wazeroir.UnionOperation) (err error) {
  1709  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1710  	switch unsignedInt {
  1711  	case wazeroir.UnsignedInt32:
  1712  		err = c.compileShiftOp(amd64.SHLL, false)
  1713  	case wazeroir.UnsignedInt64:
  1714  		err = c.compileShiftOp(amd64.SHLQ, true)
  1715  	}
  1716  	return
  1717  }
  1718  
  1719  // compileShr implements compiler.compileShr for the amd64 architecture.
  1720  func (c *amd64Compiler) compileShr(o *wazeroir.UnionOperation) (err error) {
  1721  	signedInt := wazeroir.SignedInt(o.B1)
  1722  	switch signedInt {
  1723  	case wazeroir.SignedInt32:
  1724  		err = c.compileShiftOp(amd64.SARL, true)
  1725  	case wazeroir.SignedInt64:
  1726  		err = c.compileShiftOp(amd64.SARQ, false)
  1727  	case wazeroir.SignedUint32:
  1728  		err = c.compileShiftOp(amd64.SHRL, true)
  1729  	case wazeroir.SignedUint64:
  1730  		err = c.compileShiftOp(amd64.SHRQ, false)
  1731  	}
  1732  	return
  1733  }
  1734  
  1735  // compileRotl implements compiler.compileRotl for the amd64 architecture.
  1736  func (c *amd64Compiler) compileRotl(o *wazeroir.UnionOperation) (err error) {
  1737  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1738  	switch unsignedInt {
  1739  	case wazeroir.UnsignedInt32:
  1740  		err = c.compileShiftOp(amd64.ROLL, true)
  1741  	case wazeroir.UnsignedInt64:
  1742  		err = c.compileShiftOp(amd64.ROLQ, false)
  1743  	}
  1744  	return
  1745  }
  1746  
  1747  // compileRotr implements compiler.compileRotr for the amd64 architecture.
  1748  func (c *amd64Compiler) compileRotr(o *wazeroir.UnionOperation) (err error) {
  1749  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  1750  	switch unsignedInt {
  1751  	case wazeroir.UnsignedInt32:
  1752  		err = c.compileShiftOp(amd64.RORL, true)
  1753  	case wazeroir.UnsignedInt64:
  1754  		err = c.compileShiftOp(amd64.RORQ, false)
  1755  	}
  1756  	return
  1757  }
  1758  
  1759  // compileShiftOp adds instructions for shift operations (SHR, SHL, ROTR, ROTL)
  1760  // where we have to place the second value (shift counts) on the CX register.
  1761  func (c *amd64Compiler) compileShiftOp(instruction asm.Instruction, is32Bit bool) error {
  1762  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  1763  		return err
  1764  	}
  1765  
  1766  	x2 := c.locationStack.pop()
  1767  
  1768  	// Ensures that x2 (holding shift counts) is placed on the CX register.
  1769  	const shiftCountRegister = amd64.RegCX
  1770  	if (x2.onRegister() && x2.register != shiftCountRegister) || x2.onStack() {
  1771  		// If another value lives on the CX register, we release it to the stack.
  1772  		c.onValueReleaseRegisterToStack(shiftCountRegister)
  1773  
  1774  		if x2.onRegister() {
  1775  			x2r := x2.register
  1776  			// If x2 lives on a register, we move the value to CX.
  1777  			if is32Bit {
  1778  				c.assembler.CompileRegisterToRegister(amd64.MOVL, x2r, shiftCountRegister)
  1779  			} else {
  1780  				c.assembler.CompileRegisterToRegister(amd64.MOVQ, x2r, shiftCountRegister)
  1781  			}
  1782  			// We no longer place any value on the original register, so we record it.
  1783  			c.locationStack.markRegisterUnused(x2r)
  1784  		} else {
  1785  			// If it is on stack, we just move the memory allocated value to the CX register.
  1786  			x2.setRegister(shiftCountRegister)
  1787  			c.compileLoadValueOnStackToRegister(x2)
  1788  		}
  1789  		c.locationStack.markRegisterUsed(shiftCountRegister)
  1790  	}
  1791  
  1792  	x1 := c.locationStack.peek() // Note this is peek!
  1793  	x1r := x1.register
  1794  
  1795  	if x1.onRegister() {
  1796  		c.assembler.CompileRegisterToRegister(instruction, shiftCountRegister, x1r)
  1797  	} else {
  1798  		// Shift target can be placed on a memory location.
  1799  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  1800  		c.assembler.CompileRegisterToMemory(instruction, shiftCountRegister, amd64ReservedRegisterForStackBasePointerAddress, int64(x1.stackPointer)*8)
  1801  	}
  1802  
  1803  	// We consumed x2 register after the operation here,
  1804  	// so we release it.
  1805  	c.locationStack.markRegisterUnused(shiftCountRegister)
  1806  	return nil
  1807  }
  1808  
  1809  // compileAbs implements compiler.compileAbs for the amd64 architecture.
  1810  //
  1811  // See the following discussions for how we could take the abs of floats on x86 assembly.
  1812  // https://stackoverflow.com/questions/32408665/fastest-way-to-compute-absolute-value-using-sse/32422471#32422471
  1813  // https://stackoverflow.com/questions/44630015/how-would-fabsdouble-be-implemented-on-x86-is-it-an-expensive-operation
  1814  func (c *amd64Compiler) compileAbs(o *wazeroir.UnionOperation) (err error) {
  1815  	target := c.locationStack.peek() // Note this is peek!
  1816  	if err = c.compileEnsureOnRegister(target); err != nil {
  1817  		return err
  1818  	}
  1819  
  1820  	// First shift left by one to clear the sign bit, and then shift right by one.
  1821  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  1822  		c.assembler.CompileConstToRegister(amd64.PSLLD, 1, target.register)
  1823  		c.assembler.CompileConstToRegister(amd64.PSRLD, 1, target.register)
  1824  	} else {
  1825  		c.assembler.CompileConstToRegister(amd64.PSLLQ, 1, target.register)
  1826  		c.assembler.CompileConstToRegister(amd64.PSRLQ, 1, target.register)
  1827  	}
  1828  	return nil
  1829  }
  1830  
  1831  // compileNeg implements compiler.compileNeg for the amd64 architecture.
  1832  func (c *amd64Compiler) compileNeg(o *wazeroir.UnionOperation) (err error) {
  1833  	target := c.locationStack.peek() // Note this is peek!
  1834  	if err := c.compileEnsureOnRegister(target); err != nil {
  1835  		return err
  1836  	}
  1837  
  1838  	tmpReg, err := c.allocateRegister(registerTypeVector)
  1839  	if err != nil {
  1840  		return err
  1841  	}
  1842  
  1843  	// First we move the sign-bit mask (placed in memory) to the tmp register,
  1844  	// since we cannot take XOR directly with float reg and const.
  1845  	// And then negate the value by XOR it with the sign-bit mask.
  1846  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  1847  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
  1848  		if err != nil {
  1849  			return err
  1850  		}
  1851  		c.assembler.CompileRegisterToRegister(amd64.XORPS, tmpReg, target.register)
  1852  	} else {
  1853  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
  1854  		if err != nil {
  1855  			return err
  1856  		}
  1857  		c.assembler.CompileRegisterToRegister(amd64.XORPD, tmpReg, target.register)
  1858  	}
  1859  	return nil
  1860  }
  1861  
  1862  // compileCeil implements compiler.compileCeil for the amd64 architecture.
  1863  func (c *amd64Compiler) compileCeil(o *wazeroir.UnionOperation) (err error) {
  1864  	// Internally, ceil can be performed via ROUND instruction with 0x02 mode.
  1865  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/ceilf.S for example.
  1866  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x02)
  1867  }
  1868  
  1869  // compileFloor implements compiler.compileFloor for the amd64 architecture.
  1870  func (c *amd64Compiler) compileFloor(o *wazeroir.UnionOperation) (err error) {
  1871  	// Internally, floor can be performed via ROUND instruction with 0x01 mode.
  1872  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/floorf.S for example.
  1873  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x01)
  1874  }
  1875  
  1876  // compileTrunc implements compiler.compileTrunc for the amd64 architecture.
  1877  func (c *amd64Compiler) compileTrunc(o *wazeroir.UnionOperation) error {
  1878  	// Internally, trunc can be performed via ROUND instruction with 0x03 mode.
  1879  	// See https://android.googlesource.com/platform/bionic/+/882b8af/libm/x86_64/truncf.S for example.
  1880  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x03)
  1881  }
  1882  
  1883  // compileNearest implements compiler.compileNearest for the amd64 architecture.
  1884  func (c *amd64Compiler) compileNearest(o *wazeroir.UnionOperation) error {
  1885  	// Nearest can be performed via ROUND instruction with 0x00 mode.
  1886  	return c.compileRoundInstruction(wazeroir.Float(o.B1) == wazeroir.Float32, 0x00)
  1887  }
  1888  
  1889  func (c *amd64Compiler) compileRoundInstruction(is32Bit bool, mode int64) error {
  1890  	target := c.locationStack.peek() // Note this is peek!
  1891  	if err := c.compileEnsureOnRegister(target); err != nil {
  1892  		return err
  1893  	}
  1894  
  1895  	if is32Bit {
  1896  		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSS, target.register, target.register, byte(mode))
  1897  	} else {
  1898  		c.assembler.CompileRegisterToRegisterWithArg(amd64.ROUNDSD, target.register, target.register, byte(mode))
  1899  	}
  1900  	return nil
  1901  }
  1902  
  1903  // compileMin implements compiler.compileMin for the amd64 architecture.
  1904  func (c *amd64Compiler) compileMin(o *wazeroir.UnionOperation) error {
  1905  	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
  1906  	if is32Bit {
  1907  		return c.compileMinOrMax(is32Bit, true, amd64.MINSS)
  1908  	} else {
  1909  		return c.compileMinOrMax(is32Bit, true, amd64.MINSD)
  1910  	}
  1911  }
  1912  
  1913  // compileMax implements compiler.compileMax for the amd64 architecture.
  1914  func (c *amd64Compiler) compileMax(o *wazeroir.UnionOperation) error {
  1915  	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
  1916  	if is32Bit {
  1917  		return c.compileMinOrMax(is32Bit, false, amd64.MAXSS)
  1918  	} else {
  1919  		return c.compileMinOrMax(is32Bit, false, amd64.MAXSD)
  1920  	}
  1921  }
  1922  
  1923  // emitMinOrMax adds instructions to pop two values from the stack, and push back either minimum or
  1924  // minimum of these two values onto the stack according to the minOrMaxInstruction argument.
  1925  // minOrMaxInstruction must be one of MAXSS, MAXSD, MINSS or MINSD.
  1926  // Note: These native min/max instructions are almost compatible with min/max in the Wasm specification,
  1927  // but it is slightly different with respect to the NaN handling.
  1928  // Native min/max instructions return non-NaN value if exactly one of target values
  1929  // is NaN. For example native_{min,max}(5.0, NaN) returns always 5.0, not NaN.
  1930  // However, WebAssembly specifies that min/max must always return NaN if one of values is NaN.
  1931  // Therefore, in this function, we have to add conditional jumps to check if one of values is NaN before
  1932  // the native min/max, which is why we cannot simply emit a native min/max instruction here.
  1933  //
  1934  // For the semantics, see wazeroir.Min and wazeroir.Max for detail.
  1935  func (c *amd64Compiler) compileMinOrMax(is32Bit, isMin bool, minOrMaxInstruction asm.Instruction) error {
  1936  	x2 := c.locationStack.pop()
  1937  	if err := c.compileEnsureOnRegister(x2); err != nil {
  1938  		return err
  1939  	}
  1940  	x1 := c.locationStack.pop()
  1941  	if err := c.compileEnsureOnRegister(x1); err != nil {
  1942  		return err
  1943  	}
  1944  
  1945  	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case
  1946  	if is32Bit {
  1947  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
  1948  	} else {
  1949  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
  1950  	}
  1951  
  1952  	// At this point, we have the three cases of conditional flags below
  1953  	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
  1954  	//
  1955  	// 1) Two values are NaN-free and different: All flags are cleared.
  1956  	// 2) Two values are NaN-free and equal: Only ZF flags is set.
  1957  	// 3) One of Two values is NaN: ZF, PF and CF flags are set.
  1958  
  1959  	// Jump instruction to handle 1) case by checking the ZF flag
  1960  	// as ZF is only set for 2) and 3) cases.
  1961  	nanFreeOrDiffJump := c.assembler.CompileJump(amd64.JNE)
  1962  
  1963  	// Start handling 2) and 3).
  1964  
  1965  	// Jump if one of two values is NaN by checking the parity flag (PF).
  1966  	includeNaNJmp := c.assembler.CompileJump(amd64.JPS)
  1967  
  1968  	// Start handling 2).
  1969  
  1970  	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
  1971  	// returned if two values are positive and negative zeros.
  1972  	var inst asm.Instruction
  1973  	switch {
  1974  	case is32Bit && isMin:
  1975  		inst = amd64.ORPS
  1976  	case !is32Bit && isMin:
  1977  		inst = amd64.ORPD
  1978  	case is32Bit && !isMin:
  1979  		inst = amd64.ANDPS
  1980  	case !is32Bit && !isMin:
  1981  		inst = amd64.ANDPD
  1982  	}
  1983  	c.assembler.CompileRegisterToRegister(inst, x2.register, x1.register)
  1984  
  1985  	sameExitJmp := c.assembler.CompileJump(amd64.JMP)
  1986  
  1987  	// start handling 3).
  1988  	c.assembler.SetJumpTargetOnNext(includeNaNJmp)
  1989  
  1990  	// We emit the ADD instruction to produce the NaN in x1.
  1991  	if is32Bit {
  1992  		c.assembler.CompileRegisterToRegister(amd64.ADDSS, x2.register, x1.register)
  1993  	} else {
  1994  		c.assembler.CompileRegisterToRegister(amd64.ADDSD, x2.register, x1.register)
  1995  	}
  1996  
  1997  	// Exit from the NaN case branch.
  1998  	nanExitJmp := c.assembler.CompileJump(amd64.JMP)
  1999  
  2000  	// Start handling 1).
  2001  	c.assembler.SetJumpTargetOnNext(nanFreeOrDiffJump)
  2002  
  2003  	// Now handle the NaN-free and different values case.
  2004  	c.assembler.CompileRegisterToRegister(minOrMaxInstruction, x2.register, x1.register)
  2005  
  2006  	// Set the jump target of 1) and 2) cases to the next instruction after 3) case.
  2007  	c.assembler.SetJumpTargetOnNext(nanExitJmp)
  2008  	c.assembler.SetJumpTargetOnNext(sameExitJmp)
  2009  
  2010  	// Record that we consumed the x2 and placed the minOrMax result in the x1's register.
  2011  	c.locationStack.markRegisterUnused(x2.register)
  2012  	c.locationStack.markRegisterUnused(x1.register)
  2013  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2014  	return nil
  2015  }
  2016  
  2017  // compileCopysign implements compiler.compileCopysign for the amd64 architecture.
  2018  func (c *amd64Compiler) compileCopysign(o *wazeroir.UnionOperation) error {
  2019  	is32Bit := wazeroir.Float(o.B1) == wazeroir.Float32
  2020  
  2021  	x2 := c.locationStack.pop()
  2022  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2023  		return err
  2024  	}
  2025  	x1 := c.locationStack.pop()
  2026  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2027  		return err
  2028  	}
  2029  	tmpReg, err := c.allocateRegister(registerTypeVector)
  2030  	if err != nil {
  2031  		return err
  2032  	}
  2033  
  2034  	// Move the rest bit mask to the temp register.
  2035  	if is32Bit {
  2036  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32RestBitMask, tmpReg)
  2037  	} else {
  2038  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64RestBitMask, tmpReg)
  2039  	}
  2040  	if err != nil {
  2041  		return err
  2042  	}
  2043  
  2044  	// Clear the sign bit of x1 via AND with the mask.
  2045  	if is32Bit {
  2046  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x1.register)
  2047  	} else {
  2048  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x1.register)
  2049  	}
  2050  
  2051  	// Move the sign bit mask to the temp register.
  2052  	if is32Bit {
  2053  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.float32SignBitMask, tmpReg)
  2054  	} else {
  2055  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.float64SignBitMask, tmpReg)
  2056  	}
  2057  	if err != nil {
  2058  		return err
  2059  	}
  2060  
  2061  	// Clear the non-sign bits of x2 via AND with the mask.
  2062  	if is32Bit {
  2063  		c.assembler.CompileRegisterToRegister(amd64.ANDPS, tmpReg, x2.register)
  2064  	} else {
  2065  		c.assembler.CompileRegisterToRegister(amd64.ANDPD, tmpReg, x2.register)
  2066  	}
  2067  
  2068  	// Finally, copy the sign bit of x2 to x1.
  2069  	if is32Bit {
  2070  		c.assembler.CompileRegisterToRegister(amd64.ORPS, x2.register, x1.register)
  2071  	} else {
  2072  		c.assembler.CompileRegisterToRegister(amd64.ORPD, x2.register, x1.register)
  2073  	}
  2074  
  2075  	// Record that we consumed the x2 and placed the copysign result in the x1's register.
  2076  	c.locationStack.markRegisterUnused(x2.register)
  2077  	c.locationStack.markRegisterUnused(x1.register)
  2078  	c.pushRuntimeValueLocationOnRegister(x1.register, x1.valueType)
  2079  	return nil
  2080  }
  2081  
  2082  // compileSqrt implements compiler.compileSqrt for the amd64 architecture.
  2083  func (c *amd64Compiler) compileSqrt(o *wazeroir.UnionOperation) error {
  2084  	target := c.locationStack.peek() // Note this is peek!
  2085  	if err := c.compileEnsureOnRegister(target); err != nil {
  2086  		return err
  2087  	}
  2088  	if wazeroir.Float(o.B1) == wazeroir.Float32 {
  2089  		c.assembler.CompileRegisterToRegister(amd64.SQRTSS, target.register, target.register)
  2090  	} else {
  2091  		c.assembler.CompileRegisterToRegister(amd64.SQRTSD, target.register, target.register)
  2092  	}
  2093  	return nil
  2094  }
  2095  
  2096  // compileI32WrapFromI64 implements compiler.compileI32WrapFromI64 for the amd64 architecture.
  2097  func (c *amd64Compiler) compileI32WrapFromI64() error {
  2098  	target := c.locationStack.peek() // Note this is peek!
  2099  	if err := c.compileEnsureOnRegister(target); err != nil {
  2100  		return err
  2101  	}
  2102  	c.assembler.CompileRegisterToRegister(amd64.MOVL, target.register, target.register)
  2103  	target.valueType = runtimeValueTypeI32
  2104  	return nil
  2105  }
  2106  
  2107  // compileITruncFromF implements compiler.compileITruncFromF for the amd64 architecture.
  2108  //
  2109  // Note: in the following implementation, we use CVTSS2SI and CVTSD2SI to convert floats to signed integers.
  2110  // According to the Intel manual ([1],[2]), if the source float value is either +-Inf or NaN, or it exceeds representative ranges
  2111  // of target signed integer, then the instruction returns "masked" response float32SignBitMask (or float64SignBitMask for 64 bit case).
  2112  // [1] Chapter 11.5.2, SIMD Floating-Point Exception Conditions in "Vol 1, Intel® 64 and IA-32 Architectures Manual"
  2113  //
  2114  //	https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-1-manual.html
  2115  //
  2116  // [2] https://xem.github.io/minix86/manual/intel-x86-and-64-manual-vol1/o_7281d5ea06a5b67a-268.html
  2117  func (c *amd64Compiler) compileITruncFromF(o *wazeroir.UnionOperation) (err error) {
  2118  	inputType := wazeroir.Float(o.B1)
  2119  	outputType := wazeroir.SignedInt(o.B2)
  2120  	nonTrapping := o.B3
  2121  	if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt32 {
  2122  		err = c.emitSignedI32TruncFromFloat(true, nonTrapping)
  2123  	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedInt64 {
  2124  		err = c.emitSignedI64TruncFromFloat(true, nonTrapping)
  2125  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt32 {
  2126  		err = c.emitSignedI32TruncFromFloat(false, nonTrapping)
  2127  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedInt64 {
  2128  		err = c.emitSignedI64TruncFromFloat(false, nonTrapping)
  2129  	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint32 {
  2130  		err = c.emitUnsignedI32TruncFromFloat(true, nonTrapping)
  2131  	} else if inputType == wazeroir.Float32 && outputType == wazeroir.SignedUint64 {
  2132  		err = c.emitUnsignedI64TruncFromFloat(true, nonTrapping)
  2133  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint32 {
  2134  		err = c.emitUnsignedI32TruncFromFloat(false, nonTrapping)
  2135  	} else if inputType == wazeroir.Float64 && outputType == wazeroir.SignedUint64 {
  2136  		err = c.emitUnsignedI64TruncFromFloat(false, nonTrapping)
  2137  	}
  2138  	return
  2139  }
  2140  
  2141  // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit unsigned integer.
  2142  func (c *amd64Compiler) emitUnsignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2143  	source := c.locationStack.pop()
  2144  	if err := c.compileEnsureOnRegister(source); err != nil {
  2145  		return err
  2146  	}
  2147  
  2148  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2149  	if err != nil {
  2150  		return err
  2151  	}
  2152  
  2153  	// First, we check the source float value is above or equal math.MaxInt32+1.
  2154  	if isFloat32Bit {
  2155  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
  2156  	} else {
  2157  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
  2158  	}
  2159  	if err != nil {
  2160  		return err
  2161  	}
  2162  
  2163  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2164  	var nonTrappingNaNJump asm.Node
  2165  	if nonTrapping {
  2166  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2167  		// In non trapping case, NaN is casted as zero.
  2168  		// Zero out the result register by XOR itsself.
  2169  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2170  		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
  2171  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2172  	} else {
  2173  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2174  	}
  2175  
  2176  	// Jump if the source float value is above or equal math.MaxInt32+1.
  2177  	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
  2178  
  2179  	// next we convert the value as a signed integer.
  2180  	if isFloat32Bit {
  2181  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2182  	} else {
  2183  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2184  	}
  2185  
  2186  	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
  2187  	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
  2188  
  2189  	var nonTrappingMinusJump asm.Node
  2190  	if nonTrapping {
  2191  		jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
  2192  		// In non trapping case, the minus value is casted as zero.
  2193  		// Zero out the result register by XOR itsself.
  2194  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2195  		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
  2196  		c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
  2197  	} else {
  2198  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2199  	}
  2200  
  2201  	// Otherwise, the values is valid.
  2202  	okJmpForLessThanMaxInt32PlusOne := c.assembler.CompileJump(amd64.JMP)
  2203  
  2204  	// Now, start handling the case where the original float value is above or equal math.MaxInt32+1.
  2205  	//
  2206  	// First, we subtract the math.MaxInt32+1 from the original value so it can fit in signed 32-bit integer.
  2207  	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
  2208  	if isFloat32Bit {
  2209  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned32bitIntPlusOne, source.register)
  2210  	} else {
  2211  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned32bitIntPlusOne, source.register)
  2212  	}
  2213  	if err != nil {
  2214  		return err
  2215  	}
  2216  
  2217  	// Then, convert the subtracted value as a signed 32-bit integer.
  2218  	if isFloat32Bit {
  2219  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2220  	} else {
  2221  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2222  	}
  2223  
  2224  	// next, we have to check if the value is from NaN, +Inf.
  2225  	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
  2226  	// This means we check if the result int value is minus or not.
  2227  	c.assembler.CompileRegisterToRegister(amd64.TESTL, result, result)
  2228  
  2229  	// If the result is minus, the conversion is invalid (from NaN or +Inf)
  2230  	var nonTrappingAboveOrEqualMaxInt32PlusOne asm.Node
  2231  	if nonTrapping {
  2232  		jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
  2233  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitUnsignedInt, result)
  2234  		if err != nil {
  2235  			return err
  2236  		}
  2237  		nonTrappingAboveOrEqualMaxInt32PlusOne = c.assembler.CompileJump(amd64.JMP)
  2238  		c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
  2239  	} else {
  2240  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2241  	}
  2242  
  2243  	// Otherwise, we successfully converted the source float minus (math.MaxInt32+1) to int.
  2244  	// So, we retrieve the original source float value by adding the sign mask.
  2245  	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDL, c.float32SignBitMask, result); err != nil {
  2246  		return err
  2247  	}
  2248  
  2249  	// We jump to the next instructions for valid cases.
  2250  	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt32PlusOne)
  2251  	if nonTrapping {
  2252  		c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt32PlusOne)
  2253  		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
  2254  		c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
  2255  	}
  2256  
  2257  	// We consumed the source's register and placed the conversion result
  2258  	// in the result register.
  2259  	c.locationStack.markRegisterUnused(source.register)
  2260  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  2261  	return nil
  2262  }
  2263  
  2264  // emitUnsignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit unsigned integer.
  2265  func (c *amd64Compiler) emitUnsignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2266  	source := c.locationStack.pop()
  2267  	if err := c.compileEnsureOnRegister(source); err != nil {
  2268  		return err
  2269  	}
  2270  
  2271  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2272  	if err != nil {
  2273  		return err
  2274  	}
  2275  
  2276  	// First, we check the source float value is above or equal math.MaxInt64+1.
  2277  	if isFloat32Bit {
  2278  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
  2279  	} else {
  2280  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
  2281  	}
  2282  	if err != nil {
  2283  		return err
  2284  	}
  2285  
  2286  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2287  	var nonTrappingNaNJump asm.Node
  2288  	if nonTrapping {
  2289  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is c.not set.
  2290  		// In non trapping case, NaN is casted as zero.
  2291  		// Zero out the result register by XOR itsself.
  2292  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2293  		nonTrappingNaNJump = c.assembler.CompileJump(amd64.JMP)
  2294  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2295  	} else {
  2296  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2297  	}
  2298  
  2299  	// Jump if the source float values is above or equal math.MaxInt64+1.
  2300  	jmpAboveOrEqualMaxIn32PlusOne := c.assembler.CompileJump(amd64.JCC)
  2301  
  2302  	// next we convert the value as a signed integer.
  2303  	if isFloat32Bit {
  2304  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2305  	} else {
  2306  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2307  	}
  2308  
  2309  	// Then if the result is minus, it is invalid conversion from minus float (incl. -Inf).
  2310  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
  2311  
  2312  	var nonTrappingMinusJump asm.Node
  2313  	if nonTrapping {
  2314  		jmpIfNotMinusOrMinusInf := c.assembler.CompileJump(amd64.JPL)
  2315  		// In non trapping case, the minus value is casted as zero.
  2316  		// Zero out the result register by XOR itsself.
  2317  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2318  		nonTrappingMinusJump = c.assembler.CompileJump(amd64.JMP)
  2319  		c.assembler.SetJumpTargetOnNext(jmpIfNotMinusOrMinusInf)
  2320  	} else {
  2321  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2322  	}
  2323  
  2324  	// Otherwise, the values is valid.
  2325  	okJmpForLessThanMaxInt64PlusOne := c.assembler.CompileJump(amd64.JMP)
  2326  
  2327  	// Now, start handling the case where the original float value is above or equal math.MaxInt64+1.
  2328  	//
  2329  	// First, we subtract the math.MaxInt64+1 from the original value so it can fit in signed 64-bit integer.
  2330  	c.assembler.SetJumpTargetOnNext(jmpAboveOrEqualMaxIn32PlusOne)
  2331  	if isFloat32Bit {
  2332  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSS, c.float32ForMaximumSigned64bitIntPlusOne, source.register)
  2333  	} else {
  2334  		err = c.assembler.CompileStaticConstToRegister(amd64.SUBSD, c.float64ForMaximumSigned64bitIntPlusOne, source.register)
  2335  	}
  2336  	if err != nil {
  2337  		return err
  2338  	}
  2339  
  2340  	// Then, convert the subtracted value as a signed 64-bit integer.
  2341  	if isFloat32Bit {
  2342  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2343  	} else {
  2344  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2345  	}
  2346  
  2347  	// next, we have to check if the value is from NaN, +Inf.
  2348  	// NaN or +Inf cases result in 0x8000_0000 according to the semantics of conversion,
  2349  	// This means we check if the result int value is minus or not.
  2350  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, result, result)
  2351  
  2352  	// If the result is minus, the conversion is invalid (from NaN or +Inf)
  2353  	var nonTrappingAboveOrEqualMaxInt64PlusOne asm.Node
  2354  	if nonTrapping {
  2355  		jmpIfNotPlusInf := c.assembler.CompileJump(amd64.JPL)
  2356  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitUnsignedInt, result)
  2357  		if err != nil {
  2358  			return err
  2359  		}
  2360  		nonTrappingAboveOrEqualMaxInt64PlusOne = c.assembler.CompileJump(amd64.JMP)
  2361  		c.assembler.SetJumpTargetOnNext(jmpIfNotPlusInf)
  2362  	} else {
  2363  		c.compileMaybeExitFromNativeCode(amd64.JPL, nativeCallStatusIntegerOverflow)
  2364  	}
  2365  
  2366  	// Otherwise, we successfully converted the the source float minus (math.MaxInt64+1) to int.
  2367  	// So, we retrieve the original source float value by adding the sign mask.
  2368  	if err = c.assembler.CompileStaticConstToRegister(amd64.ADDQ, c.float64SignBitMask, result); err != nil {
  2369  		return err
  2370  	}
  2371  
  2372  	// We jump to the next instructions for valid cases.
  2373  	c.assembler.SetJumpTargetOnNext(okJmpForLessThanMaxInt64PlusOne)
  2374  	if nonTrapping {
  2375  		c.assembler.SetJumpTargetOnNext(nonTrappingAboveOrEqualMaxInt64PlusOne)
  2376  		c.assembler.SetJumpTargetOnNext(nonTrappingMinusJump)
  2377  		c.assembler.SetJumpTargetOnNext(nonTrappingNaNJump)
  2378  	}
  2379  
  2380  	// We consumed the source's register and placed the conversion result
  2381  	// in the result register.
  2382  	c.locationStack.markRegisterUnused(source.register)
  2383  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
  2384  	return nil
  2385  }
  2386  
  2387  // emitSignedI32TruncFromFloat implements compileITruncFromF when the destination type is a 32-bit signed integer.
  2388  func (c *amd64Compiler) emitSignedI32TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2389  	source := c.locationStack.pop()
  2390  	if err := c.compileEnsureOnRegister(source); err != nil {
  2391  		return err
  2392  	}
  2393  
  2394  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2395  	if err != nil {
  2396  		return err
  2397  	}
  2398  
  2399  	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
  2400  	if isFloat32Bit {
  2401  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SL, source.register, result)
  2402  	} else {
  2403  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SL, source.register, result)
  2404  	}
  2405  
  2406  	// We compare the conversion result with the sign bit mask to check if it is either
  2407  	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
  2408  	// 2) the source equals the minimum signed 32-bit (=-2147483648.000000) whose bit pattern is float32ForMinimumSigned32bitIntegerAddress for 32 bit float
  2409  	// 	  or float64ForMinimumSigned32bitIntegerAddress for 64bit float.
  2410  	err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.float32SignBitMask, result)
  2411  	if err != nil {
  2412  		return err
  2413  	}
  2414  
  2415  	// Otherwise, jump to exit as the result is valid.
  2416  	okJmp := c.assembler.CompileJump(amd64.JNE)
  2417  
  2418  	// Start handling the case of 1) and 2).
  2419  	// First, check if the value is NaN.
  2420  	if isFloat32Bit {
  2421  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
  2422  	} else {
  2423  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
  2424  	}
  2425  
  2426  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2427  	var nontrappingNanJump asm.Node
  2428  	if nonTrapping {
  2429  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2430  		// In non trapping case, NaN is casted as zero.
  2431  		// Zero out the result register by XOR itsself.
  2432  		c.assembler.CompileRegisterToRegister(amd64.XORL, result, result)
  2433  		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
  2434  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2435  	} else {
  2436  		// If the value is NaN, we return the function with nativeCallStatusCodeInvalidFloatToIntConversion.
  2437  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2438  	}
  2439  
  2440  	// Check if the value is larger than or equal the minimum 32-bit integer value,
  2441  	// meaning that the value exceeds the lower bound of 32-bit signed integer range.
  2442  	if isFloat32Bit {
  2443  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned32bitInteger, source.register)
  2444  	} else {
  2445  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned32bitInteger, source.register)
  2446  	}
  2447  	if err != nil {
  2448  		return err
  2449  	}
  2450  
  2451  	if !nonTrapping {
  2452  		// Trap if the value does not exceed the lower bound.
  2453  		if isFloat32Bit {
  2454  			c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
  2455  		} else {
  2456  			c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusIntegerOverflow)
  2457  		}
  2458  
  2459  		// At this point, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
  2460  		// So, check if the value equals the minimum signed 32-bit int.
  2461  		if isFloat32Bit {
  2462  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2463  		} else {
  2464  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2465  		}
  2466  		if err != nil {
  2467  			return err
  2468  		}
  2469  
  2470  		// Trap if the value is not minus (= the minimum signed 32-bit int).
  2471  		c.compileMaybeExitFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)
  2472  
  2473  		// We jump to the next instructions for valid cases.
  2474  		c.assembler.SetJumpTargetOnNext(okJmp)
  2475  	} else {
  2476  		// Jump if the value does not exceed the lower bound.
  2477  		var jmpIfNotExceedsLowerBound asm.Node
  2478  		if isFloat32Bit {
  2479  			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JCC)
  2480  		} else {
  2481  			jmpIfNotExceedsLowerBound = c.assembler.CompileJump(amd64.JHI)
  2482  		}
  2483  
  2484  		// If the value exceeds the lower bound, we "saturate" it to the minimum.
  2485  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.minimum32BitSignedInt, result); err != nil {
  2486  			return err
  2487  		}
  2488  		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
  2489  
  2490  		// Otherwise, the value is the minimum signed 32-bit int (=-2147483648.000000) or larger than 32-bit maximum.
  2491  		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
  2492  		if isFloat32Bit {
  2493  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2494  		} else {
  2495  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2496  		}
  2497  		if err != nil {
  2498  			return err
  2499  		}
  2500  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 32-bit int).
  2501  
  2502  		// If the value exceeds signed 32-bit maximum, we saturate it to the maximum.
  2503  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVL, c.maximum32BitSignedInt, result); err != nil {
  2504  			return err
  2505  		}
  2506  
  2507  		c.assembler.SetJumpTargetOnNext(okJmp)
  2508  		c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
  2509  		c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
  2510  		c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
  2511  	}
  2512  
  2513  	// We consumed the source's register and placed the conversion result
  2514  	// in the result register.
  2515  	c.locationStack.markRegisterUnused(source.register)
  2516  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  2517  	return nil
  2518  }
  2519  
  2520  // emitSignedI64TruncFromFloat implements compileITruncFromF when the destination type is a 64-bit signed integer.
  2521  func (c *amd64Compiler) emitSignedI64TruncFromFloat(isFloat32Bit, nonTrapping bool) error {
  2522  	source := c.locationStack.pop()
  2523  	if err := c.compileEnsureOnRegister(source); err != nil {
  2524  		return err
  2525  	}
  2526  
  2527  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  2528  	if err != nil {
  2529  		return err
  2530  	}
  2531  
  2532  	// First we unconditionally convert source to integer via CVTTSS2SI (CVTTSD2SI for 64bit float).
  2533  	if isFloat32Bit {
  2534  		c.assembler.CompileRegisterToRegister(amd64.CVTTSS2SQ, source.register, result)
  2535  	} else {
  2536  		c.assembler.CompileRegisterToRegister(amd64.CVTTSD2SQ, source.register, result)
  2537  	}
  2538  
  2539  	// We compare the conversion result with the sign bit mask to check if it is either
  2540  	// 1) the source float value is either +-Inf or NaN, or it exceeds representative ranges of 32bit signed integer, or
  2541  	// 2) the source equals the minimum signed 32-bit (=-9223372036854775808.0) whose bit pattern is float32ForMinimumSigned64bitIntegerAddress for 32 bit float
  2542  	// 	  or float64ForMinimumSigned64bitIntegerAddress for 64bit float.
  2543  	err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.float64SignBitMask, result)
  2544  	if err != nil {
  2545  		return err
  2546  	}
  2547  
  2548  	// Otherwise, we simply jump to exit as the result is valid.
  2549  	okJmp := c.assembler.CompileJump(amd64.JNE)
  2550  
  2551  	// Start handling the case of 1) and 2).
  2552  	// First, check if the value is NaN.
  2553  	if isFloat32Bit {
  2554  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, source.register, source.register)
  2555  	} else {
  2556  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, source.register, source.register)
  2557  	}
  2558  
  2559  	// Check the parity flag (set when the value is NaN), and if it is set, we should raise an exception.
  2560  	var nontrappingNanJump asm.Node
  2561  	if nonTrapping {
  2562  		jmpIfNotNaN := c.assembler.CompileJump(amd64.JPC) // jump if parity is not set.
  2563  		// In non trapping case, NaN is casted as zero.
  2564  		// Zero out the result register by XOR itsself.
  2565  		c.assembler.CompileRegisterToRegister(amd64.XORQ, result, result)
  2566  		nontrappingNanJump = c.assembler.CompileJump(amd64.JMP)
  2567  		c.assembler.SetJumpTargetOnNext(jmpIfNotNaN)
  2568  	} else {
  2569  		c.compileMaybeExitFromNativeCode(amd64.JPC, nativeCallStatusCodeInvalidFloatToIntConversion)
  2570  	}
  2571  
  2572  	// Check if the value is larger than or equal the minimum 64-bit integer value,
  2573  	// meaning that the value exceeds the lower bound of 64-bit signed integer range.
  2574  	if isFloat32Bit {
  2575  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.float32ForMinimumSigned64bitInteger, source.register)
  2576  	} else {
  2577  		err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.float64ForMinimumSigned64bitInteger, source.register)
  2578  	}
  2579  	if err != nil {
  2580  		return err
  2581  	}
  2582  
  2583  	if !nonTrapping {
  2584  		// Jump if the value is -Inf.
  2585  		c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusIntegerOverflow)
  2586  
  2587  		// At this point, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
  2588  		// So, check if the value equals the minimum signed 64-bit int.
  2589  		if isFloat32Bit {
  2590  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2591  		} else {
  2592  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2593  		}
  2594  		if err != nil {
  2595  			return err
  2596  		}
  2597  
  2598  		// Trap if the value is not minus (= the minimum signed 64-bit int).
  2599  		c.compileMaybeExitFromNativeCode(amd64.JCS, nativeCallStatusIntegerOverflow)
  2600  
  2601  		// We jump to the next instructions for valid cases.
  2602  		c.assembler.SetJumpTargetOnNext(okJmp)
  2603  	} else {
  2604  		// Jump if the value is not -Inf.
  2605  		jmpIfNotExceedsLowerBound := c.assembler.CompileJump(amd64.JCC)
  2606  
  2607  		// If the value exceeds the lower bound, we "saturate" it to the minimum.
  2608  		err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.minimum64BitSignedInt, result)
  2609  		if err != nil {
  2610  			return err
  2611  		}
  2612  
  2613  		nonTrappingSaturatedMinimumJump := c.assembler.CompileJump(amd64.JMP)
  2614  
  2615  		// Otherwise, the value is the minimum signed 64-bit int (=-9223372036854775808.0) or larger than 64-bit maximum.
  2616  		// So, check if the value equals the minimum signed 64-bit int.
  2617  		c.assembler.SetJumpTargetOnNext(jmpIfNotExceedsLowerBound)
  2618  		if isFloat32Bit {
  2619  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISS, c.fourZeros, source.register)
  2620  		} else {
  2621  			err = c.assembler.CompileStaticConstToRegister(amd64.UCOMISD, c.eightZeros, source.register)
  2622  		}
  2623  		if err != nil {
  2624  			return err
  2625  		}
  2626  
  2627  		jmpIfMinimumSignedInt := c.assembler.CompileJump(amd64.JCS) // jump if the value is minus (= the minimum signed 64-bit int).
  2628  
  2629  		// If the value exceeds signed 64-bit maximum, we saturate it to the maximum.
  2630  		if err = c.assembler.CompileStaticConstToRegister(amd64.MOVQ, c.maximum64BitSignedInt, result); err != nil {
  2631  			return err
  2632  		}
  2633  
  2634  		c.assembler.SetJumpTargetOnNext(okJmp)
  2635  		c.assembler.SetJumpTargetOnNext(jmpIfMinimumSignedInt)
  2636  		c.assembler.SetJumpTargetOnNext(nonTrappingSaturatedMinimumJump)
  2637  		c.assembler.SetJumpTargetOnNext(nontrappingNanJump)
  2638  	}
  2639  
  2640  	// We consumed the source's register and placed the conversion result
  2641  	// in the result register.
  2642  	c.locationStack.markRegisterUnused(source.register)
  2643  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI64)
  2644  	return nil
  2645  }
  2646  
  2647  // compileFConvertFromI implements compiler.compileFConvertFromI for the amd64 architecture.
  2648  func (c *amd64Compiler) compileFConvertFromI(o *wazeroir.UnionOperation) (err error) {
  2649  	inputType := wazeroir.SignedInt(o.B1)
  2650  	outputType := wazeroir.Float(o.B2)
  2651  	if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt32 {
  2652  		err = c.compileSimpleConversion(amd64.CVTSL2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 32bit int
  2653  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedInt64 {
  2654  		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int
  2655  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt32 {
  2656  		err = c.compileSimpleConversion(amd64.CVTSL2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 32bit int
  2657  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedInt64 {
  2658  		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int
  2659  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint32 {
  2660  		// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
  2661  		// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
  2662  		//
  2663  		// Here's the summary:
  2664  		// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
  2665  		// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
  2666  		// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
  2667  		// >> which allows CVTSI2SS to be used after all.
  2668  		err = c.compileSimpleConversion(amd64.CVTSQ2SS, registerTypeVector, runtimeValueTypeF32) // = CVTSI2SS for 64bit int.
  2669  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint32 {
  2670  		// For the same reason above, we use 64bit conversion for unsigned 32bit.
  2671  		err = c.compileSimpleConversion(amd64.CVTSQ2SD, registerTypeVector, runtimeValueTypeF64) // = CVTSI2SD for 64bit int.
  2672  	} else if outputType == wazeroir.Float32 && inputType == wazeroir.SignedUint64 {
  2673  		err = c.emitUnsignedInt64ToFloatConversion(true)
  2674  	} else if outputType == wazeroir.Float64 && inputType == wazeroir.SignedUint64 {
  2675  		err = c.emitUnsignedInt64ToFloatConversion(false)
  2676  	}
  2677  	return
  2678  }
  2679  
  2680  // emitUnsignedInt64ToFloatConversion is handling the case of unsigned 64-bit integer
  2681  // in compileFConvertFromI.
  2682  func (c *amd64Compiler) emitUnsignedInt64ToFloatConversion(isFloat32bit bool) error {
  2683  	// The logic here is exactly the same as GCC emits for the following code:
  2684  	//
  2685  	// float convert(int num) {
  2686  	//     float foo;
  2687  	//     uint64_t ptr1 = 100;
  2688  	//     foo = (float)(ptr1);
  2689  	//     return foo;
  2690  	// }
  2691  	//
  2692  	// which is compiled by GCC as
  2693  	//
  2694  	// convert:
  2695  	// 	   push    rbp
  2696  	// 	   mov     rbp, rsp
  2697  	// 	   mov     DWORD PTR [rbp-20], edi
  2698  	// 	   mov     DWORD PTR [rbp-4], 100
  2699  	// 	   mov     eax, DWORD PTR [rbp-4]
  2700  	// 	   test    rax, rax
  2701  	// 	   js      .handle_sign_bit_case
  2702  	// 	   cvtsi2ss        xmm0, rax
  2703  	// 	   jmp     .exit
  2704  	// .handle_sign_bit_case:
  2705  	// 	   mov     rdx, rax
  2706  	// 	   shr     rdx
  2707  	// 	   and     eax, 1
  2708  	// 	   or      rdx, rax
  2709  	// 	   cvtsi2ss        xmm0, rdx
  2710  	// 	   addsd   xmm0, xmm0
  2711  	// .exit: ...
  2712  	//
  2713  	// tl;dr is that we have a branch depending on whether or not sign bit is set.
  2714  
  2715  	origin := c.locationStack.pop()
  2716  	if err := c.compileEnsureOnRegister(origin); err != nil {
  2717  		return err
  2718  	}
  2719  
  2720  	dest, err := c.allocateRegister(registerTypeVector)
  2721  	if err != nil {
  2722  		return err
  2723  	}
  2724  
  2725  	c.locationStack.markRegisterUsed(dest)
  2726  
  2727  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2728  	if err != nil {
  2729  		return err
  2730  	}
  2731  
  2732  	// Check if the most significant bit (sign bit) is set.
  2733  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, origin.register, origin.register)
  2734  
  2735  	// Jump if the sign bit is set.
  2736  	jmpIfSignbitSet := c.assembler.CompileJump(amd64.JMI)
  2737  
  2738  	// Otherwise, we could fit the unsigned int into float32.
  2739  	// So, we convert it to float32 and emit jump instruction to exit from this branch.
  2740  	if isFloat32bit {
  2741  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, origin.register, dest)
  2742  	} else {
  2743  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, origin.register, dest)
  2744  	}
  2745  	exitFromSignbitUnSet := c.assembler.CompileJump(amd64.JMP)
  2746  
  2747  	// Now handling the case where sign-bit is set.
  2748  	// We emit the following sequences:
  2749  	// 	   mov     tmpReg, origin
  2750  	// 	   shr     tmpReg, 1
  2751  	// 	   and     origin, 1
  2752  	// 	   or      tmpReg, origin
  2753  	// 	   cvtsi2ss        xmm0, tmpReg
  2754  	// 	   addsd   xmm0, xmm0
  2755  
  2756  	c.assembler.SetJumpTargetOnNext(jmpIfSignbitSet)
  2757  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, origin.register, tmpReg)
  2758  	c.assembler.CompileConstToRegister(amd64.SHRQ, 1, tmpReg)
  2759  	c.assembler.CompileConstToRegister(amd64.ANDQ, 1, origin.register)
  2760  	c.assembler.CompileRegisterToRegister(amd64.ORQ, origin.register, tmpReg)
  2761  	if isFloat32bit {
  2762  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SS, tmpReg, dest)
  2763  	} else {
  2764  		c.assembler.CompileRegisterToRegister(amd64.CVTSQ2SD, tmpReg, dest)
  2765  	}
  2766  	if isFloat32bit {
  2767  		c.assembler.CompileRegisterToRegister(amd64.ADDSS, dest, dest)
  2768  	} else {
  2769  		c.assembler.CompileRegisterToRegister(amd64.ADDSD, dest, dest)
  2770  	}
  2771  
  2772  	// Now, we finished the sign-bit set branch.
  2773  	// We have to make the exit jump target of sign-bit unset branch
  2774  	// towards the next instruction.
  2775  	c.assembler.SetJumpTargetOnNext(exitFromSignbitUnSet)
  2776  
  2777  	// We consumed the origin's register and placed the conversion result
  2778  	// in the dest register.
  2779  	c.locationStack.markRegisterUnused(origin.register)
  2780  	if isFloat32bit {
  2781  		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF32)
  2782  	} else {
  2783  		c.pushRuntimeValueLocationOnRegister(dest, runtimeValueTypeF64)
  2784  	}
  2785  	return nil
  2786  }
  2787  
  2788  // compileSimpleConversion pops a value type from the stack, and applies the
  2789  // given instruction on it, and push the result onto a register of the given type.
  2790  func (c *amd64Compiler) compileSimpleConversion(convInstruction asm.Instruction,
  2791  	destinationRegisterType registerType, destinationValueType runtimeValueType,
  2792  ) error {
  2793  	origin := c.locationStack.pop()
  2794  	if err := c.compileEnsureOnRegister(origin); err != nil {
  2795  		return err
  2796  	}
  2797  
  2798  	dest, err := c.allocateRegister(destinationRegisterType)
  2799  	if err != nil {
  2800  		return err
  2801  	}
  2802  
  2803  	c.assembler.CompileRegisterToRegister(convInstruction, origin.register, dest)
  2804  
  2805  	c.locationStack.markRegisterUnused(origin.register)
  2806  	c.pushRuntimeValueLocationOnRegister(dest, destinationValueType)
  2807  	return nil
  2808  }
  2809  
  2810  // compileF32DemoteFromF64 implements compiler.compileF32DemoteFromF64 for the amd64 architecture.
  2811  func (c *amd64Compiler) compileF32DemoteFromF64() error {
  2812  	target := c.locationStack.peek() // Note this is peek!
  2813  	if err := c.compileEnsureOnRegister(target); err != nil {
  2814  		return err
  2815  	}
  2816  
  2817  	c.assembler.CompileRegisterToRegister(amd64.CVTSD2SS, target.register, target.register)
  2818  	target.valueType = runtimeValueTypeF32
  2819  	return nil
  2820  }
  2821  
  2822  // compileF64PromoteFromF32 implements compiler.compileF64PromoteFromF32 for the amd64 architecture.
  2823  func (c *amd64Compiler) compileF64PromoteFromF32() error {
  2824  	target := c.locationStack.peek() // Note this is peek!
  2825  	if err := c.compileEnsureOnRegister(target); err != nil {
  2826  		return err
  2827  	}
  2828  
  2829  	c.assembler.CompileRegisterToRegister(amd64.CVTSS2SD, target.register, target.register)
  2830  	target.valueType = runtimeValueTypeF64
  2831  	return nil
  2832  }
  2833  
  2834  // compileI32ReinterpretFromF32 implements compiler.compileI32ReinterpretFromF32 for the amd64 architecture.
  2835  func (c *amd64Compiler) compileI32ReinterpretFromF32() error {
  2836  	if peek := c.locationStack.peek(); peek.onStack() {
  2837  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2838  		peek.valueType = runtimeValueTypeI32
  2839  		return nil
  2840  	}
  2841  	return c.compileSimpleConversion(amd64.MOVL, registerTypeGeneralPurpose, runtimeValueTypeI32)
  2842  }
  2843  
  2844  // compileI64ReinterpretFromF64 implements compiler.compileI64ReinterpretFromF64 for the amd64 architecture.
  2845  func (c *amd64Compiler) compileI64ReinterpretFromF64() error {
  2846  	if peek := c.locationStack.peek(); peek.onStack() {
  2847  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2848  		peek.valueType = runtimeValueTypeI64
  2849  		return nil
  2850  	}
  2851  	return c.compileSimpleConversion(amd64.MOVQ, registerTypeGeneralPurpose, runtimeValueTypeI64)
  2852  }
  2853  
  2854  // compileF32ReinterpretFromI32 implements compiler.compileF32ReinterpretFromI32 for the amd64 architecture.
  2855  func (c *amd64Compiler) compileF32ReinterpretFromI32() error {
  2856  	if peek := c.locationStack.peek(); peek.onStack() {
  2857  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2858  		peek.valueType = runtimeValueTypeF32
  2859  		return nil
  2860  	}
  2861  	return c.compileSimpleConversion(amd64.MOVL, registerTypeVector, runtimeValueTypeF32)
  2862  }
  2863  
  2864  // compileF64ReinterpretFromI64 implements compiler.compileF64ReinterpretFromI64 for the amd64 architecture.
  2865  func (c *amd64Compiler) compileF64ReinterpretFromI64() error {
  2866  	if peek := c.locationStack.peek(); peek.onStack() {
  2867  		// If the value is on the stack, this is no-op as there is nothing to do for converting type.
  2868  		peek.valueType = runtimeValueTypeF64
  2869  		return nil
  2870  	}
  2871  	return c.compileSimpleConversion(amd64.MOVQ, registerTypeVector, runtimeValueTypeF64)
  2872  }
  2873  
  2874  // compileExtend implements compiler.compileExtend for the amd64 architecture.
  2875  func (c *amd64Compiler) compileExtend(o *wazeroir.UnionOperation) error {
  2876  	var inst asm.Instruction
  2877  	signed := o.B1 != 0
  2878  	if signed {
  2879  		inst = amd64.MOVLQSX // = MOVSXD https://www.felixcloutier.com/x86/movsx:movsxd
  2880  	} else {
  2881  		inst = amd64.MOVL
  2882  	}
  2883  	return c.compileExtendImpl(inst, runtimeValueTypeI64)
  2884  }
  2885  
  2886  // compileSignExtend32From8 implements compiler.compileSignExtend32From8 for the amd64 architecture.
  2887  func (c *amd64Compiler) compileSignExtend32From8() error {
  2888  	return c.compileExtendImpl(amd64.MOVBLSX, runtimeValueTypeI32)
  2889  }
  2890  
  2891  // compileSignExtend32From16 implements compiler.compileSignExtend32From16 for the amd64 architecture.
  2892  func (c *amd64Compiler) compileSignExtend32From16() error {
  2893  	return c.compileExtendImpl(amd64.MOVWLSX, runtimeValueTypeI32)
  2894  }
  2895  
  2896  // compileSignExtend64From8 implements compiler.compileSignExtend64From8 for the amd64 architecture.
  2897  func (c *amd64Compiler) compileSignExtend64From8() error {
  2898  	return c.compileExtendImpl(amd64.MOVBQSX, runtimeValueTypeI64)
  2899  }
  2900  
  2901  // compileSignExtend64From16 implements compiler.compileSignExtend64From16 for the amd64 architecture.
  2902  func (c *amd64Compiler) compileSignExtend64From16() error {
  2903  	return c.compileExtendImpl(amd64.MOVWQSX, runtimeValueTypeI64)
  2904  }
  2905  
  2906  // compileSignExtend64From32 implements compiler.compileSignExtend64From32 for the amd64 architecture.
  2907  func (c *amd64Compiler) compileSignExtend64From32() error {
  2908  	return c.compileExtendImpl(amd64.MOVLQSX, runtimeValueTypeI64)
  2909  }
  2910  
  2911  func (c *amd64Compiler) compileExtendImpl(inst asm.Instruction, destinationType runtimeValueType) error {
  2912  	target := c.locationStack.peek() // Note this is peek!
  2913  	if err := c.compileEnsureOnRegister(target); err != nil {
  2914  		return err
  2915  	}
  2916  
  2917  	c.assembler.CompileRegisterToRegister(inst, target.register, target.register)
  2918  	target.valueType = destinationType
  2919  	return nil
  2920  }
  2921  
  2922  // compileEq implements compiler.compileEq for the amd64 architecture.
  2923  func (c *amd64Compiler) compileEq(o *wazeroir.UnionOperation) error {
  2924  	return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), true)
  2925  }
  2926  
  2927  // compileNe implements compiler.compileNe for the amd64 architecture.
  2928  func (c *amd64Compiler) compileNe(o *wazeroir.UnionOperation) error {
  2929  	return c.compileEqOrNe(wazeroir.UnsignedType(o.B1), false)
  2930  }
  2931  
  2932  func (c *amd64Compiler) compileEqOrNe(t wazeroir.UnsignedType, shouldEqual bool) (err error) {
  2933  	x2 := c.locationStack.pop()
  2934  	if err := c.compileEnsureOnRegister(x2); err != nil {
  2935  		return err
  2936  	}
  2937  
  2938  	x1 := c.locationStack.pop()
  2939  	if err := c.compileEnsureOnRegister(x1); err != nil {
  2940  		return err
  2941  	}
  2942  
  2943  	x1r, x2r := x1.register, x2.register
  2944  
  2945  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  2946  	c.locationStack.releaseRegister(x1)
  2947  	c.locationStack.releaseRegister(x2)
  2948  
  2949  	switch t {
  2950  	case wazeroir.UnsignedTypeI32:
  2951  		err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPL, shouldEqual)
  2952  	case wazeroir.UnsignedTypeI64:
  2953  		err = c.compileEqOrNeForInts(x1r, x2r, amd64.CMPQ, shouldEqual)
  2954  	case wazeroir.UnsignedTypeF32:
  2955  		err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISS, shouldEqual)
  2956  	case wazeroir.UnsignedTypeF64:
  2957  		err = c.compileEqOrNeForFloats(x1r, x2r, amd64.UCOMISD, shouldEqual)
  2958  	}
  2959  	if err != nil {
  2960  		return
  2961  	}
  2962  	return
  2963  }
  2964  
  2965  func (c *amd64Compiler) compileEqOrNeForInts(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction,
  2966  	shouldEqual bool,
  2967  ) error {
  2968  	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
  2969  
  2970  	// Record that the result is on the conditional register.
  2971  	var condReg asm.ConditionalRegisterState
  2972  	if shouldEqual {
  2973  		condReg = amd64.ConditionalRegisterStateE
  2974  	} else {
  2975  		condReg = amd64.ConditionalRegisterStateNE
  2976  	}
  2977  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(condReg)
  2978  	loc.valueType = runtimeValueTypeI32
  2979  	return nil
  2980  }
  2981  
  2982  // For float EQ and NE, we have to take NaN values into account.
  2983  // Notably, Wasm specification states that if one of targets is NaN,
  2984  // the result must be zero for EQ or one for NE.
  2985  func (c *amd64Compiler) compileEqOrNeForFloats(x1Reg, x2Reg asm.Register, cmpInstruction asm.Instruction, shouldEqual bool) error {
  2986  	// Before we allocate the result, we have to reserve two int registers.
  2987  	nanFragReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2988  	if err != nil {
  2989  		return err
  2990  	}
  2991  	c.locationStack.markRegisterUsed(nanFragReg)
  2992  	cmpResultReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  2993  	if err != nil {
  2994  		return err
  2995  	}
  2996  
  2997  	// Then, execute the comparison.
  2998  	c.assembler.CompileRegisterToRegister(cmpInstruction, x2Reg, x1Reg)
  2999  
  3000  	// First, we get the parity flag which indicates whether one of values was NaN.
  3001  	if shouldEqual {
  3002  		// Set 1 if two values are NOT NaN.
  3003  		c.assembler.CompileNoneToRegister(amd64.SETPC, nanFragReg)
  3004  	} else {
  3005  		// Set 1 if one of values is NaN.
  3006  		c.assembler.CompileNoneToRegister(amd64.SETPS, nanFragReg)
  3007  	}
  3008  
  3009  	// next, we get the usual comparison flag.
  3010  	if shouldEqual {
  3011  		// Set 1 if equal.
  3012  		c.assembler.CompileNoneToRegister(amd64.SETEQ, cmpResultReg)
  3013  	} else {
  3014  		// Set 1 if not equal.
  3015  		c.assembler.CompileNoneToRegister(amd64.SETNE, cmpResultReg)
  3016  	}
  3017  
  3018  	// Do "and" or "or" operations on these two flags to get the actual result.
  3019  	if shouldEqual {
  3020  		c.assembler.CompileRegisterToRegister(amd64.ANDL, nanFragReg, cmpResultReg)
  3021  	} else {
  3022  		c.assembler.CompileRegisterToRegister(amd64.ORL, nanFragReg, cmpResultReg)
  3023  	}
  3024  
  3025  	// Clear the unnecessary bits by zero extending the first byte.
  3026  	// This is necessary the upper bits (5 to 32 bits) of SET* instruction result is undefined.
  3027  	c.assembler.CompileRegisterToRegister(amd64.MOVBLZX, cmpResultReg, cmpResultReg)
  3028  
  3029  	// Now we have the result in cmpResultReg register, so we record it.
  3030  	c.pushRuntimeValueLocationOnRegister(cmpResultReg, runtimeValueTypeI32)
  3031  	// Also, we no longer need nanFragRegister.
  3032  	c.locationStack.markRegisterUnused(nanFragReg)
  3033  	return nil
  3034  }
  3035  
  3036  // compileEqz implements compiler.compileEqz for the amd64 architecture.
  3037  func (c *amd64Compiler) compileEqz(o *wazeroir.UnionOperation) (err error) {
  3038  	v := c.locationStack.pop()
  3039  	if err = c.compileEnsureOnRegister(v); err != nil {
  3040  		return err
  3041  	}
  3042  
  3043  	unsignedInt := wazeroir.UnsignedInt(o.B1)
  3044  	switch unsignedInt {
  3045  	case wazeroir.UnsignedInt32:
  3046  		err = c.assembler.CompileStaticConstToRegister(amd64.CMPL, c.fourZeros, v.register)
  3047  	case wazeroir.UnsignedInt64:
  3048  		err = c.assembler.CompileStaticConstToRegister(amd64.CMPQ, c.eightZeros, v.register)
  3049  	}
  3050  	if err != nil {
  3051  		return err
  3052  	}
  3053  
  3054  	// v is consumed by the cmp operation so release it.
  3055  	c.locationStack.releaseRegister(v)
  3056  
  3057  	// Finally, record that the result is on the conditional register.
  3058  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(amd64.ConditionalRegisterStateE)
  3059  	loc.valueType = runtimeValueTypeI32
  3060  	return nil
  3061  }
  3062  
  3063  // compileLt implements compiler.compileLt for the amd64 architecture.
  3064  func (c *amd64Compiler) compileLt(o *wazeroir.UnionOperation) error {
  3065  	x2 := c.locationStack.pop()
  3066  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3067  		return err
  3068  	}
  3069  
  3070  	x1 := c.locationStack.pop()
  3071  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3072  		return err
  3073  	}
  3074  
  3075  	// Emit the compare instruction.
  3076  	var resultConditionState asm.ConditionalRegisterState
  3077  	var inst asm.Instruction
  3078  	signedType := wazeroir.SignedType(o.B1)
  3079  	switch signedType {
  3080  	case wazeroir.SignedTypeInt32:
  3081  		resultConditionState = amd64.ConditionalRegisterStateL
  3082  		inst = amd64.CMPL
  3083  	case wazeroir.SignedTypeUint32:
  3084  		resultConditionState = amd64.ConditionalRegisterStateB
  3085  		inst = amd64.CMPL
  3086  	case wazeroir.SignedTypeInt64:
  3087  		inst = amd64.CMPQ
  3088  		resultConditionState = amd64.ConditionalRegisterStateL
  3089  	case wazeroir.SignedTypeUint64:
  3090  		resultConditionState = amd64.ConditionalRegisterStateB
  3091  		inst = amd64.CMPQ
  3092  	case wazeroir.SignedTypeFloat32:
  3093  		resultConditionState = amd64.ConditionalRegisterStateA
  3094  		inst = amd64.COMISS
  3095  	case wazeroir.SignedTypeFloat64:
  3096  		resultConditionState = amd64.ConditionalRegisterStateA
  3097  		inst = amd64.COMISD
  3098  	}
  3099  	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
  3100  
  3101  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3102  	c.locationStack.releaseRegister(x1)
  3103  	c.locationStack.releaseRegister(x2)
  3104  
  3105  	// Finally, record that the result is on the conditional register.
  3106  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3107  	loc.valueType = runtimeValueTypeI32
  3108  	return nil
  3109  }
  3110  
  3111  // compileGt implements compiler.compileGt for the amd64 architecture.
  3112  func (c *amd64Compiler) compileGt(o *wazeroir.UnionOperation) error {
  3113  	x2 := c.locationStack.pop()
  3114  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3115  		return err
  3116  	}
  3117  
  3118  	x1 := c.locationStack.pop()
  3119  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3120  		return err
  3121  	}
  3122  
  3123  	// Emit the compare instruction.
  3124  	var resultConditionState asm.ConditionalRegisterState
  3125  	signedType := wazeroir.SignedType(o.B1)
  3126  	switch signedType {
  3127  	case wazeroir.SignedTypeInt32:
  3128  		resultConditionState = amd64.ConditionalRegisterStateG
  3129  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3130  	case wazeroir.SignedTypeUint32:
  3131  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3132  		resultConditionState = amd64.ConditionalRegisterStateA
  3133  	case wazeroir.SignedTypeInt64:
  3134  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3135  		resultConditionState = amd64.ConditionalRegisterStateG
  3136  	case wazeroir.SignedTypeUint64:
  3137  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3138  		resultConditionState = amd64.ConditionalRegisterStateA
  3139  	case wazeroir.SignedTypeFloat32:
  3140  		c.assembler.CompileRegisterToRegister(amd64.UCOMISS, x2.register, x1.register)
  3141  		resultConditionState = amd64.ConditionalRegisterStateA
  3142  	case wazeroir.SignedTypeFloat64:
  3143  		c.assembler.CompileRegisterToRegister(amd64.UCOMISD, x2.register, x1.register)
  3144  		resultConditionState = amd64.ConditionalRegisterStateA
  3145  	}
  3146  
  3147  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3148  	c.locationStack.releaseRegister(x1)
  3149  	c.locationStack.releaseRegister(x2)
  3150  
  3151  	// Finally, record that the result is on the conditional register.
  3152  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3153  	loc.valueType = runtimeValueTypeI32
  3154  	return nil
  3155  }
  3156  
  3157  // compileLe implements compiler.compileLe for the amd64 architecture.
  3158  func (c *amd64Compiler) compileLe(o *wazeroir.UnionOperation) error {
  3159  	x2 := c.locationStack.pop()
  3160  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3161  		return err
  3162  	}
  3163  
  3164  	x1 := c.locationStack.pop()
  3165  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3166  		return err
  3167  	}
  3168  
  3169  	// Emit the compare instruction.
  3170  	var inst asm.Instruction
  3171  	var resultConditionState asm.ConditionalRegisterState
  3172  	signedType := wazeroir.SignedType(o.B1)
  3173  	switch signedType {
  3174  	case wazeroir.SignedTypeInt32:
  3175  		resultConditionState = amd64.ConditionalRegisterStateLE
  3176  		inst = amd64.CMPL
  3177  	case wazeroir.SignedTypeUint32:
  3178  		resultConditionState = amd64.ConditionalRegisterStateBE
  3179  		inst = amd64.CMPL
  3180  	case wazeroir.SignedTypeInt64:
  3181  		resultConditionState = amd64.ConditionalRegisterStateLE
  3182  		inst = amd64.CMPQ
  3183  	case wazeroir.SignedTypeUint64:
  3184  		resultConditionState = amd64.ConditionalRegisterStateBE
  3185  		inst = amd64.CMPQ
  3186  	case wazeroir.SignedTypeFloat32:
  3187  		resultConditionState = amd64.ConditionalRegisterStateAE
  3188  		inst = amd64.UCOMISS
  3189  	case wazeroir.SignedTypeFloat64:
  3190  		resultConditionState = amd64.ConditionalRegisterStateAE
  3191  		inst = amd64.UCOMISD
  3192  	}
  3193  	c.assembler.CompileRegisterToRegister(inst, x1.register, x2.register)
  3194  
  3195  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3196  	c.locationStack.releaseRegister(x1)
  3197  	c.locationStack.releaseRegister(x2)
  3198  
  3199  	// Finally, record that the result is on the conditional register.
  3200  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3201  	loc.valueType = runtimeValueTypeI32
  3202  	return nil
  3203  }
  3204  
  3205  // compileGe implements compiler.compileGe for the amd64 architecture.
  3206  func (c *amd64Compiler) compileGe(o *wazeroir.UnionOperation) error {
  3207  	x2 := c.locationStack.pop()
  3208  	if err := c.compileEnsureOnRegister(x2); err != nil {
  3209  		return err
  3210  	}
  3211  
  3212  	x1 := c.locationStack.pop()
  3213  	if err := c.compileEnsureOnRegister(x1); err != nil {
  3214  		return err
  3215  	}
  3216  
  3217  	// Emit the compare instruction.
  3218  	var resultConditionState asm.ConditionalRegisterState
  3219  	signedType := wazeroir.SignedType(o.B1)
  3220  	switch signedType {
  3221  	case wazeroir.SignedTypeInt32:
  3222  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3223  		resultConditionState = amd64.ConditionalRegisterStateGE
  3224  	case wazeroir.SignedTypeUint32:
  3225  		c.assembler.CompileRegisterToRegister(amd64.CMPL, x1.register, x2.register)
  3226  		resultConditionState = amd64.ConditionalRegisterStateAE
  3227  	case wazeroir.SignedTypeInt64:
  3228  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3229  		resultConditionState = amd64.ConditionalRegisterStateGE
  3230  	case wazeroir.SignedTypeUint64:
  3231  		c.assembler.CompileRegisterToRegister(amd64.CMPQ, x1.register, x2.register)
  3232  		resultConditionState = amd64.ConditionalRegisterStateAE
  3233  	case wazeroir.SignedTypeFloat32:
  3234  		c.assembler.CompileRegisterToRegister(amd64.COMISS, x2.register, x1.register)
  3235  		resultConditionState = amd64.ConditionalRegisterStateAE
  3236  	case wazeroir.SignedTypeFloat64:
  3237  		c.assembler.CompileRegisterToRegister(amd64.COMISD, x2.register, x1.register)
  3238  		resultConditionState = amd64.ConditionalRegisterStateAE
  3239  	}
  3240  
  3241  	// x1 and x2 are temporary registers only used for the cmp operation. Release them.
  3242  	c.locationStack.releaseRegister(x1)
  3243  	c.locationStack.releaseRegister(x2)
  3244  
  3245  	// Finally, record that the result is on the conditional register.
  3246  	loc := c.locationStack.pushRuntimeValueLocationOnConditionalRegister(resultConditionState)
  3247  	loc.valueType = runtimeValueTypeI32
  3248  	return nil
  3249  }
  3250  
  3251  // compileLoad implements compiler.compileLoad for the amd64 architecture.
  3252  func (c *amd64Compiler) compileLoad(o *wazeroir.UnionOperation) error {
  3253  	var (
  3254  		isIntType         bool
  3255  		movInst           asm.Instruction
  3256  		targetSizeInBytes int64
  3257  		vt                runtimeValueType
  3258  	)
  3259  
  3260  	unsignedType := wazeroir.UnsignedType(o.B1)
  3261  	offset := uint32(o.U2)
  3262  
  3263  	switch unsignedType {
  3264  	case wazeroir.UnsignedTypeI32:
  3265  		isIntType = true
  3266  		movInst = amd64.MOVL
  3267  		targetSizeInBytes = 32 / 8
  3268  		vt = runtimeValueTypeI32
  3269  	case wazeroir.UnsignedTypeI64:
  3270  		isIntType = true
  3271  		movInst = amd64.MOVQ
  3272  		targetSizeInBytes = 64 / 8
  3273  		vt = runtimeValueTypeI64
  3274  	case wazeroir.UnsignedTypeF32:
  3275  		isIntType = false
  3276  		movInst = amd64.MOVL
  3277  		targetSizeInBytes = 32 / 8
  3278  		vt = runtimeValueTypeF32
  3279  	case wazeroir.UnsignedTypeF64:
  3280  		isIntType = false
  3281  		movInst = amd64.MOVQ
  3282  		targetSizeInBytes = 64 / 8
  3283  		vt = runtimeValueTypeF64
  3284  	}
  3285  
  3286  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3287  	if err != nil {
  3288  		return err
  3289  	}
  3290  
  3291  	if isIntType {
  3292  		// For integer types, read the corresponding bytes from the offset to the memory
  3293  		// and store the value to the int register.
  3294  		c.assembler.CompileMemoryWithIndexToRegister(movInst,
  3295  			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3296  			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3297  			reg)
  3298  		c.pushRuntimeValueLocationOnRegister(reg, vt)
  3299  	} else {
  3300  		// For float types, we read the value to the float register.
  3301  		floatReg, err := c.allocateRegister(registerTypeVector)
  3302  		if err != nil {
  3303  			return err
  3304  		}
  3305  		c.assembler.CompileMemoryWithIndexToRegister(movInst,
  3306  			// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3307  			amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3308  			floatReg)
  3309  		c.pushRuntimeValueLocationOnRegister(floatReg, vt)
  3310  		// We no longer need the int register so mark it unused.
  3311  		c.locationStack.markRegisterUnused(reg)
  3312  	}
  3313  	return nil
  3314  }
  3315  
  3316  // compileLoad8 implements compiler.compileLoad8 for the amd64 architecture.
  3317  func (c *amd64Compiler) compileLoad8(o *wazeroir.UnionOperation) error {
  3318  	const targetSizeInBytes = 1
  3319  	offset := uint32(o.U2)
  3320  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3321  	if err != nil {
  3322  		return err
  3323  	}
  3324  
  3325  	// Then move a byte at the offset to the register.
  3326  	// Note that Load8 is only for integer types.
  3327  	var inst asm.Instruction
  3328  	var vt runtimeValueType
  3329  	signedInt := wazeroir.SignedInt(o.B1)
  3330  	switch signedInt {
  3331  	case wazeroir.SignedInt32:
  3332  		inst = amd64.MOVBLSX
  3333  		vt = runtimeValueTypeI32
  3334  	case wazeroir.SignedUint32:
  3335  		inst = amd64.MOVBLZX
  3336  		vt = runtimeValueTypeI32
  3337  	case wazeroir.SignedInt64:
  3338  		inst = amd64.MOVBQSX
  3339  		vt = runtimeValueTypeI64
  3340  	case wazeroir.SignedUint64:
  3341  		inst = amd64.MOVBQZX
  3342  		vt = runtimeValueTypeI64
  3343  	}
  3344  
  3345  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3346  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3347  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3348  		reg)
  3349  
  3350  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3351  	return nil
  3352  }
  3353  
  3354  // compileLoad16 implements compiler.compileLoad16 for the amd64 architecture.
  3355  func (c *amd64Compiler) compileLoad16(o *wazeroir.UnionOperation) error {
  3356  	const targetSizeInBytes = 16 / 8
  3357  	offset := uint32(o.U2)
  3358  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3359  	if err != nil {
  3360  		return err
  3361  	}
  3362  
  3363  	// Then move 2 bytes at the offset to the register.
  3364  	// Note that Load16 is only for integer types.
  3365  	var inst asm.Instruction
  3366  	var vt runtimeValueType
  3367  	signedInt := wazeroir.SignedInt(o.B1)
  3368  	switch signedInt {
  3369  	case wazeroir.SignedInt32:
  3370  		inst = amd64.MOVWLSX
  3371  		vt = runtimeValueTypeI32
  3372  	case wazeroir.SignedInt64:
  3373  		inst = amd64.MOVWQSX
  3374  		vt = runtimeValueTypeI64
  3375  	case wazeroir.SignedUint32:
  3376  		inst = amd64.MOVWLZX
  3377  		vt = runtimeValueTypeI32
  3378  	case wazeroir.SignedUint64:
  3379  		inst = amd64.MOVWQZX
  3380  		vt = runtimeValueTypeI64
  3381  	}
  3382  
  3383  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3384  		// we access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3385  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3386  		reg)
  3387  
  3388  	c.pushRuntimeValueLocationOnRegister(reg, vt)
  3389  	return nil
  3390  }
  3391  
  3392  // compileLoad32 implements compiler.compileLoad32 for the amd64 architecture.
  3393  func (c *amd64Compiler) compileLoad32(o *wazeroir.UnionOperation) error {
  3394  	const targetSizeInBytes = 32 / 8
  3395  	offset := uint32(o.U2)
  3396  	reg, err := c.compileMemoryAccessCeilSetup(offset, targetSizeInBytes)
  3397  	if err != nil {
  3398  		return err
  3399  	}
  3400  
  3401  	// Then move 4 bytes at the offset to the register.
  3402  	var inst asm.Instruction
  3403  	signed := o.B1 == 1
  3404  	if signed {
  3405  		inst = amd64.MOVLQSX
  3406  	} else {
  3407  		inst = amd64.MOVLQZX
  3408  	}
  3409  	c.assembler.CompileMemoryWithIndexToRegister(inst,
  3410  		// We access memory as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3411  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3412  		reg)
  3413  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  3414  	return nil
  3415  }
  3416  
  3417  // compileMemoryAccessCeilSetup pops the top value from the stack (called "base"), stores "base + offsetArg + targetSizeInBytes"
  3418  // into a register, and returns the stored register. We call the result "ceil" because we access the memory
  3419  // as memory.Buffer[ceil-targetSizeInBytes: ceil].
  3420  //
  3421  // Note: this also emits the instructions to check the out-of-bounds memory access.
  3422  // In other words, if the ceil exceeds the memory size, the code exits with nativeCallStatusCodeMemoryOutOfBounds status.
  3423  func (c *amd64Compiler) compileMemoryAccessCeilSetup(offsetArg uint32, targetSizeInBytes int64) (asm.Register, error) {
  3424  	base := c.locationStack.pop()
  3425  	if err := c.compileEnsureOnRegister(base); err != nil {
  3426  		return asm.NilRegister, err
  3427  	}
  3428  
  3429  	result := base.register
  3430  	if offsetConst := int64(offsetArg) + targetSizeInBytes; offsetConst <= math.MaxInt32 {
  3431  		c.assembler.CompileConstToRegister(amd64.ADDQ, offsetConst, result)
  3432  	} else if offsetConst <= math.MaxUint32 {
  3433  		// Note: in practice, this branch rarely happens as in this case, the wasm binary know that
  3434  		// memory has more than 1 GBi or at least tries to access above 1 GBi memory region.
  3435  		//
  3436  		// This case, we cannot directly add the offset to a register by ADDQ(const) instruction.
  3437  		// That is because the imm32 const is sign-extended to 64-bit in ADDQ(const), and we end up
  3438  		// making offsetConst as the negative number, which is wrong.
  3439  		tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3440  		if err != nil {
  3441  			return asm.NilRegister, err
  3442  		}
  3443  		c.assembler.CompileConstToRegister(amd64.MOVL, int64(uint32(offsetConst)), tmp)
  3444  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, tmp, result)
  3445  	} else {
  3446  		// If the offset const is too large, we exit with nativeCallStatusCodeMemoryOutOfBounds.
  3447  		c.compileExitFromNativeCode(nativeCallStatusCodeMemoryOutOfBounds)
  3448  		return result, nil
  3449  	}
  3450  
  3451  	// Now we compare the value with the memory length which is held by callEngine.
  3452  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3453  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, result)
  3454  
  3455  	// Trap if the value is out-of-bounds of memory length.
  3456  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3457  
  3458  	c.locationStack.markRegisterUnused(result)
  3459  	return result, nil
  3460  }
  3461  
  3462  // compileStore implements compiler.compileStore for the amd64 architecture.
  3463  func (c *amd64Compiler) compileStore(o *wazeroir.UnionOperation) error {
  3464  	var movInst asm.Instruction
  3465  	var targetSizeInByte int64
  3466  	unsignedType := wazeroir.UnsignedType(o.B1)
  3467  	offset := uint32(o.U2)
  3468  	switch unsignedType {
  3469  	case wazeroir.UnsignedTypeI32, wazeroir.UnsignedTypeF32:
  3470  		movInst = amd64.MOVL
  3471  		targetSizeInByte = 32 / 8
  3472  	case wazeroir.UnsignedTypeI64, wazeroir.UnsignedTypeF64:
  3473  		movInst = amd64.MOVQ
  3474  		targetSizeInByte = 64 / 8
  3475  	}
  3476  	return c.compileStoreImpl(offset, movInst, targetSizeInByte)
  3477  }
  3478  
  3479  // compileStore8 implements compiler.compileStore8 for the amd64 architecture.
  3480  func (c *amd64Compiler) compileStore8(o *wazeroir.UnionOperation) error {
  3481  	return c.compileStoreImpl(uint32(o.U2), amd64.MOVB, 1)
  3482  }
  3483  
  3484  // compileStore32 implements compiler.compileStore32 for the amd64 architecture.
  3485  func (c *amd64Compiler) compileStore16(o *wazeroir.UnionOperation) error {
  3486  	return c.compileStoreImpl(uint32(o.U2), amd64.MOVW, 16/8)
  3487  }
  3488  
  3489  // compileStore32 implements compiler.compileStore32 for the amd64 architecture.
  3490  func (c *amd64Compiler) compileStore32(o *wazeroir.UnionOperation) error {
  3491  	return c.compileStoreImpl(uint32(o.U2), amd64.MOVL, 32/8)
  3492  }
  3493  
  3494  func (c *amd64Compiler) compileStoreImpl(offsetConst uint32, inst asm.Instruction, targetSizeInBytes int64) error {
  3495  	val := c.locationStack.pop()
  3496  	if err := c.compileEnsureOnRegister(val); err != nil {
  3497  		return err
  3498  	}
  3499  
  3500  	reg, err := c.compileMemoryAccessCeilSetup(offsetConst, targetSizeInBytes)
  3501  	if err != nil {
  3502  		return err
  3503  	}
  3504  
  3505  	c.assembler.CompileRegisterToMemoryWithIndex(
  3506  		inst, val.register,
  3507  		amd64ReservedRegisterForMemory, -targetSizeInBytes, reg, 1,
  3508  	)
  3509  
  3510  	// We no longer need both the value and base registers.
  3511  	c.locationStack.releaseRegister(val)
  3512  	c.locationStack.markRegisterUnused(reg)
  3513  	return nil
  3514  }
  3515  
  3516  // compileMemoryGrow implements compiler.compileMemoryGrow for the amd64 architecture.
  3517  func (c *amd64Compiler) compileMemoryGrow() error {
  3518  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3519  		return err
  3520  	}
  3521  
  3522  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexMemoryGrow); err != nil {
  3523  		return err
  3524  	}
  3525  
  3526  	// After the function call, we have to initialize the stack base pointer and memory reserved registers.
  3527  	c.compileReservedStackBasePointerInitialization()
  3528  	c.compileReservedMemoryPointerInitialization()
  3529  	return nil
  3530  }
  3531  
  3532  // compileMemorySize implements compiler.compileMemorySize for the amd64 architecture.
  3533  func (c *amd64Compiler) compileMemorySize() error {
  3534  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3535  		return err
  3536  	}
  3537  
  3538  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  3539  	if err != nil {
  3540  		return err
  3541  	}
  3542  	loc := c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  3543  
  3544  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, loc.register)
  3545  
  3546  	// WebAssembly's memory.size returns the page size (65536) of memory region.
  3547  	// That is equivalent to divide the len of memory slice by 65536 and
  3548  	// that can be calculated as SHR by 16 bits as 65536 = 2^16.
  3549  	c.assembler.CompileConstToRegister(amd64.SHRQ, wasm.MemoryPageSizeInBits, loc.register)
  3550  	return nil
  3551  }
  3552  
  3553  // compileMemoryInit implements compiler.compileMemoryInit for the amd64 architecture.
  3554  func (c *amd64Compiler) compileMemoryInit(o *wazeroir.UnionOperation) error {
  3555  	dataIndex := uint32(o.U1)
  3556  	return c.compileInitImpl(false, dataIndex, 0)
  3557  }
  3558  
  3559  // compileInitImpl implements compileTableInit and compileMemoryInit.
  3560  //
  3561  // TODO: the compiled code in this function should be reused and compile at once as
  3562  // the code is independent of any module.
  3563  func (c *amd64Compiler) compileInitImpl(isTable bool, index, tableIndex uint32) error {
  3564  	outOfBoundsErrorStatus := nativeCallStatusCodeMemoryOutOfBounds
  3565  	if isTable {
  3566  		outOfBoundsErrorStatus = nativeCallStatusCodeInvalidTableAccess
  3567  	}
  3568  
  3569  	copySize := c.locationStack.pop()
  3570  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3571  		return err
  3572  	}
  3573  
  3574  	sourceOffset := c.locationStack.pop()
  3575  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3576  		return err
  3577  	}
  3578  
  3579  	destinationOffset := c.locationStack.pop()
  3580  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3581  		return err
  3582  	}
  3583  
  3584  	instanceAddr, err := c.allocateRegister(registerTypeGeneralPurpose)
  3585  	if err != nil {
  3586  		return err
  3587  	}
  3588  	c.locationStack.markRegisterUsed(instanceAddr)
  3589  	if isTable {
  3590  		c.compileLoadElemInstanceAddress(index, instanceAddr)
  3591  	} else {
  3592  		c.compileLoadDataInstanceAddress(index, instanceAddr)
  3593  	}
  3594  
  3595  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3596  	if err != nil {
  3597  		return err
  3598  	}
  3599  	c.locationStack.markRegisterUsed(tmp)
  3600  
  3601  	// sourceOffset += size.
  3602  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3603  	// destinationOffset += size.
  3604  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3605  
  3606  	// Check instance bounds and if exceeds the length, exit with out of bounds error.
  3607  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3608  		instanceAddr, 8, // DataInstance and Element instance holds the length is stored at offset 8.
  3609  		sourceOffset.register)
  3610  	c.compileMaybeExitFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)
  3611  
  3612  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3613  	if isTable {
  3614  		// Load the target table's address.
  3615  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  3616  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex*8), tmp)
  3617  		// Compare length.
  3618  		c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
  3619  	} else {
  3620  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3621  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3622  			destinationOffset.register)
  3623  	}
  3624  
  3625  	c.compileMaybeExitFromNativeCode(amd64.JCC, outOfBoundsErrorStatus)
  3626  
  3627  	// Otherwise, ready to copy the value from source to destination.
  3628  	//
  3629  	// If the copy size equal zero, we skip the entire instructions below.
  3630  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3631  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3632  
  3633  	var scale int16
  3634  	var memToReg, regToMem asm.Instruction
  3635  	if isTable {
  3636  		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3637  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
  3638  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3639  		// destinationOffset += table buffer's absolute address.
  3640  		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3641  			tmp, tableInstanceTableOffset, destinationOffset.register)
  3642  		// sourceOffset += data buffer's absolute address.
  3643  		c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3644  			instanceAddr, 0, sourceOffset.register)
  3645  
  3646  		// For tables, we move 8 bytes at once.
  3647  		memToReg = amd64.MOVQ
  3648  		regToMem = memToReg
  3649  		scale = 8
  3650  	} else {
  3651  		// destinationOffset += memory buffer's absolute address.
  3652  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3653  
  3654  		// sourceOffset += data buffer's absolute address.
  3655  		c.assembler.CompileMemoryToRegister(amd64.ADDQ, instanceAddr, 0, sourceOffset.register)
  3656  
  3657  		// Move one byte at once.
  3658  		memToReg = amd64.MOVBQZX
  3659  		regToMem = amd64.MOVB
  3660  		scale = 1
  3661  	}
  3662  
  3663  	// Negate the counter.
  3664  	c.assembler.CompileNoneToRegister(amd64.NEGQ, copySize.register)
  3665  
  3666  	beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3667  
  3668  	c.assembler.CompileMemoryWithIndexToRegister(memToReg,
  3669  		sourceOffset.register, 0, copySize.register, scale,
  3670  		tmp)
  3671  	// [destinationOffset + (size.register)] = tmp.
  3672  	c.assembler.CompileRegisterToMemoryWithIndex(regToMem,
  3673  		tmp,
  3674  		destinationOffset.register, 0, copySize.register, scale,
  3675  	)
  3676  
  3677  	// size += 1
  3678  	c.assembler.CompileNoneToRegister(amd64.INCQ, copySize.register)
  3679  	c.assembler.CompileJump(amd64.JMI).AssignJumpTarget(beginCopyLoop)
  3680  
  3681  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  3682  		destinationOffset.register, instanceAddr, tmp)
  3683  	c.assembler.SetJumpTargetOnNext(skipJump)
  3684  	return nil
  3685  }
  3686  
  3687  // compileDataDrop implements compiler.compileDataDrop for the amd64 architecture.
  3688  func (c *amd64Compiler) compileDataDrop(o *wazeroir.UnionOperation) error {
  3689  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  3690  		return err
  3691  	}
  3692  
  3693  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3694  	if err != nil {
  3695  		return err
  3696  	}
  3697  
  3698  	dataIndex := uint32(o.U1)
  3699  	c.compileLoadDataInstanceAddress(dataIndex, tmp)
  3700  
  3701  	// Clears the content of DataInstance[o.DataIndex] (== []byte type).
  3702  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
  3703  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
  3704  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
  3705  	return nil
  3706  }
  3707  
  3708  func (c *amd64Compiler) compileLoadDataInstanceAddress(dataIndex uint32, dst asm.Register) {
  3709  	// dst = dataIndex * dataInstanceStructSize.
  3710  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(dataIndex)*dataInstanceStructSize, dst)
  3711  
  3712  	// dst = &moduleInstance.DataInstances[0] + dst
  3713  	//     = &moduleInstance.DataInstances[0] + dataIndex*dataInstanceStructSize
  3714  	//     = &moduleInstance.DataInstances[dataIndex]
  3715  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  3716  		amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  3717  		dst,
  3718  	)
  3719  }
  3720  
  3721  // compileCopyLoopImpl implements a REP MOVSQ memory copy for the given range with support for both directions.
  3722  func (c *amd64Compiler) compileCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, backwards bool, bwOffset uint8) {
  3723  	// skip if nothing to copy
  3724  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3725  	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
  3726  
  3727  	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
  3728  	restoreCrossing := c.compilePreventCrossedTargetRegisters(
  3729  		[]*runtimeValueLocation{destinationOffset, sourceOffset, copySize},
  3730  		[]asm.Register{amd64.RegDI, amd64.RegSI, amd64.RegCX})
  3731  
  3732  	// Prepare registers for REP MOVSQ: copy from rsi to rdi, rcx times.
  3733  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3734  	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
  3735  	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
  3736  
  3737  	// Point on first byte of first quadword to copy.
  3738  	if backwards {
  3739  		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegDI)
  3740  		c.assembler.CompileConstToRegister(amd64.ADDQ, -int64(bwOffset), amd64.RegSI)
  3741  		// Set REP prefix direction backwards.
  3742  		c.assembler.CompileStandAlone(amd64.STD)
  3743  	}
  3744  
  3745  	c.assembler.CompileStandAlone(amd64.REPMOVSQ)
  3746  
  3747  	if backwards {
  3748  		// Reset direction.
  3749  		c.assembler.CompileStandAlone(amd64.CLD)
  3750  	}
  3751  
  3752  	// Restore registers.
  3753  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3754  	c.compileMaybeSwapRegisters(sourceOffset.register, amd64.RegSI)
  3755  	c.compileMaybeSwapRegisters(copySize.register, amd64.RegCX)
  3756  	restoreCrossing()
  3757  
  3758  	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
  3759  	c.assembler.CompileStandAlone(amd64.NOP)
  3760  }
  3761  
  3762  // compileMemoryCopyLoopImpl is used for directly copying after bounds/direction check.
  3763  func (c *amd64Compiler) compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
  3764  	// Point on first byte to be copied depending on direction.
  3765  	if backwards {
  3766  		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
  3767  		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
  3768  	} else {
  3769  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
  3770  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3771  	}
  3772  
  3773  	// destinationOffset += memory buffer's absolute address.
  3774  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3775  	// sourceOffset += memory buffer's absolute address.
  3776  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, sourceOffset.register)
  3777  
  3778  	// Copy copySize % 8 bytes in loop to allow copying in 8 byte groups afterward.
  3779  	beginLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3780  
  3781  	// Check copySize % 8 == 0.
  3782  	c.assembler.CompileConstToRegister(amd64.TESTQ, 7, copySize.register)
  3783  	breakLoop := c.assembler.CompileJump(amd64.JEQ)
  3784  
  3785  	c.assembler.CompileMemoryToRegister(amd64.MOVBQZX, sourceOffset.register, 0, tmp)
  3786  	c.assembler.CompileRegisterToMemory(amd64.MOVB, tmp, destinationOffset.register, 0)
  3787  
  3788  	if backwards {
  3789  		c.assembler.CompileNoneToRegister(amd64.DECQ, sourceOffset.register)
  3790  		c.assembler.CompileNoneToRegister(amd64.DECQ, destinationOffset.register)
  3791  	} else {
  3792  		c.assembler.CompileNoneToRegister(amd64.INCQ, sourceOffset.register)
  3793  		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
  3794  	}
  3795  
  3796  	c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
  3797  	c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginLoop)
  3798  	c.assembler.SetJumpTargetOnNext(breakLoop)
  3799  
  3800  	// compileCopyLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
  3801  	c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
  3802  
  3803  	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 7)
  3804  }
  3805  
  3806  // compileMemoryCopy implements compiler.compileMemoryCopy for the amd64 architecture.
  3807  //
  3808  // This uses efficient `REP MOVSQ` instructions to copy in quadword (8 bytes) batches. The remaining bytes
  3809  // are copied with a simple `MOV` loop. It uses backward copying for overlapped segments.
  3810  func (c *amd64Compiler) compileMemoryCopy() error {
  3811  	copySize := c.locationStack.pop()
  3812  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3813  		return err
  3814  	}
  3815  
  3816  	sourceOffset := c.locationStack.pop()
  3817  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  3818  		return err
  3819  	}
  3820  
  3821  	destinationOffset := c.locationStack.pop()
  3822  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3823  		return err
  3824  	}
  3825  
  3826  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3827  	if err != nil {
  3828  		return err
  3829  	}
  3830  	c.locationStack.markRegisterUsed(tmp)
  3831  
  3832  	// sourceOffset += size.
  3833  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  3834  	// destinationOffset += size.
  3835  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3836  	// tmp = max(sourceOffset, destinationOffset).
  3837  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, destinationOffset.register)
  3838  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, sourceOffset.register, tmp)
  3839  	c.assembler.CompileRegisterToRegister(amd64.CMOVQCS, destinationOffset.register, tmp)
  3840  
  3841  	// Check maximum bounds and if exceeds the length, exit with out of bounds error.
  3842  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3843  		amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset, tmp)
  3844  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3845  
  3846  	// Skip zero size.
  3847  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3848  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3849  
  3850  	// If dest < source, we can copy forwards
  3851  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
  3852  	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
  3853  
  3854  	// If source + size < dest, we can copy forwards
  3855  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
  3856  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
  3857  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
  3858  	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
  3859  
  3860  	// Copy backwards.
  3861  	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, true)
  3862  	endJump := c.assembler.CompileJump(amd64.JMP)
  3863  
  3864  	// Copy forwards.
  3865  	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  3866  	c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
  3867  	c.compileMemoryCopyLoopImpl(destinationOffset, sourceOffset, copySize, tmp, false)
  3868  
  3869  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  3870  		destinationOffset.register, tmp)
  3871  	c.assembler.SetJumpTargetOnNext(skipJump)
  3872  	c.assembler.SetJumpTargetOnNext(endJump)
  3873  
  3874  	return nil
  3875  }
  3876  
  3877  // compileFillLoopImpl implements a REP STOSQ fill loop.
  3878  func (c *amd64Compiler) compileFillLoopImpl(destinationOffset, value, fillSize *runtimeValueLocation, tmp asm.Register, replicateByte bool) {
  3879  	// Skip if nothing to fill.
  3880  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, fillSize.register, fillSize.register)
  3881  	emptyEightGroupsJump := c.assembler.CompileJump(amd64.JEQ)
  3882  
  3883  	if replicateByte {
  3884  		// Truncate value.register to a single byte
  3885  		c.assembler.CompileConstToRegister(amd64.ANDQ, 0xff, value.register)
  3886  		// Replicate single byte onto full 8-byte register.
  3887  		c.assembler.CompileConstToRegister(amd64.MOVQ, 0x0101010101010101, tmp)
  3888  		c.assembler.CompileRegisterToRegister(amd64.IMULQ, tmp, value.register)
  3889  	}
  3890  
  3891  	// Prepare registers for swaps. There will never be more than 3 XCHGs in total.
  3892  	restoreCrossing := c.compilePreventCrossedTargetRegisters(
  3893  		[]*runtimeValueLocation{destinationOffset, value, fillSize},
  3894  		[]asm.Register{amd64.RegDI, amd64.RegAX, amd64.RegCX})
  3895  
  3896  	// Prepare registers for REP STOSQ: fill at [rdi] with rax, rcx times.
  3897  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3898  	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
  3899  	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
  3900  
  3901  	c.assembler.CompileStandAlone(amd64.REPSTOSQ)
  3902  
  3903  	// Restore registers.
  3904  	c.compileMaybeSwapRegisters(destinationOffset.register, amd64.RegDI)
  3905  	c.compileMaybeSwapRegisters(value.register, amd64.RegAX)
  3906  	c.compileMaybeSwapRegisters(fillSize.register, amd64.RegCX)
  3907  	restoreCrossing()
  3908  
  3909  	c.assembler.SetJumpTargetOnNext(emptyEightGroupsJump)
  3910  }
  3911  
  3912  // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
  3913  //
  3914  // This function uses efficient `REP STOSQ` instructions to copy in quadword (8 bytes) batches
  3915  // if the size if above 15 bytes. For smaller sizes, a simple MOVB copy loop is the best
  3916  // option.
  3917  //
  3918  // TODO: the compiled code in this function should be reused and compile at once as
  3919  // the code is independent of any module.
  3920  func (c *amd64Compiler) compileFillImpl(isTable bool, tableIndex uint32) error {
  3921  	copySize := c.locationStack.pop()
  3922  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  3923  		return err
  3924  	}
  3925  
  3926  	value := c.locationStack.pop()
  3927  	if err := c.compileEnsureOnRegister(value); err != nil {
  3928  		return err
  3929  	}
  3930  
  3931  	destinationOffset := c.locationStack.pop()
  3932  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  3933  		return err
  3934  	}
  3935  
  3936  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  3937  	if err != nil {
  3938  		return err
  3939  	}
  3940  	c.locationStack.markRegisterUsed(tmp)
  3941  
  3942  	// destinationOffset += size.
  3943  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  3944  
  3945  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  3946  	if isTable {
  3947  		// tmp = &tables[0]
  3948  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  3949  			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  3950  			tmp)
  3951  
  3952  		// tmp = [tmp + TableIndex*8]
  3953  		//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  3954  		//     = [&tables[TableIndex]] = tables[TableIndex].
  3955  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(tableIndex)*8, tmp)
  3956  
  3957  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3958  			tmp, tableInstanceTableLenOffset,
  3959  			destinationOffset.register)
  3960  	} else {
  3961  		c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  3962  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset,
  3963  			destinationOffset.register)
  3964  	}
  3965  	if isTable {
  3966  		c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
  3967  	} else {
  3968  		c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeMemoryOutOfBounds)
  3969  	}
  3970  
  3971  	// Otherwise, ready to copy the value from source to destination.
  3972  	//
  3973  	// If the copy size equal zero, we skip the entire instructions below.
  3974  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  3975  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  3976  
  3977  	// destinationOffset -= size.
  3978  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  3979  
  3980  	if isTable {
  3981  		// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  3982  		c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  3983  		// destinationOffset += table buffer's absolute address.
  3984  		c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
  3985  
  3986  	} else {
  3987  		// destinationOffset += memory buffer's absolute address.
  3988  		c.assembler.CompileRegisterToRegister(amd64.ADDQ, amd64ReservedRegisterForMemory, destinationOffset.register)
  3989  
  3990  		// Copy first % 16 bytes with simple MOVB instruction.
  3991  		beginCopyLoop := c.assembler.CompileStandAlone(amd64.NOP)
  3992  		c.assembler.CompileConstToRegister(amd64.TESTQ, 15, copySize.register)
  3993  		breakLoop := c.assembler.CompileJump(amd64.JEQ)
  3994  
  3995  		c.assembler.CompileRegisterToMemory(amd64.MOVB, value.register, destinationOffset.register, 0)
  3996  
  3997  		c.assembler.CompileNoneToRegister(amd64.INCQ, destinationOffset.register)
  3998  		c.assembler.CompileNoneToRegister(amd64.DECQ, copySize.register)
  3999  		c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(beginCopyLoop)
  4000  
  4001  		c.assembler.SetJumpTargetOnNext(breakLoop)
  4002  		// compileFillLoopImpl counts in groups of 8 bytes, so we have to divide the copySize by 8.
  4003  		c.assembler.CompileConstToRegister(amd64.SHRQ, 3, copySize.register)
  4004  	}
  4005  
  4006  	c.compileFillLoopImpl(destinationOffset, value, copySize, tmp, !isTable)
  4007  
  4008  	c.locationStack.markRegisterUnused(copySize.register, value.register,
  4009  		destinationOffset.register, tmp)
  4010  	c.assembler.SetJumpTargetOnNext(skipJump)
  4011  	return nil
  4012  }
  4013  
  4014  // compileMemoryFill implements compiler.compileMemoryFill for the amd64 architecture.
  4015  //
  4016  // TODO: the compiled code in this function should be reused and compile at once as
  4017  // the code is independent of any module.
  4018  func (c *amd64Compiler) compileMemoryFill() error {
  4019  	return c.compileFillImpl(false, 0)
  4020  }
  4021  
  4022  // compileTableInit implements compiler.compileTableInit for the amd64 architecture.
  4023  func (c *amd64Compiler) compileTableInit(o *wazeroir.UnionOperation) error {
  4024  	elemIndex := uint32(o.U1)
  4025  	tableIndex := uint32(o.U2)
  4026  	return c.compileInitImpl(true, elemIndex, tableIndex)
  4027  }
  4028  
  4029  // compileTableCopyLoopImpl is used for directly copying after bounds/direction check.
  4030  func (c *amd64Compiler) compileTableCopyLoopImpl(srcTableIndex, dstTableIndex uint32, destinationOffset, sourceOffset, copySize *runtimeValueLocation, tmp asm.Register, backwards bool) {
  4031  	// Point on first byte to be copied.
  4032  	if !backwards {
  4033  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, sourceOffset.register)
  4034  		c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, destinationOffset.register)
  4035  	}
  4036  
  4037  	// Each element is of type uintptr; 2^3 = 1 << pointerSizeLog2.
  4038  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, sourceOffset.register)
  4039  	c.assembler.CompileConstToRegister(amd64.SHLQ, pointerSizeLog2, destinationOffset.register)
  4040  	// destinationOffset += table buffer's absolute address.
  4041  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4042  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
  4043  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, destinationOffset.register)
  4044  	// sourceOffset += table buffer's absolute address.
  4045  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4046  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
  4047  	c.assembler.CompileMemoryToRegister(amd64.ADDQ, tmp, tableInstanceTableOffset, sourceOffset.register)
  4048  
  4049  	c.compileCopyLoopImpl(destinationOffset, sourceOffset, copySize, backwards, 8)
  4050  }
  4051  
  4052  // compileTableCopy implements compiler.compileTableCopy for the amd64 architecture.
  4053  //
  4054  // It uses efficient `REP MOVSB` instructions for optimized copying. It uses backward copying for
  4055  // overlapped segments.
  4056  func (c *amd64Compiler) compileTableCopy(o *wazeroir.UnionOperation) error {
  4057  	copySize := c.locationStack.pop()
  4058  	if err := c.compileEnsureOnRegister(copySize); err != nil {
  4059  		return err
  4060  	}
  4061  
  4062  	sourceOffset := c.locationStack.pop()
  4063  	if err := c.compileEnsureOnRegister(sourceOffset); err != nil {
  4064  		return err
  4065  	}
  4066  
  4067  	destinationOffset := c.locationStack.pop()
  4068  	if err := c.compileEnsureOnRegister(destinationOffset); err != nil {
  4069  		return err
  4070  	}
  4071  
  4072  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4073  	if err != nil {
  4074  		return err
  4075  	}
  4076  
  4077  	// sourceOffset += size.
  4078  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, sourceOffset.register)
  4079  	// destinationOffset += size.
  4080  	c.assembler.CompileRegisterToRegister(amd64.ADDQ, copySize.register, destinationOffset.register)
  4081  
  4082  	srcTableIndex := uint32(o.U1)
  4083  	dstTableIndex := uint32(o.U2)
  4084  
  4085  	// Check source bounds and if exceeds the length, exit with out of bounds error.
  4086  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4087  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(srcTableIndex*8), tmp)
  4088  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, sourceOffset.register)
  4089  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
  4090  
  4091  	// Check destination bounds and if exceeds the length, exit with out of bounds error.
  4092  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset, tmp)
  4093  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, int64(dstTableIndex*8), tmp)
  4094  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, destinationOffset.register)
  4095  	c.compileMaybeExitFromNativeCode(amd64.JCC, nativeCallStatusCodeInvalidTableAccess)
  4096  
  4097  	// Skip zero size.
  4098  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, copySize.register, copySize.register)
  4099  	skipJump := c.assembler.CompileJump(amd64.JEQ)
  4100  
  4101  	// If dest < source, we can copy forwards.
  4102  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, destinationOffset.register, sourceOffset.register)
  4103  	destLowerThanSourceJump := c.assembler.CompileJump(amd64.JLS)
  4104  
  4105  	// If source + size < dest, we can copy forwards.
  4106  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, destinationOffset.register, tmp)
  4107  	c.assembler.CompileRegisterToRegister(amd64.SUBQ, copySize.register, tmp)
  4108  	c.assembler.CompileRegisterToRegister(amd64.CMPQ, sourceOffset.register, tmp)
  4109  	sourceBoundLowerThanDestJump := c.assembler.CompileJump(amd64.JLS)
  4110  
  4111  	// Copy backwards.
  4112  	c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, true)
  4113  	endJump := c.assembler.CompileJump(amd64.JMP)
  4114  
  4115  	// Copy forwards.
  4116  	c.assembler.SetJumpTargetOnNext(destLowerThanSourceJump)
  4117  	c.assembler.SetJumpTargetOnNext(sourceBoundLowerThanDestJump)
  4118  	c.compileTableCopyLoopImpl(srcTableIndex, dstTableIndex, destinationOffset, sourceOffset, copySize, tmp, false)
  4119  
  4120  	c.locationStack.markRegisterUnused(copySize.register, sourceOffset.register,
  4121  		destinationOffset.register, tmp)
  4122  	c.assembler.SetJumpTargetOnNext(skipJump)
  4123  	c.assembler.SetJumpTargetOnNext(endJump)
  4124  	return nil
  4125  }
  4126  
  4127  // compileElemDrop implements compiler.compileElemDrop for the amd64 architecture.
  4128  func (c *amd64Compiler) compileElemDrop(o *wazeroir.UnionOperation) error {
  4129  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4130  		return err
  4131  	}
  4132  
  4133  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4134  	if err != nil {
  4135  		return err
  4136  	}
  4137  
  4138  	elemIndex := uint32(o.U1)
  4139  	c.compileLoadElemInstanceAddress(elemIndex, tmp)
  4140  
  4141  	// Clears the content of ElementInstances[o.ElemIndex].References (== []uintptr{} type).
  4142  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 0)
  4143  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 8)
  4144  	c.assembler.CompileConstToMemory(amd64.MOVQ, 0, tmp, 16)
  4145  	return nil
  4146  }
  4147  
  4148  func (c *amd64Compiler) compileLoadElemInstanceAddress(elemIndex uint32, dst asm.Register) {
  4149  	// dst = elemIndex * elementInstanceStructSize
  4150  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(elemIndex)*elementInstanceStructSize, dst)
  4151  
  4152  	// dst = &moduleInstance.ElementInstances[0] + dst
  4153  	//     = &moduleInstance.ElementInstances[0] + elemIndex*elementInstanceStructSize
  4154  	//     = &moduleInstance.ElementInstances[elemIndex]
  4155  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  4156  		amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  4157  		dst,
  4158  	)
  4159  }
  4160  
  4161  // compileTableGet implements compiler.compileTableGet for the amd64 architecture.
  4162  func (c *amd64Compiler) compileTableGet(o *wazeroir.UnionOperation) error {
  4163  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  4164  	if err != nil {
  4165  		return err
  4166  	}
  4167  
  4168  	c.locationStack.markRegisterUsed(ref)
  4169  
  4170  	offset := c.locationStack.pop()
  4171  	if err := c.compileEnsureOnRegister(offset); err != nil {
  4172  		return err
  4173  	}
  4174  
  4175  	// ref = &tables[0]
  4176  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4177  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4178  		ref)
  4179  
  4180  	// ref = [ref + TableIndex*8]
  4181  	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4182  	//     = [&tables[TableIndex]] = tables[TableIndex].
  4183  	tableIndex := int64(o.U1)
  4184  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableIndex*8, ref)
  4185  
  4186  	// Out of bounds check.
  4187  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, ref, tableInstanceTableLenOffset, offset.register)
  4188  	c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
  4189  
  4190  	// ref = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  4191  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, ref, tableInstanceTableOffset, ref)
  4192  
  4193  	// ref = [ref + 0 + offset.register * 8]
  4194  	//     = [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset]
  4195  	//     = [&tables[TableIndex].References[offset]]
  4196  	//     = tables[TableIndex].References[offset]
  4197  	c.assembler.CompileMemoryWithIndexToRegister(amd64.MOVQ, ref,
  4198  		0, offset.register, 8, ref,
  4199  	)
  4200  
  4201  	c.locationStack.markRegisterUnused(offset.register)
  4202  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64) // table elements are opaque 64-bit at runtime.
  4203  	return nil
  4204  }
  4205  
  4206  // compileTableSet implements compiler.compileTableSet for the amd64 architecture.
  4207  func (c *amd64Compiler) compileTableSet(o *wazeroir.UnionOperation) error {
  4208  	ref := c.locationStack.pop()
  4209  	if err := c.compileEnsureOnRegister(ref); err != nil {
  4210  		return err
  4211  	}
  4212  
  4213  	offset := c.locationStack.pop()
  4214  	if err := c.compileEnsureOnRegister(offset); err != nil {
  4215  		return err
  4216  	}
  4217  
  4218  	tmp, err := c.allocateRegister(registerTypeGeneralPurpose)
  4219  	if err != nil {
  4220  		return err
  4221  	}
  4222  
  4223  	// tmp = &tables[0]
  4224  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4225  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4226  		tmp)
  4227  
  4228  	// ref = [ref + TableIndex*8]
  4229  	//     = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4230  	//     = [&tables[TableIndex]] = tables[TableIndex].
  4231  	tableIndex := int64(o.U1)
  4232  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableIndex*8, tmp)
  4233  
  4234  	// Out of bounds check.
  4235  	c.assembler.CompileMemoryToRegister(amd64.CMPQ, tmp, tableInstanceTableLenOffset, offset.register)
  4236  	c.compileMaybeExitFromNativeCode(amd64.JHI, nativeCallStatusCodeInvalidTableAccess)
  4237  
  4238  	// tmp = [&tables[TableIndex] + tableInstanceTableOffset] = &tables[TableIndex].References[0]
  4239  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmp, tableInstanceTableOffset, tmp)
  4240  
  4241  	// [tmp + 0 + offset.register * 8] = ref
  4242  	// [&tables[TableIndex].References[0] + sizeOf(uintptr) * offset] = ref
  4243  	// [&tables[TableIndex].References[offset]] = ref
  4244  	// tables[TableIndex].References[offset] = ref
  4245  	c.assembler.CompileRegisterToMemoryWithIndex(amd64.MOVQ,
  4246  		ref.register,
  4247  		tmp, 0, offset.register, 8)
  4248  
  4249  	c.locationStack.markRegisterUnused(offset.register, ref.register)
  4250  	return nil
  4251  }
  4252  
  4253  // compileTableGrow implements compiler.compileTableGrow for the amd64 architecture.
  4254  func (c *amd64Compiler) compileTableGrow(o *wazeroir.UnionOperation) error {
  4255  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4256  		return err
  4257  	}
  4258  
  4259  	// Pushes the table index.
  4260  	tableIndex := uint32(o.U1)
  4261  	if err := c.compileConstI32Impl(tableIndex); err != nil {
  4262  		return err
  4263  	}
  4264  
  4265  	// Table grow cannot be done in assembly just like memory grow as it involves with allocation in Go.
  4266  	// Therefore, call out to the built function for this purpose.
  4267  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexTableGrow); err != nil {
  4268  		return err
  4269  	}
  4270  
  4271  	// TableGrow consumes three values (table index, number of items, initial value).
  4272  	for i := 0; i < 3; i++ {
  4273  		c.locationStack.pop()
  4274  	}
  4275  
  4276  	// Then, the previous length was pushed as the result.
  4277  	loc := c.locationStack.pushRuntimeValueLocationOnStack()
  4278  	loc.valueType = runtimeValueTypeI32
  4279  
  4280  	// After return, we re-initialize reserved registers just like preamble of functions.
  4281  	c.compileReservedStackBasePointerInitialization()
  4282  	c.compileReservedMemoryPointerInitialization()
  4283  	return nil
  4284  }
  4285  
  4286  // compileTableSize implements compiler.compileTableSize for the amd64 architecture.
  4287  func (c *amd64Compiler) compileTableSize(o *wazeroir.UnionOperation) error {
  4288  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4289  		return err
  4290  	}
  4291  
  4292  	result, err := c.allocateRegister(registerTypeGeneralPurpose)
  4293  	if err != nil {
  4294  		return err
  4295  	}
  4296  
  4297  	// result = &tables[0]
  4298  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4299  		amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset,
  4300  		result)
  4301  
  4302  	// result = [result + TableIndex*8]
  4303  	//        = [&tables[0] + TableIndex*sizeOf(*tableInstance)]
  4304  	//        = [&tables[TableIndex]] = tables[TableIndex].
  4305  	tableIndex := int64(o.U1)
  4306  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableIndex*8, result)
  4307  
  4308  	// result = [result + tableInstanceTableLenOffset]
  4309  	//        = [tables[TableIndex] + tableInstanceTableLenOffset]
  4310  	//        = len(tables[TableIndex])
  4311  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, result, tableInstanceTableLenOffset, result)
  4312  
  4313  	c.pushRuntimeValueLocationOnRegister(result, runtimeValueTypeI32)
  4314  	return nil
  4315  }
  4316  
  4317  // compileTableFill implements compiler.compileTableFill for the amd64 architecture.
  4318  func (c *amd64Compiler) compileTableFill(o *wazeroir.UnionOperation) error {
  4319  	tableIndex := uint32(o.U1)
  4320  	return c.compileFillImpl(true, tableIndex)
  4321  }
  4322  
  4323  // compileRefFunc implements compiler.compileRefFunc for the amd64 architecture.
  4324  func (c *amd64Compiler) compileRefFunc(o *wazeroir.UnionOperation) error {
  4325  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4326  		return err
  4327  	}
  4328  
  4329  	ref, err := c.allocateRegister(registerTypeGeneralPurpose)
  4330  	if err != nil {
  4331  		return err
  4332  	}
  4333  
  4334  	functionIndex := int64(o.U1)
  4335  	c.assembler.CompileConstToRegister(amd64.MOVQ, functionIndex*functionSize, ref)
  4336  
  4337  	// ref = [amd64ReservedRegisterForCallEngine + callEngineModuleContextFunctionsElement0AddressOffset + int64(o.FunctionIndex)*functionSize]
  4338  	//     = &moduleEngine.functions[index]
  4339  	c.assembler.CompileMemoryToRegister(
  4340  		amd64.ADDQ, amd64ReservedRegisterForCallEngine, callEngineModuleContextFunctionsElement0AddressOffset,
  4341  		ref,
  4342  	)
  4343  
  4344  	c.pushRuntimeValueLocationOnRegister(ref, runtimeValueTypeI64)
  4345  	return nil
  4346  }
  4347  
  4348  // compileConstI32 implements compiler.compileConstI32 for the amd64 architecture.
  4349  func (c *amd64Compiler) compileConstI32(o *wazeroir.UnionOperation) error {
  4350  	return c.compileConstI32Impl(uint32(o.U1))
  4351  }
  4352  
  4353  func (c *amd64Compiler) compileConstI32Impl(v uint32) error {
  4354  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4355  		return err
  4356  	}
  4357  
  4358  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4359  	if err != nil {
  4360  		return err
  4361  	}
  4362  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI32)
  4363  	c.assembler.CompileConstToRegister(amd64.MOVL, int64(v), reg)
  4364  	return nil
  4365  }
  4366  
  4367  // compileConstI64 implements compiler.compileConstI64 for the amd64 architecture.
  4368  func (c *amd64Compiler) compileConstI64(o *wazeroir.UnionOperation) error {
  4369  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4370  		return err
  4371  	}
  4372  
  4373  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4374  	if err != nil {
  4375  		return err
  4376  	}
  4377  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeI64)
  4378  
  4379  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1), reg)
  4380  	return nil
  4381  }
  4382  
  4383  // compileConstF32 implements compiler.compileConstF32 for the amd64 architecture.
  4384  func (c *amd64Compiler) compileConstF32(o *wazeroir.UnionOperation) error {
  4385  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4386  		return err
  4387  	}
  4388  
  4389  	reg, err := c.allocateRegister(registerTypeVector)
  4390  	if err != nil {
  4391  		return err
  4392  	}
  4393  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF32)
  4394  
  4395  	// We cannot directly load the value from memory to float regs,
  4396  	// so we move it to int reg temporarily.
  4397  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4398  	if err != nil {
  4399  		return err
  4400  	}
  4401  
  4402  	c.assembler.CompileConstToRegister(amd64.MOVL, int64(o.U1) /*math.Float32bits(o.Value)*/, tmpReg)
  4403  	c.assembler.CompileRegisterToRegister(amd64.MOVL, tmpReg, reg)
  4404  	return nil
  4405  }
  4406  
  4407  // compileConstF64 implements compiler.compileConstF64 for the amd64 architecture.
  4408  func (c *amd64Compiler) compileConstF64(o *wazeroir.UnionOperation) error {
  4409  	if err := c.maybeCompileMoveTopConditionalToGeneralPurposeRegister(); err != nil {
  4410  		return err
  4411  	}
  4412  
  4413  	reg, err := c.allocateRegister(registerTypeVector)
  4414  	if err != nil {
  4415  		return err
  4416  	}
  4417  	c.pushRuntimeValueLocationOnRegister(reg, runtimeValueTypeF64)
  4418  
  4419  	// We cannot directly load the value from memory to float regs,
  4420  	// so we move it to int reg temporarily.
  4421  	tmpReg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4422  	if err != nil {
  4423  		return err
  4424  	}
  4425  
  4426  	c.assembler.CompileConstToRegister(amd64.MOVQ, int64(o.U1) /* math.Float64bits(o.Value) */, tmpReg)
  4427  	c.assembler.CompileRegisterToRegister(amd64.MOVQ, tmpReg, reg)
  4428  	return nil
  4429  }
  4430  
  4431  // compileLoadValueOnStackToRegister implements compiler.compileLoadValueOnStackToRegister for amd64.
  4432  func (c *amd64Compiler) compileLoadValueOnStackToRegister(loc *runtimeValueLocation) {
  4433  	var inst asm.Instruction
  4434  	switch loc.valueType {
  4435  	case runtimeValueTypeV128Lo:
  4436  		inst = amd64.MOVDQU
  4437  	case runtimeValueTypeV128Hi:
  4438  		panic("BUG: V128Hi must be be loaded to a register along with V128Lo")
  4439  	case runtimeValueTypeI32, runtimeValueTypeF32:
  4440  		inst = amd64.MOVL
  4441  	case runtimeValueTypeI64, runtimeValueTypeF64:
  4442  		inst = amd64.MOVQ
  4443  	default:
  4444  		panic("BUG: unknown runtime value type")
  4445  	}
  4446  
  4447  	// Copy the value from the stack.
  4448  	c.assembler.CompileMemoryToRegister(inst,
  4449  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  4450  		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8,
  4451  		loc.register)
  4452  
  4453  	if loc.valueType == runtimeValueTypeV128Lo {
  4454  		// Higher 64-bits are loaded as well ^^.
  4455  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4456  		hi.setRegister(loc.register)
  4457  	}
  4458  }
  4459  
  4460  // maybeCompileMoveTopConditionalToGeneralPurposeRegister moves the top value on the stack
  4461  // if the value is located on a conditional register.
  4462  //
  4463  // This is usually called at the beginning of methods on compiler interface where we possibly
  4464  // compile instructions without saving the conditional register value.
  4465  // The compileXXX functions without calling this function is saving the conditional
  4466  // value to the stack or register by invoking compileEnsureOnRegister for the top.
  4467  func (c *amd64Compiler) maybeCompileMoveTopConditionalToGeneralPurposeRegister() (err error) {
  4468  	if c.locationStack.sp > 0 {
  4469  		if loc := c.locationStack.peek(); loc.onConditionalRegister() {
  4470  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4471  				return err
  4472  			}
  4473  		}
  4474  	}
  4475  	return
  4476  }
  4477  
  4478  // loadConditionalRegisterToGeneralPurposeRegister saves the conditional register value
  4479  // to a general purpose register.
  4480  func (c *amd64Compiler) compileLoadConditionalRegisterToGeneralPurposeRegister(loc *runtimeValueLocation) error {
  4481  	reg, err := c.allocateRegister(registerTypeGeneralPurpose)
  4482  	if err != nil {
  4483  		return err
  4484  	}
  4485  	c.compileMoveConditionalToGeneralPurposeRegister(loc, reg)
  4486  	return nil
  4487  }
  4488  
  4489  func (c *amd64Compiler) compileMoveConditionalToGeneralPurposeRegister(loc *runtimeValueLocation, reg asm.Register) {
  4490  	// Set the flag bit to the destination. See
  4491  	// - https://c9x.me/x86/html/file_module_x86_id_288.html
  4492  	// - https://github.com/golang/go/blob/master/src/cmd/internal/obj/x86/asm6.go#L1453-L1468
  4493  	// to translate conditionalRegisterState* to amd64.SET*
  4494  	var inst asm.Instruction
  4495  	switch loc.conditionalRegister {
  4496  	case amd64.ConditionalRegisterStateE:
  4497  		inst = amd64.SETEQ
  4498  	case amd64.ConditionalRegisterStateNE:
  4499  		inst = amd64.SETNE
  4500  	case amd64.ConditionalRegisterStateS:
  4501  		inst = amd64.SETMI
  4502  	case amd64.ConditionalRegisterStateNS:
  4503  		inst = amd64.SETPL
  4504  	case amd64.ConditionalRegisterStateG:
  4505  		inst = amd64.SETGT
  4506  	case amd64.ConditionalRegisterStateGE:
  4507  		inst = amd64.SETGE
  4508  	case amd64.ConditionalRegisterStateL:
  4509  		inst = amd64.SETLT
  4510  	case amd64.ConditionalRegisterStateLE:
  4511  		inst = amd64.SETLE
  4512  	case amd64.ConditionalRegisterStateA:
  4513  		inst = amd64.SETHI
  4514  	case amd64.ConditionalRegisterStateAE:
  4515  		inst = amd64.SETCC
  4516  	case amd64.ConditionalRegisterStateB:
  4517  		inst = amd64.SETCS
  4518  	case amd64.ConditionalRegisterStateBE:
  4519  		inst = amd64.SETLS
  4520  	}
  4521  
  4522  	c.assembler.CompileNoneToRegister(inst, reg)
  4523  
  4524  	// Then we reset the unnecessary bit.
  4525  	c.assembler.CompileConstToRegister(amd64.ANDQ, 0x1, reg)
  4526  
  4527  	// Mark it uses the register.
  4528  	loc.setRegister(reg)
  4529  	c.locationStack.markRegisterUsed(reg)
  4530  }
  4531  
  4532  // allocateRegister implements compiler.allocateRegister for amd64.
  4533  func (c *amd64Compiler) allocateRegister(t registerType) (reg asm.Register, err error) {
  4534  	var ok bool
  4535  	// Try to get the unused register.
  4536  	reg, ok = c.locationStack.takeFreeRegister(t)
  4537  	if ok {
  4538  		return
  4539  	}
  4540  
  4541  	// If not found, we have to steal the register.
  4542  	stealTarget, ok := c.locationStack.takeStealTargetFromUsedRegister(t)
  4543  	if !ok {
  4544  		err = fmt.Errorf("cannot steal register")
  4545  		return
  4546  	}
  4547  
  4548  	// Release the steal target register value onto stack location.
  4549  	reg = stealTarget.register
  4550  	c.compileReleaseRegisterToStack(stealTarget)
  4551  	return
  4552  }
  4553  
  4554  // callFunction adds instructions to call a function whose address equals either addr parameter or the value on indexReg.
  4555  //
  4556  // Note: this is the counterpart for returnFunction, and see the comments there as well
  4557  // to understand how the function calls are achieved.
  4558  func (c *amd64Compiler) compileCallFunctionImpl(functionAddressRegister asm.Register, functype *wasm.FunctionType) error {
  4559  	// Release all the registers as our calling convention requires the caller-save.
  4560  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  4561  		return err
  4562  	}
  4563  
  4564  	c.locationStack.markRegisterUsed(functionAddressRegister)
  4565  
  4566  	// Obtain a temporary register to be used in the followings.
  4567  	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4568  	if !found {
  4569  		// This in theory never happen as all the registers must be free except codeAddressRegister.
  4570  		return fmt.Errorf("could not find enough free registers")
  4571  	}
  4572  
  4573  	// The stack should look like:
  4574  	//
  4575  	//               reserved slots for results (if len(results) > len(args))
  4576  	//                      |     |
  4577  	//    ,arg0, ..., argN, ..., _, .returnAddress, .returnStackBasePointerInBytes, .function, ....
  4578  	//      |                       |                                                        |
  4579  	//      |             callFrame{^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^}
  4580  	//      |
  4581  	// nextStackBasePointerOffset
  4582  	//
  4583  	// where callFrame is used to return to this currently executed function.
  4584  
  4585  	nextStackBasePointerOffset := int64(c.locationStack.sp) - int64(functype.ParamNumInUint64)
  4586  
  4587  	callFrameReturnAddressLoc, callFrameStackBasePointerInBytesLoc, callFrameFunctionLoc := c.locationStack.pushCallFrame(functype)
  4588  
  4589  	// Save the current stack base pointer at callFrameStackBasePointerInBytesLoc.
  4590  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4591  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4592  		tmpRegister)
  4593  	callFrameStackBasePointerInBytesLoc.setRegister(tmpRegister)
  4594  	c.compileReleaseRegisterToStack(callFrameStackBasePointerInBytesLoc)
  4595  
  4596  	// Set callEngine.stackContext.stackBasePointer for the next function.
  4597  	c.assembler.CompileConstToRegister(amd64.ADDQ, nextStackBasePointerOffset<<3, tmpRegister)
  4598  
  4599  	// Write the calculated value to callEngine.stackContext.stackBasePointer.
  4600  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  4601  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  4602  
  4603  	// Save the currently executed *function (placed at callEngine.moduleContext.fn) into callFrameFunctionLoc.
  4604  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4605  		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset,
  4606  		tmpRegister)
  4607  	callFrameFunctionLoc.setRegister(tmpRegister)
  4608  	c.compileReleaseRegisterToStack(callFrameFunctionLoc)
  4609  
  4610  	// Set callEngine.moduleContext.fn to the next *function.
  4611  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, functionAddressRegister,
  4612  		amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  4613  
  4614  	// Write the return address into callFrameReturnAddressLoc.
  4615  	c.assembler.CompileReadInstructionAddress(tmpRegister, amd64.JMP)
  4616  	callFrameReturnAddressLoc.setRegister(tmpRegister)
  4617  	c.compileReleaseRegisterToStack(callFrameReturnAddressLoc)
  4618  
  4619  	if amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister == functionAddressRegister {
  4620  		// This case we must move the value on targetFunctionAddressRegister to another register, otherwise
  4621  		// the address (jump target below) will be modified and result in segfault.
  4622  		// See #526.
  4623  		c.assembler.CompileRegisterToRegister(amd64.MOVQ, functionAddressRegister, tmpRegister)
  4624  		functionAddressRegister = tmpRegister
  4625  	}
  4626  
  4627  	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
  4628  	c.assembler.CompileMemoryToRegister(amd64.MOVQ, functionAddressRegister, functionModuleInstanceOffset,
  4629  		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4630  
  4631  	// And jump into the initial address of the target function.
  4632  	c.assembler.CompileJumpToMemory(amd64.JMP, functionAddressRegister, functionCodeInitialAddressOffset)
  4633  
  4634  	// All the registers used are temporary, so we mark them unused.
  4635  	c.locationStack.markRegisterUnused(tmpRegister, functionAddressRegister)
  4636  
  4637  	// On the function return, we have to initialize the state.
  4638  	if err := c.compileModuleContextInitialization(); err != nil {
  4639  		return err
  4640  	}
  4641  
  4642  	// Due to the change to callEngine.stackContext.stackBasePointer.
  4643  	c.compileReservedStackBasePointerInitialization()
  4644  
  4645  	// Due to the change to callEngine.moduleContext.moduleInstance as that might result in
  4646  	// the memory instance manipulation.
  4647  	c.compileReservedMemoryPointerInitialization()
  4648  
  4649  	// We consumed the function parameters, the call frame stack and reserved slots during the call.
  4650  	c.locationStack.sp = uint64(nextStackBasePointerOffset)
  4651  
  4652  	// Now the function results are pushed by the call.
  4653  	for _, t := range functype.Results {
  4654  		loc := c.locationStack.pushRuntimeValueLocationOnStack()
  4655  		switch t {
  4656  		case wasm.ValueTypeI32:
  4657  			loc.valueType = runtimeValueTypeI32
  4658  		case wasm.ValueTypeI64, wasm.ValueTypeFuncref, wasm.ValueTypeExternref:
  4659  			loc.valueType = runtimeValueTypeI64
  4660  		case wasm.ValueTypeF32:
  4661  			loc.valueType = runtimeValueTypeF32
  4662  		case wasm.ValueTypeF64:
  4663  			loc.valueType = runtimeValueTypeF64
  4664  		case wasm.ValueTypeV128:
  4665  			loc.valueType = runtimeValueTypeV128Lo
  4666  			hi := c.locationStack.pushRuntimeValueLocationOnStack()
  4667  			hi.valueType = runtimeValueTypeV128Hi
  4668  		default:
  4669  			panic("BUG: invalid type: " + wasm.ValueTypeName(t))
  4670  		}
  4671  	}
  4672  	return nil
  4673  }
  4674  
  4675  // returnFunction adds instructions to return from the current callframe back to the caller's frame.
  4676  // If this is the current one is the origin, we return to the callEngine.execWasmFunction with the Returned status.
  4677  // Otherwise, we jump into the callers' return address stored in callFrame.returnAddress while setting
  4678  // up all the necessary change on the callEngine's state.
  4679  //
  4680  // Note: this is the counterpart for callFunction, and see the comments there as well
  4681  // to understand how the function calls are achieved.
  4682  func (c *amd64Compiler) compileReturnFunction() error {
  4683  	// Release all the registers as our calling convention requires the caller-save.
  4684  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  4685  		return err
  4686  	}
  4687  
  4688  	if c.withListener {
  4689  		if err := c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerAfter); err != nil {
  4690  			return err
  4691  		}
  4692  		// After return, we re-initialize the stack base pointer as that is used to return to the caller below.
  4693  		c.compileReservedStackBasePointerInitialization()
  4694  	}
  4695  
  4696  	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
  4697  	// so mark it used so that it won't be used as a free register.
  4698  	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4699  	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4700  
  4701  	// Obtain a temporary register to be used in the following.
  4702  	returnAddressRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4703  	if !found {
  4704  		panic("BUG: all the registers should be free at this point: " + c.locationStack.String())
  4705  	}
  4706  
  4707  	returnAddress, callerStackBasePointerInBytes, callerFunction := c.locationStack.getCallFrameLocations(c.typ)
  4708  
  4709  	// A zero return address means return from the execution.
  4710  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4711  		amd64ReservedRegisterForStackBasePointerAddress, int64(returnAddress.stackPointer)*8,
  4712  		returnAddressRegister,
  4713  	)
  4714  	c.assembler.CompileRegisterToRegister(amd64.TESTQ, returnAddressRegister, returnAddressRegister)
  4715  
  4716  	c.compileMaybeExitFromNativeCode(amd64.JNE, nativeCallStatusCodeReturned)
  4717  
  4718  	// Alias for readability.
  4719  	tmpRegister := amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister
  4720  
  4721  	// First, restore the stackContext.stackBasePointerInBytesOffset from callerStackBasePointerInBytes.
  4722  	callerStackBasePointerInBytes.setRegister(tmpRegister)
  4723  	c.compileLoadValueOnStackToRegister(callerStackBasePointerInBytes)
  4724  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4725  		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset)
  4726  
  4727  	// Next, restore moduleContext.fn from callerFunction.
  4728  	callerFunction.setRegister(tmpRegister)
  4729  	c.compileLoadValueOnStackToRegister(callerFunction)
  4730  	c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4731  		tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextFnOffset)
  4732  
  4733  	// Also, we have to put the target function's *wasm.ModuleInstance into amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister.
  4734  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4735  		tmpRegister, functionModuleInstanceOffset,
  4736  		amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4737  
  4738  	// Then, jump into the return address!
  4739  	c.assembler.CompileJumpToRegister(amd64.JMP, returnAddressRegister)
  4740  	return nil
  4741  }
  4742  
  4743  func (c *amd64Compiler) compileCallGoHostFunction() error {
  4744  	return c.compileCallGoFunction(nativeCallStatusCodeCallGoHostFunction)
  4745  }
  4746  
  4747  func (c *amd64Compiler) compileCallBuiltinFunction(index wasm.Index) error {
  4748  	// Set the functionAddress to the callEngine.exitContext functionCallAddress.
  4749  	c.assembler.CompileConstToMemory(amd64.MOVL, int64(index), amd64ReservedRegisterForCallEngine, callEngineExitContextBuiltinFunctionCallIndexOffset)
  4750  	return c.compileCallGoFunction(nativeCallStatusCodeCallBuiltInFunction)
  4751  }
  4752  
  4753  func (c *amd64Compiler) compileCallGoFunction(compilerStatus nativeCallStatusCode) error {
  4754  	// Release all the registers as our calling convention requires the caller-save.
  4755  	if err := c.compileReleaseAllRegistersToStack(); err != nil {
  4756  		return err
  4757  	}
  4758  
  4759  	c.compileExitFromNativeCode(compilerStatus)
  4760  	return nil
  4761  }
  4762  
  4763  // compileReleaseAllRegistersToStack add the instructions to release all the LIVE value
  4764  // in the value location stack at this point into the stack memory location.
  4765  func (c *amd64Compiler) compileReleaseAllRegistersToStack() (err error) {
  4766  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4767  		if loc := &c.locationStack.stack[i]; loc.onRegister() {
  4768  			c.compileReleaseRegisterToStack(loc)
  4769  		} else if loc.onConditionalRegister() {
  4770  			if err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc); err != nil {
  4771  				return
  4772  			}
  4773  			c.compileReleaseRegisterToStack(loc)
  4774  		}
  4775  	}
  4776  	return
  4777  }
  4778  
  4779  func (c *amd64Compiler) onValueReleaseRegisterToStack(reg asm.Register) {
  4780  	for i := uint64(0); i < c.locationStack.sp; i++ {
  4781  		prevValue := &c.locationStack.stack[i]
  4782  		if prevValue.register == reg {
  4783  			c.compileReleaseRegisterToStack(prevValue)
  4784  			break
  4785  		}
  4786  	}
  4787  }
  4788  
  4789  // compileReleaseRegisterToStack implements compiler.compileReleaseRegisterToStack for amd64.
  4790  func (c *amd64Compiler) compileReleaseRegisterToStack(loc *runtimeValueLocation) {
  4791  	var inst asm.Instruction
  4792  	switch loc.valueType {
  4793  	case runtimeValueTypeV128Lo:
  4794  		inst = amd64.MOVDQU
  4795  	case runtimeValueTypeV128Hi:
  4796  		panic("BUG: V128Hi must be released to the stack along with V128Lo")
  4797  	case runtimeValueTypeI32, runtimeValueTypeF32:
  4798  		inst = amd64.MOVL
  4799  	case runtimeValueTypeI64, runtimeValueTypeF64:
  4800  		inst = amd64.MOVQ
  4801  	default:
  4802  		panic("BUG: unknown runtime value type")
  4803  	}
  4804  
  4805  	c.assembler.CompileRegisterToMemory(inst, loc.register,
  4806  		// Note: stack pointers are ensured not to exceed 2^27 so this offset never exceeds 32-bit range.
  4807  		amd64ReservedRegisterForStackBasePointerAddress, int64(loc.stackPointer)*8)
  4808  
  4809  	// Mark the register is free.
  4810  	c.locationStack.releaseRegister(loc)
  4811  
  4812  	if loc.valueType == runtimeValueTypeV128Lo {
  4813  		// Higher 64-bits are released as well ^^.
  4814  		hi := &c.locationStack.stack[loc.stackPointer+1]
  4815  		c.locationStack.releaseRegister(hi)
  4816  	}
  4817  }
  4818  
  4819  func (c *amd64Compiler) compileMaybeExitFromNativeCode(skipCondition asm.Instruction, status nativeCallStatusCode) {
  4820  	if target := c.compiledTrapTargets[status]; target != nil {
  4821  		// We've already compiled this.
  4822  		// Invert the return condition to jump into the appropriate target.
  4823  		var returnCondition asm.Instruction
  4824  		switch skipCondition {
  4825  		case amd64.JHI:
  4826  			returnCondition = amd64.JLS
  4827  		case amd64.JLS:
  4828  			returnCondition = amd64.JHI
  4829  		case amd64.JNE:
  4830  			returnCondition = amd64.JEQ
  4831  		case amd64.JEQ:
  4832  			returnCondition = amd64.JNE
  4833  		case amd64.JCC:
  4834  			returnCondition = amd64.JCS
  4835  		case amd64.JCS:
  4836  			returnCondition = amd64.JCC
  4837  		case amd64.JPC:
  4838  			returnCondition = amd64.JPS
  4839  		case amd64.JPS:
  4840  			returnCondition = amd64.JPC
  4841  		case amd64.JPL:
  4842  			returnCondition = amd64.JMI
  4843  		case amd64.JMI:
  4844  			returnCondition = amd64.JPL
  4845  		default:
  4846  			panic("BUG: couldn't invert condition")
  4847  		}
  4848  		c.assembler.CompileJump(returnCondition).AssignJumpTarget(target)
  4849  	} else {
  4850  		skip := c.assembler.CompileJump(skipCondition)
  4851  		c.compileExitFromNativeCode(status)
  4852  		c.assembler.SetJumpTargetOnNext(skip)
  4853  	}
  4854  }
  4855  
  4856  func (c *amd64Compiler) compileExitFromNativeCode(status nativeCallStatusCode) {
  4857  	if target := c.compiledTrapTargets[status]; target != nil {
  4858  		c.assembler.CompileJump(amd64.JMP).AssignJumpTarget(target)
  4859  		return
  4860  	}
  4861  
  4862  	switch status {
  4863  	case nativeCallStatusCodeReturned:
  4864  		// Save the target for reuse.
  4865  		c.compiledTrapTargets[status] = c.compileNOP()
  4866  	case nativeCallStatusCodeCallGoHostFunction, nativeCallStatusCodeCallBuiltInFunction:
  4867  		// Read the return address, and write it to callEngine.exitContext.returnAddress.
  4868  		returnAddressReg, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4869  		if !ok {
  4870  			panic("BUG: cannot take free register")
  4871  		}
  4872  		c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.RET)
  4873  		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4874  			returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
  4875  	default:
  4876  		if c.ir.IROperationSourceOffsetsInWasmBinary != nil {
  4877  			// This case, the execution traps and we want the top frame's source position in the stack trace.
  4878  			// Take RegR15 and store the instruction address onto callEngine.returnAddress.
  4879  			returnAddressReg := amd64.RegR15
  4880  			c.assembler.CompileReadInstructionAddress(returnAddressReg, amd64.MOVQ)
  4881  			c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  4882  				returnAddressReg, amd64ReservedRegisterForCallEngine, callEngineExitContextReturnAddressOffset)
  4883  		} else {
  4884  			// We won't use the source position, so just save the target for reuse.
  4885  			c.compiledTrapTargets[status] = c.compileNOP()
  4886  		}
  4887  	}
  4888  
  4889  	// Write the status to callEngine.exitContext.statusCode.
  4890  	c.assembler.CompileConstToMemory(amd64.MOVB, int64(status),
  4891  		amd64ReservedRegisterForCallEngine, callEngineExitContextNativeCallStatusCodeOffset)
  4892  
  4893  	// Write back the cached SP to the actual eng.stackPointer.
  4894  	c.assembler.CompileConstToMemory(amd64.MOVQ, int64(c.locationStack.sp),
  4895  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackPointerOffset)
  4896  
  4897  	c.assembler.CompileStandAlone(amd64.RET)
  4898  }
  4899  
  4900  func (c *amd64Compiler) compilePreamble() (err error) {
  4901  	// We assume all function parameters are already pushed onto the stack by
  4902  	// the caller.
  4903  	c.locationStack.init(c.typ)
  4904  
  4905  	if err := c.compileModuleContextInitialization(); err != nil {
  4906  		return err
  4907  	}
  4908  
  4909  	// Check if it's necessary to grow the value stack by using max stack pointer.
  4910  	if err = c.compileMaybeGrowStack(); err != nil {
  4911  		return err
  4912  	}
  4913  
  4914  	if c.withListener {
  4915  		if err = c.compileCallBuiltinFunction(builtinFunctionIndexFunctionListenerBefore); err != nil {
  4916  			return err
  4917  		}
  4918  	}
  4919  
  4920  	c.compileReservedStackBasePointerInitialization()
  4921  
  4922  	// Finally, we initialize the reserved memory register based on the module context.
  4923  	c.compileReservedMemoryPointerInitialization()
  4924  	return
  4925  }
  4926  
  4927  func (c *amd64Compiler) compileReservedStackBasePointerInitialization() {
  4928  	// First, make reservedRegisterForStackBasePointer point to the beginning of the slice backing array.
  4929  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4930  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackElement0AddressOffset,
  4931  		amd64ReservedRegisterForStackBasePointerAddress)
  4932  
  4933  	// next we move the base pointer (callEngine.stackBasePointer) to the tmp register.
  4934  	c.assembler.CompileMemoryToRegister(amd64.ADDQ,
  4935  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset,
  4936  		amd64ReservedRegisterForStackBasePointerAddress,
  4937  	)
  4938  }
  4939  
  4940  func (c *amd64Compiler) compileReservedMemoryPointerInitialization() {
  4941  	if c.ir.HasMemory || c.ir.UsesMemory {
  4942  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4943  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset,
  4944  			amd64ReservedRegisterForMemory,
  4945  		)
  4946  	}
  4947  }
  4948  
  4949  // compileMaybeGrowStack adds instructions to check the necessity to grow the value stack,
  4950  // and if so, make the builtin function call to do so. These instructions are called in the function's
  4951  // preamble.
  4952  func (c *amd64Compiler) compileMaybeGrowStack() error {
  4953  	tmpRegister, ok := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4954  	if !ok {
  4955  		panic("BUG: cannot take free register")
  4956  	}
  4957  
  4958  	c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  4959  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackLenInBytesOffset, tmpRegister)
  4960  	c.assembler.CompileMemoryToRegister(amd64.SUBQ,
  4961  		amd64ReservedRegisterForCallEngine, callEngineStackContextStackBasePointerInBytesOffset, tmpRegister)
  4962  
  4963  	// If stack base pointer + max stack pointer > stackLen, we need to grow the stack.
  4964  	cmpWithStackPointerCeil := c.assembler.CompileRegisterToConst(amd64.CMPQ, tmpRegister, 0)
  4965  	c.assignStackPointerCeilNeeded = cmpWithStackPointerCeil
  4966  
  4967  	// Jump if we have no need to grow.
  4968  	jmpIfNoNeedToGrowStack := c.assembler.CompileJump(amd64.JCC)
  4969  
  4970  	// Otherwise, we have to make the builtin function call to grow the call stack.
  4971  	if err := c.compileCallBuiltinFunction(builtinFunctionIndexGrowStack); err != nil {
  4972  		return err
  4973  	}
  4974  
  4975  	c.assembler.SetJumpTargetOnNext(jmpIfNoNeedToGrowStack)
  4976  	return nil
  4977  }
  4978  
  4979  // compileModuleContextInitialization adds instructions to initialize callEngine.ModuleContext's fields based on
  4980  // callEngine.ModuleContext.ModuleInstanceAddress.
  4981  // This is called in two cases: in function preamble, and on the return from (non-Go) function calls.
  4982  func (c *amd64Compiler) compileModuleContextInitialization() error {
  4983  	// amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister holds the module instance's address
  4984  	// so mark it used so that it won't be used as a free register until the module context initialization finishes.
  4985  	c.locationStack.markRegisterUsed(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4986  	defer c.locationStack.markRegisterUnused(amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  4987  
  4988  	// Obtain the temporary registers to be used in the followings.
  4989  	tmpRegister, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4990  	if !found {
  4991  		// This in theory never happen as all the registers must be free except indexReg.
  4992  		return fmt.Errorf("could not find enough free registers")
  4993  	}
  4994  	c.locationStack.markRegisterUsed(tmpRegister)
  4995  	tmpRegister2, found := c.locationStack.takeFreeRegister(registerTypeGeneralPurpose)
  4996  	if !found {
  4997  		// This in theory never happen as all the registers must be free except indexReg.
  4998  		return fmt.Errorf("could not find enough free registers")
  4999  	}
  5000  	c.locationStack.markRegisterUsed(tmpRegister2)
  5001  
  5002  	// If the module instance address stays the same, we could skip the entire code below.
  5003  	// The rationale/idea for this is that, in almost all use cases, users instantiate a single
  5004  	// Wasm binary and run the functions from it, rather than doing import/export on multiple
  5005  	// binaries. As a result, this cmp and jmp instruction sequence below must be easy for
  5006  	// x64 CPU to do branch prediction since almost 100% jump happens across function calls.
  5007  	c.assembler.CompileMemoryToRegister(amd64.CMPQ,
  5008  		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister)
  5009  	jmpIfModuleNotChange := c.assembler.CompileJump(amd64.JEQ)
  5010  
  5011  	// If engine.ModuleContext.ModuleInstance is not equal the value on amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
  5012  	// we have to put the new value there.
  5013  	c.assembler.CompileRegisterToMemory(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister,
  5014  		amd64ReservedRegisterForCallEngine, callEngineModuleContextModuleInstanceOffset)
  5015  
  5016  	// Also, we have to update the following fields:
  5017  	// * callEngine.moduleContext.globalElement0Address
  5018  	// * callEngine.moduleContext.tableElement0Address
  5019  	// * callEngine.moduleContext.memoryInstance
  5020  	// * callEngine.moduleContext.memoryElement0Address
  5021  	// * callEngine.moduleContext.memorySliceLen
  5022  	// * callEngine.moduleContext.codesElement0Address
  5023  	// * callEngine.moduleContext.typeIDsElement0Address
  5024  	// * callEngine.moduleContext.dataInstancesElement0Address
  5025  	// * callEngine.moduleContext.elementInstancesElement0Address
  5026  
  5027  	// Update globalElement0Address.
  5028  	//
  5029  	// Note: if there's global.get or set instruction in the function, the existence of the globals
  5030  	// is ensured by function validation at module instantiation phase, and that's why it is ok to
  5031  	// skip the initialization if the module's globals slice is empty.
  5032  	if len(c.ir.Globals) > 0 {
  5033  		// Since ModuleInstance.Globals is []*globalInstance, internally
  5034  		// the address of the first item in the underlying array lies exactly on the globals offset.
  5035  		// See https://go.dev/blog/slices-intro if unfamiliar.
  5036  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceGlobalsOffset, tmpRegister)
  5037  
  5038  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextGlobalElement0AddressOffset)
  5039  	}
  5040  
  5041  	// Update tableElement0Address.
  5042  	//
  5043  	// Note: if there's table instruction in the function, the existence of the table
  5044  	// is ensured by function validation at module instantiation phase, and that's
  5045  	// why it is ok to skip the initialization if the module's table doesn't exist.
  5046  	if c.ir.HasTable {
  5047  		// First, we need to read the *wasm.Table.
  5048  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTablesOffset, tmpRegister)
  5049  
  5050  		// At this point, tmpRegister holds the address of ModuleInstance.Table.
  5051  		// So we are ready to read and put the first item's address stored in Table.Table.
  5052  		// Here we read the value into tmpRegister2.
  5053  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  5054  			amd64ReservedRegisterForCallEngine, callEngineModuleContextTablesElement0AddressOffset)
  5055  
  5056  		// Finally, we put &ModuleInstance.TypeIDs[0] into moduleContext.typeIDsElement0Address.
  5057  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5058  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceTypeIDsOffset, tmpRegister)
  5059  		c.assembler.CompileRegisterToMemory(amd64.MOVQ,
  5060  			tmpRegister, amd64ReservedRegisterForCallEngine, callEngineModuleContextTypeIDsElement0AddressOffset)
  5061  	}
  5062  
  5063  	// Update memoryElement0Address and memorySliceLen.
  5064  	//
  5065  	// Note: if there's memory instruction in the function, memory instance must be non-nil.
  5066  	// That is ensured by function validation at module instantiation phase, and that's
  5067  	// why it is ok to skip the initialization if the module's memory instance is nil.
  5068  	if c.ir.HasMemory {
  5069  		c.assembler.CompileMemoryToRegister(amd64.MOVQ,
  5070  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceMemoryOffset,
  5071  			tmpRegister)
  5072  
  5073  		// Set memory instance.
  5074  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister,
  5075  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryInstanceOffset)
  5076  
  5077  		// Set length.
  5078  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferLenOffset, tmpRegister2)
  5079  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
  5080  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemorySliceLenOffset)
  5081  
  5082  		// Set element zero address.
  5083  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, memoryInstanceBufferOffset, tmpRegister2)
  5084  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister2,
  5085  			amd64ReservedRegisterForCallEngine, callEngineModuleContextMemoryElement0AddressOffset)
  5086  	}
  5087  
  5088  	// Update moduleContext.codesElement0Address
  5089  	{
  5090  		// "tmpRegister = [moduleInstanceAddressRegister + moduleInstanceEngineOffset + interfaceDataOffset] (== *moduleEngine)"
  5091  		//
  5092  		// Go's interface is laid out on memory as two quad words as struct {tab, data uintptr}
  5093  		// where tab points to the interface table, and the latter points to the actual
  5094  		// implementation of interface. This case, we extract "data" pointer as *moduleEngine.
  5095  		// See the following references for detail:
  5096  		// * https://research.swtch.com/interfaces
  5097  		// * https://github.com/golang/go/blob/release-branch.go1.20/src/runtime/runtime2.go#L207-L210
  5098  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceEngineOffset+interfaceDataOffset, tmpRegister)
  5099  
  5100  		// "tmpRegister = [tmpRegister + moduleEnginecodesOffset] (== &moduleEngine.codes[0])"
  5101  		c.assembler.CompileMemoryToRegister(amd64.MOVQ, tmpRegister, moduleEngineFunctionsOffset, tmpRegister)
  5102  
  5103  		// "callEngine.moduleContext.functionsElement0Address = tmpRegister".
  5104  		c.assembler.CompileRegisterToMemory(amd64.MOVQ, tmpRegister, amd64ReservedRegisterForCallEngine,
  5105  			callEngineModuleContextFunctionsElement0AddressOffset)
  5106  	}
  5107  
  5108  	// Update dataInstancesElement0Address.
  5109  	if c.ir.HasDataInstances {
  5110  		// "tmpRegister = &moduleInstance.DataInstances[0]"
  5111  		c.assembler.CompileMemoryToRegister(
  5112  			amd64.MOVQ,
  5113  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceDataInstancesOffset,
  5114  			tmpRegister,
  5115  		)
  5116  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
  5117  		c.assembler.CompileRegisterToMemory(
  5118  			amd64.MOVQ,
  5119  			tmpRegister,
  5120  			amd64ReservedRegisterForCallEngine, callEngineModuleContextDataInstancesElement0AddressOffset,
  5121  		)
  5122  	}
  5123  
  5124  	// Update callEngine.moduleContext.elementInstancesElement0Address
  5125  	if c.ir.HasElementInstances {
  5126  		// "tmpRegister = &moduleInstance.ElementInstnaces[0]"
  5127  		c.assembler.CompileMemoryToRegister(
  5128  			amd64.MOVQ,
  5129  			amd64CallingConventionDestinationFunctionModuleInstanceAddressRegister, moduleInstanceElementInstancesOffset,
  5130  			tmpRegister,
  5131  		)
  5132  		// "callEngine.moduleContext.dataInstancesElement0Address = tmpRegister".
  5133  		c.assembler.CompileRegisterToMemory(
  5134  			amd64.MOVQ,
  5135  			tmpRegister,
  5136  			amd64ReservedRegisterForCallEngine, callEngineModuleContextElementInstancesElement0AddressOffset,
  5137  		)
  5138  	}
  5139  
  5140  	c.locationStack.markRegisterUnused(tmpRegister, tmpRegister2)
  5141  
  5142  	// Set the jump target towards the next instruction for the case where module instance address hasn't changed.
  5143  	c.assembler.SetJumpTargetOnNext(jmpIfModuleNotChange)
  5144  	return nil
  5145  }
  5146  
  5147  // compileEnsureOnRegister ensures that the given value is located on a
  5148  // general purpose register of an appropriate type.
  5149  func (c *amd64Compiler) compileEnsureOnRegister(loc *runtimeValueLocation) (err error) {
  5150  	if loc.onStack() {
  5151  		// Allocate the register.
  5152  		reg, err := c.allocateRegister(loc.getRegisterType())
  5153  		if err != nil {
  5154  			return err
  5155  		}
  5156  
  5157  		// Mark it uses the register.
  5158  		loc.setRegister(reg)
  5159  		c.locationStack.markRegisterUsed(reg)
  5160  
  5161  		c.compileLoadValueOnStackToRegister(loc)
  5162  	} else if loc.onConditionalRegister() {
  5163  		err = c.compileLoadConditionalRegisterToGeneralPurposeRegister(loc)
  5164  	}
  5165  	return
  5166  }
  5167  
  5168  // compileMaybeSwapRegisters swaps two registers if they're not equal.
  5169  func (c *amd64Compiler) compileMaybeSwapRegisters(reg1, reg2 asm.Register) {
  5170  	if reg1 != reg2 {
  5171  		c.assembler.CompileRegisterToRegister(amd64.XCHGQ, reg1, reg2)
  5172  	}
  5173  }
  5174  
  5175  // compilePreventCrossedTargetRegisters swaps registers in such a way, that for neither runtimeValueLocation from locs its
  5176  // corresponding register with the same index from targets is occupied by some other runtimeValueLocation from locs. It returns a
  5177  // closure to restore the original register placement.
  5178  //
  5179  // This function makes it possible to safely exchange one set of registers with another, where a register might be in both sets.
  5180  // Each register will correspond either to itself or another register not present in its own set.
  5181  //
  5182  // For example, if we have locs = [AX, BX, CX], targets = [BX, SI, AX], then it'll do two swaps
  5183  // to make locs = [BX, CX, AX].
  5184  func (c *amd64Compiler) compilePreventCrossedTargetRegisters(locs []*runtimeValueLocation, targets []asm.Register) (restore func()) {
  5185  	type swap struct{ srcIndex, dstIndex int }
  5186  	var swaps []swap
  5187  	for i := range locs {
  5188  		targetLocation := -1 // -1 means not found.
  5189  		for j := range locs {
  5190  			if locs[j].register == targets[i] {
  5191  				targetLocation = j
  5192  				break
  5193  			}
  5194  		}
  5195  		if targetLocation != -1 && targetLocation != i {
  5196  			c.compileMaybeSwapRegisters(locs[i].register, locs[targetLocation].register)
  5197  			locs[i].register, locs[targetLocation].register = locs[targetLocation].register, locs[i].register
  5198  			swaps = append(swaps, swap{i, targetLocation})
  5199  		}
  5200  	}
  5201  	return func() {
  5202  		// Restore in reverse order because a register can be moved multiple times.
  5203  		for i := len(swaps) - 1; i >= 0; i -= 1 {
  5204  			r1, r2 := swaps[i].srcIndex, swaps[i].dstIndex
  5205  			c.compileMaybeSwapRegisters(locs[r1].register, locs[r2].register)
  5206  			locs[r1].register, locs[r2].register = locs[r2].register, locs[r1].register
  5207  		}
  5208  	}
  5209  }