wa-lang.org/wazero@v1.0.2/internal/asm/amd64/impl.go (about)

     1  package amd64
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  	"math"
     9  
    10  	"wa-lang.org/wazero/internal/asm"
    11  )
    12  
    13  // nodeImpl implements asm.Node for amd64.
    14  type nodeImpl struct {
    15  	instruction asm.Instruction
    16  
    17  	offsetInBinaryField asm.NodeOffsetInBinary // Field suffix to dodge conflict with OffsetInBinary
    18  
    19  	// jumpTarget holds the target node in the linked for the jump-kind instruction.
    20  	jumpTarget *nodeImpl
    21  	flag       nodeFlag
    22  	// next holds the next node from this node in the assembled linked list.
    23  	next *nodeImpl
    24  
    25  	types                    operandTypes
    26  	srcReg, dstReg           asm.Register
    27  	srcConst, dstConst       asm.ConstantValue
    28  	srcMemIndex, dstMemIndex asm.Register
    29  	srcMemScale, dstMemScale byte
    30  
    31  	arg byte
    32  
    33  	// readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of
    34  	// read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress.
    35  	readInstructionAddressBeforeTargetInstruction asm.Instruction
    36  
    37  	// jumpOrigins hold all the nodes trying to jump into this node. In other words, all the nodes with .jumpTarget == this.
    38  	jumpOrigins map[*nodeImpl]struct{}
    39  
    40  	staticConst *asm.StaticConst
    41  }
    42  
    43  type nodeFlag byte
    44  
    45  const (
    46  	// nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge
    47  	// whether a jump is backward or forward before encoding.
    48  	nodeFlagInitializedForEncoding nodeFlag = 1 << iota
    49  	nodeFlagBackwardJump
    50  	// nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and
    51  	// the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget
    52  	// as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible.
    53  	nodeFlagShortForwardJump
    54  )
    55  
    56  func (n *nodeImpl) isInitializedForEncoding() bool {
    57  	return n.flag&nodeFlagInitializedForEncoding != 0
    58  }
    59  
    60  func (n *nodeImpl) isJumpNode() bool {
    61  	return n.jumpTarget != nil
    62  }
    63  
    64  func (n *nodeImpl) isBackwardJump() bool {
    65  	return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0)
    66  }
    67  
    68  func (n *nodeImpl) isForwardJump() bool {
    69  	return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0)
    70  }
    71  
    72  func (n *nodeImpl) isForwardShortJump() bool {
    73  	return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0
    74  }
    75  
    76  // AssignJumpTarget implements asm.Node.AssignJumpTarget.
    77  func (n *nodeImpl) AssignJumpTarget(target asm.Node) {
    78  	n.jumpTarget = target.(*nodeImpl)
    79  }
    80  
    81  // AssignDestinationConstant implements asm.Node.AssignDestinationConstant.
    82  func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) {
    83  	n.dstConst = value
    84  }
    85  
    86  // AssignSourceConstant implements asm.Node.AssignSourceConstant.
    87  func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) {
    88  	n.srcConst = value
    89  }
    90  
    91  // OffsetInBinary implements asm.Node.OffsetInBinary.
    92  func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary {
    93  	return n.offsetInBinaryField
    94  }
    95  
    96  // String implements fmt.Stringer.
    97  //
    98  // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax,
    99  // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand
   100  // might be embraced by '[]' to represent the memory location.
   101  func (n *nodeImpl) String() (ret string) {
   102  	instName := InstructionName(n.instruction)
   103  	switch n.types {
   104  	case operandTypesNoneToNone:
   105  		ret = instName
   106  	case operandTypesNoneToRegister:
   107  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg))
   108  	case operandTypesNoneToMemory:
   109  		if n.dstMemIndex != asm.NilRegister {
   110  			ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName,
   111  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   112  		} else {
   113  			ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst)
   114  		}
   115  	case operandTypesNoneToBranch:
   116  		ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget)
   117  	case operandTypesRegisterToNone:
   118  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg))
   119  	case operandTypesRegisterToRegister:
   120  		ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg))
   121  	case operandTypesRegisterToMemory:
   122  		if n.dstMemIndex != asm.NilRegister {
   123  			ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg),
   124  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   125  		} else {
   126  			ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst)
   127  		}
   128  	case operandTypesRegisterToConst:
   129  		ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst)
   130  	case operandTypesMemoryToRegister:
   131  		if n.srcMemIndex != asm.NilRegister {
   132  			ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName,
   133  				RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg))
   134  		} else {
   135  			ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
   136  		}
   137  	case operandTypesMemoryToConst:
   138  		if n.srcMemIndex != asm.NilRegister {
   139  			ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName,
   140  				RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst)
   141  		} else {
   142  			ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst)
   143  		}
   144  	case operandTypesConstToMemory:
   145  		if n.dstMemIndex != asm.NilRegister {
   146  			ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst,
   147  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   148  		} else {
   149  			ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst)
   150  		}
   151  	case operandTypesConstToRegister:
   152  		ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg))
   153  	case operandTypesStaticConstToRegister:
   154  		ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg))
   155  	case operandTypesRegisterToStaticConst:
   156  		ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw)
   157  	}
   158  	return
   159  }
   160  
   161  // operandType represents where an operand is placed for an instruction.
   162  // Note: this is almost the same as obj.AddrType in GO assembler.
   163  type operandType byte
   164  
   165  const (
   166  	operandTypeNone operandType = iota
   167  	operandTypeRegister
   168  	operandTypeMemory
   169  	operandTypeConst
   170  	operandTypeStaticConst
   171  	operandTypeBranch
   172  )
   173  
   174  func (o operandType) String() (ret string) {
   175  	switch o {
   176  	case operandTypeNone:
   177  		ret = "none"
   178  	case operandTypeRegister:
   179  		ret = "register"
   180  	case operandTypeMemory:
   181  		ret = "memory"
   182  	case operandTypeConst:
   183  		ret = "const"
   184  	case operandTypeBranch:
   185  		ret = "branch"
   186  	case operandTypeStaticConst:
   187  		ret = "static-const"
   188  	}
   189  	return
   190  }
   191  
   192  // operandTypes represents the only combinations of two operandTypes used by wazero
   193  type operandTypes struct{ src, dst operandType }
   194  
   195  var (
   196  	operandTypesNoneToNone            = operandTypes{operandTypeNone, operandTypeNone}
   197  	operandTypesNoneToRegister        = operandTypes{operandTypeNone, operandTypeRegister}
   198  	operandTypesNoneToMemory          = operandTypes{operandTypeNone, operandTypeMemory}
   199  	operandTypesNoneToBranch          = operandTypes{operandTypeNone, operandTypeBranch}
   200  	operandTypesRegisterToNone        = operandTypes{operandTypeRegister, operandTypeNone}
   201  	operandTypesRegisterToRegister    = operandTypes{operandTypeRegister, operandTypeRegister}
   202  	operandTypesRegisterToMemory      = operandTypes{operandTypeRegister, operandTypeMemory}
   203  	operandTypesRegisterToConst       = operandTypes{operandTypeRegister, operandTypeConst}
   204  	operandTypesMemoryToRegister      = operandTypes{operandTypeMemory, operandTypeRegister}
   205  	operandTypesMemoryToConst         = operandTypes{operandTypeMemory, operandTypeConst}
   206  	operandTypesConstToRegister       = operandTypes{operandTypeConst, operandTypeRegister}
   207  	operandTypesConstToMemory         = operandTypes{operandTypeConst, operandTypeMemory}
   208  	operandTypesStaticConstToRegister = operandTypes{operandTypeStaticConst, operandTypeRegister}
   209  	operandTypesRegisterToStaticConst = operandTypes{operandTypeRegister, operandTypeStaticConst}
   210  )
   211  
   212  // String implements fmt.Stringer
   213  func (o operandTypes) String() string {
   214  	return fmt.Sprintf("from:%s,to:%s", o.src, o.dst)
   215  }
   216  
   217  // AssemblerImpl implements Assembler.
   218  type AssemblerImpl struct {
   219  	asm.BaseAssemblerImpl
   220  	enablePadding   bool
   221  	root, current   *nodeImpl
   222  	nodeCount       int
   223  	buf             *bytes.Buffer
   224  	forceReAssemble bool
   225  	// MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool
   226  	// but have it as an exported field here for testability.
   227  	MaxDisplacementForConstantPool int
   228  
   229  	pool *asm.StaticConstPool
   230  }
   231  
   232  func NewAssembler() *AssemblerImpl {
   233  	return &AssemblerImpl{
   234  		buf: bytes.NewBuffer(nil), enablePadding: true, pool: asm.NewStaticConstPool(),
   235  		MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool,
   236  	}
   237  }
   238  
   239  // newNode creates a new Node and appends it into the linked list.
   240  func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl {
   241  	n := &nodeImpl{
   242  		instruction: instruction,
   243  		next:        nil,
   244  		types:       types,
   245  		jumpOrigins: map[*nodeImpl]struct{}{},
   246  	}
   247  	a.addNode(n)
   248  	a.nodeCount++
   249  	return n
   250  }
   251  
   252  // addNode appends the new node into the linked list.
   253  func (a *AssemblerImpl) addNode(node *nodeImpl) {
   254  	if a.root == nil {
   255  		a.root = node
   256  		a.current = node
   257  	} else {
   258  		parent := a.current
   259  		parent.next = node
   260  		a.current = node
   261  	}
   262  
   263  	for _, o := range a.SetBranchTargetOnNextNodes {
   264  		origin := o.(*nodeImpl)
   265  		origin.jumpTarget = node
   266  	}
   267  	a.SetBranchTargetOnNextNodes = nil
   268  }
   269  
   270  // EncodeNode encodes the given node into writer.
   271  func (a *AssemblerImpl) EncodeNode(n *nodeImpl) (err error) {
   272  	switch n.types {
   273  	case operandTypesNoneToNone:
   274  		err = a.encodeNoneToNone(n)
   275  	case operandTypesNoneToRegister:
   276  		err = a.encodeNoneToRegister(n)
   277  	case operandTypesNoneToMemory:
   278  		err = a.encodeNoneToMemory(n)
   279  	case operandTypesNoneToBranch:
   280  		// Branching operand can be encoded as relative jumps.
   281  		err = a.encodeRelativeJump(n)
   282  	case operandTypesRegisterToNone:
   283  		err = a.encodeRegisterToNone(n)
   284  	case operandTypesRegisterToRegister:
   285  		err = a.encodeRegisterToRegister(n)
   286  	case operandTypesRegisterToMemory:
   287  		err = a.encodeRegisterToMemory(n)
   288  	case operandTypesRegisterToConst:
   289  		err = a.encodeRegisterToConst(n)
   290  	case operandTypesMemoryToRegister:
   291  		err = a.encodeMemoryToRegister(n)
   292  	case operandTypesConstToRegister:
   293  		err = a.encodeConstToRegister(n)
   294  	case operandTypesConstToMemory:
   295  		err = a.encodeConstToMemory(n)
   296  	case operandTypesMemoryToConst:
   297  		err = a.encodeMemoryToConst(n)
   298  	case operandTypesStaticConstToRegister:
   299  		err = a.encodeStaticConstToRegister(n)
   300  	case operandTypesRegisterToStaticConst:
   301  		err = a.encodeRegisterToStaticConst(n)
   302  	default:
   303  		err = fmt.Errorf("encoder undefined for [%s] operand type", n.types)
   304  	}
   305  	if err != nil {
   306  		err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node.
   307  	}
   308  	return
   309  }
   310  
   311  // Assemble implements asm.AssemblerBase
   312  func (a *AssemblerImpl) Assemble() ([]byte, error) {
   313  	a.InitializeNodesForEncoding()
   314  
   315  	// Continue encoding until we are not forced to re-assemble which happens when
   316  	// a short relative jump ends up the offset larger than 8-bit length.
   317  	for {
   318  		err := a.Encode()
   319  		if err != nil {
   320  			return nil, err
   321  		}
   322  
   323  		if !a.forceReAssemble {
   324  			break
   325  		} else {
   326  			// We reset the length of buffer but don't delete the underlying slice since
   327  			// the binary size will roughly the same after reassemble.
   328  			a.buf.Reset()
   329  			// Reset the re-assemble flag in order to avoid the infinite loop!
   330  			a.forceReAssemble = false
   331  		}
   332  	}
   333  
   334  	code := a.buf.Bytes()
   335  	for _, cb := range a.OnGenerateCallbacks {
   336  		if err := cb(code); err != nil {
   337  			return nil, err
   338  		}
   339  	}
   340  	return code, nil
   341  }
   342  
   343  // InitializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps
   344  // are forward or backward jump.
   345  func (a *AssemblerImpl) InitializeNodesForEncoding() {
   346  	for n := a.root; n != nil; n = n.next {
   347  		n.flag |= nodeFlagInitializedForEncoding
   348  		if target := n.jumpTarget; target != nil {
   349  			if target.isInitializedForEncoding() {
   350  				// This means the target exists behind.
   351  				n.flag |= nodeFlagBackwardJump
   352  			} else {
   353  				// Otherwise, this is forward jump.
   354  				// We start with assuming that the jump can be short (8-bit displacement).
   355  				// If it doens't fit, we change this flag in resolveRelativeForwardJump.
   356  				n.flag |= nodeFlagShortForwardJump
   357  			}
   358  		}
   359  	}
   360  
   361  	// Roughly allocate the buffer by assuming an instruction has 5-bytes length on average.
   362  	a.buf.Grow(a.nodeCount * 5)
   363  }
   364  
   365  func (a *AssemblerImpl) Encode() (err error) {
   366  	for n := a.root; n != nil; n = n.next {
   367  		// If an instruction needs NOP padding, we do so before encoding it.
   368  		// https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   369  		if a.enablePadding {
   370  			if err = a.maybeNOPPadding(n); err != nil {
   371  				return
   372  			}
   373  		}
   374  
   375  		// After the padding, we can finalize the offset of this instruction in the binary.
   376  		n.offsetInBinaryField = uint64(a.buf.Len())
   377  
   378  		if err = a.EncodeNode(n); err != nil {
   379  			return
   380  		}
   381  
   382  		err = a.ResolveForwardRelativeJumps(n)
   383  		if err != nil {
   384  			err = fmt.Errorf("invalid relative forward jumps: %w", err)
   385  			break
   386  		}
   387  
   388  		a.maybeFlushConstants(n.next == nil)
   389  	}
   390  	return
   391  }
   392  
   393  // maybeNOPPadding maybe appends NOP instructions before the node `n`.
   394  // This is necessary to avoid Intel's jump erratum:
   395  // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   396  func (a *AssemblerImpl) maybeNOPPadding(n *nodeImpl) (err error) {
   397  	var instructionLen int32
   398  
   399  	// See in Section 2.1 in for when we have to pad NOP.
   400  	// https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   401  	switch n.instruction {
   402  	case RET, JMP, JCC, JCS, JEQ, JGE, JGT, JHI, JLE, JLS, JLT, JMI, JNE, JPC, JPS:
   403  		// In order to know the instruction length before writing into the binary,
   404  		// we try encoding it with the temporary buffer.
   405  		saved := a.buf
   406  		a.buf = bytes.NewBuffer(nil)
   407  
   408  		// Assign the temporary offset which may or may not be correct depending on the padding decision.
   409  		n.offsetInBinaryField = uint64(saved.Len())
   410  
   411  		// Encode the node and get the instruction length.
   412  		if err = a.EncodeNode(n); err != nil {
   413  			return
   414  		}
   415  		instructionLen = int32(a.buf.Len())
   416  
   417  		// Revert the temporary buffer.
   418  		a.buf = saved
   419  	case // The possible fused jump instructions if the next node is a conditional jump instruction.
   420  		CMPL, CMPQ, TESTL, TESTQ, ADDL, ADDQ, SUBL, SUBQ, ANDL, ANDQ, INCQ, DECQ:
   421  		instructionLen, err = a.fusedInstructionLength(n)
   422  		if err != nil {
   423  			return err
   424  		}
   425  	}
   426  
   427  	if instructionLen == 0 {
   428  		return
   429  	}
   430  
   431  	const boundaryInBytes int32 = 32
   432  	const mask int32 = boundaryInBytes - 1
   433  
   434  	var padNum int
   435  	currentPos := int32(a.buf.Len())
   436  	if used := currentPos & mask; used+instructionLen >= boundaryInBytes {
   437  		padNum = int(boundaryInBytes - used)
   438  	}
   439  
   440  	a.padNOP(padNum)
   441  	return
   442  }
   443  
   444  // fusedInstructionLength returns the length of "macro fused instruction" if the
   445  // instruction sequence starting from `n` can be fused by processor. Otherwise,
   446  // returns zero.
   447  func (a *AssemblerImpl) fusedInstructionLength(n *nodeImpl) (ret int32, err error) {
   448  	// Find the next non-NOP instruction.
   449  	next := n.next
   450  	for ; next != nil && next.instruction == NOP; next = next.next {
   451  	}
   452  
   453  	if next == nil {
   454  		return
   455  	}
   456  
   457  	inst, jmpInst := n.instruction, next.instruction
   458  
   459  	if !(jmpInst == JCC || jmpInst == JCS || jmpInst == JEQ || jmpInst == JGE || jmpInst == JGT ||
   460  		jmpInst == JHI || jmpInst == JLE || jmpInst == JLS || jmpInst == JLT || jmpInst == JMI ||
   461  		jmpInst == JNE || jmpInst == JPC || jmpInst == JPS) {
   462  		// If the next instruction is not jump kind, the instruction will not be fused.
   463  		return
   464  	}
   465  
   466  	// How to determine whether the instruction can be fused is described in
   467  	// Section 3.4.2.2 of "Intel Optimization Manual":
   468  	// https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf
   469  	isTest := inst == TESTL || inst == TESTQ
   470  	isCmp := inst == CMPQ || inst == CMPL
   471  	isTestCmp := isTest || isCmp
   472  	if isTestCmp && ((n.types.src == operandTypeMemory && n.types.dst == operandTypeConst) ||
   473  		(n.types.src == operandTypeConst && n.types.dst == operandTypeMemory)) {
   474  		// The manual says: "CMP and TEST can not be fused when comparing MEM-IMM".
   475  		return
   476  	}
   477  
   478  	// Implement the decision according to the table 3-1 in the manual.
   479  	isAnd := inst == ANDL || inst == ANDQ
   480  	if !isTest && !isAnd {
   481  		if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC {
   482  			// These jumps are only fused for TEST or AND.
   483  			return
   484  		}
   485  		isAdd := inst == ADDL || inst == ADDQ
   486  		isSub := inst == SUBL || inst == SUBQ
   487  		if !isCmp && !isAdd && !isSub {
   488  			if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS {
   489  				// Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB.
   490  				return
   491  			}
   492  		}
   493  	}
   494  
   495  	// Now the instruction is ensured to be fused by the processor.
   496  	// In order to know the fused instruction length before writing into the binary,
   497  	// we try encoding it with the temporary buffer.
   498  	saved := a.buf
   499  	savedLen := uint64(saved.Len())
   500  	a.buf = bytes.NewBuffer(nil)
   501  
   502  	for _, fused := range []*nodeImpl{n, next} {
   503  		// Assign the temporary offset which may or may not be correct depending on the padding decision.
   504  		fused.offsetInBinaryField = savedLen + uint64(a.buf.Len())
   505  
   506  		// Encode the node into the temporary buffer.
   507  		if err = a.EncodeNode(fused); err != nil {
   508  			return
   509  		}
   510  	}
   511  
   512  	ret = int32(a.buf.Len())
   513  
   514  	// Revert the temporary buffer.
   515  	a.buf = saved
   516  	return
   517  }
   518  
   519  // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP"
   520  // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
   521  var nopOpcodes = [][11]byte{
   522  	{0x90},
   523  	{0x66, 0x90},
   524  	{0x0f, 0x1f, 0x00},
   525  	{0x0f, 0x1f, 0x40, 0x00},
   526  	{0x0f, 0x1f, 0x44, 0x00, 0x00},
   527  	{0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
   528  	{0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
   529  	{0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   530  	{0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   531  	{0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   532  	{0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   533  }
   534  
   535  func (a *AssemblerImpl) padNOP(num int) {
   536  	for num > 0 {
   537  		singleNopNum := num
   538  		if singleNopNum > len(nopOpcodes) {
   539  			singleNopNum = len(nopOpcodes)
   540  		}
   541  		a.buf.Write(nopOpcodes[singleNopNum-1][:singleNopNum])
   542  		num -= singleNopNum
   543  	}
   544  }
   545  
   546  // CompileStandAlone implements the same method as documented on asm.AssemblerBase.
   547  func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node {
   548  	return a.newNode(instruction, operandTypesNoneToNone)
   549  }
   550  
   551  // CompileConstToRegister implements the same method as documented on asm.AssemblerBase.
   552  func (a *AssemblerImpl) CompileConstToRegister(
   553  	instruction asm.Instruction,
   554  	value asm.ConstantValue,
   555  	destinationReg asm.Register,
   556  ) (inst asm.Node) {
   557  	n := a.newNode(instruction, operandTypesConstToRegister)
   558  	n.srcConst = value
   559  	n.dstReg = destinationReg
   560  	return n
   561  }
   562  
   563  // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase.
   564  func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
   565  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   566  	n.srcReg = from
   567  	n.dstReg = to
   568  }
   569  
   570  // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase.
   571  func (a *AssemblerImpl) CompileMemoryToRegister(
   572  	instruction asm.Instruction,
   573  	sourceBaseReg asm.Register,
   574  	sourceOffsetConst asm.ConstantValue,
   575  	destinationReg asm.Register,
   576  ) {
   577  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   578  	n.srcReg = sourceBaseReg
   579  	n.srcConst = sourceOffsetConst
   580  	n.dstReg = destinationReg
   581  }
   582  
   583  // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase.
   584  func (a *AssemblerImpl) CompileRegisterToMemory(
   585  	instruction asm.Instruction,
   586  	sourceRegister, destinationBaseRegister asm.Register,
   587  	destinationOffsetConst asm.ConstantValue,
   588  ) {
   589  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   590  	n.srcReg = sourceRegister
   591  	n.dstReg = destinationBaseRegister
   592  	n.dstConst = destinationOffsetConst
   593  }
   594  
   595  // CompileJump implements the same method as documented on asm.AssemblerBase.
   596  func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node {
   597  	return a.newNode(jmpInstruction, operandTypesNoneToBranch)
   598  }
   599  
   600  // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase.
   601  func (a *AssemblerImpl) CompileJumpToMemory(
   602  	jmpInstruction asm.Instruction,
   603  	baseReg asm.Register,
   604  	offset asm.ConstantValue,
   605  ) {
   606  	n := a.newNode(jmpInstruction, operandTypesNoneToMemory)
   607  	n.dstReg = baseReg
   608  	n.dstConst = offset
   609  }
   610  
   611  // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase.
   612  func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) {
   613  	n := a.newNode(jmpInstruction, operandTypesNoneToRegister)
   614  	n.dstReg = reg
   615  }
   616  
   617  // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase.
   618  func (a *AssemblerImpl) CompileReadInstructionAddress(
   619  	destinationRegister asm.Register,
   620  	beforeAcquisitionTargetInstruction asm.Instruction,
   621  ) {
   622  	n := a.newNode(LEAQ, operandTypesMemoryToRegister)
   623  	n.dstReg = destinationRegister
   624  	n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction
   625  }
   626  
   627  // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler.
   628  func (a *AssemblerImpl) CompileRegisterToRegisterWithArg(
   629  	instruction asm.Instruction,
   630  	from, to asm.Register,
   631  	arg byte,
   632  ) {
   633  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   634  	n.srcReg = from
   635  	n.dstReg = to
   636  	n.arg = arg
   637  }
   638  
   639  // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler.
   640  func (a *AssemblerImpl) CompileMemoryWithIndexToRegister(
   641  	instruction asm.Instruction,
   642  	srcBaseReg asm.Register,
   643  	srcOffsetConst asm.ConstantValue,
   644  	srcIndex asm.Register,
   645  	srcScale int16,
   646  	dstReg asm.Register,
   647  ) {
   648  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   649  	n.srcReg = srcBaseReg
   650  	n.srcConst = srcOffsetConst
   651  	n.srcMemIndex = srcIndex
   652  	n.srcMemScale = byte(srcScale)
   653  	n.dstReg = dstReg
   654  }
   655  
   656  // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler.
   657  func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister(
   658  	instruction asm.Instruction,
   659  	srcBaseReg asm.Register,
   660  	srcOffsetConst asm.ConstantValue,
   661  	srcIndex asm.Register,
   662  	srcScale int16,
   663  	dstReg asm.Register,
   664  	arg byte,
   665  ) {
   666  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   667  	n.srcReg = srcBaseReg
   668  	n.srcConst = srcOffsetConst
   669  	n.srcMemIndex = srcIndex
   670  	n.srcMemScale = byte(srcScale)
   671  	n.dstReg = dstReg
   672  	n.arg = arg
   673  }
   674  
   675  // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler.
   676  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex(
   677  	instruction asm.Instruction,
   678  	srcReg, dstBaseReg asm.Register,
   679  	dstOffsetConst asm.ConstantValue,
   680  	dstIndex asm.Register,
   681  	dstScale int16,
   682  ) {
   683  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   684  	n.srcReg = srcReg
   685  	n.dstReg = dstBaseReg
   686  	n.dstConst = dstOffsetConst
   687  	n.dstMemIndex = dstIndex
   688  	n.dstMemScale = byte(dstScale)
   689  }
   690  
   691  // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler.
   692  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg(
   693  	instruction asm.Instruction,
   694  	srcReg, dstBaseReg asm.Register,
   695  	dstOffsetConst asm.ConstantValue,
   696  	dstIndex asm.Register,
   697  	dstScale int16,
   698  	arg byte,
   699  ) {
   700  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   701  	n.srcReg = srcReg
   702  	n.dstReg = dstBaseReg
   703  	n.dstConst = dstOffsetConst
   704  	n.dstMemIndex = dstIndex
   705  	n.dstMemScale = byte(dstScale)
   706  	n.arg = arg
   707  }
   708  
   709  // CompileRegisterToConst implements the same method as documented on amd64.Assembler.
   710  func (a *AssemblerImpl) CompileRegisterToConst(
   711  	instruction asm.Instruction,
   712  	srcRegister asm.Register,
   713  	value asm.ConstantValue,
   714  ) asm.Node {
   715  	n := a.newNode(instruction, operandTypesRegisterToConst)
   716  	n.srcReg = srcRegister
   717  	n.dstConst = value
   718  	return n
   719  }
   720  
   721  // CompileRegisterToNone implements the same method as documented on amd64.Assembler.
   722  func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) {
   723  	n := a.newNode(instruction, operandTypesRegisterToNone)
   724  	n.srcReg = register
   725  }
   726  
   727  // CompileNoneToRegister implements the same method as documented on amd64.Assembler.
   728  func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) {
   729  	n := a.newNode(instruction, operandTypesNoneToRegister)
   730  	n.dstReg = register
   731  }
   732  
   733  // CompileNoneToMemory implements the same method as documented on amd64.Assembler.
   734  func (a *AssemblerImpl) CompileNoneToMemory(
   735  	instruction asm.Instruction,
   736  	baseReg asm.Register,
   737  	offset asm.ConstantValue,
   738  ) {
   739  	n := a.newNode(instruction, operandTypesNoneToMemory)
   740  	n.dstReg = baseReg
   741  	n.dstConst = offset
   742  }
   743  
   744  // CompileConstToMemory implements the same method as documented on amd64.Assembler.
   745  func (a *AssemblerImpl) CompileConstToMemory(
   746  	instruction asm.Instruction,
   747  	value asm.ConstantValue,
   748  	dstbaseReg asm.Register,
   749  	dstOffset asm.ConstantValue,
   750  ) asm.Node {
   751  	n := a.newNode(instruction, operandTypesConstToMemory)
   752  	n.srcConst = value
   753  	n.dstReg = dstbaseReg
   754  	n.dstConst = dstOffset
   755  	return n
   756  }
   757  
   758  // CompileMemoryToConst implements the same method as documented on amd64.Assembler.
   759  func (a *AssemblerImpl) CompileMemoryToConst(
   760  	instruction asm.Instruction,
   761  	srcBaseReg asm.Register,
   762  	srcOffset, value asm.ConstantValue,
   763  ) asm.Node {
   764  	n := a.newNode(instruction, operandTypesMemoryToConst)
   765  	n.srcReg = srcBaseReg
   766  	n.srcConst = srcOffset
   767  	n.dstConst = value
   768  	return n
   769  }
   770  
   771  func errorEncodingUnsupported(n *nodeImpl) error {
   772  	return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types)
   773  }
   774  
   775  func (a *AssemblerImpl) encodeNoneToNone(n *nodeImpl) (err error) {
   776  	switch n.instruction {
   777  	case CDQ:
   778  		// https://www.felixcloutier.com/x86/cwd:cdq:cqo
   779  		err = a.buf.WriteByte(0x99)
   780  	case CQO:
   781  		// https://www.felixcloutier.com/x86/cwd:cdq:cqo
   782  		_, err = a.buf.Write([]byte{RexPrefixW, 0x99})
   783  	case NOP:
   784  		// Simply optimize out the NOP instructions.
   785  	case RET:
   786  		// https://www.felixcloutier.com/x86/ret
   787  		err = a.buf.WriteByte(0xc3)
   788  	case UD2:
   789  		// https://mudongliang.github.io/x86/html/file_module_x86_id_318.html
   790  		_, err = a.buf.Write([]byte{0x0f, 0x0b})
   791  	case REPMOVSQ:
   792  		_, err = a.buf.Write([]byte{0xf3, RexPrefixW, 0xa5})
   793  	case REPSTOSQ:
   794  		_, err = a.buf.Write([]byte{0xf3, RexPrefixW, 0xab})
   795  	case STD:
   796  		_, err = a.buf.Write([]byte{0xfd})
   797  	case CLD:
   798  		_, err = a.buf.Write([]byte{0xfc})
   799  	default:
   800  		err = errorEncodingUnsupported(n)
   801  	}
   802  	return
   803  }
   804  
   805  func (a *AssemblerImpl) encodeNoneToRegister(n *nodeImpl) (err error) {
   806  	regBits, prefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
   807  	if err != nil {
   808  		return err
   809  	}
   810  
   811  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
   812  	modRM := 0b11_000_000 | // Specifying that opeand is register.
   813  		regBits
   814  	if n.instruction == JMP {
   815  		// JMP's opcode is defined as "FF /4" meaning that we have to have "4"
   816  		// in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp
   817  		modRM |= 0b00_100_000
   818  	} else if n.instruction == NEGQ {
   819  		prefix |= RexPrefixW
   820  		modRM |= 0b00_011_000
   821  	} else if n.instruction == INCQ {
   822  		prefix |= RexPrefixW
   823  	} else if n.instruction == DECQ {
   824  		prefix |= RexPrefixW
   825  		modRM |= 0b00_001_000
   826  	} else {
   827  		if RegSP <= n.dstReg && n.dstReg <= RegDI {
   828  			// If the destination is one byte length register, we need to have the default prefix.
   829  			// https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers
   830  			prefix |= RexPrefixDefault
   831  		}
   832  	}
   833  
   834  	if prefix != RexPrefixNone {
   835  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding
   836  		if err = a.buf.WriteByte(prefix); err != nil {
   837  			return
   838  		}
   839  	}
   840  
   841  	switch n.instruction {
   842  	case JMP:
   843  		// https://www.felixcloutier.com/x86/jmp
   844  		_, err = a.buf.Write([]byte{0xff, modRM})
   845  	case SETCC:
   846  		// https://www.felixcloutier.com/x86/setcc
   847  		_, err = a.buf.Write([]byte{0x0f, 0x93, modRM})
   848  	case SETCS:
   849  		// https://www.felixcloutier.com/x86/setcc
   850  		_, err = a.buf.Write([]byte{0x0f, 0x92, modRM})
   851  	case SETEQ:
   852  		// https://www.felixcloutier.com/x86/setcc
   853  		_, err = a.buf.Write([]byte{0x0f, 0x94, modRM})
   854  	case SETGE:
   855  		// https://www.felixcloutier.com/x86/setcc
   856  		_, err = a.buf.Write([]byte{0x0f, 0x9d, modRM})
   857  	case SETGT:
   858  		// https://www.felixcloutier.com/x86/setcc
   859  		_, err = a.buf.Write([]byte{0x0f, 0x9f, modRM})
   860  	case SETHI:
   861  		// https://www.felixcloutier.com/x86/setcc
   862  		_, err = a.buf.Write([]byte{0x0f, 0x97, modRM})
   863  	case SETLE:
   864  		// https://www.felixcloutier.com/x86/setcc
   865  		_, err = a.buf.Write([]byte{0x0f, 0x9e, modRM})
   866  	case SETLS:
   867  		// https://www.felixcloutier.com/x86/setcc
   868  		_, err = a.buf.Write([]byte{0x0f, 0x96, modRM})
   869  	case SETLT:
   870  		// https://www.felixcloutier.com/x86/setcc
   871  		_, err = a.buf.Write([]byte{0x0f, 0x9c, modRM})
   872  	case SETNE:
   873  		// https://www.felixcloutier.com/x86/setcc
   874  		_, err = a.buf.Write([]byte{0x0f, 0x95, modRM})
   875  	case SETPC:
   876  		// https://www.felixcloutier.com/x86/setcc
   877  		_, err = a.buf.Write([]byte{0x0f, 0x9b, modRM})
   878  	case SETPS:
   879  		// https://www.felixcloutier.com/x86/setcc
   880  		_, err = a.buf.Write([]byte{0x0f, 0x9a, modRM})
   881  	case NEGQ:
   882  		// https://www.felixcloutier.com/x86/neg
   883  		_, err = a.buf.Write([]byte{0xf7, modRM})
   884  	case INCQ:
   885  		// https://www.felixcloutier.com/x86/inc
   886  		_, err = a.buf.Write([]byte{0xff, modRM})
   887  	case DECQ:
   888  		// https://www.felixcloutier.com/x86/dec
   889  		_, err = a.buf.Write([]byte{0xff, modRM})
   890  	default:
   891  		err = errorEncodingUnsupported(n)
   892  	}
   893  	return
   894  }
   895  
   896  func (a *AssemblerImpl) encodeNoneToMemory(n *nodeImpl) (err error) {
   897  	RexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation()
   898  	if err != nil {
   899  		return err
   900  	}
   901  
   902  	var opcode byte
   903  	switch n.instruction {
   904  	case INCQ:
   905  		// https://www.felixcloutier.com/x86/inc
   906  		RexPrefix |= RexPrefixW
   907  		opcode = 0xff
   908  	case DECQ:
   909  		// https://www.felixcloutier.com/x86/dec
   910  		RexPrefix |= RexPrefixW
   911  		modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM.
   912  		opcode = 0xff
   913  	case JMP:
   914  		// https://www.felixcloutier.com/x86/jmp
   915  		modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM.
   916  		opcode = 0xff
   917  	default:
   918  		return errorEncodingUnsupported(n)
   919  	}
   920  
   921  	if RexPrefix != RexPrefixNone {
   922  		a.buf.WriteByte(RexPrefix)
   923  	}
   924  
   925  	a.buf.Write([]byte{opcode, modRM})
   926  
   927  	if sbi != nil {
   928  		a.buf.WriteByte(*sbi)
   929  	}
   930  
   931  	if displacementWidth != 0 {
   932  		a.WriteConst(n.dstConst, displacementWidth)
   933  	}
   934  	return
   935  }
   936  
   937  type relativeJumpOpcode struct{ short, long []byte }
   938  
   939  func (o relativeJumpOpcode) instructionLen(short bool) int64 {
   940  	if short {
   941  		return int64(len(o.short)) + 1 // 1 byte = 8 bit offset
   942  	} else {
   943  		return int64(len(o.long)) + 4 // 4 byte = 32 bit offset
   944  	}
   945  }
   946  
   947  var relativeJumpOpcodes = map[asm.Instruction]relativeJumpOpcode{
   948  	// https://www.felixcloutier.com/x86/jcc
   949  	JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}},
   950  	JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}},
   951  	JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}},
   952  	JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}},
   953  	JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}},
   954  	JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}},
   955  	JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}},
   956  	JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}},
   957  	JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}},
   958  	JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}},
   959  	JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}},
   960  	JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}},
   961  	JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}},
   962  	JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}},
   963  	// https://www.felixcloutier.com/x86/jmp
   964  	JMP: {short: []byte{0xeb}, long: []byte{0xe9}},
   965  }
   966  
   967  func (a *AssemblerImpl) ResolveForwardRelativeJumps(target *nodeImpl) (err error) {
   968  	offsetInBinary := int64(target.OffsetInBinary())
   969  	for origin := range target.jumpOrigins {
   970  		shortJump := origin.isForwardShortJump()
   971  		op := relativeJumpOpcodes[origin.instruction]
   972  		instructionLen := op.instructionLen(shortJump)
   973  
   974  		// Calculate the offset from the EIP (at the time of executing this jump instruction)
   975  		// to the target instruction. This value is always >= 0 as here we only handle forward jumps.
   976  		offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen)
   977  		if shortJump {
   978  			if offset > math.MaxInt8 {
   979  				// This forces reassemble in the outer loop inside AssemblerImpl.Assemble().
   980  				a.forceReAssemble = true
   981  				// From the next reAssemble phases, this forward jump will be encoded long jump and
   982  				// allocate 32-bit offset bytes by default. This means that this `origin` node
   983  				// will always enter the "long jump offset encoding" block below
   984  				origin.flag ^= nodeFlagShortForwardJump
   985  			} else {
   986  				a.buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset)
   987  			}
   988  		} else { // long jump offset encoding.
   989  			if offset > math.MaxInt32 {
   990  				return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction))
   991  			}
   992  			binary.LittleEndian.PutUint32(a.buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset))
   993  		}
   994  	}
   995  	return nil
   996  }
   997  
   998  func (a *AssemblerImpl) encodeRelativeJump(n *nodeImpl) (err error) {
   999  	if n.jumpTarget == nil {
  1000  		err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction))
  1001  		return
  1002  	}
  1003  
  1004  	op, ok := relativeJumpOpcodes[n.instruction]
  1005  	if !ok {
  1006  		return errorEncodingUnsupported(n)
  1007  	}
  1008  
  1009  	var isShortJump bool
  1010  	// offsetOfEIP means the offset of EIP register at the time of executing this jump instruction.
  1011  	// Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP.
  1012  	var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps
  1013  	if n.isBackwardJump() {
  1014  		// If this is the backward jump, we can calculate the exact offset now.
  1015  		offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary())
  1016  		isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8
  1017  		offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump)
  1018  	} else {
  1019  		// For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps.
  1020  		n.jumpTarget.jumpOrigins[n] = struct{}{}
  1021  		isShortJump = n.isForwardShortJump()
  1022  	}
  1023  
  1024  	if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here.
  1025  		return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction))
  1026  	}
  1027  
  1028  	if isShortJump {
  1029  		a.buf.Write(op.short)
  1030  		a.WriteConst(offsetOfEIP, 8)
  1031  	} else {
  1032  		a.buf.Write(op.long)
  1033  		a.WriteConst(offsetOfEIP, 32)
  1034  	}
  1035  	return
  1036  }
  1037  
  1038  func (a *AssemblerImpl) encodeRegisterToNone(n *nodeImpl) (err error) {
  1039  	regBits, prefix, err := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM)
  1040  	if err != nil {
  1041  		return err
  1042  	}
  1043  
  1044  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1045  	modRM := 0b11_000_000 | // Specifying that opeand is register.
  1046  		regBits
  1047  
  1048  	var opcode byte
  1049  	switch n.instruction {
  1050  	case DIVL:
  1051  		// https://www.felixcloutier.com/x86/div
  1052  		modRM |= 0b00_110_000
  1053  		opcode = 0xf7
  1054  	case DIVQ:
  1055  		// https://www.felixcloutier.com/x86/div
  1056  		prefix |= RexPrefixW
  1057  		modRM |= 0b00_110_000
  1058  		opcode = 0xf7
  1059  	case IDIVL:
  1060  		// https://www.felixcloutier.com/x86/idiv
  1061  		modRM |= 0b00_111_000
  1062  		opcode = 0xf7
  1063  	case IDIVQ:
  1064  		// https://www.felixcloutier.com/x86/idiv
  1065  		prefix |= RexPrefixW
  1066  		modRM |= 0b00_111_000
  1067  		opcode = 0xf7
  1068  	case MULL:
  1069  		// https://www.felixcloutier.com/x86/mul
  1070  		modRM |= 0b00_100_000
  1071  		opcode = 0xf7
  1072  	case MULQ:
  1073  		// https://www.felixcloutier.com/x86/mul
  1074  		prefix |= RexPrefixW
  1075  		modRM |= 0b00_100_000
  1076  		opcode = 0xf7
  1077  	default:
  1078  		err = errorEncodingUnsupported(n)
  1079  	}
  1080  
  1081  	if prefix != RexPrefixNone {
  1082  		a.buf.WriteByte(prefix)
  1083  	}
  1084  
  1085  	a.buf.Write([]byte{opcode, modRM})
  1086  	return
  1087  }
  1088  
  1089  var registerToRegisterOpcode = map[asm.Instruction]struct {
  1090  	opcode                           []byte
  1091  	rPrefix                          RexPrefix
  1092  	mandatoryPrefix                  byte
  1093  	srcOnModRMReg                    bool
  1094  	isSrc8bit                        bool
  1095  	needArg                          bool
  1096  	requireSrcFloat, requireDstFloat bool
  1097  }{
  1098  	// https://www.felixcloutier.com/x86/add
  1099  	ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true},
  1100  	ADDQ: {opcode: []byte{0x1}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1101  	// https://www.felixcloutier.com/x86/and
  1102  	ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true},
  1103  	ANDQ: {opcode: []byte{0x21}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1104  	// https://www.felixcloutier.com/x86/cmp
  1105  	CMPL: {opcode: []byte{0x39}},
  1106  	CMPQ: {opcode: []byte{0x39}, rPrefix: RexPrefixW},
  1107  	// https://www.felixcloutier.com/x86/cmovcc
  1108  	CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: RexPrefixW},
  1109  	// https://www.felixcloutier.com/x86/addsd
  1110  	ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true},
  1111  	// https://www.felixcloutier.com/x86/addss
  1112  	ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true},
  1113  	// https://www.felixcloutier.com/x86/addpd
  1114  	ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}, requireSrcFloat: true, requireDstFloat: true},
  1115  	// https://www.felixcloutier.com/x86/addps
  1116  	ANDPS: {opcode: []byte{0x0f, 0x54}, requireSrcFloat: true, requireDstFloat: true},
  1117  	// https://www.felixcloutier.com/x86/bsr
  1118  	BSRL: {opcode: []byte{0xf, 0xbd}},
  1119  	BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: RexPrefixW},
  1120  	// https://www.felixcloutier.com/x86/comisd
  1121  	COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}, requireSrcFloat: true, requireDstFloat: true},
  1122  	// https://www.felixcloutier.com/x86/comiss
  1123  	COMISS: {opcode: []byte{0x0f, 0x2f}, requireSrcFloat: true, requireDstFloat: true},
  1124  	// https://www.felixcloutier.com/x86/cvtsd2ss
  1125  	CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true},
  1126  	// https://www.felixcloutier.com/x86/cvtsi2sd
  1127  	CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, requireDstFloat: true},
  1128  	// https://www.felixcloutier.com/x86/cvtsi2sd
  1129  	CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: RexPrefixW, requireDstFloat: true},
  1130  	// https://www.felixcloutier.com/x86/cvtsi2ss
  1131  	CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, requireDstFloat: true},
  1132  	// https://www.felixcloutier.com/x86/cvtsi2ss
  1133  	CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: RexPrefixW, requireDstFloat: true},
  1134  	// https://www.felixcloutier.com/x86/cvtss2sd
  1135  	CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true},
  1136  	// https://www.felixcloutier.com/x86/cvttsd2si
  1137  	CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, requireSrcFloat: true},
  1138  	CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: RexPrefixW, requireSrcFloat: true},
  1139  	// https://www.felixcloutier.com/x86/cvttss2si
  1140  	CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, requireSrcFloat: true},
  1141  	CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: RexPrefixW, requireSrcFloat: true},
  1142  	// https://www.felixcloutier.com/x86/divsd
  1143  	DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true},
  1144  	// https://www.felixcloutier.com/x86/divss
  1145  	DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true},
  1146  	// https://www.felixcloutier.com/x86/lzcnt
  1147  	LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}},
  1148  	LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: RexPrefixW},
  1149  	// https://www.felixcloutier.com/x86/maxsd
  1150  	MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true},
  1151  	// https://www.felixcloutier.com/x86/maxss
  1152  	MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true},
  1153  	// https://www.felixcloutier.com/x86/minsd
  1154  	MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true},
  1155  	// https://www.felixcloutier.com/x86/minss
  1156  	MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true},
  1157  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1158  	MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true},
  1159  	// https://www.felixcloutier.com/x86/movzx
  1160  	MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true},
  1161  	// https://www.felixcloutier.com/x86/movzx
  1162  	MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true},
  1163  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1164  	MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: RexPrefixW, isSrc8bit: true},
  1165  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1166  	MOVLQSX: {opcode: []byte{0x63}, rPrefix: RexPrefixW},
  1167  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1168  	MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: RexPrefixW},
  1169  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1170  	MOVWLSX: {opcode: []byte{0x0f, 0xbf}},
  1171  	// https://www.felixcloutier.com/x86/imul
  1172  	IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: RexPrefixW},
  1173  	// https://www.felixcloutier.com/x86/mulss
  1174  	MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true},
  1175  	// https://www.felixcloutier.com/x86/mulsd
  1176  	MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true},
  1177  	// https://www.felixcloutier.com/x86/or
  1178  	ORL: {opcode: []byte{0x09}, srcOnModRMReg: true},
  1179  	ORQ: {opcode: []byte{0x09}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1180  	// https://www.felixcloutier.com/x86/orpd
  1181  	ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}, requireSrcFloat: true, requireDstFloat: true},
  1182  	// https://www.felixcloutier.com/x86/orps
  1183  	ORPS: {opcode: []byte{0x0f, 0x56}, requireSrcFloat: true, requireDstFloat: true},
  1184  	// https://www.felixcloutier.com/x86/popcnt
  1185  	POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}},
  1186  	POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: RexPrefixW},
  1187  	// https://www.felixcloutier.com/x86/roundss
  1188  	ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true, requireSrcFloat: true, requireDstFloat: true},
  1189  	// https://www.felixcloutier.com/x86/roundsd
  1190  	ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true, requireSrcFloat: true, requireDstFloat: true},
  1191  	// https://www.felixcloutier.com/x86/sqrtss
  1192  	SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true},
  1193  	// https://www.felixcloutier.com/x86/sqrtsd
  1194  	SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true},
  1195  	// https://www.felixcloutier.com/x86/sub
  1196  	SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true},
  1197  	SUBQ: {opcode: []byte{0x29}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1198  	// https://www.felixcloutier.com/x86/subss
  1199  	SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true},
  1200  	// https://www.felixcloutier.com/x86/subsd
  1201  	SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true},
  1202  	// https://www.felixcloutier.com/x86/test
  1203  	TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true},
  1204  	TESTQ: {opcode: []byte{0x85}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1205  	// https://www.felixcloutier.com/x86/tzcnt
  1206  	TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}},
  1207  	TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: RexPrefixW},
  1208  	// https://www.felixcloutier.com/x86/ucomisd
  1209  	UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}, requireSrcFloat: true, requireDstFloat: true},
  1210  	// https://www.felixcloutier.com/x86/ucomiss
  1211  	UCOMISS: {opcode: []byte{0x0f, 0x2e}, requireSrcFloat: true, requireDstFloat: true},
  1212  	// https://www.felixcloutier.com/x86/xchg
  1213  	XCHGQ: {opcode: []byte{0x87}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1214  	// https://www.felixcloutier.com/x86/xor
  1215  	XORL: {opcode: []byte{0x31}, srcOnModRMReg: true},
  1216  	XORQ: {opcode: []byte{0x31}, rPrefix: RexPrefixW, srcOnModRMReg: true},
  1217  	// https://www.felixcloutier.com/x86/xorpd
  1218  	XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}, requireSrcFloat: true, requireDstFloat: true},
  1219  	XORPS: {opcode: []byte{0x0f, 0x57}, requireSrcFloat: true, requireDstFloat: true},
  1220  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1221  	PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, requireSrcFloat: false, requireDstFloat: true, needArg: true},
  1222  	// https://www.felixcloutier.com/x86/pinsrw
  1223  	PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, requireSrcFloat: false, requireDstFloat: true, needArg: true},
  1224  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1225  	PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, requireSrcFloat: false, requireDstFloat: true, needArg: true},
  1226  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1227  	PINSRQ: {mandatoryPrefix: 0x66, rPrefix: RexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, requireSrcFloat: false, requireDstFloat: true, needArg: true},
  1228  	// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1229  	MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}, requireSrcFloat: true, requireDstFloat: true},
  1230  	// https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
  1231  	MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}, requireSrcFloat: true, requireDstFloat: true},
  1232  	// https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
  1233  	PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}, requireSrcFloat: true, requireDstFloat: true},
  1234  	PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}, requireSrcFloat: true, requireDstFloat: true},
  1235  	PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}, requireSrcFloat: true, requireDstFloat: true},
  1236  	PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}, requireSrcFloat: true, requireDstFloat: true},
  1237  	// https://www.felixcloutier.com/x86/psubb:psubw:psubd
  1238  	PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}, requireSrcFloat: true, requireDstFloat: true},
  1239  	PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}, requireSrcFloat: true, requireDstFloat: true},
  1240  	PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}, requireSrcFloat: true, requireDstFloat: true},
  1241  	// https://www.felixcloutier.com/x86/psubq
  1242  	PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}, requireSrcFloat: true, requireDstFloat: true},
  1243  	// https://www.felixcloutier.com/x86/addps
  1244  	ADDPS: {opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true},
  1245  	// https://www.felixcloutier.com/x86/addpd
  1246  	ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true},
  1247  	// https://www.felixcloutier.com/x86/subps
  1248  	SUBPS: {opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true},
  1249  	// https://www.felixcloutier.com/x86/subpd
  1250  	SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true},
  1251  	// https://www.felixcloutier.com/x86/pxor
  1252  	PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}, requireSrcFloat: true, requireDstFloat: true},
  1253  	// https://www.felixcloutier.com/x86/pand
  1254  	PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}, requireSrcFloat: true, requireDstFloat: true},
  1255  	// https://www.felixcloutier.com/x86/por
  1256  	POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}, requireSrcFloat: true, requireDstFloat: true},
  1257  	// https://www.felixcloutier.com/x86/pandn
  1258  	PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}, requireSrcFloat: true, requireDstFloat: true},
  1259  	// https://www.felixcloutier.com/x86/pshufb
  1260  	PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}, requireSrcFloat: true, requireDstFloat: true},
  1261  	// https://www.felixcloutier.com/x86/pshufd
  1262  	PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1263  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1264  	PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, requireSrcFloat: true, requireDstFloat: false, needArg: true, srcOnModRMReg: true},
  1265  	// https://www.felixcloutier.com/x86/pextrw
  1266  	PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, requireSrcFloat: true, requireDstFloat: false, needArg: true},
  1267  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1268  	PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, requireSrcFloat: true, requireDstFloat: false, needArg: true, srcOnModRMReg: true},
  1269  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1270  	PEXTRQ: {rPrefix: RexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, requireSrcFloat: true, requireDstFloat: false, needArg: true, srcOnModRMReg: true},
  1271  	// https://www.felixcloutier.com/x86/insertps
  1272  	INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1273  	// https://www.felixcloutier.com/x86/movlhps
  1274  	MOVLHPS: {opcode: []byte{0x0f, 0x16}, requireSrcFloat: true, requireDstFloat: true},
  1275  	// https://www.felixcloutier.com/x86/ptest
  1276  	PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}, requireSrcFloat: true, requireDstFloat: true},
  1277  	// https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd
  1278  	PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}, requireSrcFloat: true, requireDstFloat: true},
  1279  	PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}, requireSrcFloat: true, requireDstFloat: true},
  1280  	PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}, requireSrcFloat: true, requireDstFloat: true},
  1281  	// https://www.felixcloutier.com/x86/pcmpeqq
  1282  	PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}, requireSrcFloat: true, requireDstFloat: true},
  1283  	// https://www.felixcloutier.com/x86/paddusb:paddusw
  1284  	PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}, requireSrcFloat: true, requireDstFloat: true},
  1285  	// https://www.felixcloutier.com/x86/movsd
  1286  	MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}, requireSrcFloat: true, requireDstFloat: true},
  1287  	// https://www.felixcloutier.com/x86/packsswb:packssdw
  1288  	PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}, requireSrcFloat: true, requireDstFloat: true},
  1289  	// https://www.felixcloutier.com/x86/pmovmskb
  1290  	PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}, requireSrcFloat: true, requireDstFloat: false},
  1291  	// https://www.felixcloutier.com/x86/movmskps
  1292  	MOVMSKPS: {opcode: []byte{0x0f, 0x50}, requireSrcFloat: true, requireDstFloat: false},
  1293  	// https://www.felixcloutier.com/x86/movmskpd
  1294  	MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}, requireSrcFloat: true, requireDstFloat: false},
  1295  	// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  1296  	PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}, requireSrcFloat: true, requireDstFloat: true},
  1297  	// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  1298  	PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}, requireSrcFloat: true, requireDstFloat: true},
  1299  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1300  	PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}, requireSrcFloat: true, requireDstFloat: true},
  1301  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1302  	PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}, requireSrcFloat: true, requireDstFloat: true},
  1303  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1304  	PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}, requireSrcFloat: true, requireDstFloat: true},
  1305  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1306  	PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}, requireSrcFloat: true, requireDstFloat: true},
  1307  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1308  	PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}, requireSrcFloat: true, requireDstFloat: true},
  1309  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1310  	PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}, requireSrcFloat: true, requireDstFloat: true},
  1311  	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
  1312  	PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}, requireSrcFloat: true, requireDstFloat: true},
  1313  	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
  1314  	PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}, requireSrcFloat: true, requireDstFloat: true},
  1315  	// https://www.felixcloutier.com/x86/cmpps
  1316  	CMPPS: {opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1317  	// https://www.felixcloutier.com/x86/cmppd
  1318  	CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1319  	// https://www.felixcloutier.com/x86/pcmpgtq
  1320  	PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}, requireSrcFloat: true, requireDstFloat: true},
  1321  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1322  	PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}, requireSrcFloat: true, requireDstFloat: true},
  1323  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1324  	PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}, requireSrcFloat: true, requireDstFloat: true},
  1325  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1326  	PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}, requireSrcFloat: true, requireDstFloat: true},
  1327  	// https://www.felixcloutier.com/x86/pminsd:pminsq
  1328  	PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}, requireSrcFloat: true, requireDstFloat: true},
  1329  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1330  	PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}, requireSrcFloat: true, requireDstFloat: true},
  1331  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1332  	PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}, requireSrcFloat: true, requireDstFloat: true},
  1333  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1334  	PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}, requireSrcFloat: true, requireDstFloat: true},
  1335  	// https://www.felixcloutier.com/x86/pminsb:pminsw
  1336  	PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}, requireSrcFloat: true, requireDstFloat: true},
  1337  	// https://www.felixcloutier.com/x86/pminsb:pminsw
  1338  	PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}, requireSrcFloat: true, requireDstFloat: true},
  1339  	// https://www.felixcloutier.com/x86/pminud:pminuq
  1340  	PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}, requireSrcFloat: true, requireDstFloat: true},
  1341  	// https://www.felixcloutier.com/x86/pminub:pminuw
  1342  	PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}, requireSrcFloat: true, requireDstFloat: true},
  1343  	// https://www.felixcloutier.com/x86/pminub:pminuw
  1344  	PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}, requireSrcFloat: true, requireDstFloat: true},
  1345  	// https://www.felixcloutier.com/x86/pmaxud:pmaxuq
  1346  	PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}, requireSrcFloat: true, requireDstFloat: true},
  1347  	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
  1348  	PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}, requireSrcFloat: true, requireDstFloat: true},
  1349  	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
  1350  	PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}, requireSrcFloat: true, requireDstFloat: true},
  1351  	// https://www.felixcloutier.com/x86/pmullw
  1352  	PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}, requireSrcFloat: true, requireDstFloat: true},
  1353  	// https://www.felixcloutier.com/x86/pmulld:pmullq
  1354  	PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}, requireSrcFloat: true, requireDstFloat: true},
  1355  	// https://www.felixcloutier.com/x86/pmuludq
  1356  	PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}, requireSrcFloat: true, requireDstFloat: true},
  1357  	// https://www.felixcloutier.com/x86/psubsb:psubsw
  1358  	PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}, requireSrcFloat: true, requireDstFloat: true},
  1359  	// https://www.felixcloutier.com/x86/psubsb:psubsw
  1360  	PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}, requireSrcFloat: true, requireDstFloat: true},
  1361  	// https://www.felixcloutier.com/x86/psubusb:psubusw
  1362  	PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}, requireSrcFloat: true, requireDstFloat: true},
  1363  	// https://www.felixcloutier.com/x86/psubusb:psubusw
  1364  	PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}, requireSrcFloat: true, requireDstFloat: true},
  1365  	// https://www.felixcloutier.com/x86/paddsb:paddsw
  1366  	PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}, requireSrcFloat: true, requireDstFloat: true},
  1367  	// https://www.felixcloutier.com/x86/paddsb:paddsw
  1368  	PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}, requireSrcFloat: true, requireDstFloat: true},
  1369  	// https://www.felixcloutier.com/x86/paddusb:paddusw
  1370  	PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}, requireSrcFloat: true, requireDstFloat: true},
  1371  	// https://www.felixcloutier.com/x86/pavgb:pavgw
  1372  	PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}, requireSrcFloat: true, requireDstFloat: true},
  1373  	// https://www.felixcloutier.com/x86/pavgb:pavgw
  1374  	PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}, requireSrcFloat: true, requireDstFloat: true},
  1375  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1376  	PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}, requireSrcFloat: true, requireDstFloat: true},
  1377  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1378  	PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}, requireSrcFloat: true, requireDstFloat: true},
  1379  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1380  	PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}, requireSrcFloat: true, requireDstFloat: true},
  1381  	// https://www.felixcloutier.com/x86/blendvpd
  1382  	BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}, requireSrcFloat: true, requireDstFloat: true},
  1383  	// https://www.felixcloutier.com/x86/maxpd
  1384  	MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true},
  1385  	// https://www.felixcloutier.com/x86/maxps
  1386  	MAXPS: {opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true},
  1387  	// https://www.felixcloutier.com/x86/minpd
  1388  	MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true},
  1389  	// https://www.felixcloutier.com/x86/minps
  1390  	MINPS: {opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true},
  1391  	// https://www.felixcloutier.com/x86/andnpd
  1392  	ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}, requireSrcFloat: true, requireDstFloat: true},
  1393  	// https://www.felixcloutier.com/x86/andnps
  1394  	ANDNPS: {opcode: []byte{0x0f, 0x55}, requireSrcFloat: true, requireDstFloat: true},
  1395  	// https://www.felixcloutier.com/x86/mulps
  1396  	MULPS: {opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true},
  1397  	// https://www.felixcloutier.com/x86/mulpd
  1398  	MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true},
  1399  	// https://www.felixcloutier.com/x86/divps
  1400  	DIVPS: {opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true},
  1401  	// https://www.felixcloutier.com/x86/divpd
  1402  	DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true},
  1403  	// https://www.felixcloutier.com/x86/sqrtps
  1404  	SQRTPS: {opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true},
  1405  	// https://www.felixcloutier.com/x86/sqrtpd
  1406  	SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true},
  1407  	// https://www.felixcloutier.com/x86/roundps
  1408  	ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1409  	// https://www.felixcloutier.com/x86/roundpd
  1410  	ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1411  	// https://www.felixcloutier.com/x86/palignr
  1412  	PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1413  	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
  1414  	PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}, requireSrcFloat: true, requireDstFloat: true},
  1415  	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
  1416  	PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}, requireSrcFloat: true, requireDstFloat: true},
  1417  	// https://www.felixcloutier.com/x86/pmulhuw
  1418  	PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}, requireSrcFloat: true, requireDstFloat: true},
  1419  	// https://www.felixcloutier.com/x86/pmuldq
  1420  	PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}, requireSrcFloat: true, requireDstFloat: true},
  1421  	// https://www.felixcloutier.com/x86/pmulhrsw
  1422  	PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}, requireSrcFloat: true, requireDstFloat: true},
  1423  	// https://www.felixcloutier.com/x86/pmovsx
  1424  	PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}, requireSrcFloat: true, requireDstFloat: true},
  1425  	// https://www.felixcloutier.com/x86/pmovsx
  1426  	PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}, requireSrcFloat: true, requireDstFloat: true},
  1427  	// https://www.felixcloutier.com/x86/pmovsx
  1428  	PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}, requireSrcFloat: true, requireDstFloat: true},
  1429  	// https://www.felixcloutier.com/x86/pmovzx
  1430  	PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}, requireSrcFloat: true, requireDstFloat: true},
  1431  	// https://www.felixcloutier.com/x86/pmovzx
  1432  	PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}, requireSrcFloat: true, requireDstFloat: true},
  1433  	// https://www.felixcloutier.com/x86/pmovzx
  1434  	PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}, requireSrcFloat: true, requireDstFloat: true},
  1435  	// https://www.felixcloutier.com/x86/pmulhw
  1436  	PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}, requireSrcFloat: true, requireDstFloat: true},
  1437  	// https://www.felixcloutier.com/x86/cmpps
  1438  	CMPEQPS: {opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1439  	// https://www.felixcloutier.com/x86/cmppd
  1440  	CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1441  	// https://www.felixcloutier.com/x86/cvttps2dq
  1442  	CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}, requireSrcFloat: true, requireDstFloat: true},
  1443  	// https://www.felixcloutier.com/x86/cvtdq2ps
  1444  	CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}, requireSrcFloat: true, requireDstFloat: true},
  1445  	// https://www.felixcloutier.com/x86/cvtdq2pd
  1446  	CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}, requireSrcFloat: true, requireDstFloat: true},
  1447  	// https://www.felixcloutier.com/x86/cvtpd2ps
  1448  	CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true},
  1449  	// https://www.felixcloutier.com/x86/cvtps2pd
  1450  	CVTPS2PD: {opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true},
  1451  	// https://www.felixcloutier.com/x86/movupd
  1452  	MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}, requireSrcFloat: true, requireDstFloat: true},
  1453  	// https://www.felixcloutier.com/x86/shufps
  1454  	SHUFPS: {opcode: []byte{0x0f, 0xc6}, requireSrcFloat: true, requireDstFloat: true, needArg: true},
  1455  	// https://www.felixcloutier.com/x86/pmaddwd
  1456  	PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}, requireSrcFloat: true, requireDstFloat: true},
  1457  	// https://www.felixcloutier.com/x86/unpcklps
  1458  	UNPCKLPS: {opcode: []byte{0x0f, 0x14}, requireSrcFloat: true, requireDstFloat: true},
  1459  	// https://www.felixcloutier.com/x86/packuswb
  1460  	PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}, requireSrcFloat: true, requireDstFloat: true},
  1461  	// https://www.felixcloutier.com/x86/packsswb:packssdw
  1462  	PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}, requireSrcFloat: true, requireDstFloat: true},
  1463  	// https://www.felixcloutier.com/x86/packusdw
  1464  	PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}, requireSrcFloat: true, requireDstFloat: true},
  1465  	// https://www.felixcloutier.com/x86/pmaddubsw
  1466  	PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}, requireSrcFloat: true, requireDstFloat: true},
  1467  	// https://www.felixcloutier.com/x86/cvttpd2dq
  1468  	CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}, requireDstFloat: true, requireSrcFloat: true},
  1469  }
  1470  
  1471  var RegisterToRegisterShiftOpcode = map[asm.Instruction]struct {
  1472  	opcode         []byte
  1473  	rPrefix        RexPrefix
  1474  	modRMExtension byte
  1475  }{
  1476  	// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1477  	ROLL: {opcode: []byte{0xd3}},
  1478  	ROLQ: {opcode: []byte{0xd3}, rPrefix: RexPrefixW},
  1479  	RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000},
  1480  	RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: RexPrefixW},
  1481  	// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1482  	SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000},
  1483  	SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: RexPrefixW},
  1484  	SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000},
  1485  	SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: RexPrefixW},
  1486  	SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000},
  1487  	SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: RexPrefixW},
  1488  }
  1489  
  1490  type registerToRegisterMOVOpcode struct {
  1491  	opcode          []byte
  1492  	mandatoryPrefix byte
  1493  	srcOnModRMReg   bool
  1494  	rPrefix         RexPrefix
  1495  }
  1496  
  1497  var registerToRegisterMOVOpcodes = map[asm.Instruction]struct {
  1498  	i2i, i2f, f2i, f2f registerToRegisterMOVOpcode
  1499  }{
  1500  	MOVL: {
  1501  		// https://www.felixcloutier.com/x86/mov
  1502  		i2i: registerToRegisterMOVOpcode{opcode: []byte{0x89}, srcOnModRMReg: true},
  1503  		// https://www.felixcloutier.com/x86/movd:movq
  1504  		i2f: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x6e}, mandatoryPrefix: 0x66, srcOnModRMReg: false},
  1505  		f2i: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x7e}, mandatoryPrefix: 0x66, srcOnModRMReg: true},
  1506  	},
  1507  	MOVQ: {
  1508  		// https://www.felixcloutier.com/x86/mov
  1509  		i2i: registerToRegisterMOVOpcode{opcode: []byte{0x89}, srcOnModRMReg: true, rPrefix: RexPrefixW},
  1510  		// https://www.felixcloutier.com/x86/movd:movq
  1511  		i2f: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x6e}, mandatoryPrefix: 0x66, srcOnModRMReg: false, rPrefix: RexPrefixW},
  1512  		f2i: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x7e}, mandatoryPrefix: 0x66, srcOnModRMReg: true, rPrefix: RexPrefixW},
  1513  		// https://www.felixcloutier.com/x86/movq
  1514  		f2f: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x7e}, mandatoryPrefix: 0xf3},
  1515  	},
  1516  }
  1517  
  1518  func (a *AssemblerImpl) encodeRegisterToRegister(n *nodeImpl) (err error) {
  1519  	// Alias for readability
  1520  	inst := n.instruction
  1521  
  1522  	if op, ok := registerToRegisterMOVOpcodes[inst]; ok {
  1523  		var opcode registerToRegisterMOVOpcode
  1524  		srcIsFloat, dstIsFloat := IsVectorRegister(n.srcReg), IsVectorRegister(n.dstReg)
  1525  		if srcIsFloat && dstIsFloat {
  1526  			if inst == MOVL {
  1527  				return errors.New("MOVL for float to float is undefined")
  1528  			}
  1529  			opcode = op.f2f
  1530  		} else if srcIsFloat && !dstIsFloat {
  1531  			opcode = op.f2i
  1532  		} else if !srcIsFloat && dstIsFloat {
  1533  			opcode = op.i2f
  1534  		} else {
  1535  			opcode = op.i2i
  1536  		}
  1537  
  1538  		rexPrefix, modRM, err := n.GetRegisterToRegisterModRM(opcode.srcOnModRMReg)
  1539  		if err != nil {
  1540  			return err
  1541  		}
  1542  		rexPrefix |= opcode.rPrefix
  1543  
  1544  		if opcode.mandatoryPrefix != 0 {
  1545  			a.buf.WriteByte(opcode.mandatoryPrefix)
  1546  		}
  1547  
  1548  		if rexPrefix != RexPrefixNone {
  1549  			a.buf.WriteByte(rexPrefix)
  1550  		}
  1551  		a.buf.Write(opcode.opcode)
  1552  
  1553  		a.buf.WriteByte(modRM)
  1554  		return nil
  1555  	} else if op, ok := registerToRegisterOpcode[inst]; ok {
  1556  		srcIsFloat, dstIsFloat := IsVectorRegister(n.srcReg), IsVectorRegister(n.dstReg)
  1557  		if op.requireSrcFloat && !srcIsFloat {
  1558  			return fmt.Errorf("%s require float src register but got %s", InstructionName(inst), RegisterName(n.srcReg))
  1559  		} else if op.requireDstFloat && !dstIsFloat {
  1560  			return fmt.Errorf("%s require float dst register but got %s", InstructionName(inst), RegisterName(n.dstReg))
  1561  		} else if !op.requireSrcFloat && srcIsFloat {
  1562  			return fmt.Errorf("%s require integer src register but got %s", InstructionName(inst), RegisterName(n.srcReg))
  1563  		} else if !op.requireDstFloat && dstIsFloat {
  1564  			return fmt.Errorf("%s require integer dst register but got %s", InstructionName(inst), RegisterName(n.dstReg))
  1565  		}
  1566  
  1567  		rexPrefix, modRM, err := n.GetRegisterToRegisterModRM(op.srcOnModRMReg)
  1568  		if err != nil {
  1569  			return err
  1570  		}
  1571  		rexPrefix |= op.rPrefix
  1572  
  1573  		if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI {
  1574  			// If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix.
  1575  			// https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers
  1576  			rexPrefix |= RexPrefixDefault
  1577  		}
  1578  
  1579  		if op.mandatoryPrefix != 0 {
  1580  			a.buf.WriteByte(op.mandatoryPrefix)
  1581  		}
  1582  
  1583  		if rexPrefix != RexPrefixNone {
  1584  			a.buf.WriteByte(rexPrefix)
  1585  		}
  1586  		a.buf.Write(op.opcode)
  1587  
  1588  		a.buf.WriteByte(modRM)
  1589  
  1590  		if op.needArg {
  1591  			a.WriteConst(int64(n.arg), 8)
  1592  		}
  1593  		return nil
  1594  	} else if op, ok := RegisterToRegisterShiftOpcode[inst]; ok {
  1595  		if n.srcReg != RegCX {
  1596  			return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(inst), RegisterName(n.srcReg))
  1597  		} else if IsVectorRegister(n.dstReg) {
  1598  			return fmt.Errorf("shifting instruction %s require integer register as dst but got %s", InstructionName(inst), RegisterName(n.srcReg))
  1599  		}
  1600  
  1601  		reg3bits, rexPrefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  1602  		if err != nil {
  1603  			return err
  1604  		}
  1605  
  1606  		rexPrefix |= op.rPrefix
  1607  		if rexPrefix != RexPrefixNone {
  1608  			a.buf.WriteByte(rexPrefix)
  1609  		}
  1610  
  1611  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1612  		modRM := 0b11_000_000 |
  1613  			(op.modRMExtension) |
  1614  			reg3bits
  1615  		a.buf.Write(append(op.opcode, modRM))
  1616  		return nil
  1617  	} else {
  1618  		return errorEncodingUnsupported(n)
  1619  	}
  1620  }
  1621  
  1622  func (a *AssemblerImpl) encodeRegisterToMemory(n *nodeImpl) (err error) {
  1623  	rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation()
  1624  	if err != nil {
  1625  		return err
  1626  	}
  1627  
  1628  	var opcode []byte
  1629  	var mandatoryPrefix byte
  1630  	var isShiftInstruction bool
  1631  	var needArg bool
  1632  	switch n.instruction {
  1633  	case CMPL:
  1634  		// https://www.felixcloutier.com/x86/cmp
  1635  		opcode = []byte{0x3b}
  1636  	case CMPQ:
  1637  		// https://www.felixcloutier.com/x86/cmp
  1638  		rexPrefix |= RexPrefixW
  1639  		opcode = []byte{0x3b}
  1640  	case MOVB:
  1641  		// https://www.felixcloutier.com/x86/mov
  1642  		opcode = []byte{0x88}
  1643  		// 1 byte register operands need default prefix for the following registers.
  1644  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1645  			rexPrefix |= RexPrefixDefault
  1646  		}
  1647  	case MOVL:
  1648  		if IsVectorRegister(n.srcReg) {
  1649  			// https://www.felixcloutier.com/x86/movd:movq
  1650  			opcode = []byte{0x0f, 0x7e}
  1651  			mandatoryPrefix = 0x66
  1652  		} else {
  1653  			// https://www.felixcloutier.com/x86/mov
  1654  			opcode = []byte{0x89}
  1655  		}
  1656  	case MOVQ:
  1657  		if IsVectorRegister(n.srcReg) {
  1658  			// https://www.felixcloutier.com/x86/movq
  1659  			opcode = []byte{0x0f, 0xd6}
  1660  			mandatoryPrefix = 0x66
  1661  		} else {
  1662  			// https://www.felixcloutier.com/x86/mov
  1663  			rexPrefix |= RexPrefixW
  1664  			opcode = []byte{0x89}
  1665  		}
  1666  	case MOVW:
  1667  		// https://www.felixcloutier.com/x86/mov
  1668  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1669  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1670  		mandatoryPrefix = 0x66
  1671  		opcode = []byte{0x89}
  1672  	case SARL:
  1673  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1674  		modRM |= 0b00_111_000
  1675  		opcode = []byte{0xd3}
  1676  		isShiftInstruction = true
  1677  	case SARQ:
  1678  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1679  		rexPrefix |= RexPrefixW
  1680  		modRM |= 0b00_111_000
  1681  		opcode = []byte{0xd3}
  1682  		isShiftInstruction = true
  1683  	case SHLL:
  1684  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1685  		modRM |= 0b00_100_000
  1686  		opcode = []byte{0xd3}
  1687  		isShiftInstruction = true
  1688  	case SHLQ:
  1689  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1690  		rexPrefix |= RexPrefixW
  1691  		modRM |= 0b00_100_000
  1692  		opcode = []byte{0xd3}
  1693  		isShiftInstruction = true
  1694  	case SHRL:
  1695  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1696  		modRM |= 0b00_101_000
  1697  		opcode = []byte{0xd3}
  1698  		isShiftInstruction = true
  1699  	case SHRQ:
  1700  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1701  		rexPrefix |= RexPrefixW
  1702  		modRM |= 0b00_101_000
  1703  		opcode = []byte{0xd3}
  1704  		isShiftInstruction = true
  1705  	case ROLL:
  1706  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1707  		opcode = []byte{0xd3}
  1708  		isShiftInstruction = true
  1709  	case ROLQ:
  1710  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1711  		rexPrefix |= RexPrefixW
  1712  		opcode = []byte{0xd3}
  1713  		isShiftInstruction = true
  1714  	case RORL:
  1715  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1716  		modRM |= 0b00_001_000
  1717  		opcode = []byte{0xd3}
  1718  		isShiftInstruction = true
  1719  	case RORQ:
  1720  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1721  		rexPrefix |= RexPrefixW
  1722  		opcode = []byte{0xd3}
  1723  		modRM |= 0b00_001_000
  1724  		isShiftInstruction = true
  1725  	case MOVDQU:
  1726  		// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1727  		mandatoryPrefix = 0xf3
  1728  		opcode = []byte{0x0f, 0x7f}
  1729  	case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1730  		mandatoryPrefix = 0x66
  1731  		opcode = []byte{0x0f, 0x3a, 0x14}
  1732  		needArg = true
  1733  	case PEXTRW: // https://www.felixcloutier.com/x86/pextrw
  1734  		mandatoryPrefix = 0x66
  1735  		opcode = []byte{0x0f, 0x3a, 0x15}
  1736  		needArg = true
  1737  	case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1738  		mandatoryPrefix = 0x66
  1739  		opcode = []byte{0x0f, 0x3a, 0x16}
  1740  		needArg = true
  1741  	case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1742  		mandatoryPrefix = 0x66
  1743  		rexPrefix |= RexPrefixW // REX.W
  1744  		opcode = []byte{0x0f, 0x3a, 0x16}
  1745  		needArg = true
  1746  	default:
  1747  		return errorEncodingUnsupported(n)
  1748  	}
  1749  
  1750  	if !isShiftInstruction {
  1751  		srcReg3Bits, prefix, err := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg)
  1752  		if err != nil {
  1753  			return err
  1754  		}
  1755  
  1756  		rexPrefix |= prefix
  1757  		modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg
  1758  	} else {
  1759  		if n.srcReg != RegCX {
  1760  			return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg))
  1761  		}
  1762  	}
  1763  
  1764  	if mandatoryPrefix != 0 {
  1765  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  1766  		a.buf.WriteByte(mandatoryPrefix)
  1767  	}
  1768  
  1769  	if rexPrefix != RexPrefixNone {
  1770  		a.buf.WriteByte(rexPrefix)
  1771  	}
  1772  
  1773  	a.buf.Write(opcode)
  1774  
  1775  	a.buf.WriteByte(modRM)
  1776  
  1777  	if sbi != nil {
  1778  		a.buf.WriteByte(*sbi)
  1779  	}
  1780  
  1781  	if displacementWidth != 0 {
  1782  		a.WriteConst(n.dstConst, displacementWidth)
  1783  	}
  1784  
  1785  	if needArg {
  1786  		a.WriteConst(int64(n.arg), 8)
  1787  	}
  1788  	return
  1789  }
  1790  
  1791  func (a *AssemblerImpl) encodeRegisterToConst(n *nodeImpl) (err error) {
  1792  	regBits, prefix, err := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM)
  1793  	if err != nil {
  1794  		return err
  1795  	}
  1796  
  1797  	switch n.instruction {
  1798  	case CMPL, CMPQ:
  1799  		if n.instruction == CMPQ {
  1800  			prefix |= RexPrefixW
  1801  		}
  1802  		if prefix != RexPrefixNone {
  1803  			a.buf.WriteByte(prefix)
  1804  		}
  1805  		is8bitConst := fitInSigned8bit(n.dstConst)
  1806  		// https://www.felixcloutier.com/x86/cmp
  1807  		if n.srcReg == RegAX && !is8bitConst {
  1808  			a.buf.Write([]byte{0x3d})
  1809  		} else {
  1810  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1811  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  1812  				0b00_111_000 | // CMP with immediate needs "/7" extension.
  1813  				regBits
  1814  			if is8bitConst {
  1815  				a.buf.Write([]byte{0x83, modRM})
  1816  			} else {
  1817  				a.buf.Write([]byte{0x81, modRM})
  1818  			}
  1819  		}
  1820  	default:
  1821  		err = errorEncodingUnsupported(n)
  1822  	}
  1823  
  1824  	if fitInSigned8bit(n.dstConst) {
  1825  		a.WriteConst(n.dstConst, 8)
  1826  	} else {
  1827  		a.WriteConst(n.dstConst, 32)
  1828  	}
  1829  	return
  1830  }
  1831  
  1832  func (a *AssemblerImpl) encodeReadInstructionAddress(n *nodeImpl) error {
  1833  	dstReg3Bits, rexPrefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg)
  1834  	if err != nil {
  1835  		return err
  1836  	}
  1837  
  1838  	a.AddOnGenerateCallBack(func(code []byte) error {
  1839  		// Find the target instruction node.
  1840  		targetNode := n
  1841  		for ; targetNode != nil; targetNode = targetNode.next {
  1842  			if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction {
  1843  				targetNode = targetNode.next
  1844  				break
  1845  			}
  1846  		}
  1847  
  1848  		if targetNode == nil {
  1849  			return errors.New("BUG: target instruction not found for read instruction address")
  1850  		}
  1851  
  1852  		offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */)
  1853  		if offset >= math.MaxInt32 {
  1854  			return errors.New("BUG: too large offset for LEAQ instruction")
  1855  		}
  1856  
  1857  		binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset)))
  1858  		return nil
  1859  	})
  1860  
  1861  	// https://www.felixcloutier.com/x86/lea
  1862  	opcode := byte(0x8d)
  1863  	rexPrefix |= RexPrefixW
  1864  
  1865  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  1866  	modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding.
  1867  		(dstReg3Bits << 3) // Place the dstReg on ModRM:reg.
  1868  
  1869  	a.buf.Write([]byte{rexPrefix, opcode, modRM})
  1870  	a.WriteConst(int64(0), 32) // Preserve
  1871  	return nil
  1872  }
  1873  
  1874  func (a *AssemblerImpl) encodeMemoryToRegister(n *nodeImpl) (err error) {
  1875  	if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE {
  1876  		return a.encodeReadInstructionAddress(n)
  1877  	}
  1878  
  1879  	rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation()
  1880  	if err != nil {
  1881  		return err
  1882  	}
  1883  
  1884  	dstReg3Bits, prefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg)
  1885  	if err != nil {
  1886  		return err
  1887  	}
  1888  
  1889  	rexPrefix |= prefix
  1890  	modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg
  1891  
  1892  	var mandatoryPrefix byte
  1893  	var opcode []byte
  1894  	var needArg bool
  1895  	switch n.instruction {
  1896  	case ADDL:
  1897  		// https://www.felixcloutier.com/x86/add
  1898  		opcode = []byte{0x03}
  1899  	case ADDQ:
  1900  		// https://www.felixcloutier.com/x86/add
  1901  		rexPrefix |= RexPrefixW
  1902  		opcode = []byte{0x03}
  1903  	case CMPL:
  1904  		// https://www.felixcloutier.com/x86/cmp
  1905  		opcode = []byte{0x39}
  1906  	case CMPQ:
  1907  		// https://www.felixcloutier.com/x86/cmp
  1908  		rexPrefix |= RexPrefixW
  1909  		opcode = []byte{0x39}
  1910  	case LEAQ:
  1911  		// https://www.felixcloutier.com/x86/lea
  1912  		rexPrefix |= RexPrefixW
  1913  		opcode = []byte{0x8d}
  1914  	case MOVBLSX:
  1915  		// https://www.felixcloutier.com/x86/movsx:movsxd
  1916  		opcode = []byte{0x0f, 0xbe}
  1917  	case MOVBLZX:
  1918  		// https://www.felixcloutier.com/x86/movzx
  1919  		opcode = []byte{0x0f, 0xb6}
  1920  	case MOVBQSX:
  1921  		// https://www.felixcloutier.com/x86/movsx:movsxd
  1922  		rexPrefix |= RexPrefixW
  1923  		opcode = []byte{0x0f, 0xbe}
  1924  	case MOVBQZX:
  1925  		// https://www.felixcloutier.com/x86/movzx
  1926  		rexPrefix |= RexPrefixW
  1927  		opcode = []byte{0x0f, 0xb6}
  1928  	case MOVLQSX:
  1929  		// https://www.felixcloutier.com/x86/movsx:movsxd
  1930  		rexPrefix |= RexPrefixW
  1931  		opcode = []byte{0x63}
  1932  	case MOVLQZX:
  1933  		// https://www.felixcloutier.com/x86/mov
  1934  		// Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and
  1935  		// that is semantically equivalent to MOV 32bit to 32bit.
  1936  		opcode = []byte{0x8B}
  1937  	case MOVL:
  1938  		// https://www.felixcloutier.com/x86/mov
  1939  		// Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and
  1940  		// that is semantically equivalent to MOV 32bit to 32bit.
  1941  		if IsVectorRegister(n.dstReg) {
  1942  			// https://www.felixcloutier.com/x86/movd:movq
  1943  			opcode = []byte{0x0f, 0x6e}
  1944  			mandatoryPrefix = 0x66
  1945  		} else {
  1946  			// https://www.felixcloutier.com/x86/mov
  1947  			opcode = []byte{0x8B}
  1948  		}
  1949  	case MOVQ:
  1950  		if IsVectorRegister(n.dstReg) {
  1951  			// https://www.felixcloutier.com/x86/movq
  1952  			opcode = []byte{0x0f, 0x7e}
  1953  			mandatoryPrefix = 0xf3
  1954  		} else {
  1955  			// https://www.felixcloutier.com/x86/mov
  1956  			rexPrefix |= RexPrefixW
  1957  			opcode = []byte{0x8B}
  1958  		}
  1959  	case MOVWLSX:
  1960  		// https://www.felixcloutier.com/x86/movsx:movsxd
  1961  		opcode = []byte{0x0f, 0xbf}
  1962  	case MOVWLZX:
  1963  		// https://www.felixcloutier.com/x86/movzx
  1964  		opcode = []byte{0x0f, 0xb7}
  1965  	case MOVWQSX:
  1966  		// https://www.felixcloutier.com/x86/movsx:movsxd
  1967  		rexPrefix |= RexPrefixW
  1968  		opcode = []byte{0x0f, 0xbf}
  1969  	case MOVWQZX:
  1970  		// https://www.felixcloutier.com/x86/movzx
  1971  		rexPrefix |= RexPrefixW
  1972  		opcode = []byte{0x0f, 0xb7}
  1973  	case SUBQ:
  1974  		// https://www.felixcloutier.com/x86/sub
  1975  		rexPrefix |= RexPrefixW
  1976  		opcode = []byte{0x2b}
  1977  	case SUBSD:
  1978  		// https://www.felixcloutier.com/x86/subsd
  1979  		opcode = []byte{0x0f, 0x5c}
  1980  		mandatoryPrefix = 0xf2
  1981  	case SUBSS:
  1982  		// https://www.felixcloutier.com/x86/subss
  1983  		opcode = []byte{0x0f, 0x5c}
  1984  		mandatoryPrefix = 0xf3
  1985  	case UCOMISD:
  1986  		// https://www.felixcloutier.com/x86/ucomisd
  1987  		opcode = []byte{0x0f, 0x2e}
  1988  		mandatoryPrefix = 0x66
  1989  	case UCOMISS:
  1990  		// https://www.felixcloutier.com/x86/ucomiss
  1991  		opcode = []byte{0x0f, 0x2e}
  1992  	case MOVDQU:
  1993  		// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1994  		mandatoryPrefix = 0xf3
  1995  		opcode = []byte{0x0f, 0x6f}
  1996  	case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx
  1997  		mandatoryPrefix = 0x66
  1998  		opcode = []byte{0x0f, 0x38, 0x20}
  1999  	case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx
  2000  		mandatoryPrefix = 0x66
  2001  		opcode = []byte{0x0f, 0x38, 0x23}
  2002  	case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx
  2003  		mandatoryPrefix = 0x66
  2004  		opcode = []byte{0x0f, 0x38, 0x25}
  2005  	case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx
  2006  		mandatoryPrefix = 0x66
  2007  		opcode = []byte{0x0f, 0x38, 0x30}
  2008  	case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx
  2009  		mandatoryPrefix = 0x66
  2010  		opcode = []byte{0x0f, 0x38, 0x33}
  2011  	case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx
  2012  		mandatoryPrefix = 0x66
  2013  		opcode = []byte{0x0f, 0x38, 0x35}
  2014  	case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2015  		mandatoryPrefix = 0x66
  2016  		opcode = []byte{0x0f, 0x3a, 0x20}
  2017  		needArg = true
  2018  	case PINSRW: // https://www.felixcloutier.com/x86/pinsrw
  2019  		mandatoryPrefix = 0x66
  2020  		opcode = []byte{0x0f, 0xc4}
  2021  		needArg = true
  2022  	case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2023  		mandatoryPrefix = 0x66
  2024  		opcode = []byte{0x0f, 0x3a, 0x22}
  2025  		needArg = true
  2026  	case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2027  		rexPrefix |= RexPrefixW
  2028  		mandatoryPrefix = 0x66
  2029  		opcode = []byte{0x0f, 0x3a, 0x22}
  2030  		needArg = true
  2031  	default:
  2032  		return errorEncodingUnsupported(n)
  2033  	}
  2034  
  2035  	if mandatoryPrefix != 0 {
  2036  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  2037  		a.buf.WriteByte(mandatoryPrefix)
  2038  	}
  2039  
  2040  	if rexPrefix != RexPrefixNone {
  2041  		a.buf.WriteByte(rexPrefix)
  2042  	}
  2043  
  2044  	a.buf.Write(opcode)
  2045  
  2046  	a.buf.WriteByte(modRM)
  2047  
  2048  	if sbi != nil {
  2049  		a.buf.WriteByte(*sbi)
  2050  	}
  2051  
  2052  	if displacementWidth != 0 {
  2053  		a.WriteConst(n.srcConst, displacementWidth)
  2054  	}
  2055  
  2056  	if needArg {
  2057  		a.WriteConst(int64(n.arg), 8)
  2058  	}
  2059  	return
  2060  }
  2061  
  2062  func (a *AssemblerImpl) encodeConstToRegister(n *nodeImpl) (err error) {
  2063  	regBits, rexPrefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  2064  	if err != nil {
  2065  		return err
  2066  	}
  2067  
  2068  	isFloatReg := IsVectorRegister(n.dstReg)
  2069  	switch n.instruction {
  2070  	case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD:
  2071  		if !isFloatReg {
  2072  			return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg))
  2073  		}
  2074  	default:
  2075  		if isFloatReg {
  2076  			return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg))
  2077  		}
  2078  	}
  2079  
  2080  	if n.instruction != MOVQ && !FitIn32bit(n.srcConst) {
  2081  		return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2082  	} else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) {
  2083  		return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2084  	} else if (n.instruction == PSLLD ||
  2085  		n.instruction == PSLLQ ||
  2086  		n.instruction == PSRLD ||
  2087  		n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) {
  2088  		return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2089  	}
  2090  
  2091  	isSigned8bitConst := fitInSigned8bit(n.srcConst)
  2092  	switch inst := n.instruction; inst {
  2093  	case ADDQ:
  2094  		// https://www.felixcloutier.com/x86/add
  2095  		rexPrefix |= RexPrefixW
  2096  		if n.dstReg == RegAX && !isSigned8bitConst {
  2097  			a.buf.Write([]byte{rexPrefix, 0x05})
  2098  		} else {
  2099  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2100  				regBits
  2101  			if isSigned8bitConst {
  2102  				a.buf.Write([]byte{rexPrefix, 0x83, modRM})
  2103  			} else {
  2104  				a.buf.Write([]byte{rexPrefix, 0x81, modRM})
  2105  			}
  2106  		}
  2107  		if isSigned8bitConst {
  2108  			a.WriteConst(n.srcConst, 8)
  2109  		} else {
  2110  			a.WriteConst(n.srcConst, 32)
  2111  		}
  2112  	case ANDQ:
  2113  		// https://www.felixcloutier.com/x86/and
  2114  		rexPrefix |= RexPrefixW
  2115  		if n.dstReg == RegAX && !isSigned8bitConst {
  2116  			a.buf.Write([]byte{rexPrefix, 0x25})
  2117  		} else {
  2118  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2119  				0b00_100_000 | // AND with immediate needs "/4" extension.
  2120  				regBits
  2121  			if isSigned8bitConst {
  2122  				a.buf.Write([]byte{rexPrefix, 0x83, modRM})
  2123  			} else {
  2124  				a.buf.Write([]byte{rexPrefix, 0x81, modRM})
  2125  			}
  2126  		}
  2127  		if fitInSigned8bit(n.srcConst) {
  2128  			a.WriteConst(n.srcConst, 8)
  2129  		} else {
  2130  			a.WriteConst(n.srcConst, 32)
  2131  		}
  2132  	case TESTQ:
  2133  		// https://www.felixcloutier.com/x86/test
  2134  		rexPrefix |= RexPrefixW
  2135  		if n.dstReg == RegAX && !isSigned8bitConst {
  2136  			a.buf.Write([]byte{rexPrefix, 0xa9})
  2137  		} else {
  2138  			modRM := 0b11_000_000 | // Specifying that operand is register
  2139  				regBits
  2140  			a.buf.Write([]byte{rexPrefix, 0xf7, modRM})
  2141  		}
  2142  		a.WriteConst(n.srcConst, 32)
  2143  	case MOVL:
  2144  		// https://www.felixcloutier.com/x86/mov
  2145  		if rexPrefix != RexPrefixNone {
  2146  			a.buf.WriteByte(rexPrefix)
  2147  		}
  2148  		a.buf.Write([]byte{0xb8 | regBits})
  2149  		a.WriteConst(n.srcConst, 32)
  2150  	case MOVQ:
  2151  		// https://www.felixcloutier.com/x86/mov
  2152  		if FitIn32bit(n.srcConst) {
  2153  			if n.srcConst > math.MaxInt32 {
  2154  				if rexPrefix != RexPrefixNone {
  2155  					a.buf.WriteByte(rexPrefix)
  2156  				}
  2157  				a.buf.Write([]byte{0xb8 | regBits})
  2158  			} else {
  2159  				rexPrefix |= RexPrefixW
  2160  				modRM := 0b11_000_000 | // Specifying that opeand is register.
  2161  					regBits
  2162  				a.buf.Write([]byte{rexPrefix, 0xc7, modRM})
  2163  			}
  2164  			a.WriteConst(n.srcConst, 32)
  2165  		} else {
  2166  			rexPrefix |= RexPrefixW
  2167  			a.buf.Write([]byte{rexPrefix, 0xb8 | regBits})
  2168  			a.WriteConst(n.srcConst, 64)
  2169  		}
  2170  	case SHLQ:
  2171  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  2172  		rexPrefix |= RexPrefixW
  2173  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2174  			0b00_100_000 | // SHL with immediate needs "/4" extension.
  2175  			regBits
  2176  		if n.srcConst == 1 {
  2177  			a.buf.Write([]byte{rexPrefix, 0xd1, modRM})
  2178  		} else {
  2179  			a.buf.Write([]byte{rexPrefix, 0xc1, modRM})
  2180  			a.WriteConst(n.srcConst, 8)
  2181  		}
  2182  	case SHRQ:
  2183  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  2184  		rexPrefix |= RexPrefixW
  2185  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2186  			0b00_101_000 | // SHR with immediate needs "/5" extension.
  2187  			regBits
  2188  		if n.srcConst == 1 {
  2189  			a.buf.Write([]byte{rexPrefix, 0xd1, modRM})
  2190  		} else {
  2191  			a.buf.Write([]byte{rexPrefix, 0xc1, modRM})
  2192  			a.WriteConst(n.srcConst, 8)
  2193  		}
  2194  	case PSLLD:
  2195  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2196  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2197  			0b00_110_000 | // PSLL with immediate needs "/6" extension.
  2198  			regBits
  2199  		if rexPrefix != RexPrefixNone {
  2200  			a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x72, modRM})
  2201  			a.WriteConst(n.srcConst, 8)
  2202  		} else {
  2203  			a.buf.Write([]byte{0x66, 0x0f, 0x72, modRM})
  2204  			a.WriteConst(n.srcConst, 8)
  2205  		}
  2206  	case PSLLQ:
  2207  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2208  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2209  			0b00_110_000 | // PSLL with immediate needs "/6" extension.
  2210  			regBits
  2211  		if rexPrefix != RexPrefixNone {
  2212  			a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x73, modRM})
  2213  			a.WriteConst(n.srcConst, 8)
  2214  		} else {
  2215  			a.buf.Write([]byte{0x66, 0x0f, 0x73, modRM})
  2216  			a.WriteConst(n.srcConst, 8)
  2217  		}
  2218  	case PSRLD:
  2219  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2220  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2221  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2222  			0b00_010_000 | // PSRL with immediate needs "/2" extension.
  2223  			regBits
  2224  		if rexPrefix != RexPrefixNone {
  2225  			a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x72, modRM})
  2226  			a.WriteConst(n.srcConst, 8)
  2227  		} else {
  2228  			a.buf.Write([]byte{0x66, 0x0f, 0x72, modRM})
  2229  			a.WriteConst(n.srcConst, 8)
  2230  		}
  2231  	case PSRLQ:
  2232  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2233  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2234  			0b00_010_000 | // PSRL with immediate needs "/2" extension.
  2235  			regBits
  2236  		if rexPrefix != RexPrefixNone {
  2237  			a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x73, modRM})
  2238  			a.WriteConst(n.srcConst, 8)
  2239  		} else {
  2240  			a.buf.Write([]byte{0x66, 0x0f, 0x73, modRM})
  2241  			a.WriteConst(n.srcConst, 8)
  2242  		}
  2243  	case PSRAW, PSRAD:
  2244  		// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  2245  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2246  			0b00_100_000 | // PSRAW with immediate needs "/4" extension.
  2247  			regBits
  2248  		a.buf.WriteByte(0x66)
  2249  		if rexPrefix != RexPrefixNone {
  2250  			a.buf.WriteByte(rexPrefix)
  2251  		}
  2252  
  2253  		var op byte
  2254  		if inst == PSRAD {
  2255  			op = 0x72
  2256  		} else { // PSRAW
  2257  			op = 0x71
  2258  		}
  2259  
  2260  		a.buf.Write([]byte{0x0f, op, modRM})
  2261  		a.WriteConst(n.srcConst, 8)
  2262  	case PSRLW:
  2263  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2264  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2265  			0b00_010_000 | // PSRLW with immediate needs "/2" extension.
  2266  			regBits
  2267  		a.buf.WriteByte(0x66)
  2268  		if rexPrefix != RexPrefixNone {
  2269  			a.buf.WriteByte(rexPrefix)
  2270  		}
  2271  		a.buf.Write([]byte{0x0f, 0x71, modRM})
  2272  		a.WriteConst(n.srcConst, 8)
  2273  	case PSLLW:
  2274  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2275  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2276  			0b00_110_000 | // PSLLW with immediate needs "/6" extension.
  2277  			regBits
  2278  		a.buf.WriteByte(0x66)
  2279  		if rexPrefix != RexPrefixNone {
  2280  			a.buf.WriteByte(rexPrefix)
  2281  		}
  2282  		a.buf.Write([]byte{0x0f, 0x71, modRM})
  2283  		a.WriteConst(n.srcConst, 8)
  2284  	case XORL, XORQ:
  2285  		// https://www.felixcloutier.com/x86/xor
  2286  		if inst == XORQ {
  2287  			rexPrefix |= RexPrefixW
  2288  		}
  2289  		if rexPrefix != RexPrefixNone {
  2290  			a.buf.WriteByte(rexPrefix)
  2291  		}
  2292  		if n.dstReg == RegAX && !isSigned8bitConst {
  2293  			a.buf.Write([]byte{0x35})
  2294  		} else {
  2295  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2296  				0b00_110_000 | // XOR with immediate needs "/6" extension.
  2297  				regBits
  2298  			if isSigned8bitConst {
  2299  				a.buf.Write([]byte{0x83, modRM})
  2300  			} else {
  2301  				a.buf.Write([]byte{0x81, modRM})
  2302  			}
  2303  		}
  2304  		if fitInSigned8bit(n.srcConst) {
  2305  			a.WriteConst(n.srcConst, 8)
  2306  		} else {
  2307  			a.WriteConst(n.srcConst, 32)
  2308  		}
  2309  	default:
  2310  		err = errorEncodingUnsupported(n)
  2311  	}
  2312  	return
  2313  }
  2314  
  2315  func (a *AssemblerImpl) encodeMemoryToConst(n *nodeImpl) (err error) {
  2316  	if !FitIn32bit(n.dstConst) {
  2317  		return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction))
  2318  	}
  2319  
  2320  	rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation()
  2321  	if err != nil {
  2322  		return err
  2323  	}
  2324  
  2325  	// Alias for readability.
  2326  	c := n.dstConst
  2327  
  2328  	var opcode, constWidth byte
  2329  	switch n.instruction {
  2330  	case CMPL:
  2331  		// https://www.felixcloutier.com/x86/cmp
  2332  		if fitInSigned8bit(c) {
  2333  			opcode = 0x83
  2334  			constWidth = 8
  2335  		} else {
  2336  			opcode = 0x81
  2337  			constWidth = 32
  2338  		}
  2339  		modRM |= 0b00_111_000
  2340  	default:
  2341  		return errorEncodingUnsupported(n)
  2342  	}
  2343  
  2344  	if rexPrefix != RexPrefixNone {
  2345  		a.buf.WriteByte(rexPrefix)
  2346  	}
  2347  
  2348  	a.buf.Write([]byte{opcode, modRM})
  2349  
  2350  	if sbi != nil {
  2351  		a.buf.WriteByte(*sbi)
  2352  	}
  2353  
  2354  	if displacementWidth != 0 {
  2355  		a.WriteConst(n.srcConst, displacementWidth)
  2356  	}
  2357  
  2358  	a.WriteConst(c, constWidth)
  2359  	return
  2360  }
  2361  
  2362  func (a *AssemblerImpl) encodeConstToMemory(n *nodeImpl) (err error) {
  2363  	rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation()
  2364  	if err != nil {
  2365  		return err
  2366  	}
  2367  
  2368  	// Alias for readability.
  2369  	inst := n.instruction
  2370  	c := n.srcConst
  2371  
  2372  	if inst == MOVB && !fitInSigned8bit(c) {
  2373  		return fmt.Errorf("too large load target const %d for MOVB", c)
  2374  	} else if !FitIn32bit(c) {
  2375  		return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction))
  2376  	}
  2377  
  2378  	var constWidth, opcode byte
  2379  	switch inst {
  2380  	case MOVB:
  2381  		opcode = 0xc6
  2382  		constWidth = 8
  2383  	case MOVL:
  2384  		opcode = 0xc7
  2385  		constWidth = 32
  2386  	case MOVQ:
  2387  		rexPrefix |= RexPrefixW
  2388  		opcode = 0xc7
  2389  		constWidth = 32
  2390  	default:
  2391  		return errorEncodingUnsupported(n)
  2392  	}
  2393  
  2394  	if rexPrefix != RexPrefixNone {
  2395  		a.buf.WriteByte(rexPrefix)
  2396  	}
  2397  
  2398  	a.buf.Write([]byte{opcode, modRM})
  2399  
  2400  	if sbi != nil {
  2401  		a.buf.WriteByte(*sbi)
  2402  	}
  2403  
  2404  	if displacementWidth != 0 {
  2405  		a.WriteConst(n.dstConst, displacementWidth)
  2406  	}
  2407  
  2408  	a.WriteConst(c, constWidth)
  2409  	return
  2410  }
  2411  
  2412  func (a *AssemblerImpl) WriteConst(v int64, length byte) {
  2413  	switch length {
  2414  	case 8:
  2415  		a.buf.WriteByte(byte(int8(v)))
  2416  	case 32:
  2417  		// TODO: any way to directly put little endian bytes into bytes.Buffer?
  2418  		offsetBytes := make([]byte, 4)
  2419  		binary.LittleEndian.PutUint32(offsetBytes, uint32(int32(v)))
  2420  		a.buf.Write(offsetBytes)
  2421  	case 64:
  2422  		// TODO: any way to directly put little endian bytes into bytes.Buffer?
  2423  		offsetBytes := make([]byte, 8)
  2424  		binary.LittleEndian.PutUint64(offsetBytes, uint64(v))
  2425  		a.buf.Write(offsetBytes)
  2426  	default:
  2427  		panic("BUG: length must be one of 8, 32 or 64")
  2428  	}
  2429  }
  2430  
  2431  func (n *nodeImpl) GetMemoryLocation() (p RexPrefix, modRM byte, sbi *byte, displacementWidth byte, err error) {
  2432  	var baseReg, indexReg asm.Register
  2433  	var offset asm.ConstantValue
  2434  	var scale byte
  2435  	if n.types.dst == operandTypeMemory {
  2436  		baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale
  2437  	} else if n.types.src == operandTypeMemory {
  2438  		baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale
  2439  	} else {
  2440  		err = fmt.Errorf("memory location is not supported for %s", n.types)
  2441  		return
  2442  	}
  2443  
  2444  	if !FitIn32bit(offset) {
  2445  		err = errors.New("offset does not fit in 32-bit integer")
  2446  		return
  2447  	}
  2448  
  2449  	if baseReg == asm.NilRegister && indexReg != asm.NilRegister {
  2450  		// [(index*scale) + displacement] addressing is possible, but we haven't used it for now.
  2451  		err = errors.New("addressing without base register but with index is not implemented")
  2452  	} else if baseReg == asm.NilRegister {
  2453  		modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB.
  2454  		sbiValue := byte(0b00_100_101)
  2455  		sbi = &sbiValue
  2456  		displacementWidth = 32
  2457  	} else if indexReg == asm.NilRegister {
  2458  		modRM, p, err = register3bits(baseReg, registerSpecifierPositionModRMFieldRM)
  2459  		if err != nil {
  2460  			return
  2461  		}
  2462  
  2463  		// Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0
  2464  		// and otherwise [R/M].
  2465  		withoutDisplacement := offset == 0 &&
  2466  			// If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value
  2467  			// is zero since it's not [R/M] operand is not defined for these two registers.
  2468  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2469  			baseReg != RegR13 && baseReg != RegBP
  2470  		if withoutDisplacement {
  2471  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2472  			modRM |= 0b00_000_000 // Specifying that operand is memory without displacement
  2473  			displacementWidth = 0
  2474  		} else if fitInSigned8bit(offset) {
  2475  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2476  			modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement.
  2477  			displacementWidth = 8
  2478  		} else {
  2479  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2480  			modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement.
  2481  			displacementWidth = 32
  2482  		}
  2483  
  2484  		// For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP].
  2485  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2486  		//
  2487  		// Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement].
  2488  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2
  2489  		if baseReg == RegSP || baseReg == RegR12 {
  2490  			sbiValue := byte(0b00_100_100)
  2491  			sbi = &sbiValue
  2492  		}
  2493  	} else {
  2494  		if indexReg == RegSP {
  2495  			err = errors.New("SP cannot be used for SIB index")
  2496  			return
  2497  		}
  2498  
  2499  		modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB.
  2500  
  2501  		withoutDisplacement := offset == 0 &&
  2502  			// For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod).
  2503  			baseReg != RegR13 && baseReg != RegBP
  2504  		if withoutDisplacement {
  2505  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2506  			modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement
  2507  			displacementWidth = 0
  2508  		} else if fitInSigned8bit(offset) {
  2509  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2510  			modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement.
  2511  			displacementWidth = 8
  2512  		} else {
  2513  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2514  			modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement.
  2515  			displacementWidth = 32
  2516  		}
  2517  
  2518  		var baseRegBits byte
  2519  		baseRegBits, p, err = register3bits(baseReg, registerSpecifierPositionModRMFieldRM)
  2520  		if err != nil {
  2521  			return
  2522  		}
  2523  
  2524  		var indexRegBits byte
  2525  		var indexRegPrefix RexPrefix
  2526  		indexRegBits, indexRegPrefix, err = register3bits(indexReg, registerSpecifierPositionSIBIndex)
  2527  		if err != nil {
  2528  			return
  2529  		}
  2530  		p |= indexRegPrefix
  2531  
  2532  		sbiValue := baseRegBits | (indexRegBits << 3)
  2533  		switch scale {
  2534  		case 1:
  2535  			sbiValue |= 0b00_000_000
  2536  		case 2:
  2537  			sbiValue |= 0b01_000_000
  2538  		case 4:
  2539  			sbiValue |= 0b10_000_000
  2540  		case 8:
  2541  			sbiValue |= 0b11_000_000
  2542  		default:
  2543  			err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale)
  2544  			return
  2545  		}
  2546  
  2547  		sbi = &sbiValue
  2548  	}
  2549  	return
  2550  }
  2551  
  2552  // GetRegisterToRegisterModRM does XXXX
  2553  //
  2554  // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation
  2555  // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity.
  2556  func (n *nodeImpl) GetRegisterToRegisterModRM(srcOnModRMReg bool) (RexPrefix, modRM byte, err error) {
  2557  	var reg3bits, rm3bits byte
  2558  	if srcOnModRMReg {
  2559  		reg3bits, RexPrefix, err = register3bits(n.srcReg,
  2560  			// Indicate that srcReg will be specified by ModRM:reg.
  2561  			registerSpecifierPositionModRMFieldReg)
  2562  		if err != nil {
  2563  			return
  2564  		}
  2565  
  2566  		var dstRexPrefix byte
  2567  		rm3bits, dstRexPrefix, err = register3bits(n.dstReg,
  2568  			// Indicate that dstReg will be specified by ModRM:r/m.
  2569  			registerSpecifierPositionModRMFieldRM)
  2570  		if err != nil {
  2571  			return
  2572  		}
  2573  		RexPrefix |= dstRexPrefix
  2574  	} else {
  2575  		rm3bits, RexPrefix, err = register3bits(n.srcReg,
  2576  			// Indicate that srcReg will be specified by ModRM:r/m.
  2577  			registerSpecifierPositionModRMFieldRM)
  2578  		if err != nil {
  2579  			return
  2580  		}
  2581  
  2582  		var dstRexPrefix byte
  2583  		reg3bits, dstRexPrefix, err = register3bits(n.dstReg,
  2584  			// Indicate that dstReg will be specified by ModRM:reg.
  2585  			registerSpecifierPositionModRMFieldReg)
  2586  		if err != nil {
  2587  			return
  2588  		}
  2589  		RexPrefix |= dstRexPrefix
  2590  	}
  2591  
  2592  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2593  	modRM = 0b11_000_000 | // Specifying that dst operand is register.
  2594  		(reg3bits << 3) |
  2595  		rm3bits
  2596  
  2597  	return
  2598  }
  2599  
  2600  // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
  2601  type RexPrefix = byte
  2602  
  2603  // REX prefixes are independent of each other and can be combined with OR.
  2604  const (
  2605  	RexPrefixNone    RexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix.
  2606  	RexPrefixDefault RexPrefix = 0b0100_0000
  2607  	RexPrefixW                 = 0b0000_1000 | RexPrefixDefault // REX.W
  2608  	RexPrefixR                 = 0b0000_0100 | RexPrefixDefault // REX.R
  2609  	RexPrefixX                 = 0b0000_0010 | RexPrefixDefault // REX.X
  2610  	RexPrefixB                 = 0b0000_0001 | RexPrefixDefault // REX.B
  2611  )
  2612  
  2613  // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed.
  2614  type registerSpecifierPosition byte
  2615  
  2616  const (
  2617  	registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota
  2618  	registerSpecifierPositionModRMFieldRM
  2619  	registerSpecifierPositionSIBIndex
  2620  )
  2621  
  2622  func register3bits(
  2623  	reg asm.Register,
  2624  	registerSpecifierPosition registerSpecifierPosition,
  2625  ) (bits byte, prefix RexPrefix, err error) {
  2626  	prefix = RexPrefixNone
  2627  	if RegR8 <= reg && reg <= RegR15 || RegX8 <= reg && reg <= RegX15 {
  2628  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
  2629  		switch registerSpecifierPosition {
  2630  		case registerSpecifierPositionModRMFieldReg:
  2631  			prefix = RexPrefixR
  2632  		case registerSpecifierPositionModRMFieldRM:
  2633  			prefix = RexPrefixB
  2634  		case registerSpecifierPositionSIBIndex:
  2635  			prefix = RexPrefixX
  2636  		}
  2637  	}
  2638  
  2639  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers
  2640  	switch reg {
  2641  	case RegAX, RegR8, RegX0, RegX8:
  2642  		bits = 0b000
  2643  	case RegCX, RegR9, RegX1, RegX9:
  2644  		bits = 0b001
  2645  	case RegDX, RegR10, RegX2, RegX10:
  2646  		bits = 0b010
  2647  	case RegBX, RegR11, RegX3, RegX11:
  2648  		bits = 0b011
  2649  	case RegSP, RegR12, RegX4, RegX12:
  2650  		bits = 0b100
  2651  	case RegBP, RegR13, RegX5, RegX13:
  2652  		bits = 0b101
  2653  	case RegSI, RegR14, RegX6, RegX14:
  2654  		bits = 0b110
  2655  	case RegDI, RegR15, RegX7, RegX15:
  2656  		bits = 0b111
  2657  	default:
  2658  		err = fmt.Errorf("invalid register [%s]", RegisterName(reg))
  2659  	}
  2660  	return
  2661  }
  2662  
  2663  func FitIn32bit(v int64) bool {
  2664  	return math.MinInt32 <= v && v <= math.MaxUint32
  2665  }
  2666  
  2667  func fitInSigned8bit(v int64) bool {
  2668  	return math.MinInt8 <= v && v <= math.MaxInt8
  2669  }
  2670  
  2671  func IsVectorRegister(r asm.Register) bool {
  2672  	return RegX0 <= r && r <= RegX15
  2673  }