github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/asm/amd64/impl.go (about)

     1  package amd64
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  
     9  	"github.com/wasilibs/wazerox/internal/asm"
    10  )
    11  
    12  // nodeImpl implements asm.Node for amd64.
    13  type nodeImpl struct {
    14  	// jumpTarget holds the target node in the linked for the jump-kind instruction.
    15  	jumpTarget *nodeImpl
    16  
    17  	// prev and next hold the prev/next node from this node in the assembled linked list.
    18  	prev, next *nodeImpl
    19  
    20  	// forwardJumpOrigins hold all the nodes trying to jump into this node as a
    21  	// singly linked list. In other words, all the nodes with .jumpTarget == this.
    22  	forwardJumpOrigins *nodeImpl
    23  
    24  	staticConst *asm.StaticConst
    25  
    26  	dstConst       asm.ConstantValue
    27  	offsetInBinary asm.NodeOffsetInBinary
    28  	srcConst       asm.ConstantValue
    29  	instruction    asm.Instruction
    30  
    31  	// readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of
    32  	// read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress.
    33  	readInstructionAddressBeforeTargetInstruction asm.Instruction
    34  	flag                                          nodeFlag
    35  	types                                         operandTypes
    36  	srcReg, dstReg                                asm.Register
    37  	srcMemIndex, dstMemIndex                      asm.Register
    38  	srcMemScale, dstMemScale                      byte
    39  	arg                                           byte
    40  
    41  	// staticConstReferrersAdded true if this node is already added into AssemblerImpl.staticConstReferrers.
    42  	// Only used when staticConst is not nil. Through re-assembly, we might end up adding multiple times which causes unnecessary
    43  	// allocations, so we use this flag to do it once.
    44  	staticConstReferrersAdded bool
    45  }
    46  
    47  type nodeFlag byte
    48  
    49  const (
    50  	// nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge
    51  	// whether a jump is backward or forward before encoding.
    52  	nodeFlagInitializedForEncoding nodeFlag = 1 << iota
    53  	nodeFlagBackwardJump
    54  	// nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and
    55  	// the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget
    56  	// as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible.
    57  	nodeFlagShortForwardJump
    58  	// nodeFlagLock indicates the encoded instruction should include the LOCK prefix
    59  	nodeFlagLock
    60  )
    61  
    62  func (n *nodeImpl) isInitializedForEncoding() bool {
    63  	return n.flag&nodeFlagInitializedForEncoding != 0
    64  }
    65  
    66  func (n *nodeImpl) isJumpNode() bool {
    67  	return n.jumpTarget != nil
    68  }
    69  
    70  func (n *nodeImpl) isBackwardJump() bool {
    71  	return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0)
    72  }
    73  
    74  func (n *nodeImpl) isForwardJump() bool {
    75  	return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0)
    76  }
    77  
    78  func (n *nodeImpl) isForwardShortJump() bool {
    79  	return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0
    80  }
    81  
    82  func (n *nodeImpl) isLock() bool {
    83  	return n.flag&nodeFlagLock != 0
    84  }
    85  
    86  // AssignJumpTarget implements asm.Node.AssignJumpTarget.
    87  func (n *nodeImpl) AssignJumpTarget(target asm.Node) {
    88  	n.jumpTarget = target.(*nodeImpl)
    89  }
    90  
    91  // AssignDestinationConstant implements asm.Node.AssignDestinationConstant.
    92  func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) {
    93  	n.dstConst = value
    94  }
    95  
    96  // AssignSourceConstant implements asm.Node.AssignSourceConstant.
    97  func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) {
    98  	n.srcConst = value
    99  }
   100  
   101  // OffsetInBinary implements asm.Node.OffsetInBinary.
   102  func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary {
   103  	return n.offsetInBinary
   104  }
   105  
   106  // String implements fmt.Stringer.
   107  //
   108  // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax,
   109  // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand
   110  // might be embraced by '[]' to represent the memory location.
   111  func (n *nodeImpl) String() (ret string) {
   112  	instName := InstructionName(n.instruction)
   113  	switch n.types {
   114  	case operandTypesNoneToNone:
   115  		ret = instName
   116  	case operandTypesNoneToRegister:
   117  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg))
   118  	case operandTypesNoneToMemory:
   119  		if n.dstMemIndex != asm.NilRegister {
   120  			ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName,
   121  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   122  		} else {
   123  			ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst)
   124  		}
   125  	case operandTypesNoneToBranch:
   126  		ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget)
   127  	case operandTypesRegisterToNone:
   128  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg))
   129  	case operandTypesRegisterToRegister:
   130  		ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg))
   131  	case operandTypesRegisterToMemory:
   132  		if n.dstMemIndex != asm.NilRegister {
   133  			ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg),
   134  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   135  		} else {
   136  			ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst)
   137  		}
   138  	case operandTypesRegisterToConst:
   139  		ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst)
   140  	case operandTypesMemoryToRegister:
   141  		if n.srcMemIndex != asm.NilRegister {
   142  			ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName,
   143  				RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg))
   144  		} else {
   145  			ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
   146  		}
   147  	case operandTypesMemoryToConst:
   148  		if n.srcMemIndex != asm.NilRegister {
   149  			ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName,
   150  				RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst)
   151  		} else {
   152  			ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst)
   153  		}
   154  	case operandTypesConstToMemory:
   155  		if n.dstMemIndex != asm.NilRegister {
   156  			ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst,
   157  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   158  		} else {
   159  			ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst)
   160  		}
   161  	case operandTypesConstToRegister:
   162  		ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg))
   163  	case operandTypesStaticConstToRegister:
   164  		ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg))
   165  	case operandTypesRegisterToStaticConst:
   166  		ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw)
   167  	}
   168  	return
   169  }
   170  
   171  type operandTypes byte
   172  
   173  const (
   174  	operandTypesNoneToNone operandTypes = iota
   175  	operandTypesNoneToRegister
   176  	operandTypesNoneToMemory
   177  	operandTypesNoneToBranch
   178  	operandTypesRegisterToNone
   179  	operandTypesRegisterToRegister
   180  	operandTypesRegisterToMemory
   181  	operandTypesRegisterToConst
   182  	operandTypesMemoryToRegister
   183  	operandTypesMemoryToConst
   184  	operandTypesConstToRegister
   185  	operandTypesConstToMemory
   186  	operandTypesStaticConstToRegister
   187  	operandTypesRegisterToStaticConst
   188  )
   189  
   190  // String implements fmt.Stringer
   191  func (o operandTypes) String() (ret string) {
   192  	switch o {
   193  	case operandTypesNoneToNone:
   194  		ret = "NoneToNone"
   195  	case operandTypesNoneToRegister:
   196  		ret = "NoneToRegister"
   197  	case operandTypesNoneToMemory:
   198  		ret = "NoneToMemory"
   199  	case operandTypesNoneToBranch:
   200  		ret = "NoneToBranch"
   201  	case operandTypesRegisterToNone:
   202  		ret = "RegisterToNone"
   203  	case operandTypesRegisterToRegister:
   204  		ret = "RegisterToRegister"
   205  	case operandTypesRegisterToMemory:
   206  		ret = "RegisterToMemory"
   207  	case operandTypesRegisterToConst:
   208  		ret = "RegisterToConst"
   209  	case operandTypesMemoryToRegister:
   210  		ret = "MemoryToRegister"
   211  	case operandTypesMemoryToConst:
   212  		ret = "MemoryToConst"
   213  	case operandTypesConstToRegister:
   214  		ret = "ConstToRegister"
   215  	case operandTypesConstToMemory:
   216  		ret = "ConstToMemory"
   217  	case operandTypesStaticConstToRegister:
   218  		ret = "StaticConstToRegister"
   219  	case operandTypesRegisterToStaticConst:
   220  		ret = "RegisterToStaticConst"
   221  	}
   222  	return
   223  }
   224  
   225  type (
   226  	// AssemblerImpl implements Assembler.
   227  	AssemblerImpl struct {
   228  		root    *nodeImpl
   229  		current *nodeImpl
   230  		asm.BaseAssemblerImpl
   231  		readInstructionAddressNodes []*nodeImpl
   232  
   233  		// staticConstReferrers maintains the list of static const referrers which requires the
   234  		// offset resolution after finalizing the binary layout.
   235  		staticConstReferrers []staticConstReferrer
   236  
   237  		nodePool nodePool
   238  		pool     asm.StaticConstPool
   239  
   240  		// MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool
   241  		// but have it as an exported field here for testability.
   242  		MaxDisplacementForConstantPool int
   243  
   244  		forceReAssemble bool
   245  	}
   246  
   247  	// staticConstReferrer represents a referrer of a asm.StaticConst.
   248  	staticConstReferrer struct {
   249  		n *nodeImpl
   250  		// instLen is the encoded length of the instruction for `n`.
   251  		instLen int
   252  	}
   253  )
   254  
   255  func NewAssembler() *AssemblerImpl {
   256  	return &AssemblerImpl{
   257  		nodePool:                       nodePool{index: nodePageSize},
   258  		pool:                           asm.NewStaticConstPool(),
   259  		MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool,
   260  	}
   261  }
   262  
   263  const nodePageSize = 128
   264  
   265  type nodePage = [nodePageSize]nodeImpl
   266  
   267  // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl.
   268  // This reduces the allocations over compilation by reusing AssemblerImpl.
   269  type nodePool struct {
   270  	pages []*nodePage
   271  	index int
   272  }
   273  
   274  // allocNode allocates a new nodeImpl for use from the pool.
   275  // This expands the pool if there is no space left for it.
   276  func (n *nodePool) allocNode() *nodeImpl {
   277  	if n.index == nodePageSize {
   278  		if len(n.pages) == cap(n.pages) {
   279  			n.pages = append(n.pages, new(nodePage))
   280  		} else {
   281  			i := len(n.pages)
   282  			n.pages = n.pages[:i+1]
   283  			if n.pages[i] == nil {
   284  				n.pages[i] = new(nodePage)
   285  			}
   286  		}
   287  		n.index = 0
   288  	}
   289  	ret := &n.pages[len(n.pages)-1][n.index]
   290  	n.index++
   291  	return ret
   292  }
   293  
   294  func (n *nodePool) reset() {
   295  	for _, ns := range n.pages {
   296  		pages := ns[:]
   297  		for i := range pages {
   298  			pages[i] = nodeImpl{}
   299  		}
   300  	}
   301  	n.pages = n.pages[:0]
   302  	n.index = nodePageSize
   303  }
   304  
   305  // AllocateNOP implements asm.AssemblerBase.
   306  func (a *AssemblerImpl) AllocateNOP() asm.Node {
   307  	n := a.nodePool.allocNode()
   308  	n.instruction = NOP
   309  	n.types = operandTypesNoneToNone
   310  	return n
   311  }
   312  
   313  // Add implements asm.AssemblerBase.
   314  func (a *AssemblerImpl) Add(n asm.Node) {
   315  	a.addNode(n.(*nodeImpl))
   316  }
   317  
   318  // Reset implements asm.AssemblerBase.
   319  func (a *AssemblerImpl) Reset() {
   320  	pool := a.pool
   321  	pool.Reset()
   322  	*a = AssemblerImpl{
   323  		nodePool:                    a.nodePool,
   324  		pool:                        pool,
   325  		readInstructionAddressNodes: a.readInstructionAddressNodes[:0],
   326  		staticConstReferrers:        a.staticConstReferrers[:0],
   327  		BaseAssemblerImpl: asm.BaseAssemblerImpl{
   328  			SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0],
   329  			JumpTableEntries:           a.JumpTableEntries[:0],
   330  		},
   331  	}
   332  	a.nodePool.reset()
   333  }
   334  
   335  // newNode creates a new Node and appends it into the linked list.
   336  func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl {
   337  	n := a.nodePool.allocNode()
   338  	n.instruction = instruction
   339  	n.types = types
   340  	a.addNode(n)
   341  	return n
   342  }
   343  
   344  // addNode appends the new node into the linked list.
   345  func (a *AssemblerImpl) addNode(node *nodeImpl) {
   346  	if a.root == nil {
   347  		a.root = node
   348  		a.current = node
   349  	} else {
   350  		parent := a.current
   351  		parent.next = node
   352  		node.prev = parent
   353  		a.current = node
   354  	}
   355  
   356  	for _, o := range a.SetBranchTargetOnNextNodes {
   357  		origin := o.(*nodeImpl)
   358  		origin.jumpTarget = node
   359  	}
   360  	// Reuse the underlying slice to avoid re-allocations.
   361  	a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0]
   362  }
   363  
   364  // encodeNode encodes the given node into writer.
   365  func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) {
   366  	switch n.types {
   367  	case operandTypesNoneToNone:
   368  		err = a.encodeNoneToNone(buf, n)
   369  	case operandTypesNoneToRegister:
   370  		err = a.encodeNoneToRegister(buf, n)
   371  	case operandTypesNoneToMemory:
   372  		err = a.encodeNoneToMemory(buf, n)
   373  	case operandTypesNoneToBranch:
   374  		// Branching operand can be encoded as relative jumps.
   375  		err = a.encodeRelativeJump(buf, n)
   376  	case operandTypesRegisterToNone:
   377  		err = a.encodeRegisterToNone(buf, n)
   378  	case operandTypesRegisterToRegister:
   379  		err = a.encodeRegisterToRegister(buf, n)
   380  	case operandTypesRegisterToMemory:
   381  		err = a.encodeRegisterToMemory(buf, n)
   382  	case operandTypesRegisterToConst:
   383  		err = a.encodeRegisterToConst(buf, n)
   384  	case operandTypesMemoryToRegister:
   385  		err = a.encodeMemoryToRegister(buf, n)
   386  	case operandTypesMemoryToConst:
   387  		err = a.encodeMemoryToConst(buf, n)
   388  	case operandTypesConstToRegister:
   389  		err = a.encodeConstToRegister(buf, n)
   390  	case operandTypesConstToMemory:
   391  		err = a.encodeConstToMemory(buf, n)
   392  	case operandTypesStaticConstToRegister:
   393  		err = a.encodeStaticConstToRegister(buf, n)
   394  	case operandTypesRegisterToStaticConst:
   395  		err = a.encodeRegisterToStaticConst(buf, n)
   396  	default:
   397  		err = fmt.Errorf("encoder undefined for [%s] operand type", n.types)
   398  	}
   399  	if err != nil {
   400  		err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node.
   401  	}
   402  	return
   403  }
   404  
   405  // Assemble implements asm.AssemblerBase
   406  func (a *AssemblerImpl) Assemble(buf asm.Buffer) error {
   407  	a.initializeNodesForEncoding()
   408  
   409  	// Continue encoding until we are not forced to re-assemble which happens when
   410  	// a short relative jump ends up the offset larger than 8-bit length.
   411  	for {
   412  		err := a.encode(buf)
   413  		if err != nil {
   414  			return err
   415  		}
   416  
   417  		if !a.forceReAssemble {
   418  			break
   419  		} else {
   420  			// We reset the length of buffer but don't delete the underlying slice since
   421  			// the binary size will roughly the same after reassemble.
   422  			buf.Reset()
   423  			// Reset the re-assemble flag in order to avoid the infinite loop!
   424  			a.forceReAssemble = false
   425  		}
   426  	}
   427  
   428  	code := buf.Bytes()
   429  	for _, n := range a.readInstructionAddressNodes {
   430  		if err := a.finalizeReadInstructionAddressNode(code, n); err != nil {
   431  			return err
   432  		}
   433  	}
   434  
   435  	// Now that we've finished the layout, fill out static consts offsets.
   436  	for i := range a.staticConstReferrers {
   437  		ref := &a.staticConstReferrers[i]
   438  		n, instLen := ref.n, ref.instLen
   439  		// Calculate the displacement between the RIP (the offset _after_ n) and the static constant.
   440  		displacement := int(n.staticConst.OffsetInBinary) - int(n.OffsetInBinary()) - instLen
   441  		// The offset must be stored at the 4 bytes from the tail of this n. See AssemblerImpl.encodeStaticConstImpl for detail.
   442  		displacementOffsetInInstruction := n.OffsetInBinary() + uint64(instLen-4)
   443  		binary.LittleEndian.PutUint32(code[displacementOffsetInInstruction:], uint32(int32(displacement)))
   444  	}
   445  
   446  	return a.FinalizeJumpTableEntry(code)
   447  }
   448  
   449  // initializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps
   450  // are forward or backward jump.
   451  func (a *AssemblerImpl) initializeNodesForEncoding() {
   452  	for n := a.root; n != nil; n = n.next {
   453  		n.flag |= nodeFlagInitializedForEncoding
   454  		if target := n.jumpTarget; target != nil {
   455  			if target.isInitializedForEncoding() {
   456  				// This means the target exists behind.
   457  				n.flag |= nodeFlagBackwardJump
   458  			} else {
   459  				// Otherwise, this is forward jump.
   460  				// We start with assuming that the jump can be short (8-bit displacement).
   461  				// If it doens't fit, we change this flag in resolveRelativeForwardJump.
   462  				n.flag |= nodeFlagShortForwardJump
   463  
   464  				// If the target node is also the branching instruction, we replace the target with the NOP
   465  				// node so that we can avoid the collision of the target.forwardJumpOrigins both as destination and origins.
   466  				if target.types == operandTypesNoneToBranch {
   467  					// Allocate the NOP node from the pool.
   468  					nop := a.nodePool.allocNode()
   469  					nop.instruction = NOP
   470  					nop.types = operandTypesNoneToNone
   471  					// Insert it between target.prev and target: [target.prev, target] -> [target.prev, nop, target]
   472  					prev := target.prev
   473  					nop.prev = prev
   474  					prev.next = nop
   475  					nop.next = target
   476  					target.prev = nop
   477  					n.jumpTarget = nop
   478  					target = nop
   479  				}
   480  
   481  				// We add this node `n` into the end of the linked list (.forwardJumpOrigins) beginning from the `target.forwardJumpOrigins`.
   482  				// Insert the current `n` as the head of the list.
   483  				n.forwardJumpOrigins = target.forwardJumpOrigins
   484  				target.forwardJumpOrigins = n
   485  			}
   486  		}
   487  	}
   488  }
   489  
   490  func (a *AssemblerImpl) encode(buf asm.Buffer) error {
   491  	for n := a.root; n != nil; n = n.next {
   492  		// If an instruction needs NOP padding, we do so before encoding it.
   493  		//
   494  		// This is necessary to avoid Intel's jump erratum; see in Section 2.1
   495  		// in for when we have to pad NOP:
   496  		// https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   497  		//
   498  		// This logic used to be implemented in a function called maybeNOPPadding,
   499  		// but the complexity of the logic made it impossible for the compiler to
   500  		// inline. Since this function is on a hot code path, we inlined the
   501  		// initial checks to skip the function call when instructions do not need
   502  		// NOP padding.
   503  		switch info := nopPaddingInfo[n.instruction]; {
   504  		case info.jmp:
   505  			if err := a.encodeJmpNOPPadding(buf, n); err != nil {
   506  				return err
   507  			}
   508  		case info.onNextJmp:
   509  			if err := a.encodeOnNextJmpNOPPAdding(buf, n); err != nil {
   510  				return err
   511  			}
   512  		}
   513  
   514  		// After the padding, we can finalize the offset of this instruction in the binary.
   515  		n.offsetInBinary = uint64(buf.Len())
   516  
   517  		if err := a.encodeNode(buf, n); err != nil {
   518  			return err
   519  		}
   520  
   521  		if n.forwardJumpOrigins != nil {
   522  			if err := a.resolveForwardRelativeJumps(buf, n); err != nil {
   523  				return fmt.Errorf("invalid relative forward jumps: %w", err)
   524  			}
   525  		}
   526  
   527  		a.maybeFlushConstants(buf, n.next == nil)
   528  	}
   529  	return nil
   530  }
   531  
   532  var nopPaddingInfo = [instructionEnd]struct {
   533  	jmp, onNextJmp bool
   534  }{
   535  	RET: {jmp: true},
   536  	JMP: {jmp: true},
   537  	JCC: {jmp: true},
   538  	JCS: {jmp: true},
   539  	JEQ: {jmp: true},
   540  	JGE: {jmp: true},
   541  	JGT: {jmp: true},
   542  	JHI: {jmp: true},
   543  	JLE: {jmp: true},
   544  	JLS: {jmp: true},
   545  	JLT: {jmp: true},
   546  	JMI: {jmp: true},
   547  	JNE: {jmp: true},
   548  	JPC: {jmp: true},
   549  	JPS: {jmp: true},
   550  	// The possible fused jump instructions if the next node is a conditional jump instruction.
   551  	CMPL:  {onNextJmp: true},
   552  	CMPQ:  {onNextJmp: true},
   553  	TESTL: {onNextJmp: true},
   554  	TESTQ: {onNextJmp: true},
   555  	ADDL:  {onNextJmp: true},
   556  	ADDQ:  {onNextJmp: true},
   557  	SUBL:  {onNextJmp: true},
   558  	SUBQ:  {onNextJmp: true},
   559  	ANDL:  {onNextJmp: true},
   560  	ANDQ:  {onNextJmp: true},
   561  	INCQ:  {onNextJmp: true},
   562  	DECQ:  {onNextJmp: true},
   563  }
   564  
   565  func (a *AssemblerImpl) encodeJmpNOPPadding(buf asm.Buffer, n *nodeImpl) error {
   566  	// In order to know the instruction length before writing into the binary,
   567  	// we try encoding it.
   568  	prevLen := buf.Len()
   569  
   570  	// Assign the temporary offset which may or may not be correct depending on the padding decision.
   571  	n.offsetInBinary = uint64(prevLen)
   572  
   573  	// Encode the node and get the instruction length.
   574  	if err := a.encodeNode(buf, n); err != nil {
   575  		return err
   576  	}
   577  	instructionLen := int32(buf.Len() - prevLen)
   578  
   579  	// Revert the written bytes.
   580  	buf.Truncate(prevLen)
   581  	return a.encodeNOPPadding(buf, instructionLen)
   582  }
   583  
   584  func (a *AssemblerImpl) encodeOnNextJmpNOPPAdding(buf asm.Buffer, n *nodeImpl) error {
   585  	instructionLen, err := a.fusedInstructionLength(buf, n)
   586  	if err != nil {
   587  		return err
   588  	}
   589  	return a.encodeNOPPadding(buf, instructionLen)
   590  }
   591  
   592  // encodeNOPPadding maybe appends NOP instructions before the node `n`.
   593  // This is necessary to avoid Intel's jump erratum:
   594  // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   595  func (a *AssemblerImpl) encodeNOPPadding(buf asm.Buffer, instructionLen int32) error {
   596  	const boundaryInBytes int32 = 32
   597  	const mask = boundaryInBytes - 1
   598  	var padNum int
   599  	currentPos := int32(buf.Len())
   600  	if used := currentPos & mask; used+instructionLen >= boundaryInBytes {
   601  		padNum = int(boundaryInBytes - used)
   602  	}
   603  	a.padNOP(buf, padNum)
   604  	return nil
   605  }
   606  
   607  // fusedInstructionLength returns the length of "macro fused instruction" if the
   608  // instruction sequence starting from `n` can be fused by processor. Otherwise,
   609  // returns zero.
   610  func (a *AssemblerImpl) fusedInstructionLength(buf asm.Buffer, n *nodeImpl) (ret int32, err error) {
   611  	// Find the next non-NOP instruction.
   612  	next := n.next
   613  	for ; next != nil && next.instruction == NOP; next = next.next {
   614  	}
   615  
   616  	if next == nil {
   617  		return
   618  	}
   619  
   620  	inst, jmpInst := n.instruction, next.instruction
   621  
   622  	if !nopPaddingInfo[jmpInst].jmp {
   623  		// If the next instruction is not jump kind, the instruction will not be fused.
   624  		return
   625  	}
   626  
   627  	// How to determine whether the instruction can be fused is described in
   628  	// Section 3.4.2.2 of "Intel Optimization Manual":
   629  	// https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf
   630  	isTest := inst == TESTL || inst == TESTQ
   631  	isCmp := inst == CMPQ || inst == CMPL
   632  	isTestCmp := isTest || isCmp
   633  	if isTestCmp && (n.types == operandTypesMemoryToConst || n.types == operandTypesConstToMemory) {
   634  		// The manual says: "CMP and TEST can not be fused when comparing MEM-IMM".
   635  		return
   636  	}
   637  
   638  	// Implement the decision according to the table 3-1 in the manual.
   639  	isAnd := inst == ANDL || inst == ANDQ
   640  	if !isTest && !isAnd {
   641  		if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC {
   642  			// These jumps are only fused for TEST or AND.
   643  			return
   644  		}
   645  		isAdd := inst == ADDL || inst == ADDQ
   646  		isSub := inst == SUBL || inst == SUBQ
   647  		if !isCmp && !isAdd && !isSub {
   648  			if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS {
   649  				// Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB.
   650  				return
   651  			}
   652  		}
   653  	}
   654  
   655  	// Now the instruction is ensured to be fused by the processor.
   656  	// In order to know the fused instruction length before writing into the binary,
   657  	// we try encoding it.
   658  	savedLen := uint64(buf.Len())
   659  
   660  	// Encode the nodes into the buffer.
   661  	if err = a.encodeNode(buf, n); err != nil {
   662  		return
   663  	}
   664  	if err = a.encodeNode(buf, next); err != nil {
   665  		return
   666  	}
   667  
   668  	ret = int32(uint64(buf.Len()) - savedLen)
   669  
   670  	// Revert the written bytes.
   671  	buf.Truncate(int(savedLen))
   672  	return
   673  }
   674  
   675  // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP"
   676  // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
   677  var nopOpcodes = [][11]byte{
   678  	{0x90},
   679  	{0x66, 0x90},
   680  	{0x0f, 0x1f, 0x00},
   681  	{0x0f, 0x1f, 0x40, 0x00},
   682  	{0x0f, 0x1f, 0x44, 0x00, 0x00},
   683  	{0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
   684  	{0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
   685  	{0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   686  	{0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   687  	{0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   688  	{0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   689  }
   690  
   691  func (a *AssemblerImpl) padNOP(buf asm.Buffer, num int) {
   692  	for num > 0 {
   693  		singleNopNum := num
   694  		if singleNopNum > len(nopOpcodes) {
   695  			singleNopNum = len(nopOpcodes)
   696  		}
   697  		buf.AppendBytes(nopOpcodes[singleNopNum-1][:singleNopNum])
   698  		num -= singleNopNum
   699  	}
   700  }
   701  
   702  // CompileStandAlone implements the same method as documented on asm.AssemblerBase.
   703  func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node {
   704  	return a.newNode(instruction, operandTypesNoneToNone)
   705  }
   706  
   707  // CompileConstToRegister implements the same method as documented on asm.AssemblerBase.
   708  func (a *AssemblerImpl) CompileConstToRegister(
   709  	instruction asm.Instruction,
   710  	value asm.ConstantValue,
   711  	destinationReg asm.Register,
   712  ) (inst asm.Node) {
   713  	n := a.newNode(instruction, operandTypesConstToRegister)
   714  	n.srcConst = value
   715  	n.dstReg = destinationReg
   716  	return n
   717  }
   718  
   719  // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase.
   720  func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
   721  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   722  	n.srcReg = from
   723  	n.dstReg = to
   724  }
   725  
   726  // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase.
   727  func (a *AssemblerImpl) CompileMemoryToRegister(
   728  	instruction asm.Instruction,
   729  	sourceBaseReg asm.Register,
   730  	sourceOffsetConst asm.ConstantValue,
   731  	destinationReg asm.Register,
   732  ) {
   733  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   734  	n.srcReg = sourceBaseReg
   735  	n.srcConst = sourceOffsetConst
   736  	n.dstReg = destinationReg
   737  }
   738  
   739  // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase.
   740  func (a *AssemblerImpl) CompileRegisterToMemory(
   741  	instruction asm.Instruction,
   742  	sourceRegister, destinationBaseRegister asm.Register,
   743  	destinationOffsetConst asm.ConstantValue,
   744  ) {
   745  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   746  	n.srcReg = sourceRegister
   747  	n.dstReg = destinationBaseRegister
   748  	n.dstConst = destinationOffsetConst
   749  }
   750  
   751  // CompileRegisterToMemoryWithIndexAndLock implements the same method as documented on asm.AssemblerBase.
   752  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndLock(
   753  	instruction asm.Instruction,
   754  	srcReg asm.Register,
   755  	dstBaseReg asm.Register,
   756  	dstOffsetConst int64,
   757  	dstIndex asm.Register,
   758  	dstScale int16,
   759  ) {
   760  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   761  	n.srcReg = srcReg
   762  	n.dstReg = dstBaseReg
   763  	n.dstConst = dstOffsetConst
   764  	n.dstMemIndex = dstIndex
   765  	n.dstMemScale = byte(dstScale)
   766  	n.flag |= nodeFlagLock
   767  }
   768  
   769  // CompileJump implements the same method as documented on asm.AssemblerBase.
   770  func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node {
   771  	return a.newNode(jmpInstruction, operandTypesNoneToBranch)
   772  }
   773  
   774  // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase.
   775  func (a *AssemblerImpl) CompileJumpToMemory(
   776  	jmpInstruction asm.Instruction,
   777  	baseReg asm.Register,
   778  	offset asm.ConstantValue,
   779  ) {
   780  	n := a.newNode(jmpInstruction, operandTypesNoneToMemory)
   781  	n.dstReg = baseReg
   782  	n.dstConst = offset
   783  }
   784  
   785  // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase.
   786  func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) {
   787  	n := a.newNode(jmpInstruction, operandTypesNoneToRegister)
   788  	n.dstReg = reg
   789  }
   790  
   791  // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase.
   792  func (a *AssemblerImpl) CompileReadInstructionAddress(
   793  	destinationRegister asm.Register,
   794  	beforeAcquisitionTargetInstruction asm.Instruction,
   795  ) {
   796  	n := a.newNode(LEAQ, operandTypesMemoryToRegister)
   797  	n.dstReg = destinationRegister
   798  	n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction
   799  }
   800  
   801  // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler.
   802  func (a *AssemblerImpl) CompileRegisterToRegisterWithArg(
   803  	instruction asm.Instruction,
   804  	from, to asm.Register,
   805  	arg byte,
   806  ) {
   807  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   808  	n.srcReg = from
   809  	n.dstReg = to
   810  	n.arg = arg
   811  }
   812  
   813  // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler.
   814  func (a *AssemblerImpl) CompileMemoryWithIndexToRegister(
   815  	instruction asm.Instruction,
   816  	srcBaseReg asm.Register,
   817  	srcOffsetConst asm.ConstantValue,
   818  	srcIndex asm.Register,
   819  	srcScale int16,
   820  	dstReg asm.Register,
   821  ) {
   822  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   823  	n.srcReg = srcBaseReg
   824  	n.srcConst = srcOffsetConst
   825  	n.srcMemIndex = srcIndex
   826  	n.srcMemScale = byte(srcScale)
   827  	n.dstReg = dstReg
   828  }
   829  
   830  // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler.
   831  func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister(
   832  	instruction asm.Instruction,
   833  	srcBaseReg asm.Register,
   834  	srcOffsetConst asm.ConstantValue,
   835  	srcIndex asm.Register,
   836  	srcScale int16,
   837  	dstReg asm.Register,
   838  	arg byte,
   839  ) {
   840  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   841  	n.srcReg = srcBaseReg
   842  	n.srcConst = srcOffsetConst
   843  	n.srcMemIndex = srcIndex
   844  	n.srcMemScale = byte(srcScale)
   845  	n.dstReg = dstReg
   846  	n.arg = arg
   847  }
   848  
   849  // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler.
   850  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex(
   851  	instruction asm.Instruction,
   852  	srcReg, dstBaseReg asm.Register,
   853  	dstOffsetConst asm.ConstantValue,
   854  	dstIndex asm.Register,
   855  	dstScale int16,
   856  ) {
   857  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   858  	n.srcReg = srcReg
   859  	n.dstReg = dstBaseReg
   860  	n.dstConst = dstOffsetConst
   861  	n.dstMemIndex = dstIndex
   862  	n.dstMemScale = byte(dstScale)
   863  }
   864  
   865  // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler.
   866  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg(
   867  	instruction asm.Instruction,
   868  	srcReg, dstBaseReg asm.Register,
   869  	dstOffsetConst asm.ConstantValue,
   870  	dstIndex asm.Register,
   871  	dstScale int16,
   872  	arg byte,
   873  ) {
   874  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   875  	n.srcReg = srcReg
   876  	n.dstReg = dstBaseReg
   877  	n.dstConst = dstOffsetConst
   878  	n.dstMemIndex = dstIndex
   879  	n.dstMemScale = byte(dstScale)
   880  	n.arg = arg
   881  }
   882  
   883  // CompileRegisterToConst implements the same method as documented on amd64.Assembler.
   884  func (a *AssemblerImpl) CompileRegisterToConst(
   885  	instruction asm.Instruction,
   886  	srcRegister asm.Register,
   887  	value asm.ConstantValue,
   888  ) asm.Node {
   889  	n := a.newNode(instruction, operandTypesRegisterToConst)
   890  	n.srcReg = srcRegister
   891  	n.dstConst = value
   892  	return n
   893  }
   894  
   895  // CompileRegisterToNone implements the same method as documented on amd64.Assembler.
   896  func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) {
   897  	n := a.newNode(instruction, operandTypesRegisterToNone)
   898  	n.srcReg = register
   899  }
   900  
   901  // CompileNoneToRegister implements the same method as documented on amd64.Assembler.
   902  func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) {
   903  	n := a.newNode(instruction, operandTypesNoneToRegister)
   904  	n.dstReg = register
   905  }
   906  
   907  // CompileNoneToMemory implements the same method as documented on amd64.Assembler.
   908  func (a *AssemblerImpl) CompileNoneToMemory(
   909  	instruction asm.Instruction,
   910  	baseReg asm.Register,
   911  	offset asm.ConstantValue,
   912  ) {
   913  	n := a.newNode(instruction, operandTypesNoneToMemory)
   914  	n.dstReg = baseReg
   915  	n.dstConst = offset
   916  }
   917  
   918  // CompileConstToMemory implements the same method as documented on amd64.Assembler.
   919  func (a *AssemblerImpl) CompileConstToMemory(
   920  	instruction asm.Instruction,
   921  	value asm.ConstantValue,
   922  	dstbaseReg asm.Register,
   923  	dstOffset asm.ConstantValue,
   924  ) asm.Node {
   925  	n := a.newNode(instruction, operandTypesConstToMemory)
   926  	n.srcConst = value
   927  	n.dstReg = dstbaseReg
   928  	n.dstConst = dstOffset
   929  	return n
   930  }
   931  
   932  // CompileMemoryToConst implements the same method as documented on amd64.Assembler.
   933  func (a *AssemblerImpl) CompileMemoryToConst(
   934  	instruction asm.Instruction,
   935  	srcBaseReg asm.Register,
   936  	srcOffset, value asm.ConstantValue,
   937  ) asm.Node {
   938  	n := a.newNode(instruction, operandTypesMemoryToConst)
   939  	n.srcReg = srcBaseReg
   940  	n.srcConst = srcOffset
   941  	n.dstConst = value
   942  	return n
   943  }
   944  
   945  func errorEncodingUnsupported(n *nodeImpl) error {
   946  	return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types)
   947  }
   948  
   949  func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) (err error) {
   950  	// Throughout the encoding methods, we use this pair of base offset and
   951  	// code buffer to write instructions.
   952  	//
   953  	// The code buffer is allocated at the end of the current buffer to a size
   954  	// large enough to hold all the bytes that may be written by the method.
   955  	//
   956  	// We use Go's append builtin to write to the buffer because it allows the
   957  	// compiler to generate much better code than if we made calls to write
   958  	// methods to mutate an encapsulated byte slice.
   959  	//
   960  	// At the end of the method, we truncate the buffer size back to the base
   961  	// plus the length of the code buffer so the end of the buffer points right
   962  	// after the last byte that was written.
   963  	base := buf.Len()
   964  	code := buf.Append(4)[:0]
   965  
   966  	switch n.instruction {
   967  	case CDQ:
   968  		// https://www.felixcloutier.com/x86/cwd:cdq:cqo
   969  		code = append(code, 0x99)
   970  	case CQO:
   971  		// https://www.felixcloutier.com/x86/cwd:cdq:cqo
   972  		code = append(code, rexPrefixW, 0x99)
   973  	case NOP:
   974  		// Simply optimize out the NOP instructions.
   975  	case RET:
   976  		// https://www.felixcloutier.com/x86/ret
   977  		code = append(code, 0xc3)
   978  	case UD2:
   979  		// https://mudongliang.github.io/x86/html/file_module_x86_id_318.html
   980  		code = append(code, 0x0f, 0x0b)
   981  	case REPMOVSQ:
   982  		code = append(code, 0xf3, rexPrefixW, 0xa5)
   983  	case REPSTOSQ:
   984  		code = append(code, 0xf3, rexPrefixW, 0xab)
   985  	case STD:
   986  		code = append(code, 0xfd)
   987  	case CLD:
   988  		code = append(code, 0xfc)
   989  	case MFENCE:
   990  		code = append(code, 0x0F, 0xAE, 0xF0)
   991  	default:
   992  		err = errorEncodingUnsupported(n)
   993  	}
   994  
   995  	buf.Truncate(base + len(code))
   996  	return
   997  }
   998  
   999  func (a *AssemblerImpl) encodeNoneToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  1000  	regBits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  1001  
  1002  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1003  	modRM := 0b11_000_000 | // Specifying that opeand is register.
  1004  		regBits
  1005  	var mandatoryPrefix byte
  1006  	switch n.instruction {
  1007  	case JMP:
  1008  		// JMP's opcode is defined as "FF /4" meaning that we have to have "4"
  1009  		// in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp
  1010  		modRM |= 0b00_100_000
  1011  	case NEGQ:
  1012  		prefix |= rexPrefixW
  1013  		modRM |= 0b00_011_000
  1014  	case NEGL:
  1015  		modRM |= 0b00_011_000
  1016  	case NEGW:
  1017  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1018  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1019  		mandatoryPrefix = 0x66
  1020  		modRM |= 0b00_011_000
  1021  	case NEGB:
  1022  		modRM |= 0b00_011_000
  1023  		// 1 byte register operands need default prefix for the following registers.
  1024  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1025  			prefix |= rexPrefixDefault
  1026  		}
  1027  	case INCQ:
  1028  		prefix |= rexPrefixW
  1029  	case DECQ:
  1030  		prefix |= rexPrefixW
  1031  		modRM |= 0b00_001_000
  1032  	default:
  1033  		if RegSP <= n.dstReg && n.dstReg <= RegDI {
  1034  			// If the destination is one byte length register, we need to have the default prefix.
  1035  			// https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers
  1036  			prefix |= rexPrefixDefault
  1037  		}
  1038  	}
  1039  
  1040  	base := buf.Len()
  1041  	code := buf.Append(8)[:0]
  1042  
  1043  	if mandatoryPrefix != 0 {
  1044  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  1045  		code = append(code, mandatoryPrefix)
  1046  	}
  1047  
  1048  	if prefix != rexPrefixNone {
  1049  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding
  1050  		code = append(code, prefix)
  1051  	}
  1052  
  1053  	switch n.instruction {
  1054  	case JMP:
  1055  		// https://www.felixcloutier.com/x86/jmp
  1056  		code = append(code, 0xff, modRM)
  1057  	case SETCC:
  1058  		// https://www.felixcloutier.com/x86/setcc
  1059  		code = append(code, 0x0f, 0x93, modRM)
  1060  	case SETCS:
  1061  		// https://www.felixcloutier.com/x86/setcc
  1062  		code = append(code, 0x0f, 0x92, modRM)
  1063  	case SETEQ:
  1064  		// https://www.felixcloutier.com/x86/setcc
  1065  		code = append(code, 0x0f, 0x94, modRM)
  1066  	case SETGE:
  1067  		// https://www.felixcloutier.com/x86/setcc
  1068  		code = append(code, 0x0f, 0x9d, modRM)
  1069  	case SETGT:
  1070  		// https://www.felixcloutier.com/x86/setcc
  1071  		code = append(code, 0x0f, 0x9f, modRM)
  1072  	case SETHI:
  1073  		// https://www.felixcloutier.com/x86/setcc
  1074  		code = append(code, 0x0f, 0x97, modRM)
  1075  	case SETLE:
  1076  		// https://www.felixcloutier.com/x86/setcc
  1077  		code = append(code, 0x0f, 0x9e, modRM)
  1078  	case SETLS:
  1079  		// https://www.felixcloutier.com/x86/setcc
  1080  		code = append(code, 0x0f, 0x96, modRM)
  1081  	case SETLT:
  1082  		// https://www.felixcloutier.com/x86/setcc
  1083  		code = append(code, 0x0f, 0x9c, modRM)
  1084  	case SETNE:
  1085  		// https://www.felixcloutier.com/x86/setcc
  1086  		code = append(code, 0x0f, 0x95, modRM)
  1087  	case SETPC:
  1088  		// https://www.felixcloutier.com/x86/setcc
  1089  		code = append(code, 0x0f, 0x9b, modRM)
  1090  	case SETPS:
  1091  		// https://www.felixcloutier.com/x86/setcc
  1092  		code = append(code, 0x0f, 0x9a, modRM)
  1093  	case NEGQ, NEGL, NEGW:
  1094  		// https://www.felixcloutier.com/x86/neg
  1095  		code = append(code, 0xf7, modRM)
  1096  	case NEGB:
  1097  		// https://www.felixcloutier.com/x86/neg
  1098  		code = append(code, 0xf6, modRM)
  1099  	case INCQ:
  1100  		// https://www.felixcloutier.com/x86/inc
  1101  		code = append(code, 0xff, modRM)
  1102  	case DECQ:
  1103  		// https://www.felixcloutier.com/x86/dec
  1104  		code = append(code, 0xff, modRM)
  1105  	default:
  1106  		err = errorEncodingUnsupported(n)
  1107  	}
  1108  
  1109  	buf.Truncate(base + len(code))
  1110  	return
  1111  }
  1112  
  1113  func (a *AssemblerImpl) encodeNoneToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  1114  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true)
  1115  	if err != nil {
  1116  		return err
  1117  	}
  1118  
  1119  	var opcode byte
  1120  	switch n.instruction {
  1121  	case INCQ:
  1122  		// https://www.felixcloutier.com/x86/inc
  1123  		rexPrefix |= rexPrefixW
  1124  		opcode = 0xff
  1125  	case DECQ:
  1126  		// https://www.felixcloutier.com/x86/dec
  1127  		rexPrefix |= rexPrefixW
  1128  		modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM.
  1129  		opcode = 0xff
  1130  	case JMP:
  1131  		// https://www.felixcloutier.com/x86/jmp
  1132  		modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM.
  1133  		opcode = 0xff
  1134  	default:
  1135  		return errorEncodingUnsupported(n)
  1136  	}
  1137  
  1138  	base := buf.Len()
  1139  	code := buf.Append(12)[:0]
  1140  
  1141  	if rexPrefix != rexPrefixNone {
  1142  		code = append(code, rexPrefix)
  1143  	}
  1144  
  1145  	code = append(code, opcode, modRM)
  1146  
  1147  	if sbiExist {
  1148  		code = append(code, sbi)
  1149  	}
  1150  
  1151  	if displacementWidth != 0 {
  1152  		code = appendConst(code, n.dstConst, displacementWidth)
  1153  	}
  1154  
  1155  	buf.Truncate(base + len(code))
  1156  	return
  1157  }
  1158  
  1159  type relativeJumpOpcode struct{ short, long []byte }
  1160  
  1161  func (o relativeJumpOpcode) instructionLen(short bool) int64 {
  1162  	if short {
  1163  		return int64(len(o.short)) + 1 // 1 byte = 8 bit offset
  1164  	} else {
  1165  		return int64(len(o.long)) + 4 // 4 byte = 32 bit offset
  1166  	}
  1167  }
  1168  
  1169  var relativeJumpOpcodes = [...]relativeJumpOpcode{
  1170  	// https://www.felixcloutier.com/x86/jcc
  1171  	JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}},
  1172  	JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}},
  1173  	JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}},
  1174  	JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}},
  1175  	JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}},
  1176  	JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}},
  1177  	JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}},
  1178  	JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}},
  1179  	JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}},
  1180  	JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}},
  1181  	JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}},
  1182  	JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}},
  1183  	JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}},
  1184  	JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}},
  1185  	// https://www.felixcloutier.com/x86/jmp
  1186  	JMP: {short: []byte{0xeb}, long: []byte{0xe9}},
  1187  }
  1188  
  1189  func (a *AssemblerImpl) resolveForwardRelativeJumps(buf asm.Buffer, target *nodeImpl) (err error) {
  1190  	offsetInBinary := int64(target.OffsetInBinary())
  1191  	origin := target.forwardJumpOrigins
  1192  	for ; origin != nil; origin = origin.forwardJumpOrigins {
  1193  		shortJump := origin.isForwardShortJump()
  1194  		op := relativeJumpOpcodes[origin.instruction]
  1195  		instructionLen := op.instructionLen(shortJump)
  1196  
  1197  		// Calculate the offset from the EIP (at the time of executing this jump instruction)
  1198  		// to the target instruction. This value is always >= 0 as here we only handle forward jumps.
  1199  		offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen)
  1200  		if shortJump {
  1201  			if offset > math.MaxInt8 {
  1202  				// This forces reassemble in the outer loop inside AssemblerImpl.Assemble().
  1203  				a.forceReAssemble = true
  1204  				// From the next reAssemble phases, this forward jump will be encoded long jump and
  1205  				// allocate 32-bit offset bytes by default. This means that this `origin` node
  1206  				// will always enter the "long jump offset encoding" block below
  1207  				origin.flag ^= nodeFlagShortForwardJump
  1208  			} else {
  1209  				buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset)
  1210  			}
  1211  		} else { // long jump offset encoding.
  1212  			if offset > math.MaxInt32 {
  1213  				return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction))
  1214  			}
  1215  			binary.LittleEndian.PutUint32(buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset))
  1216  		}
  1217  	}
  1218  	return nil
  1219  }
  1220  
  1221  func (a *AssemblerImpl) encodeRelativeJump(buf asm.Buffer, n *nodeImpl) (err error) {
  1222  	if n.jumpTarget == nil {
  1223  		err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction))
  1224  		return
  1225  	}
  1226  
  1227  	op := relativeJumpOpcodes[n.instruction]
  1228  	var isShortJump bool
  1229  	// offsetOfEIP means the offset of EIP register at the time of executing this jump instruction.
  1230  	// Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP.
  1231  	var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps
  1232  	if n.isBackwardJump() {
  1233  		// If this is the backward jump, we can calculate the exact offset now.
  1234  		offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary())
  1235  		isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8
  1236  		offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump)
  1237  	} else {
  1238  		// For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps.
  1239  		isShortJump = n.isForwardShortJump()
  1240  	}
  1241  
  1242  	if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here.
  1243  		return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction))
  1244  	}
  1245  
  1246  	base := buf.Len()
  1247  	code := buf.Append(6)[:0]
  1248  
  1249  	if isShortJump {
  1250  		code = append(code, op.short...)
  1251  		code = append(code, byte(offsetOfEIP))
  1252  	} else {
  1253  		code = append(code, op.long...)
  1254  		code = appendUint32(code, uint32(offsetOfEIP))
  1255  	}
  1256  
  1257  	buf.Truncate(base + len(code))
  1258  	return
  1259  }
  1260  
  1261  func (a *AssemblerImpl) encodeRegisterToNone(buf asm.Buffer, n *nodeImpl) (err error) {
  1262  	regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM)
  1263  
  1264  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1265  	modRM := 0b11_000_000 | // Specifying that opeand is register.
  1266  		regBits
  1267  
  1268  	var opcode byte
  1269  	switch n.instruction {
  1270  	case DIVL:
  1271  		// https://www.felixcloutier.com/x86/div
  1272  		modRM |= 0b00_110_000
  1273  		opcode = 0xf7
  1274  	case DIVQ:
  1275  		// https://www.felixcloutier.com/x86/div
  1276  		prefix |= rexPrefixW
  1277  		modRM |= 0b00_110_000
  1278  		opcode = 0xf7
  1279  	case IDIVL:
  1280  		// https://www.felixcloutier.com/x86/idiv
  1281  		modRM |= 0b00_111_000
  1282  		opcode = 0xf7
  1283  	case IDIVQ:
  1284  		// https://www.felixcloutier.com/x86/idiv
  1285  		prefix |= rexPrefixW
  1286  		modRM |= 0b00_111_000
  1287  		opcode = 0xf7
  1288  	case MULL:
  1289  		// https://www.felixcloutier.com/x86/mul
  1290  		modRM |= 0b00_100_000
  1291  		opcode = 0xf7
  1292  	case MULQ:
  1293  		// https://www.felixcloutier.com/x86/mul
  1294  		prefix |= rexPrefixW
  1295  		modRM |= 0b00_100_000
  1296  		opcode = 0xf7
  1297  	default:
  1298  		err = errorEncodingUnsupported(n)
  1299  	}
  1300  
  1301  	base := buf.Len()
  1302  	code := buf.Append(3)[:0]
  1303  
  1304  	if prefix != rexPrefixNone {
  1305  		code = append(code, prefix)
  1306  	}
  1307  
  1308  	code = append(code, opcode, modRM)
  1309  
  1310  	buf.Truncate(base + len(code))
  1311  	return
  1312  }
  1313  
  1314  var registerToRegisterOpcode = [instructionEnd]*struct {
  1315  	opcode          []byte
  1316  	rPrefix         rexPrefix
  1317  	mandatoryPrefix byte
  1318  	srcOnModRMReg   bool
  1319  	isSrc8bit       bool
  1320  	needArg         bool
  1321  }{
  1322  	// https://www.felixcloutier.com/x86/add
  1323  	ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true},
  1324  	ADDQ: {opcode: []byte{0x1}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1325  	// https://www.felixcloutier.com/x86/and
  1326  	ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true},
  1327  	ANDQ: {opcode: []byte{0x21}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1328  	// https://www.felixcloutier.com/x86/cmp
  1329  	CMPL: {opcode: []byte{0x39}},
  1330  	CMPQ: {opcode: []byte{0x39}, rPrefix: rexPrefixW},
  1331  	// https://www.felixcloutier.com/x86/cmovcc
  1332  	CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: rexPrefixW},
  1333  	// https://www.felixcloutier.com/x86/addsd
  1334  	ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}},
  1335  	// https://www.felixcloutier.com/x86/addss
  1336  	ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}},
  1337  	// https://www.felixcloutier.com/x86/addpd
  1338  	ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}},
  1339  	// https://www.felixcloutier.com/x86/addps
  1340  	ANDPS: {opcode: []byte{0x0f, 0x54}},
  1341  	// https://www.felixcloutier.com/x86/bsr
  1342  	BSRL: {opcode: []byte{0xf, 0xbd}},
  1343  	BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: rexPrefixW},
  1344  	// https://www.felixcloutier.com/x86/comisd
  1345  	COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}},
  1346  	// https://www.felixcloutier.com/x86/comiss
  1347  	COMISS: {opcode: []byte{0x0f, 0x2f}},
  1348  	// https://www.felixcloutier.com/x86/cvtsd2ss
  1349  	CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}},
  1350  	// https://www.felixcloutier.com/x86/cvtsi2sd
  1351  	CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}},
  1352  	// https://www.felixcloutier.com/x86/cvtsi2sd
  1353  	CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW},
  1354  	// https://www.felixcloutier.com/x86/cvtsi2ss
  1355  	CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}},
  1356  	// https://www.felixcloutier.com/x86/cvtsi2ss
  1357  	CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW},
  1358  	// https://www.felixcloutier.com/x86/cvtss2sd
  1359  	CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}},
  1360  	// https://www.felixcloutier.com/x86/cvttsd2si
  1361  	CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}},
  1362  	CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW},
  1363  	// https://www.felixcloutier.com/x86/cvttss2si
  1364  	CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}},
  1365  	CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW},
  1366  	// https://www.felixcloutier.com/x86/divsd
  1367  	DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}},
  1368  	// https://www.felixcloutier.com/x86/divss
  1369  	DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}},
  1370  	// https://www.felixcloutier.com/x86/lzcnt
  1371  	LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}},
  1372  	LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: rexPrefixW},
  1373  	// https://www.felixcloutier.com/x86/maxsd
  1374  	MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}},
  1375  	// https://www.felixcloutier.com/x86/maxss
  1376  	MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}},
  1377  	// https://www.felixcloutier.com/x86/minsd
  1378  	MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}},
  1379  	// https://www.felixcloutier.com/x86/minss
  1380  	MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}},
  1381  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1382  	MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true},
  1383  	// https://www.felixcloutier.com/x86/movzx
  1384  	MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true},
  1385  	// https://www.felixcloutier.com/x86/movzx
  1386  	MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true},
  1387  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1388  	MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: rexPrefixW, isSrc8bit: true},
  1389  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1390  	MOVLQSX: {opcode: []byte{0x63}, rPrefix: rexPrefixW},
  1391  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1392  	MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: rexPrefixW},
  1393  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1394  	MOVWLSX: {opcode: []byte{0x0f, 0xbf}},
  1395  	// https://www.felixcloutier.com/x86/imul
  1396  	IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: rexPrefixW},
  1397  	// https://www.felixcloutier.com/x86/mulss
  1398  	MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}},
  1399  	// https://www.felixcloutier.com/x86/mulsd
  1400  	MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}},
  1401  	// https://www.felixcloutier.com/x86/or
  1402  	ORL: {opcode: []byte{0x09}, srcOnModRMReg: true},
  1403  	ORQ: {opcode: []byte{0x09}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1404  	// https://www.felixcloutier.com/x86/orpd
  1405  	ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}},
  1406  	// https://www.felixcloutier.com/x86/orps
  1407  	ORPS: {opcode: []byte{0x0f, 0x56}},
  1408  	// https://www.felixcloutier.com/x86/popcnt
  1409  	POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}},
  1410  	POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: rexPrefixW},
  1411  	// https://www.felixcloutier.com/x86/roundss
  1412  	ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true},
  1413  	// https://www.felixcloutier.com/x86/roundsd
  1414  	ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true},
  1415  	// https://www.felixcloutier.com/x86/sqrtss
  1416  	SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}},
  1417  	// https://www.felixcloutier.com/x86/sqrtsd
  1418  	SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}},
  1419  	// https://www.felixcloutier.com/x86/sub
  1420  	SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true},
  1421  	SUBQ: {opcode: []byte{0x29}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1422  	// https://www.felixcloutier.com/x86/subss
  1423  	SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}},
  1424  	// https://www.felixcloutier.com/x86/subsd
  1425  	SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}},
  1426  	// https://www.felixcloutier.com/x86/test
  1427  	TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true},
  1428  	TESTQ: {opcode: []byte{0x85}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1429  	// https://www.felixcloutier.com/x86/tzcnt
  1430  	TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}},
  1431  	TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: rexPrefixW},
  1432  	// https://www.felixcloutier.com/x86/ucomisd
  1433  	UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}},
  1434  	// https://www.felixcloutier.com/x86/ucomiss
  1435  	UCOMISS: {opcode: []byte{0x0f, 0x2e}},
  1436  	// https://www.felixcloutier.com/x86/xchg
  1437  	XCHGQ: {opcode: []byte{0x87}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1438  	// https://www.felixcloutier.com/x86/xor
  1439  	XORL: {opcode: []byte{0x31}, srcOnModRMReg: true},
  1440  	XORQ: {opcode: []byte{0x31}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1441  	// https://www.felixcloutier.com/x86/xorpd
  1442  	XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}},
  1443  	XORPS: {opcode: []byte{0x0f, 0x57}},
  1444  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1445  	PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, needArg: true},
  1446  	// https://www.felixcloutier.com/x86/pinsrw
  1447  	PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, needArg: true},
  1448  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1449  	PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true},
  1450  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1451  	PINSRQ: {mandatoryPrefix: 0x66, rPrefix: rexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true},
  1452  	// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1453  	MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}},
  1454  	// https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
  1455  	MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}},
  1456  	// https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
  1457  	PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}},
  1458  	PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}},
  1459  	PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}},
  1460  	PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}},
  1461  	// https://www.felixcloutier.com/x86/psubb:psubw:psubd
  1462  	PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}},
  1463  	PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}},
  1464  	PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}},
  1465  	// https://www.felixcloutier.com/x86/psubq
  1466  	PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}},
  1467  	// https://www.felixcloutier.com/x86/addps
  1468  	ADDPS: {opcode: []byte{0x0f, 0x58}},
  1469  	// https://www.felixcloutier.com/x86/addpd
  1470  	ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}},
  1471  	// https://www.felixcloutier.com/x86/subps
  1472  	SUBPS: {opcode: []byte{0x0f, 0x5c}},
  1473  	// https://www.felixcloutier.com/x86/subpd
  1474  	SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}},
  1475  	// https://www.felixcloutier.com/x86/pxor
  1476  	PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}},
  1477  	// https://www.felixcloutier.com/x86/pand
  1478  	PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}},
  1479  	// https://www.felixcloutier.com/x86/por
  1480  	POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}},
  1481  	// https://www.felixcloutier.com/x86/pandn
  1482  	PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}},
  1483  	// https://www.felixcloutier.com/x86/pshufb
  1484  	PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}},
  1485  	// https://www.felixcloutier.com/x86/pshufd
  1486  	PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, needArg: true},
  1487  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1488  	PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, needArg: true, srcOnModRMReg: true},
  1489  	// https://www.felixcloutier.com/x86/pextrw
  1490  	PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, needArg: true},
  1491  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1492  	PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true},
  1493  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1494  	PEXTRQ: {rPrefix: rexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true},
  1495  	// https://www.felixcloutier.com/x86/insertps
  1496  	INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, needArg: true},
  1497  	// https://www.felixcloutier.com/x86/movlhps
  1498  	MOVLHPS: {opcode: []byte{0x0f, 0x16}},
  1499  	// https://www.felixcloutier.com/x86/ptest
  1500  	PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}},
  1501  	// https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd
  1502  	PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}},
  1503  	PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}},
  1504  	PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}},
  1505  	// https://www.felixcloutier.com/x86/pcmpeqq
  1506  	PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}},
  1507  	// https://www.felixcloutier.com/x86/paddusb:paddusw
  1508  	PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}},
  1509  	// https://www.felixcloutier.com/x86/movsd
  1510  	MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}},
  1511  	// https://www.felixcloutier.com/x86/packsswb:packssdw
  1512  	PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}},
  1513  	// https://www.felixcloutier.com/x86/pmovmskb
  1514  	PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}},
  1515  	// https://www.felixcloutier.com/x86/movmskps
  1516  	MOVMSKPS: {opcode: []byte{0x0f, 0x50}},
  1517  	// https://www.felixcloutier.com/x86/movmskpd
  1518  	MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}},
  1519  	// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  1520  	PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}},
  1521  	// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  1522  	PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}},
  1523  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1524  	PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}},
  1525  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1526  	PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}},
  1527  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1528  	PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}},
  1529  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1530  	PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}},
  1531  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1532  	PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}},
  1533  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1534  	PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}},
  1535  	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
  1536  	PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}},
  1537  	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
  1538  	PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}},
  1539  	// https://www.felixcloutier.com/x86/cmpps
  1540  	CMPPS: {opcode: []byte{0x0f, 0xc2}, needArg: true},
  1541  	// https://www.felixcloutier.com/x86/cmppd
  1542  	CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true},
  1543  	// https://www.felixcloutier.com/x86/pcmpgtq
  1544  	PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}},
  1545  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1546  	PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}},
  1547  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1548  	PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}},
  1549  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1550  	PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}},
  1551  	// https://www.felixcloutier.com/x86/pminsd:pminsq
  1552  	PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}},
  1553  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1554  	PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}},
  1555  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1556  	PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}},
  1557  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1558  	PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}},
  1559  	// https://www.felixcloutier.com/x86/pminsb:pminsw
  1560  	PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}},
  1561  	// https://www.felixcloutier.com/x86/pminsb:pminsw
  1562  	PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}},
  1563  	// https://www.felixcloutier.com/x86/pminud:pminuq
  1564  	PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}},
  1565  	// https://www.felixcloutier.com/x86/pminub:pminuw
  1566  	PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}},
  1567  	// https://www.felixcloutier.com/x86/pminub:pminuw
  1568  	PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}},
  1569  	// https://www.felixcloutier.com/x86/pmaxud:pmaxuq
  1570  	PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}},
  1571  	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
  1572  	PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}},
  1573  	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
  1574  	PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}},
  1575  	// https://www.felixcloutier.com/x86/pmullw
  1576  	PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}},
  1577  	// https://www.felixcloutier.com/x86/pmulld:pmullq
  1578  	PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}},
  1579  	// https://www.felixcloutier.com/x86/pmuludq
  1580  	PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}},
  1581  	// https://www.felixcloutier.com/x86/psubsb:psubsw
  1582  	PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}},
  1583  	// https://www.felixcloutier.com/x86/psubsb:psubsw
  1584  	PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}},
  1585  	// https://www.felixcloutier.com/x86/psubusb:psubusw
  1586  	PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}},
  1587  	// https://www.felixcloutier.com/x86/psubusb:psubusw
  1588  	PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}},
  1589  	// https://www.felixcloutier.com/x86/paddsb:paddsw
  1590  	PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}},
  1591  	// https://www.felixcloutier.com/x86/paddsb:paddsw
  1592  	PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}},
  1593  	// https://www.felixcloutier.com/x86/paddusb:paddusw
  1594  	PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}},
  1595  	// https://www.felixcloutier.com/x86/pavgb:pavgw
  1596  	PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}},
  1597  	// https://www.felixcloutier.com/x86/pavgb:pavgw
  1598  	PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}},
  1599  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1600  	PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}},
  1601  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1602  	PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}},
  1603  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1604  	PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}},
  1605  	// https://www.felixcloutier.com/x86/blendvpd
  1606  	BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}},
  1607  	// https://www.felixcloutier.com/x86/maxpd
  1608  	MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}},
  1609  	// https://www.felixcloutier.com/x86/maxps
  1610  	MAXPS: {opcode: []byte{0x0f, 0x5f}},
  1611  	// https://www.felixcloutier.com/x86/minpd
  1612  	MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}},
  1613  	// https://www.felixcloutier.com/x86/minps
  1614  	MINPS: {opcode: []byte{0x0f, 0x5d}},
  1615  	// https://www.felixcloutier.com/x86/andnpd
  1616  	ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}},
  1617  	// https://www.felixcloutier.com/x86/andnps
  1618  	ANDNPS: {opcode: []byte{0x0f, 0x55}},
  1619  	// https://www.felixcloutier.com/x86/mulps
  1620  	MULPS: {opcode: []byte{0x0f, 0x59}},
  1621  	// https://www.felixcloutier.com/x86/mulpd
  1622  	MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}},
  1623  	// https://www.felixcloutier.com/x86/divps
  1624  	DIVPS: {opcode: []byte{0x0f, 0x5e}},
  1625  	// https://www.felixcloutier.com/x86/divpd
  1626  	DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}},
  1627  	// https://www.felixcloutier.com/x86/sqrtps
  1628  	SQRTPS: {opcode: []byte{0x0f, 0x51}},
  1629  	// https://www.felixcloutier.com/x86/sqrtpd
  1630  	SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}},
  1631  	// https://www.felixcloutier.com/x86/roundps
  1632  	ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, needArg: true},
  1633  	// https://www.felixcloutier.com/x86/roundpd
  1634  	ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, needArg: true},
  1635  	// https://www.felixcloutier.com/x86/palignr
  1636  	PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, needArg: true},
  1637  	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
  1638  	PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}},
  1639  	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
  1640  	PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}},
  1641  	// https://www.felixcloutier.com/x86/pmulhuw
  1642  	PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}},
  1643  	// https://www.felixcloutier.com/x86/pmuldq
  1644  	PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}},
  1645  	// https://www.felixcloutier.com/x86/pmulhrsw
  1646  	PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}},
  1647  	// https://www.felixcloutier.com/x86/pmovsx
  1648  	PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}},
  1649  	// https://www.felixcloutier.com/x86/pmovsx
  1650  	PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}},
  1651  	// https://www.felixcloutier.com/x86/pmovsx
  1652  	PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}},
  1653  	// https://www.felixcloutier.com/x86/pmovzx
  1654  	PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}},
  1655  	// https://www.felixcloutier.com/x86/pmovzx
  1656  	PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}},
  1657  	// https://www.felixcloutier.com/x86/pmovzx
  1658  	PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}},
  1659  	// https://www.felixcloutier.com/x86/pmulhw
  1660  	PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}},
  1661  	// https://www.felixcloutier.com/x86/cmpps
  1662  	CMPEQPS: {opcode: []byte{0x0f, 0xc2}, needArg: true},
  1663  	// https://www.felixcloutier.com/x86/cmppd
  1664  	CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true},
  1665  	// https://www.felixcloutier.com/x86/cvttps2dq
  1666  	CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}},
  1667  	// https://www.felixcloutier.com/x86/cvtdq2ps
  1668  	CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}},
  1669  	// https://www.felixcloutier.com/x86/cvtdq2pd
  1670  	CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}},
  1671  	// https://www.felixcloutier.com/x86/cvtpd2ps
  1672  	CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}},
  1673  	// https://www.felixcloutier.com/x86/cvtps2pd
  1674  	CVTPS2PD: {opcode: []byte{0x0f, 0x5a}},
  1675  	// https://www.felixcloutier.com/x86/movupd
  1676  	MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}},
  1677  	// https://www.felixcloutier.com/x86/shufps
  1678  	SHUFPS: {opcode: []byte{0x0f, 0xc6}, needArg: true},
  1679  	// https://www.felixcloutier.com/x86/pmaddwd
  1680  	PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}},
  1681  	// https://www.felixcloutier.com/x86/unpcklps
  1682  	UNPCKLPS: {opcode: []byte{0x0f, 0x14}},
  1683  	// https://www.felixcloutier.com/x86/packuswb
  1684  	PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}},
  1685  	// https://www.felixcloutier.com/x86/packsswb:packssdw
  1686  	PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}},
  1687  	// https://www.felixcloutier.com/x86/packusdw
  1688  	PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}},
  1689  	// https://www.felixcloutier.com/x86/pmaddubsw
  1690  	PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}},
  1691  	// https://www.felixcloutier.com/x86/cvttpd2dq
  1692  	CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}},
  1693  }
  1694  
  1695  var registerToRegisterShiftOpcode = [instructionEnd]*struct {
  1696  	opcode         []byte
  1697  	rPrefix        rexPrefix
  1698  	modRMExtension byte
  1699  }{
  1700  	// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1701  	ROLL: {opcode: []byte{0xd3}},
  1702  	ROLQ: {opcode: []byte{0xd3}, rPrefix: rexPrefixW},
  1703  	RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000},
  1704  	RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: rexPrefixW},
  1705  	// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1706  	SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000},
  1707  	SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: rexPrefixW},
  1708  	SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000},
  1709  	SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: rexPrefixW},
  1710  	SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000},
  1711  	SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: rexPrefixW},
  1712  }
  1713  
  1714  func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  1715  	// Alias for readability
  1716  	inst := n.instruction
  1717  	base := buf.Len()
  1718  	code := buf.Append(8)[:0]
  1719  
  1720  	switch inst {
  1721  	case MOVL, MOVQ:
  1722  		var (
  1723  			opcode          []byte
  1724  			mandatoryPrefix byte
  1725  			srcOnModRMReg   bool
  1726  			rPrefix         rexPrefix
  1727  		)
  1728  		srcIsFloat, dstIsFloat := isVectorRegister(n.srcReg), isVectorRegister(n.dstReg)
  1729  		f2f := srcIsFloat && dstIsFloat
  1730  		if f2f {
  1731  			// https://www.felixcloutier.com/x86/movq
  1732  			opcode, mandatoryPrefix = []byte{0x0f, 0x7e}, 0xf3
  1733  		} else if srcIsFloat && !dstIsFloat {
  1734  			// https://www.felixcloutier.com/x86/movd:movq
  1735  			opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x7e}, 0x66, true
  1736  		} else if !srcIsFloat && dstIsFloat {
  1737  			// https://www.felixcloutier.com/x86/movd:movq
  1738  			opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x6e}, 0x66, false
  1739  		} else {
  1740  			// https://www.felixcloutier.com/x86/mov
  1741  			opcode, srcOnModRMReg = []byte{0x89}, true
  1742  		}
  1743  
  1744  		rexPrefix, modRM, err := n.getRegisterToRegisterModRM(srcOnModRMReg)
  1745  		if err != nil {
  1746  			return err
  1747  		}
  1748  		rexPrefix |= rPrefix
  1749  
  1750  		if inst == MOVQ && !f2f {
  1751  			rexPrefix |= rexPrefixW
  1752  		}
  1753  		if mandatoryPrefix != 0 {
  1754  			code = append(code, mandatoryPrefix)
  1755  		}
  1756  		if rexPrefix != rexPrefixNone {
  1757  			code = append(code, rexPrefix)
  1758  		}
  1759  		code = append(code, opcode...)
  1760  		code = append(code, modRM)
  1761  		buf.Truncate(base + len(code))
  1762  		return nil
  1763  	}
  1764  
  1765  	if op := registerToRegisterOpcode[inst]; op != nil {
  1766  		rexPrefix, modRM, err := n.getRegisterToRegisterModRM(op.srcOnModRMReg)
  1767  		if err != nil {
  1768  			return err
  1769  		}
  1770  		rexPrefix |= op.rPrefix
  1771  
  1772  		if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI {
  1773  			// If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix.
  1774  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers
  1775  			rexPrefix |= rexPrefixDefault
  1776  		}
  1777  
  1778  		if op.mandatoryPrefix != 0 {
  1779  			code = append(code, op.mandatoryPrefix)
  1780  		}
  1781  
  1782  		if rexPrefix != rexPrefixNone {
  1783  			code = append(code, rexPrefix)
  1784  		}
  1785  		code = append(code, op.opcode...)
  1786  		code = append(code, modRM)
  1787  
  1788  		if op.needArg {
  1789  			code = append(code, n.arg)
  1790  		}
  1791  	} else if op := registerToRegisterShiftOpcode[inst]; op != nil {
  1792  		reg3bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  1793  		rexPrefix |= op.rPrefix
  1794  		if rexPrefix != rexPrefixNone {
  1795  			code = append(code, rexPrefix)
  1796  		}
  1797  
  1798  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1799  		modRM := 0b11_000_000 |
  1800  			(op.modRMExtension) |
  1801  			reg3bits
  1802  		code = append(code, op.opcode...)
  1803  		code = append(code, modRM)
  1804  	} else {
  1805  		return errorEncodingUnsupported(n)
  1806  	}
  1807  
  1808  	buf.Truncate(base + len(code))
  1809  	return nil
  1810  }
  1811  
  1812  func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  1813  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true)
  1814  	if err != nil {
  1815  		return err
  1816  	}
  1817  
  1818  	var opcode []byte
  1819  	var mandatoryPrefix byte
  1820  	var isShiftInstruction bool
  1821  	var needArg bool
  1822  	switch n.instruction {
  1823  	case CMPL:
  1824  		// https://www.felixcloutier.com/x86/cmp
  1825  		opcode = []byte{0x3b}
  1826  	case CMPQ:
  1827  		// https://www.felixcloutier.com/x86/cmp
  1828  		rexPrefix |= rexPrefixW
  1829  		opcode = []byte{0x3b}
  1830  	case MOVB:
  1831  		// https://www.felixcloutier.com/x86/mov
  1832  		opcode = []byte{0x88}
  1833  		// 1 byte register operands need default prefix for the following registers.
  1834  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1835  			rexPrefix |= rexPrefixDefault
  1836  		}
  1837  	case MOVL:
  1838  		if isVectorRegister(n.srcReg) {
  1839  			// https://www.felixcloutier.com/x86/movd:movq
  1840  			opcode = []byte{0x0f, 0x7e}
  1841  			mandatoryPrefix = 0x66
  1842  		} else {
  1843  			// https://www.felixcloutier.com/x86/mov
  1844  			opcode = []byte{0x89}
  1845  		}
  1846  	case MOVQ:
  1847  		if isVectorRegister(n.srcReg) {
  1848  			// https://www.felixcloutier.com/x86/movq
  1849  			opcode = []byte{0x0f, 0xd6}
  1850  			mandatoryPrefix = 0x66
  1851  		} else {
  1852  			// https://www.felixcloutier.com/x86/mov
  1853  			rexPrefix |= rexPrefixW
  1854  			opcode = []byte{0x89}
  1855  		}
  1856  	case MOVW:
  1857  		// https://www.felixcloutier.com/x86/mov
  1858  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1859  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1860  		mandatoryPrefix = 0x66
  1861  		opcode = []byte{0x89}
  1862  	case SARL:
  1863  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1864  		modRM |= 0b00_111_000
  1865  		opcode = []byte{0xd3}
  1866  		isShiftInstruction = true
  1867  	case SARQ:
  1868  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1869  		rexPrefix |= rexPrefixW
  1870  		modRM |= 0b00_111_000
  1871  		opcode = []byte{0xd3}
  1872  		isShiftInstruction = true
  1873  	case SHLL:
  1874  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1875  		modRM |= 0b00_100_000
  1876  		opcode = []byte{0xd3}
  1877  		isShiftInstruction = true
  1878  	case SHLQ:
  1879  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1880  		rexPrefix |= rexPrefixW
  1881  		modRM |= 0b00_100_000
  1882  		opcode = []byte{0xd3}
  1883  		isShiftInstruction = true
  1884  	case SHRL:
  1885  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1886  		modRM |= 0b00_101_000
  1887  		opcode = []byte{0xd3}
  1888  		isShiftInstruction = true
  1889  	case SHRQ:
  1890  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1891  		rexPrefix |= rexPrefixW
  1892  		modRM |= 0b00_101_000
  1893  		opcode = []byte{0xd3}
  1894  		isShiftInstruction = true
  1895  	case ROLL:
  1896  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1897  		opcode = []byte{0xd3}
  1898  		isShiftInstruction = true
  1899  	case ROLQ:
  1900  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1901  		rexPrefix |= rexPrefixW
  1902  		opcode = []byte{0xd3}
  1903  		isShiftInstruction = true
  1904  	case RORL:
  1905  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1906  		modRM |= 0b00_001_000
  1907  		opcode = []byte{0xd3}
  1908  		isShiftInstruction = true
  1909  	case RORQ:
  1910  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1911  		rexPrefix |= rexPrefixW
  1912  		opcode = []byte{0xd3}
  1913  		modRM |= 0b00_001_000
  1914  		isShiftInstruction = true
  1915  	case MOVDQU:
  1916  		// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1917  		mandatoryPrefix = 0xf3
  1918  		opcode = []byte{0x0f, 0x7f}
  1919  	case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1920  		mandatoryPrefix = 0x66
  1921  		opcode = []byte{0x0f, 0x3a, 0x14}
  1922  		needArg = true
  1923  	case PEXTRW: // https://www.felixcloutier.com/x86/pextrw
  1924  		mandatoryPrefix = 0x66
  1925  		opcode = []byte{0x0f, 0x3a, 0x15}
  1926  		needArg = true
  1927  	case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1928  		mandatoryPrefix = 0x66
  1929  		opcode = []byte{0x0f, 0x3a, 0x16}
  1930  		needArg = true
  1931  	case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1932  		mandatoryPrefix = 0x66
  1933  		rexPrefix |= rexPrefixW // REX.W
  1934  		opcode = []byte{0x0f, 0x3a, 0x16}
  1935  		needArg = true
  1936  	case XCHGB:
  1937  		// https://www.felixcloutier.com/x86/xchg
  1938  		opcode = []byte{0x86}
  1939  		// 1 byte register operands need default prefix for the following registers.
  1940  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1941  			rexPrefix |= rexPrefixDefault
  1942  		}
  1943  	case XCHGW:
  1944  		// https://www.felixcloutier.com/x86/mov
  1945  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1946  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1947  		mandatoryPrefix = 0x66
  1948  		opcode = []byte{0x87}
  1949  	case XCHGL:
  1950  		// https://www.felixcloutier.com/x86/xchg
  1951  		opcode = []byte{0x87}
  1952  	case XCHGQ:
  1953  		// https://www.felixcloutier.com/x86/mxchg
  1954  		rexPrefix |= rexPrefixW
  1955  		opcode = []byte{0x87}
  1956  	case XADDB:
  1957  		// https://www.felixcloutier.com/x86/xadd
  1958  		opcode = []byte{0x0F, 0xC0}
  1959  		// 1 byte register operands need default prefix for the following registers.
  1960  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1961  			rexPrefix |= rexPrefixDefault
  1962  		}
  1963  	case XADDW:
  1964  		// https://www.felixcloutier.com/x86/xadd
  1965  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1966  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1967  		mandatoryPrefix = 0x66
  1968  		opcode = []byte{0x0F, 0xC1}
  1969  	case XADDL:
  1970  		// https://www.felixcloutier.com/x86/xadd
  1971  		opcode = []byte{0x0F, 0xC1}
  1972  	case XADDQ:
  1973  		// https://www.felixcloutier.com/x86/xadd
  1974  		rexPrefix |= rexPrefixW
  1975  		opcode = []byte{0x0F, 0xC1}
  1976  	case CMPXCHGB:
  1977  		// https://www.felixcloutier.com/x86/cmpxchg
  1978  		opcode = []byte{0x0F, 0xB0}
  1979  		// 1 byte register operands need default prefix for the following registers.
  1980  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1981  			rexPrefix |= rexPrefixDefault
  1982  		}
  1983  	case CMPXCHGW:
  1984  		// https://www.felixcloutier.com/x86/cmpxchg
  1985  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1986  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1987  		mandatoryPrefix = 0x66
  1988  		opcode = []byte{0x0F, 0xB1}
  1989  	case CMPXCHGL:
  1990  		// https://www.felixcloutier.com/x86/cmpxchg
  1991  		opcode = []byte{0x0F, 0xB1}
  1992  	case CMPXCHGQ:
  1993  		// https://www.felixcloutier.com/x86/cmpxchg
  1994  		rexPrefix |= rexPrefixW
  1995  		opcode = []byte{0x0F, 0xB1}
  1996  	default:
  1997  		return errorEncodingUnsupported(n)
  1998  	}
  1999  
  2000  	if !isShiftInstruction {
  2001  		srcReg3Bits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg)
  2002  
  2003  		rexPrefix |= prefix
  2004  		modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg
  2005  	} else {
  2006  		if n.srcReg != RegCX {
  2007  			return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg))
  2008  		}
  2009  	}
  2010  
  2011  	base := buf.Len()
  2012  	code := buf.Append(16)[:0]
  2013  
  2014  	if mandatoryPrefix != 0 {
  2015  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  2016  		code = append(code, mandatoryPrefix)
  2017  	}
  2018  
  2019  	if n.isLock() {
  2020  		code = append(code, lockPrefix)
  2021  	}
  2022  
  2023  	if rexPrefix != rexPrefixNone {
  2024  		code = append(code, rexPrefix)
  2025  	}
  2026  
  2027  	code = append(code, opcode...)
  2028  	code = append(code, modRM)
  2029  
  2030  	if sbiExist {
  2031  		code = append(code, sbi)
  2032  	}
  2033  
  2034  	if displacementWidth != 0 {
  2035  		code = appendConst(code, n.dstConst, displacementWidth)
  2036  	}
  2037  
  2038  	if needArg {
  2039  		code = append(code, n.arg)
  2040  	}
  2041  
  2042  	buf.Truncate(base + len(code))
  2043  	return
  2044  }
  2045  
  2046  func (a *AssemblerImpl) encodeRegisterToConst(buf asm.Buffer, n *nodeImpl) (err error) {
  2047  	regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM)
  2048  
  2049  	base := buf.Len()
  2050  	code := buf.Append(10)[:0]
  2051  
  2052  	switch n.instruction {
  2053  	case CMPL, CMPQ:
  2054  		if n.instruction == CMPQ {
  2055  			prefix |= rexPrefixW
  2056  		}
  2057  		if prefix != rexPrefixNone {
  2058  			code = append(code, prefix)
  2059  		}
  2060  		is8bitConst := fitInSigned8bit(n.dstConst)
  2061  		// https://www.felixcloutier.com/x86/cmp
  2062  		if n.srcReg == RegAX && !is8bitConst {
  2063  			code = append(code, 0x3d)
  2064  		} else {
  2065  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2066  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2067  				0b00_111_000 | // CMP with immediate needs "/7" extension.
  2068  				regBits
  2069  			if is8bitConst {
  2070  				code = append(code, 0x83, modRM)
  2071  			} else {
  2072  				code = append(code, 0x81, modRM)
  2073  			}
  2074  		}
  2075  	default:
  2076  		err = errorEncodingUnsupported(n)
  2077  	}
  2078  
  2079  	if fitInSigned8bit(n.dstConst) {
  2080  		code = append(code, byte(n.dstConst))
  2081  	} else {
  2082  		code = appendUint32(code, uint32(n.dstConst))
  2083  	}
  2084  
  2085  	buf.Truncate(base + len(code))
  2086  	return
  2087  }
  2088  
  2089  func (a *AssemblerImpl) finalizeReadInstructionAddressNode(code []byte, n *nodeImpl) (err error) {
  2090  	// Find the target instruction node.
  2091  	targetNode := n
  2092  	for ; targetNode != nil; targetNode = targetNode.next {
  2093  		if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction {
  2094  			targetNode = targetNode.next
  2095  			break
  2096  		}
  2097  	}
  2098  
  2099  	if targetNode == nil {
  2100  		return errors.New("BUG: target instruction not found for read instruction address")
  2101  	}
  2102  
  2103  	offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */)
  2104  	if offset >= math.MaxInt32 {
  2105  		return errors.New("BUG: too large offset for LEAQ instruction")
  2106  	}
  2107  
  2108  	binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset)))
  2109  	return nil
  2110  }
  2111  
  2112  func (a *AssemblerImpl) encodeReadInstructionAddress(buf asm.Buffer, n *nodeImpl) error {
  2113  	dstReg3Bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg)
  2114  
  2115  	a.readInstructionAddressNodes = append(a.readInstructionAddressNodes, n)
  2116  
  2117  	// https://www.felixcloutier.com/x86/lea
  2118  	opcode := byte(0x8d)
  2119  	rexPrefix |= rexPrefixW
  2120  
  2121  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2122  	modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding.
  2123  		(dstReg3Bits << 3) // Place the dstReg on ModRM:reg.
  2124  
  2125  	code := buf.Append(7)
  2126  	code[0] = rexPrefix
  2127  	code[1] = opcode
  2128  	code[2] = modRM
  2129  	binary.LittleEndian.PutUint32(code[3:], 0) // Preserve
  2130  	return nil
  2131  }
  2132  
  2133  func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2134  	if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE {
  2135  		return a.encodeReadInstructionAddress(buf, n)
  2136  	}
  2137  
  2138  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false)
  2139  	if err != nil {
  2140  		return err
  2141  	}
  2142  
  2143  	dstReg3Bits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg)
  2144  	rexPrefix |= prefix
  2145  	modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg
  2146  
  2147  	var mandatoryPrefix byte
  2148  	var opcode []byte
  2149  	var needArg bool
  2150  
  2151  	switch n.instruction {
  2152  	case ADDL:
  2153  		// https://www.felixcloutier.com/x86/add
  2154  		opcode = []byte{0x03}
  2155  	case ADDQ:
  2156  		// https://www.felixcloutier.com/x86/add
  2157  		rexPrefix |= rexPrefixW
  2158  		opcode = []byte{0x03}
  2159  	case CMPL:
  2160  		// https://www.felixcloutier.com/x86/cmp
  2161  		opcode = []byte{0x39}
  2162  	case CMPQ:
  2163  		// https://www.felixcloutier.com/x86/cmp
  2164  		rexPrefix |= rexPrefixW
  2165  		opcode = []byte{0x39}
  2166  	case LEAQ:
  2167  		// https://www.felixcloutier.com/x86/lea
  2168  		rexPrefix |= rexPrefixW
  2169  		opcode = []byte{0x8d}
  2170  	case MOVBLSX:
  2171  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2172  		opcode = []byte{0x0f, 0xbe}
  2173  	case MOVBLZX:
  2174  		// https://www.felixcloutier.com/x86/movzx
  2175  		opcode = []byte{0x0f, 0xb6}
  2176  	case MOVBQSX:
  2177  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2178  		rexPrefix |= rexPrefixW
  2179  		opcode = []byte{0x0f, 0xbe}
  2180  	case MOVBQZX:
  2181  		// https://www.felixcloutier.com/x86/movzx
  2182  		rexPrefix |= rexPrefixW
  2183  		opcode = []byte{0x0f, 0xb6}
  2184  	case MOVLQSX:
  2185  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2186  		rexPrefix |= rexPrefixW
  2187  		opcode = []byte{0x63}
  2188  	case MOVLQZX:
  2189  		// https://www.felixcloutier.com/x86/mov
  2190  		// Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and
  2191  		// that is semantically equivalent to MOV 32bit to 32bit.
  2192  		opcode = []byte{0x8B}
  2193  	case MOVL:
  2194  		// https://www.felixcloutier.com/x86/mov
  2195  		// Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and
  2196  		// that is semantically equivalent to MOV 32bit to 32bit.
  2197  		if isVectorRegister(n.dstReg) {
  2198  			// https://www.felixcloutier.com/x86/movd:movq
  2199  			opcode = []byte{0x0f, 0x6e}
  2200  			mandatoryPrefix = 0x66
  2201  		} else {
  2202  			// https://www.felixcloutier.com/x86/mov
  2203  			opcode = []byte{0x8B}
  2204  		}
  2205  	case MOVQ:
  2206  		if isVectorRegister(n.dstReg) {
  2207  			// https://www.felixcloutier.com/x86/movq
  2208  			opcode = []byte{0x0f, 0x7e}
  2209  			mandatoryPrefix = 0xf3
  2210  		} else {
  2211  			// https://www.felixcloutier.com/x86/mov
  2212  			rexPrefix |= rexPrefixW
  2213  			opcode = []byte{0x8B}
  2214  		}
  2215  	case MOVWLSX:
  2216  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2217  		opcode = []byte{0x0f, 0xbf}
  2218  	case MOVWLZX:
  2219  		// https://www.felixcloutier.com/x86/movzx
  2220  		opcode = []byte{0x0f, 0xb7}
  2221  	case MOVWQSX:
  2222  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2223  		rexPrefix |= rexPrefixW
  2224  		opcode = []byte{0x0f, 0xbf}
  2225  	case MOVWQZX:
  2226  		// https://www.felixcloutier.com/x86/movzx
  2227  		rexPrefix |= rexPrefixW
  2228  		opcode = []byte{0x0f, 0xb7}
  2229  	case SUBQ:
  2230  		// https://www.felixcloutier.com/x86/sub
  2231  		rexPrefix |= rexPrefixW
  2232  		opcode = []byte{0x2b}
  2233  	case SUBSD:
  2234  		// https://www.felixcloutier.com/x86/subsd
  2235  		opcode = []byte{0x0f, 0x5c}
  2236  		mandatoryPrefix = 0xf2
  2237  	case SUBSS:
  2238  		// https://www.felixcloutier.com/x86/subss
  2239  		opcode = []byte{0x0f, 0x5c}
  2240  		mandatoryPrefix = 0xf3
  2241  	case UCOMISD:
  2242  		// https://www.felixcloutier.com/x86/ucomisd
  2243  		opcode = []byte{0x0f, 0x2e}
  2244  		mandatoryPrefix = 0x66
  2245  	case UCOMISS:
  2246  		// https://www.felixcloutier.com/x86/ucomiss
  2247  		opcode = []byte{0x0f, 0x2e}
  2248  	case MOVDQU:
  2249  		// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  2250  		mandatoryPrefix = 0xf3
  2251  		opcode = []byte{0x0f, 0x6f}
  2252  	case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx
  2253  		mandatoryPrefix = 0x66
  2254  		opcode = []byte{0x0f, 0x38, 0x20}
  2255  	case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx
  2256  		mandatoryPrefix = 0x66
  2257  		opcode = []byte{0x0f, 0x38, 0x23}
  2258  	case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx
  2259  		mandatoryPrefix = 0x66
  2260  		opcode = []byte{0x0f, 0x38, 0x25}
  2261  	case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx
  2262  		mandatoryPrefix = 0x66
  2263  		opcode = []byte{0x0f, 0x38, 0x30}
  2264  	case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx
  2265  		mandatoryPrefix = 0x66
  2266  		opcode = []byte{0x0f, 0x38, 0x33}
  2267  	case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx
  2268  		mandatoryPrefix = 0x66
  2269  		opcode = []byte{0x0f, 0x38, 0x35}
  2270  	case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2271  		mandatoryPrefix = 0x66
  2272  		opcode = []byte{0x0f, 0x3a, 0x20}
  2273  		needArg = true
  2274  	case PINSRW: // https://www.felixcloutier.com/x86/pinsrw
  2275  		mandatoryPrefix = 0x66
  2276  		opcode = []byte{0x0f, 0xc4}
  2277  		needArg = true
  2278  	case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2279  		mandatoryPrefix = 0x66
  2280  		opcode = []byte{0x0f, 0x3a, 0x22}
  2281  		needArg = true
  2282  	case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2283  		rexPrefix |= rexPrefixW
  2284  		mandatoryPrefix = 0x66
  2285  		opcode = []byte{0x0f, 0x3a, 0x22}
  2286  		needArg = true
  2287  	default:
  2288  		return errorEncodingUnsupported(n)
  2289  	}
  2290  
  2291  	base := buf.Len()
  2292  	code := buf.Append(16)[:0]
  2293  
  2294  	if mandatoryPrefix != 0 {
  2295  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  2296  		code = append(code, mandatoryPrefix)
  2297  	}
  2298  
  2299  	if rexPrefix != rexPrefixNone {
  2300  		code = append(code, rexPrefix)
  2301  	}
  2302  
  2303  	code = append(code, opcode...)
  2304  	code = append(code, modRM)
  2305  
  2306  	if sbiExist {
  2307  		code = append(code, sbi)
  2308  	}
  2309  
  2310  	if displacementWidth != 0 {
  2311  		code = appendConst(code, n.srcConst, displacementWidth)
  2312  	}
  2313  
  2314  	if needArg {
  2315  		code = append(code, n.arg)
  2316  	}
  2317  
  2318  	buf.Truncate(base + len(code))
  2319  	return
  2320  }
  2321  
  2322  func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2323  	regBits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  2324  
  2325  	isFloatReg := isVectorRegister(n.dstReg)
  2326  	switch n.instruction {
  2327  	case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD:
  2328  		if !isFloatReg {
  2329  			return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg))
  2330  		}
  2331  	default:
  2332  		if isFloatReg {
  2333  			return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg))
  2334  		}
  2335  	}
  2336  
  2337  	if n.instruction != MOVQ && !fitIn32bit(n.srcConst) {
  2338  		return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2339  	} else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) {
  2340  		return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2341  	} else if (n.instruction == PSLLD ||
  2342  		n.instruction == PSLLQ ||
  2343  		n.instruction == PSRLD ||
  2344  		n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) {
  2345  		return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2346  	}
  2347  
  2348  	base := buf.Len()
  2349  	code := buf.Append(32)[:0]
  2350  
  2351  	isSigned8bitConst := fitInSigned8bit(n.srcConst)
  2352  	switch inst := n.instruction; inst {
  2353  	case ADDQ:
  2354  		// https://www.felixcloutier.com/x86/add
  2355  		rexPrefix |= rexPrefixW
  2356  		if n.dstReg == RegAX && !isSigned8bitConst {
  2357  			code = append(code, rexPrefix, 0x05)
  2358  		} else {
  2359  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2360  				regBits
  2361  			if isSigned8bitConst {
  2362  				code = append(code, rexPrefix, 0x83, modRM)
  2363  			} else {
  2364  				code = append(code, rexPrefix, 0x81, modRM)
  2365  			}
  2366  		}
  2367  		if isSigned8bitConst {
  2368  			code = append(code, byte(n.srcConst))
  2369  		} else {
  2370  			code = appendUint32(code, uint32(n.srcConst))
  2371  		}
  2372  	case ANDQ:
  2373  		// https://www.felixcloutier.com/x86/and
  2374  		rexPrefix |= rexPrefixW
  2375  		if n.dstReg == RegAX && !isSigned8bitConst {
  2376  			code = append(code, rexPrefix, 0x25)
  2377  		} else {
  2378  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2379  				0b00_100_000 | // AND with immediate needs "/4" extension.
  2380  				regBits
  2381  			if isSigned8bitConst {
  2382  				code = append(code, rexPrefix, 0x83, modRM)
  2383  			} else {
  2384  				code = append(code, rexPrefix, 0x81, modRM)
  2385  			}
  2386  		}
  2387  		if fitInSigned8bit(n.srcConst) {
  2388  			code = append(code, byte(n.srcConst))
  2389  		} else {
  2390  			code = appendUint32(code, uint32(n.srcConst))
  2391  		}
  2392  	case TESTQ:
  2393  		// https://www.felixcloutier.com/x86/test
  2394  		rexPrefix |= rexPrefixW
  2395  		if n.dstReg == RegAX && !isSigned8bitConst {
  2396  			code = append(code, rexPrefix, 0xa9)
  2397  		} else {
  2398  			modRM := 0b11_000_000 | // Specifying that operand is register
  2399  				regBits
  2400  			code = append(code, rexPrefix, 0xf7, modRM)
  2401  		}
  2402  		code = appendUint32(code, uint32(n.srcConst))
  2403  	case MOVL:
  2404  		// https://www.felixcloutier.com/x86/mov
  2405  		if rexPrefix != rexPrefixNone {
  2406  			code = append(code, rexPrefix)
  2407  		}
  2408  		code = append(code, 0xb8|regBits)
  2409  		code = appendUint32(code, uint32(n.srcConst))
  2410  	case MOVQ:
  2411  		// https://www.felixcloutier.com/x86/mov
  2412  		if fitIn32bit(n.srcConst) {
  2413  			if n.srcConst > math.MaxInt32 {
  2414  				if rexPrefix != rexPrefixNone {
  2415  					code = append(code, rexPrefix)
  2416  				}
  2417  				code = append(code, 0xb8|regBits)
  2418  			} else {
  2419  				rexPrefix |= rexPrefixW
  2420  				modRM := 0b11_000_000 | // Specifying that opeand is register.
  2421  					regBits
  2422  				code = append(code, rexPrefix, 0xc7, modRM)
  2423  			}
  2424  			code = appendUint32(code, uint32(n.srcConst))
  2425  		} else {
  2426  			rexPrefix |= rexPrefixW
  2427  			code = append(code, rexPrefix, 0xb8|regBits)
  2428  			code = appendUint64(code, uint64(n.srcConst))
  2429  		}
  2430  	case SHLQ:
  2431  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  2432  		rexPrefix |= rexPrefixW
  2433  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2434  			0b00_100_000 | // SHL with immediate needs "/4" extension.
  2435  			regBits
  2436  		if n.srcConst == 1 {
  2437  			code = append(code, rexPrefix, 0xd1, modRM)
  2438  		} else {
  2439  			code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst))
  2440  		}
  2441  	case SHRQ:
  2442  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  2443  		rexPrefix |= rexPrefixW
  2444  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2445  			0b00_101_000 | // SHR with immediate needs "/5" extension.
  2446  			regBits
  2447  		if n.srcConst == 1 {
  2448  			code = append(code, rexPrefix, 0xd1, modRM)
  2449  		} else {
  2450  			code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst))
  2451  		}
  2452  	case PSLLD:
  2453  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2454  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2455  			0b00_110_000 | // PSLL with immediate needs "/6" extension.
  2456  			regBits
  2457  		if rexPrefix != rexPrefixNone {
  2458  			code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst))
  2459  		} else {
  2460  			code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst))
  2461  		}
  2462  	case PSLLQ:
  2463  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2464  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2465  			0b00_110_000 | // PSLL with immediate needs "/6" extension.
  2466  			regBits
  2467  		if rexPrefix != rexPrefixNone {
  2468  			code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst))
  2469  		} else {
  2470  			code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst))
  2471  		}
  2472  	case PSRLD:
  2473  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2474  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2475  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2476  			0b00_010_000 | // PSRL with immediate needs "/2" extension.
  2477  			regBits
  2478  		if rexPrefix != rexPrefixNone {
  2479  			code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst))
  2480  		} else {
  2481  			code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst))
  2482  		}
  2483  	case PSRLQ:
  2484  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2485  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2486  			0b00_010_000 | // PSRL with immediate needs "/2" extension.
  2487  			regBits
  2488  		if rexPrefix != rexPrefixNone {
  2489  			code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst))
  2490  		} else {
  2491  			code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst))
  2492  		}
  2493  	case PSRAW, PSRAD:
  2494  		// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  2495  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2496  			0b00_100_000 | // PSRAW with immediate needs "/4" extension.
  2497  			regBits
  2498  		code = append(code, 0x66)
  2499  		if rexPrefix != rexPrefixNone {
  2500  			code = append(code, rexPrefix)
  2501  		}
  2502  
  2503  		var op byte
  2504  		if inst == PSRAD {
  2505  			op = 0x72
  2506  		} else { // PSRAW
  2507  			op = 0x71
  2508  		}
  2509  
  2510  		code = append(code, 0x0f, op, modRM, byte(n.srcConst))
  2511  	case PSRLW:
  2512  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2513  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2514  			0b00_010_000 | // PSRLW with immediate needs "/2" extension.
  2515  			regBits
  2516  		code = append(code, 0x66)
  2517  		if rexPrefix != rexPrefixNone {
  2518  			code = append(code, rexPrefix)
  2519  		}
  2520  		code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst))
  2521  	case PSLLW:
  2522  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2523  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2524  			0b00_110_000 | // PSLLW with immediate needs "/6" extension.
  2525  			regBits
  2526  		code = append(code, 0x66)
  2527  		if rexPrefix != rexPrefixNone {
  2528  			code = append(code, rexPrefix)
  2529  		}
  2530  		code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst))
  2531  	case XORL, XORQ:
  2532  		// https://www.felixcloutier.com/x86/xor
  2533  		if inst == XORQ {
  2534  			rexPrefix |= rexPrefixW
  2535  		}
  2536  		if rexPrefix != rexPrefixNone {
  2537  			code = append(code, rexPrefix)
  2538  		}
  2539  		if n.dstReg == RegAX && !isSigned8bitConst {
  2540  			code = append(code, 0x35)
  2541  		} else {
  2542  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2543  				0b00_110_000 | // XOR with immediate needs "/6" extension.
  2544  				regBits
  2545  			if isSigned8bitConst {
  2546  				code = append(code, 0x83, modRM)
  2547  			} else {
  2548  				code = append(code, 0x81, modRM)
  2549  			}
  2550  		}
  2551  		if fitInSigned8bit(n.srcConst) {
  2552  			code = append(code, byte(n.srcConst))
  2553  		} else {
  2554  			code = appendUint32(code, uint32(n.srcConst))
  2555  		}
  2556  	default:
  2557  		err = errorEncodingUnsupported(n)
  2558  	}
  2559  
  2560  	buf.Truncate(base + len(code))
  2561  	return
  2562  }
  2563  
  2564  func (a *AssemblerImpl) encodeMemoryToConst(buf asm.Buffer, n *nodeImpl) (err error) {
  2565  	if !fitIn32bit(n.dstConst) {
  2566  		return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction))
  2567  	}
  2568  
  2569  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false)
  2570  	if err != nil {
  2571  		return err
  2572  	}
  2573  
  2574  	// Alias for readability.
  2575  	c := n.dstConst
  2576  
  2577  	var opcode, constWidth byte
  2578  	switch n.instruction {
  2579  	case CMPL:
  2580  		// https://www.felixcloutier.com/x86/cmp
  2581  		if fitInSigned8bit(c) {
  2582  			opcode = 0x83
  2583  			constWidth = 8
  2584  		} else {
  2585  			opcode = 0x81
  2586  			constWidth = 32
  2587  		}
  2588  		modRM |= 0b00_111_000
  2589  	default:
  2590  		return errorEncodingUnsupported(n)
  2591  	}
  2592  
  2593  	base := buf.Len()
  2594  	code := buf.Append(20)[:0]
  2595  
  2596  	if rexPrefix != rexPrefixNone {
  2597  		code = append(code, rexPrefix)
  2598  	}
  2599  
  2600  	code = append(code, opcode, modRM)
  2601  
  2602  	if sbiExist {
  2603  		code = append(code, sbi)
  2604  	}
  2605  
  2606  	if displacementWidth != 0 {
  2607  		code = appendConst(code, n.srcConst, displacementWidth)
  2608  	}
  2609  
  2610  	code = appendConst(code, c, constWidth)
  2611  	buf.Truncate(base + len(code))
  2612  	return
  2613  }
  2614  
  2615  func (a *AssemblerImpl) encodeConstToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  2616  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true)
  2617  	if err != nil {
  2618  		return err
  2619  	}
  2620  
  2621  	// Alias for readability.
  2622  	inst := n.instruction
  2623  	c := n.srcConst
  2624  
  2625  	if inst == MOVB && !fitInSigned8bit(c) {
  2626  		return fmt.Errorf("too large load target const %d for MOVB", c)
  2627  	} else if !fitIn32bit(c) {
  2628  		return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction))
  2629  	}
  2630  
  2631  	var constWidth, opcode byte
  2632  	switch inst {
  2633  	case MOVB:
  2634  		opcode = 0xc6
  2635  		constWidth = 8
  2636  	case MOVL:
  2637  		opcode = 0xc7
  2638  		constWidth = 32
  2639  	case MOVQ:
  2640  		rexPrefix |= rexPrefixW
  2641  		opcode = 0xc7
  2642  		constWidth = 32
  2643  	default:
  2644  		return errorEncodingUnsupported(n)
  2645  	}
  2646  
  2647  	base := buf.Len()
  2648  	code := buf.Append(20)[:0]
  2649  
  2650  	if rexPrefix != rexPrefixNone {
  2651  		code = append(code, rexPrefix)
  2652  	}
  2653  
  2654  	code = append(code, opcode, modRM)
  2655  
  2656  	if sbiExist {
  2657  		code = append(code, sbi)
  2658  	}
  2659  
  2660  	if displacementWidth != 0 {
  2661  		code = appendConst(code, n.dstConst, displacementWidth)
  2662  	}
  2663  
  2664  	code = appendConst(code, c, constWidth)
  2665  
  2666  	buf.Truncate(base + len(code))
  2667  	return
  2668  }
  2669  
  2670  func appendUint32(code []byte, v uint32) []byte {
  2671  	b := [4]byte{}
  2672  	binary.LittleEndian.PutUint32(b[:], uint32(v))
  2673  	return append(code, b[:]...)
  2674  }
  2675  
  2676  func appendUint64(code []byte, v uint64) []byte {
  2677  	b := [8]byte{}
  2678  	binary.LittleEndian.PutUint64(b[:], uint64(v))
  2679  	return append(code, b[:]...)
  2680  }
  2681  
  2682  func appendConst(code []byte, v int64, length byte) []byte {
  2683  	switch length {
  2684  	case 8:
  2685  		return append(code, byte(v))
  2686  	case 32:
  2687  		return appendUint32(code, uint32(v))
  2688  	default:
  2689  		return appendUint64(code, uint64(v))
  2690  	}
  2691  }
  2692  
  2693  func (n *nodeImpl) getMemoryLocation(dstMem bool) (p rexPrefix, modRM byte, sbi byte, sbiExist bool, displacementWidth byte, err error) {
  2694  	var baseReg, indexReg asm.Register
  2695  	var offset asm.ConstantValue
  2696  	var scale byte
  2697  	if dstMem {
  2698  		baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale
  2699  	} else {
  2700  		baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale
  2701  	}
  2702  
  2703  	if !fitIn32bit(offset) {
  2704  		err = errors.New("offset does not fit in 32-bit integer")
  2705  		return
  2706  	}
  2707  
  2708  	if baseReg == asm.NilRegister && indexReg != asm.NilRegister {
  2709  		// [(index*scale) + displacement] addressing is possible, but we haven't used it for now.
  2710  		err = errors.New("addressing without base register but with index is not implemented")
  2711  	} else if baseReg == asm.NilRegister {
  2712  		modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB.
  2713  		sbi, sbiExist = byte(0b00_100_101), true
  2714  		displacementWidth = 32
  2715  	} else if indexReg == asm.NilRegister {
  2716  		modRM, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM)
  2717  
  2718  		// Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0
  2719  		// and otherwise [R/M].
  2720  		withoutDisplacement := offset == 0 &&
  2721  			// If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value
  2722  			// is zero since it's not [R/M] operand is not defined for these two registers.
  2723  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2724  			baseReg != RegR13 && baseReg != RegBP
  2725  		if withoutDisplacement {
  2726  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2727  			modRM |= 0b00_000_000 // Specifying that operand is memory without displacement
  2728  			displacementWidth = 0
  2729  		} else if fitInSigned8bit(offset) {
  2730  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2731  			modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement.
  2732  			displacementWidth = 8
  2733  		} else {
  2734  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2735  			modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement.
  2736  			displacementWidth = 32
  2737  		}
  2738  
  2739  		// For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP].
  2740  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2741  		//
  2742  		// Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement].
  2743  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2
  2744  		if baseReg == RegSP || baseReg == RegR12 {
  2745  			sbi, sbiExist = byte(0b00_100_100), true
  2746  		}
  2747  	} else {
  2748  		if indexReg == RegSP {
  2749  			err = errors.New("SP cannot be used for SIB index")
  2750  			return
  2751  		}
  2752  
  2753  		modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB.
  2754  
  2755  		withoutDisplacement := offset == 0 &&
  2756  			// For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod).
  2757  			baseReg != RegR13 && baseReg != RegBP
  2758  		if withoutDisplacement {
  2759  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2760  			modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement
  2761  			displacementWidth = 0
  2762  		} else if fitInSigned8bit(offset) {
  2763  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2764  			modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement.
  2765  			displacementWidth = 8
  2766  		} else {
  2767  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2768  			modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement.
  2769  			displacementWidth = 32
  2770  		}
  2771  
  2772  		var baseRegBits byte
  2773  		baseRegBits, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM)
  2774  
  2775  		var indexRegBits byte
  2776  		var indexRegPrefix rexPrefix
  2777  		indexRegBits, indexRegPrefix = register3bits(indexReg, registerSpecifierPositionSIBIndex)
  2778  		p |= indexRegPrefix
  2779  
  2780  		sbi, sbiExist = baseRegBits|(indexRegBits<<3), true
  2781  		switch scale {
  2782  		case 1:
  2783  			sbi |= 0b00_000_000
  2784  		case 2:
  2785  			sbi |= 0b01_000_000
  2786  		case 4:
  2787  			sbi |= 0b10_000_000
  2788  		case 8:
  2789  			sbi |= 0b11_000_000
  2790  		default:
  2791  			err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale)
  2792  			return
  2793  		}
  2794  
  2795  	}
  2796  	return
  2797  }
  2798  
  2799  // getRegisterToRegisterModRM does XXXX
  2800  //
  2801  // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation
  2802  // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity.
  2803  func (n *nodeImpl) getRegisterToRegisterModRM(srcOnModRMReg bool) (rexPrefix, modRM byte, err error) {
  2804  	var reg3bits, rm3bits byte
  2805  	if srcOnModRMReg {
  2806  		reg3bits, rexPrefix = register3bits(n.srcReg,
  2807  			// Indicate that srcReg will be specified by ModRM:reg.
  2808  			registerSpecifierPositionModRMFieldReg)
  2809  
  2810  		var dstRexPrefix byte
  2811  		rm3bits, dstRexPrefix = register3bits(n.dstReg,
  2812  			// Indicate that dstReg will be specified by ModRM:r/m.
  2813  			registerSpecifierPositionModRMFieldRM)
  2814  		rexPrefix |= dstRexPrefix
  2815  	} else {
  2816  		rm3bits, rexPrefix = register3bits(n.srcReg,
  2817  			// Indicate that srcReg will be specified by ModRM:r/m.
  2818  			registerSpecifierPositionModRMFieldRM)
  2819  
  2820  		var dstRexPrefix byte
  2821  		reg3bits, dstRexPrefix = register3bits(n.dstReg,
  2822  			// Indicate that dstReg will be specified by ModRM:reg.
  2823  			registerSpecifierPositionModRMFieldReg)
  2824  		rexPrefix |= dstRexPrefix
  2825  	}
  2826  
  2827  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2828  	modRM = 0b11_000_000 | // Specifying that dst operand is register.
  2829  		(reg3bits << 3) |
  2830  		rm3bits
  2831  
  2832  	return
  2833  }
  2834  
  2835  // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
  2836  type rexPrefix = byte
  2837  
  2838  // REX prefixes are independent of each other and can be combined with OR.
  2839  const (
  2840  	rexPrefixNone    rexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix.
  2841  	rexPrefixDefault rexPrefix = 0b0100_0000
  2842  	rexPrefixW                 = 0b0000_1000 | rexPrefixDefault // REX.W
  2843  	rexPrefixR                 = 0b0000_0100 | rexPrefixDefault // REX.R
  2844  	rexPrefixX                 = 0b0000_0010 | rexPrefixDefault // REX.X
  2845  	rexPrefixB                 = 0b0000_0001 | rexPrefixDefault // REX.B
  2846  )
  2847  
  2848  // lockPrefix represents the LOCK prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#Legacy_Prefixes
  2849  const lockPrefix = 0xF0
  2850  
  2851  // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed.
  2852  type registerSpecifierPosition byte
  2853  
  2854  const (
  2855  	registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota
  2856  	registerSpecifierPositionModRMFieldRM
  2857  	registerSpecifierPositionSIBIndex
  2858  )
  2859  
  2860  var regInfo = [...]struct {
  2861  	bits    byte
  2862  	needRex bool
  2863  }{
  2864  	RegAX:  {bits: 0b000},
  2865  	RegCX:  {bits: 0b001},
  2866  	RegDX:  {bits: 0b010},
  2867  	RegBX:  {bits: 0b011},
  2868  	RegSP:  {bits: 0b100},
  2869  	RegBP:  {bits: 0b101},
  2870  	RegSI:  {bits: 0b110},
  2871  	RegDI:  {bits: 0b111},
  2872  	RegR8:  {bits: 0b000, needRex: true},
  2873  	RegR9:  {bits: 0b001, needRex: true},
  2874  	RegR10: {bits: 0b010, needRex: true},
  2875  	RegR11: {bits: 0b011, needRex: true},
  2876  	RegR12: {bits: 0b100, needRex: true},
  2877  	RegR13: {bits: 0b101, needRex: true},
  2878  	RegR14: {bits: 0b110, needRex: true},
  2879  	RegR15: {bits: 0b111, needRex: true},
  2880  	RegX0:  {bits: 0b000},
  2881  	RegX1:  {bits: 0b001},
  2882  	RegX2:  {bits: 0b010},
  2883  	RegX3:  {bits: 0b011},
  2884  	RegX4:  {bits: 0b100},
  2885  	RegX5:  {bits: 0b101},
  2886  	RegX6:  {bits: 0b110},
  2887  	RegX7:  {bits: 0b111},
  2888  	RegX8:  {bits: 0b000, needRex: true},
  2889  	RegX9:  {bits: 0b001, needRex: true},
  2890  	RegX10: {bits: 0b010, needRex: true},
  2891  	RegX11: {bits: 0b011, needRex: true},
  2892  	RegX12: {bits: 0b100, needRex: true},
  2893  	RegX13: {bits: 0b101, needRex: true},
  2894  	RegX14: {bits: 0b110, needRex: true},
  2895  	RegX15: {bits: 0b111, needRex: true},
  2896  }
  2897  
  2898  func register3bits(
  2899  	reg asm.Register,
  2900  	registerSpecifierPosition registerSpecifierPosition,
  2901  ) (bits byte, prefix rexPrefix) {
  2902  	info := regInfo[reg]
  2903  	bits = info.bits
  2904  	if info.needRex {
  2905  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
  2906  		switch registerSpecifierPosition {
  2907  		case registerSpecifierPositionModRMFieldReg:
  2908  			prefix = rexPrefixR
  2909  		case registerSpecifierPositionModRMFieldRM:
  2910  			prefix = rexPrefixB
  2911  		case registerSpecifierPositionSIBIndex:
  2912  			prefix = rexPrefixX
  2913  		}
  2914  	}
  2915  	return
  2916  }
  2917  
  2918  func fitIn32bit(v int64) bool {
  2919  	return math.MinInt32 <= v && v <= math.MaxUint32
  2920  }
  2921  
  2922  func fitInSigned8bit(v int64) bool {
  2923  	return math.MinInt8 <= v && v <= math.MaxInt8
  2924  }
  2925  
  2926  func isVectorRegister(r asm.Register) bool {
  2927  	return RegX0 <= r && r <= RegX15
  2928  }