github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/asm/amd64/impl.go

github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/asm/amd64/impl.go (about)

     1  package amd64
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  	"math"
     8  
     9  	"github.com/bananabytelabs/wazero/internal/asm"
    10  )
    11  
    12  // nodeImpl implements asm.Node for amd64.
    13  type nodeImpl struct {
    14  	// jumpTarget holds the target node in the linked for the jump-kind instruction.
    15  	jumpTarget *nodeImpl
    16  
    17  	// prev and next hold the prev/next node from this node in the assembled linked list.
    18  	prev, next *nodeImpl
    19  
    20  	// forwardJumpOrigins hold all the nodes trying to jump into this node as a
    21  	// singly linked list. In other words, all the nodes with .jumpTarget == this.
    22  	forwardJumpOrigins *nodeImpl
    23  
    24  	staticConst *asm.StaticConst
    25  
    26  	dstConst       asm.ConstantValue
    27  	offsetInBinary asm.NodeOffsetInBinary
    28  	srcConst       asm.ConstantValue
    29  	instruction    asm.Instruction
    30  
    31  	// readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of
    32  	// read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress.
    33  	readInstructionAddressBeforeTargetInstruction asm.Instruction
    34  	flag                                          nodeFlag
    35  	types                                         operandTypes
    36  	srcReg, dstReg                                asm.Register
    37  	srcMemIndex, dstMemIndex                      asm.Register
    38  	srcMemScale, dstMemScale                      byte
    39  	arg                                           byte
    40  
    41  	// staticConstReferrersAdded true if this node is already added into AssemblerImpl.staticConstReferrers.
    42  	// Only used when staticConst is not nil. Through re-assembly, we might end up adding multiple times which causes unnecessary
    43  	// allocations, so we use this flag to do it once.
    44  	staticConstReferrersAdded bool
    45  }
    46  
    47  type nodeFlag byte
    48  
    49  const (
    50  	// nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge
    51  	// whether a jump is backward or forward before encoding.
    52  	nodeFlagInitializedForEncoding nodeFlag = 1 << iota
    53  	nodeFlagBackwardJump
    54  	// nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and
    55  	// the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget
    56  	// as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible.
    57  	nodeFlagShortForwardJump
    58  )
    59  
    60  func (n *nodeImpl) isInitializedForEncoding() bool {
    61  	return n.flag&nodeFlagInitializedForEncoding != 0
    62  }
    63  
    64  func (n *nodeImpl) isJumpNode() bool {
    65  	return n.jumpTarget != nil
    66  }
    67  
    68  func (n *nodeImpl) isBackwardJump() bool {
    69  	return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0)
    70  }
    71  
    72  func (n *nodeImpl) isForwardJump() bool {
    73  	return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0)
    74  }
    75  
    76  func (n *nodeImpl) isForwardShortJump() bool {
    77  	return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0
    78  }
    79  
    80  // AssignJumpTarget implements asm.Node.AssignJumpTarget.
    81  func (n *nodeImpl) AssignJumpTarget(target asm.Node) {
    82  	n.jumpTarget = target.(*nodeImpl)
    83  }
    84  
    85  // AssignDestinationConstant implements asm.Node.AssignDestinationConstant.
    86  func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) {
    87  	n.dstConst = value
    88  }
    89  
    90  // AssignSourceConstant implements asm.Node.AssignSourceConstant.
    91  func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) {
    92  	n.srcConst = value
    93  }
    94  
    95  // OffsetInBinary implements asm.Node.OffsetInBinary.
    96  func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary {
    97  	return n.offsetInBinary
    98  }
    99  
   100  // String implements fmt.Stringer.
   101  //
   102  // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax,
   103  // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand
   104  // might be embraced by '[]' to represent the memory location.
   105  func (n *nodeImpl) String() (ret string) {
   106  	instName := InstructionName(n.instruction)
   107  	switch n.types {
   108  	case operandTypesNoneToNone:
   109  		ret = instName
   110  	case operandTypesNoneToRegister:
   111  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg))
   112  	case operandTypesNoneToMemory:
   113  		if n.dstMemIndex != asm.NilRegister {
   114  			ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName,
   115  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   116  		} else {
   117  			ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst)
   118  		}
   119  	case operandTypesNoneToBranch:
   120  		ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget)
   121  	case operandTypesRegisterToNone:
   122  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg))
   123  	case operandTypesRegisterToRegister:
   124  		ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg))
   125  	case operandTypesRegisterToMemory:
   126  		if n.dstMemIndex != asm.NilRegister {
   127  			ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg),
   128  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   129  		} else {
   130  			ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst)
   131  		}
   132  	case operandTypesRegisterToConst:
   133  		ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst)
   134  	case operandTypesMemoryToRegister:
   135  		if n.srcMemIndex != asm.NilRegister {
   136  			ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName,
   137  				RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg))
   138  		} else {
   139  			ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
   140  		}
   141  	case operandTypesMemoryToConst:
   142  		if n.srcMemIndex != asm.NilRegister {
   143  			ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName,
   144  				RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst)
   145  		} else {
   146  			ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst)
   147  		}
   148  	case operandTypesConstToMemory:
   149  		if n.dstMemIndex != asm.NilRegister {
   150  			ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst,
   151  				RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale)
   152  		} else {
   153  			ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst)
   154  		}
   155  	case operandTypesConstToRegister:
   156  		ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg))
   157  	case operandTypesStaticConstToRegister:
   158  		ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg))
   159  	case operandTypesRegisterToStaticConst:
   160  		ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw)
   161  	}
   162  	return
   163  }
   164  
   165  type operandTypes byte
   166  
   167  const (
   168  	operandTypesNoneToNone operandTypes = iota
   169  	operandTypesNoneToRegister
   170  	operandTypesNoneToMemory
   171  	operandTypesNoneToBranch
   172  	operandTypesRegisterToNone
   173  	operandTypesRegisterToRegister
   174  	operandTypesRegisterToMemory
   175  	operandTypesRegisterToConst
   176  	operandTypesMemoryToRegister
   177  	operandTypesMemoryToConst
   178  	operandTypesConstToRegister
   179  	operandTypesConstToMemory
   180  	operandTypesStaticConstToRegister
   181  	operandTypesRegisterToStaticConst
   182  )
   183  
   184  // String implements fmt.Stringer
   185  func (o operandTypes) String() (ret string) {
   186  	switch o {
   187  	case operandTypesNoneToNone:
   188  		ret = "NoneToNone"
   189  	case operandTypesNoneToRegister:
   190  		ret = "NoneToRegister"
   191  	case operandTypesNoneToMemory:
   192  		ret = "NoneToMemory"
   193  	case operandTypesNoneToBranch:
   194  		ret = "NoneToBranch"
   195  	case operandTypesRegisterToNone:
   196  		ret = "RegisterToNone"
   197  	case operandTypesRegisterToRegister:
   198  		ret = "RegisterToRegister"
   199  	case operandTypesRegisterToMemory:
   200  		ret = "RegisterToMemory"
   201  	case operandTypesRegisterToConst:
   202  		ret = "RegisterToConst"
   203  	case operandTypesMemoryToRegister:
   204  		ret = "MemoryToRegister"
   205  	case operandTypesMemoryToConst:
   206  		ret = "MemoryToConst"
   207  	case operandTypesConstToRegister:
   208  		ret = "ConstToRegister"
   209  	case operandTypesConstToMemory:
   210  		ret = "ConstToMemory"
   211  	case operandTypesStaticConstToRegister:
   212  		ret = "StaticConstToRegister"
   213  	case operandTypesRegisterToStaticConst:
   214  		ret = "RegisterToStaticConst"
   215  	}
   216  	return
   217  }
   218  
   219  type (
   220  	// AssemblerImpl implements Assembler.
   221  	AssemblerImpl struct {
   222  		root    *nodeImpl
   223  		current *nodeImpl
   224  		asm.BaseAssemblerImpl
   225  		readInstructionAddressNodes []*nodeImpl
   226  
   227  		// staticConstReferrers maintains the list of static const referrers which requires the
   228  		// offset resolution after finalizing the binary layout.
   229  		staticConstReferrers []staticConstReferrer
   230  
   231  		nodePool nodePool
   232  		pool     asm.StaticConstPool
   233  
   234  		// MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool
   235  		// but have it as an exported field here for testability.
   236  		MaxDisplacementForConstantPool int
   237  
   238  		forceReAssemble bool
   239  	}
   240  
   241  	// staticConstReferrer represents a referrer of a asm.StaticConst.
   242  	staticConstReferrer struct {
   243  		n *nodeImpl
   244  		// instLen is the encoded length of the instruction for `n`.
   245  		instLen int
   246  	}
   247  )
   248  
   249  func NewAssembler() *AssemblerImpl {
   250  	return &AssemblerImpl{
   251  		nodePool:                       nodePool{index: nodePageSize},
   252  		pool:                           asm.NewStaticConstPool(),
   253  		MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool,
   254  	}
   255  }
   256  
   257  const nodePageSize = 128
   258  
   259  type nodePage = [nodePageSize]nodeImpl
   260  
   261  // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl.
   262  // This reduces the allocations over compilation by reusing AssemblerImpl.
   263  type nodePool struct {
   264  	pages []*nodePage
   265  	index int
   266  }
   267  
   268  // allocNode allocates a new nodeImpl for use from the pool.
   269  // This expands the pool if there is no space left for it.
   270  func (n *nodePool) allocNode() *nodeImpl {
   271  	if n.index == nodePageSize {
   272  		if len(n.pages) == cap(n.pages) {
   273  			n.pages = append(n.pages, new(nodePage))
   274  		} else {
   275  			i := len(n.pages)
   276  			n.pages = n.pages[:i+1]
   277  			if n.pages[i] == nil {
   278  				n.pages[i] = new(nodePage)
   279  			}
   280  		}
   281  		n.index = 0
   282  	}
   283  	ret := &n.pages[len(n.pages)-1][n.index]
   284  	n.index++
   285  	return ret
   286  }
   287  
   288  func (n *nodePool) reset() {
   289  	for _, ns := range n.pages {
   290  		pages := ns[:]
   291  		for i := range pages {
   292  			pages[i] = nodeImpl{}
   293  		}
   294  	}
   295  	n.pages = n.pages[:0]
   296  	n.index = nodePageSize
   297  }
   298  
   299  // AllocateNOP implements asm.AssemblerBase.
   300  func (a *AssemblerImpl) AllocateNOP() asm.Node {
   301  	n := a.nodePool.allocNode()
   302  	n.instruction = NOP
   303  	n.types = operandTypesNoneToNone
   304  	return n
   305  }
   306  
   307  // Add implements asm.AssemblerBase.
   308  func (a *AssemblerImpl) Add(n asm.Node) {
   309  	a.addNode(n.(*nodeImpl))
   310  }
   311  
   312  // Reset implements asm.AssemblerBase.
   313  func (a *AssemblerImpl) Reset() {
   314  	pool := a.pool
   315  	pool.Reset()
   316  	*a = AssemblerImpl{
   317  		nodePool:                    a.nodePool,
   318  		pool:                        pool,
   319  		readInstructionAddressNodes: a.readInstructionAddressNodes[:0],
   320  		staticConstReferrers:        a.staticConstReferrers[:0],
   321  		BaseAssemblerImpl: asm.BaseAssemblerImpl{
   322  			SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0],
   323  			JumpTableEntries:           a.JumpTableEntries[:0],
   324  		},
   325  	}
   326  	a.nodePool.reset()
   327  }
   328  
   329  // newNode creates a new Node and appends it into the linked list.
   330  func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl {
   331  	n := a.nodePool.allocNode()
   332  	n.instruction = instruction
   333  	n.types = types
   334  	a.addNode(n)
   335  	return n
   336  }
   337  
   338  // addNode appends the new node into the linked list.
   339  func (a *AssemblerImpl) addNode(node *nodeImpl) {
   340  	if a.root == nil {
   341  		a.root = node
   342  		a.current = node
   343  	} else {
   344  		parent := a.current
   345  		parent.next = node
   346  		node.prev = parent
   347  		a.current = node
   348  	}
   349  
   350  	for _, o := range a.SetBranchTargetOnNextNodes {
   351  		origin := o.(*nodeImpl)
   352  		origin.jumpTarget = node
   353  	}
   354  	// Reuse the underlying slice to avoid re-allocations.
   355  	a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0]
   356  }
   357  
   358  // encodeNode encodes the given node into writer.
   359  func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) {
   360  	switch n.types {
   361  	case operandTypesNoneToNone:
   362  		err = a.encodeNoneToNone(buf, n)
   363  	case operandTypesNoneToRegister:
   364  		err = a.encodeNoneToRegister(buf, n)
   365  	case operandTypesNoneToMemory:
   366  		err = a.encodeNoneToMemory(buf, n)
   367  	case operandTypesNoneToBranch:
   368  		// Branching operand can be encoded as relative jumps.
   369  		err = a.encodeRelativeJump(buf, n)
   370  	case operandTypesRegisterToNone:
   371  		err = a.encodeRegisterToNone(buf, n)
   372  	case operandTypesRegisterToRegister:
   373  		err = a.encodeRegisterToRegister(buf, n)
   374  	case operandTypesRegisterToMemory:
   375  		err = a.encodeRegisterToMemory(buf, n)
   376  	case operandTypesRegisterToConst:
   377  		err = a.encodeRegisterToConst(buf, n)
   378  	case operandTypesMemoryToRegister:
   379  		err = a.encodeMemoryToRegister(buf, n)
   380  	case operandTypesMemoryToConst:
   381  		err = a.encodeMemoryToConst(buf, n)
   382  	case operandTypesConstToRegister:
   383  		err = a.encodeConstToRegister(buf, n)
   384  	case operandTypesConstToMemory:
   385  		err = a.encodeConstToMemory(buf, n)
   386  	case operandTypesStaticConstToRegister:
   387  		err = a.encodeStaticConstToRegister(buf, n)
   388  	case operandTypesRegisterToStaticConst:
   389  		err = a.encodeRegisterToStaticConst(buf, n)
   390  	default:
   391  		err = fmt.Errorf("encoder undefined for [%s] operand type", n.types)
   392  	}
   393  	if err != nil {
   394  		err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node.
   395  	}
   396  	return
   397  }
   398  
   399  // Assemble implements asm.AssemblerBase
   400  func (a *AssemblerImpl) Assemble(buf asm.Buffer) error {
   401  	a.initializeNodesForEncoding()
   402  
   403  	// Continue encoding until we are not forced to re-assemble which happens when
   404  	// a short relative jump ends up the offset larger than 8-bit length.
   405  	for {
   406  		err := a.encode(buf)
   407  		if err != nil {
   408  			return err
   409  		}
   410  
   411  		if !a.forceReAssemble {
   412  			break
   413  		} else {
   414  			// We reset the length of buffer but don't delete the underlying slice since
   415  			// the binary size will roughly the same after reassemble.
   416  			buf.Reset()
   417  			// Reset the re-assemble flag in order to avoid the infinite loop!
   418  			a.forceReAssemble = false
   419  		}
   420  	}
   421  
   422  	code := buf.Bytes()
   423  	for _, n := range a.readInstructionAddressNodes {
   424  		if err := a.finalizeReadInstructionAddressNode(code, n); err != nil {
   425  			return err
   426  		}
   427  	}
   428  
   429  	// Now that we've finished the layout, fill out static consts offsets.
   430  	for i := range a.staticConstReferrers {
   431  		ref := &a.staticConstReferrers[i]
   432  		n, instLen := ref.n, ref.instLen
   433  		// Calculate the displacement between the RIP (the offset _after_ n) and the static constant.
   434  		displacement := int(n.staticConst.OffsetInBinary) - int(n.OffsetInBinary()) - instLen
   435  		// The offset must be stored at the 4 bytes from the tail of this n. See AssemblerImpl.encodeStaticConstImpl for detail.
   436  		displacementOffsetInInstruction := n.OffsetInBinary() + uint64(instLen-4)
   437  		binary.LittleEndian.PutUint32(code[displacementOffsetInInstruction:], uint32(int32(displacement)))
   438  	}
   439  
   440  	return a.FinalizeJumpTableEntry(code)
   441  }
   442  
   443  // initializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps
   444  // are forward or backward jump.
   445  func (a *AssemblerImpl) initializeNodesForEncoding() {
   446  	for n := a.root; n != nil; n = n.next {
   447  		n.flag |= nodeFlagInitializedForEncoding
   448  		if target := n.jumpTarget; target != nil {
   449  			if target.isInitializedForEncoding() {
   450  				// This means the target exists behind.
   451  				n.flag |= nodeFlagBackwardJump
   452  			} else {
   453  				// Otherwise, this is forward jump.
   454  				// We start with assuming that the jump can be short (8-bit displacement).
   455  				// If it doens't fit, we change this flag in resolveRelativeForwardJump.
   456  				n.flag |= nodeFlagShortForwardJump
   457  
   458  				// If the target node is also the branching instruction, we replace the target with the NOP
   459  				// node so that we can avoid the collision of the target.forwardJumpOrigins both as destination and origins.
   460  				if target.types == operandTypesNoneToBranch {
   461  					// Allocate the NOP node from the pool.
   462  					nop := a.nodePool.allocNode()
   463  					nop.instruction = NOP
   464  					nop.types = operandTypesNoneToNone
   465  					// Insert it between target.prev and target: [target.prev, target] -> [target.prev, nop, target]
   466  					prev := target.prev
   467  					nop.prev = prev
   468  					prev.next = nop
   469  					nop.next = target
   470  					target.prev = nop
   471  					n.jumpTarget = nop
   472  					target = nop
   473  				}
   474  
   475  				// We add this node `n` into the end of the linked list (.forwardJumpOrigins) beginning from the `target.forwardJumpOrigins`.
   476  				// Insert the current `n` as the head of the list.
   477  				n.forwardJumpOrigins = target.forwardJumpOrigins
   478  				target.forwardJumpOrigins = n
   479  			}
   480  		}
   481  	}
   482  }
   483  
   484  func (a *AssemblerImpl) encode(buf asm.Buffer) error {
   485  	for n := a.root; n != nil; n = n.next {
   486  		// If an instruction needs NOP padding, we do so before encoding it.
   487  		//
   488  		// This is necessary to avoid Intel's jump erratum; see in Section 2.1
   489  		// in for when we have to pad NOP:
   490  		// https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   491  		//
   492  		// This logic used to be implemented in a function called maybeNOPPadding,
   493  		// but the complexity of the logic made it impossible for the compiler to
   494  		// inline. Since this function is on a hot code path, we inlined the
   495  		// initial checks to skip the function call when instructions do not need
   496  		// NOP padding.
   497  		switch info := nopPaddingInfo[n.instruction]; {
   498  		case info.jmp:
   499  			if err := a.encodeJmpNOPPadding(buf, n); err != nil {
   500  				return err
   501  			}
   502  		case info.onNextJmp:
   503  			if err := a.encodeOnNextJmpNOPPAdding(buf, n); err != nil {
   504  				return err
   505  			}
   506  		}
   507  
   508  		// After the padding, we can finalize the offset of this instruction in the binary.
   509  		n.offsetInBinary = uint64(buf.Len())
   510  
   511  		if err := a.encodeNode(buf, n); err != nil {
   512  			return err
   513  		}
   514  
   515  		if n.forwardJumpOrigins != nil {
   516  			if err := a.resolveForwardRelativeJumps(buf, n); err != nil {
   517  				return fmt.Errorf("invalid relative forward jumps: %w", err)
   518  			}
   519  		}
   520  
   521  		a.maybeFlushConstants(buf, n.next == nil)
   522  	}
   523  	return nil
   524  }
   525  
   526  var nopPaddingInfo = [instructionEnd]struct {
   527  	jmp, onNextJmp bool
   528  }{
   529  	RET: {jmp: true},
   530  	JMP: {jmp: true},
   531  	JCC: {jmp: true},
   532  	JCS: {jmp: true},
   533  	JEQ: {jmp: true},
   534  	JGE: {jmp: true},
   535  	JGT: {jmp: true},
   536  	JHI: {jmp: true},
   537  	JLE: {jmp: true},
   538  	JLS: {jmp: true},
   539  	JLT: {jmp: true},
   540  	JMI: {jmp: true},
   541  	JNE: {jmp: true},
   542  	JPC: {jmp: true},
   543  	JPS: {jmp: true},
   544  	// The possible fused jump instructions if the next node is a conditional jump instruction.
   545  	CMPL:  {onNextJmp: true},
   546  	CMPQ:  {onNextJmp: true},
   547  	TESTL: {onNextJmp: true},
   548  	TESTQ: {onNextJmp: true},
   549  	ADDL:  {onNextJmp: true},
   550  	ADDQ:  {onNextJmp: true},
   551  	SUBL:  {onNextJmp: true},
   552  	SUBQ:  {onNextJmp: true},
   553  	ANDL:  {onNextJmp: true},
   554  	ANDQ:  {onNextJmp: true},
   555  	INCQ:  {onNextJmp: true},
   556  	DECQ:  {onNextJmp: true},
   557  }
   558  
   559  func (a *AssemblerImpl) encodeJmpNOPPadding(buf asm.Buffer, n *nodeImpl) error {
   560  	// In order to know the instruction length before writing into the binary,
   561  	// we try encoding it.
   562  	prevLen := buf.Len()
   563  
   564  	// Assign the temporary offset which may or may not be correct depending on the padding decision.
   565  	n.offsetInBinary = uint64(prevLen)
   566  
   567  	// Encode the node and get the instruction length.
   568  	if err := a.encodeNode(buf, n); err != nil {
   569  		return err
   570  	}
   571  	instructionLen := int32(buf.Len() - prevLen)
   572  
   573  	// Revert the written bytes.
   574  	buf.Truncate(prevLen)
   575  	return a.encodeNOPPadding(buf, instructionLen)
   576  }
   577  
   578  func (a *AssemblerImpl) encodeOnNextJmpNOPPAdding(buf asm.Buffer, n *nodeImpl) error {
   579  	instructionLen, err := a.fusedInstructionLength(buf, n)
   580  	if err != nil {
   581  		return err
   582  	}
   583  	return a.encodeNOPPadding(buf, instructionLen)
   584  }
   585  
   586  // encodeNOPPadding maybe appends NOP instructions before the node `n`.
   587  // This is necessary to avoid Intel's jump erratum:
   588  // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
   589  func (a *AssemblerImpl) encodeNOPPadding(buf asm.Buffer, instructionLen int32) error {
   590  	const boundaryInBytes int32 = 32
   591  	const mask = boundaryInBytes - 1
   592  	var padNum int
   593  	currentPos := int32(buf.Len())
   594  	if used := currentPos & mask; used+instructionLen >= boundaryInBytes {
   595  		padNum = int(boundaryInBytes - used)
   596  	}
   597  	a.padNOP(buf, padNum)
   598  	return nil
   599  }
   600  
   601  // fusedInstructionLength returns the length of "macro fused instruction" if the
   602  // instruction sequence starting from `n` can be fused by processor. Otherwise,
   603  // returns zero.
   604  func (a *AssemblerImpl) fusedInstructionLength(buf asm.Buffer, n *nodeImpl) (ret int32, err error) {
   605  	// Find the next non-NOP instruction.
   606  	next := n.next
   607  	for ; next != nil && next.instruction == NOP; next = next.next {
   608  	}
   609  
   610  	if next == nil {
   611  		return
   612  	}
   613  
   614  	inst, jmpInst := n.instruction, next.instruction
   615  
   616  	if !nopPaddingInfo[jmpInst].jmp {
   617  		// If the next instruction is not jump kind, the instruction will not be fused.
   618  		return
   619  	}
   620  
   621  	// How to determine whether the instruction can be fused is described in
   622  	// Section 3.4.2.2 of "Intel Optimization Manual":
   623  	// https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf
   624  	isTest := inst == TESTL || inst == TESTQ
   625  	isCmp := inst == CMPQ || inst == CMPL
   626  	isTestCmp := isTest || isCmp
   627  	if isTestCmp && (n.types == operandTypesMemoryToConst || n.types == operandTypesConstToMemory) {
   628  		// The manual says: "CMP and TEST can not be fused when comparing MEM-IMM".
   629  		return
   630  	}
   631  
   632  	// Implement the decision according to the table 3-1 in the manual.
   633  	isAnd := inst == ANDL || inst == ANDQ
   634  	if !isTest && !isAnd {
   635  		if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC {
   636  			// These jumps are only fused for TEST or AND.
   637  			return
   638  		}
   639  		isAdd := inst == ADDL || inst == ADDQ
   640  		isSub := inst == SUBL || inst == SUBQ
   641  		if !isCmp && !isAdd && !isSub {
   642  			if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS {
   643  				// Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB.
   644  				return
   645  			}
   646  		}
   647  	}
   648  
   649  	// Now the instruction is ensured to be fused by the processor.
   650  	// In order to know the fused instruction length before writing into the binary,
   651  	// we try encoding it.
   652  	savedLen := uint64(buf.Len())
   653  
   654  	// Encode the nodes into the buffer.
   655  	if err = a.encodeNode(buf, n); err != nil {
   656  		return
   657  	}
   658  	if err = a.encodeNode(buf, next); err != nil {
   659  		return
   660  	}
   661  
   662  	ret = int32(uint64(buf.Len()) - savedLen)
   663  
   664  	// Revert the written bytes.
   665  	buf.Truncate(int(savedLen))
   666  	return
   667  }
   668  
   669  // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP"
   670  // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf
   671  var nopOpcodes = [][11]byte{
   672  	{0x90},
   673  	{0x66, 0x90},
   674  	{0x0f, 0x1f, 0x00},
   675  	{0x0f, 0x1f, 0x40, 0x00},
   676  	{0x0f, 0x1f, 0x44, 0x00, 0x00},
   677  	{0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
   678  	{0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00},
   679  	{0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   680  	{0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   681  	{0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   682  	{0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   683  }
   684  
   685  func (a *AssemblerImpl) padNOP(buf asm.Buffer, num int) {
   686  	for num > 0 {
   687  		singleNopNum := num
   688  		if singleNopNum > len(nopOpcodes) {
   689  			singleNopNum = len(nopOpcodes)
   690  		}
   691  		buf.AppendBytes(nopOpcodes[singleNopNum-1][:singleNopNum])
   692  		num -= singleNopNum
   693  	}
   694  }
   695  
   696  // CompileStandAlone implements the same method as documented on asm.AssemblerBase.
   697  func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node {
   698  	return a.newNode(instruction, operandTypesNoneToNone)
   699  }
   700  
   701  // CompileConstToRegister implements the same method as documented on asm.AssemblerBase.
   702  func (a *AssemblerImpl) CompileConstToRegister(
   703  	instruction asm.Instruction,
   704  	value asm.ConstantValue,
   705  	destinationReg asm.Register,
   706  ) (inst asm.Node) {
   707  	n := a.newNode(instruction, operandTypesConstToRegister)
   708  	n.srcConst = value
   709  	n.dstReg = destinationReg
   710  	return n
   711  }
   712  
   713  // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase.
   714  func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
   715  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   716  	n.srcReg = from
   717  	n.dstReg = to
   718  }
   719  
   720  // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase.
   721  func (a *AssemblerImpl) CompileMemoryToRegister(
   722  	instruction asm.Instruction,
   723  	sourceBaseReg asm.Register,
   724  	sourceOffsetConst asm.ConstantValue,
   725  	destinationReg asm.Register,
   726  ) {
   727  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   728  	n.srcReg = sourceBaseReg
   729  	n.srcConst = sourceOffsetConst
   730  	n.dstReg = destinationReg
   731  }
   732  
   733  // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase.
   734  func (a *AssemblerImpl) CompileRegisterToMemory(
   735  	instruction asm.Instruction,
   736  	sourceRegister, destinationBaseRegister asm.Register,
   737  	destinationOffsetConst asm.ConstantValue,
   738  ) {
   739  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   740  	n.srcReg = sourceRegister
   741  	n.dstReg = destinationBaseRegister
   742  	n.dstConst = destinationOffsetConst
   743  }
   744  
   745  // CompileJump implements the same method as documented on asm.AssemblerBase.
   746  func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node {
   747  	return a.newNode(jmpInstruction, operandTypesNoneToBranch)
   748  }
   749  
   750  // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase.
   751  func (a *AssemblerImpl) CompileJumpToMemory(
   752  	jmpInstruction asm.Instruction,
   753  	baseReg asm.Register,
   754  	offset asm.ConstantValue,
   755  ) {
   756  	n := a.newNode(jmpInstruction, operandTypesNoneToMemory)
   757  	n.dstReg = baseReg
   758  	n.dstConst = offset
   759  }
   760  
   761  // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase.
   762  func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) {
   763  	n := a.newNode(jmpInstruction, operandTypesNoneToRegister)
   764  	n.dstReg = reg
   765  }
   766  
   767  // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase.
   768  func (a *AssemblerImpl) CompileReadInstructionAddress(
   769  	destinationRegister asm.Register,
   770  	beforeAcquisitionTargetInstruction asm.Instruction,
   771  ) {
   772  	n := a.newNode(LEAQ, operandTypesMemoryToRegister)
   773  	n.dstReg = destinationRegister
   774  	n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction
   775  }
   776  
   777  // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler.
   778  func (a *AssemblerImpl) CompileRegisterToRegisterWithArg(
   779  	instruction asm.Instruction,
   780  	from, to asm.Register,
   781  	arg byte,
   782  ) {
   783  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   784  	n.srcReg = from
   785  	n.dstReg = to
   786  	n.arg = arg
   787  }
   788  
   789  // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler.
   790  func (a *AssemblerImpl) CompileMemoryWithIndexToRegister(
   791  	instruction asm.Instruction,
   792  	srcBaseReg asm.Register,
   793  	srcOffsetConst asm.ConstantValue,
   794  	srcIndex asm.Register,
   795  	srcScale int16,
   796  	dstReg asm.Register,
   797  ) {
   798  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   799  	n.srcReg = srcBaseReg
   800  	n.srcConst = srcOffsetConst
   801  	n.srcMemIndex = srcIndex
   802  	n.srcMemScale = byte(srcScale)
   803  	n.dstReg = dstReg
   804  }
   805  
   806  // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler.
   807  func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister(
   808  	instruction asm.Instruction,
   809  	srcBaseReg asm.Register,
   810  	srcOffsetConst asm.ConstantValue,
   811  	srcIndex asm.Register,
   812  	srcScale int16,
   813  	dstReg asm.Register,
   814  	arg byte,
   815  ) {
   816  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   817  	n.srcReg = srcBaseReg
   818  	n.srcConst = srcOffsetConst
   819  	n.srcMemIndex = srcIndex
   820  	n.srcMemScale = byte(srcScale)
   821  	n.dstReg = dstReg
   822  	n.arg = arg
   823  }
   824  
   825  // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler.
   826  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex(
   827  	instruction asm.Instruction,
   828  	srcReg, dstBaseReg asm.Register,
   829  	dstOffsetConst asm.ConstantValue,
   830  	dstIndex asm.Register,
   831  	dstScale int16,
   832  ) {
   833  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   834  	n.srcReg = srcReg
   835  	n.dstReg = dstBaseReg
   836  	n.dstConst = dstOffsetConst
   837  	n.dstMemIndex = dstIndex
   838  	n.dstMemScale = byte(dstScale)
   839  }
   840  
   841  // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler.
   842  func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg(
   843  	instruction asm.Instruction,
   844  	srcReg, dstBaseReg asm.Register,
   845  	dstOffsetConst asm.ConstantValue,
   846  	dstIndex asm.Register,
   847  	dstScale int16,
   848  	arg byte,
   849  ) {
   850  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   851  	n.srcReg = srcReg
   852  	n.dstReg = dstBaseReg
   853  	n.dstConst = dstOffsetConst
   854  	n.dstMemIndex = dstIndex
   855  	n.dstMemScale = byte(dstScale)
   856  	n.arg = arg
   857  }
   858  
   859  // CompileRegisterToConst implements the same method as documented on amd64.Assembler.
   860  func (a *AssemblerImpl) CompileRegisterToConst(
   861  	instruction asm.Instruction,
   862  	srcRegister asm.Register,
   863  	value asm.ConstantValue,
   864  ) asm.Node {
   865  	n := a.newNode(instruction, operandTypesRegisterToConst)
   866  	n.srcReg = srcRegister
   867  	n.dstConst = value
   868  	return n
   869  }
   870  
   871  // CompileRegisterToNone implements the same method as documented on amd64.Assembler.
   872  func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) {
   873  	n := a.newNode(instruction, operandTypesRegisterToNone)
   874  	n.srcReg = register
   875  }
   876  
   877  // CompileNoneToRegister implements the same method as documented on amd64.Assembler.
   878  func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) {
   879  	n := a.newNode(instruction, operandTypesNoneToRegister)
   880  	n.dstReg = register
   881  }
   882  
   883  // CompileNoneToMemory implements the same method as documented on amd64.Assembler.
   884  func (a *AssemblerImpl) CompileNoneToMemory(
   885  	instruction asm.Instruction,
   886  	baseReg asm.Register,
   887  	offset asm.ConstantValue,
   888  ) {
   889  	n := a.newNode(instruction, operandTypesNoneToMemory)
   890  	n.dstReg = baseReg
   891  	n.dstConst = offset
   892  }
   893  
   894  // CompileConstToMemory implements the same method as documented on amd64.Assembler.
   895  func (a *AssemblerImpl) CompileConstToMemory(
   896  	instruction asm.Instruction,
   897  	value asm.ConstantValue,
   898  	dstbaseReg asm.Register,
   899  	dstOffset asm.ConstantValue,
   900  ) asm.Node {
   901  	n := a.newNode(instruction, operandTypesConstToMemory)
   902  	n.srcConst = value
   903  	n.dstReg = dstbaseReg
   904  	n.dstConst = dstOffset
   905  	return n
   906  }
   907  
   908  // CompileMemoryToConst implements the same method as documented on amd64.Assembler.
   909  func (a *AssemblerImpl) CompileMemoryToConst(
   910  	instruction asm.Instruction,
   911  	srcBaseReg asm.Register,
   912  	srcOffset, value asm.ConstantValue,
   913  ) asm.Node {
   914  	n := a.newNode(instruction, operandTypesMemoryToConst)
   915  	n.srcReg = srcBaseReg
   916  	n.srcConst = srcOffset
   917  	n.dstConst = value
   918  	return n
   919  }
   920  
   921  func errorEncodingUnsupported(n *nodeImpl) error {
   922  	return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types)
   923  }
   924  
   925  func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) (err error) {
   926  	// Throughout the encoding methods, we use this pair of base offset and
   927  	// code buffer to write instructions.
   928  	//
   929  	// The code buffer is allocated at the end of the current buffer to a size
   930  	// large enough to hold all the bytes that may be written by the method.
   931  	//
   932  	// We use Go's append builtin to write to the buffer because it allows the
   933  	// compiler to generate much better code than if we made calls to write
   934  	// methods to mutate an encapsulated byte slice.
   935  	//
   936  	// At the end of the method, we truncate the buffer size back to the base
   937  	// plus the length of the code buffer so the end of the buffer points right
   938  	// after the last byte that was written.
   939  	base := buf.Len()
   940  	code := buf.Append(4)[:0]
   941  
   942  	switch n.instruction {
   943  	case CDQ:
   944  		// https://www.felixcloutier.com/x86/cwd:cdq:cqo
   945  		code = append(code, 0x99)
   946  	case CQO:
   947  		// https://www.felixcloutier.com/x86/cwd:cdq:cqo
   948  		code = append(code, rexPrefixW, 0x99)
   949  	case NOP:
   950  		// Simply optimize out the NOP instructions.
   951  	case RET:
   952  		// https://www.felixcloutier.com/x86/ret
   953  		code = append(code, 0xc3)
   954  	case UD2:
   955  		// https://mudongliang.github.io/x86/html/file_module_x86_id_318.html
   956  		code = append(code, 0x0f, 0x0b)
   957  	case REPMOVSQ:
   958  		code = append(code, 0xf3, rexPrefixW, 0xa5)
   959  	case REPSTOSQ:
   960  		code = append(code, 0xf3, rexPrefixW, 0xab)
   961  	case STD:
   962  		code = append(code, 0xfd)
   963  	case CLD:
   964  		code = append(code, 0xfc)
   965  	default:
   966  		err = errorEncodingUnsupported(n)
   967  	}
   968  
   969  	buf.Truncate(base + len(code))
   970  	return
   971  }
   972  
   973  func (a *AssemblerImpl) encodeNoneToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
   974  	regBits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
   975  
   976  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
   977  	modRM := 0b11_000_000 | // Specifying that opeand is register.
   978  		regBits
   979  	if n.instruction == JMP {
   980  		// JMP's opcode is defined as "FF /4" meaning that we have to have "4"
   981  		// in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp
   982  		modRM |= 0b00_100_000
   983  	} else if n.instruction == NEGQ {
   984  		prefix |= rexPrefixW
   985  		modRM |= 0b00_011_000
   986  	} else if n.instruction == INCQ {
   987  		prefix |= rexPrefixW
   988  	} else if n.instruction == DECQ {
   989  		prefix |= rexPrefixW
   990  		modRM |= 0b00_001_000
   991  	} else {
   992  		if RegSP <= n.dstReg && n.dstReg <= RegDI {
   993  			// If the destination is one byte length register, we need to have the default prefix.
   994  			// https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers
   995  			prefix |= rexPrefixDefault
   996  		}
   997  	}
   998  
   999  	base := buf.Len()
  1000  	code := buf.Append(4)[:0]
  1001  
  1002  	if prefix != rexPrefixNone {
  1003  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding
  1004  		code = append(code, prefix)
  1005  	}
  1006  
  1007  	switch n.instruction {
  1008  	case JMP:
  1009  		// https://www.felixcloutier.com/x86/jmp
  1010  		code = append(code, 0xff, modRM)
  1011  	case SETCC:
  1012  		// https://www.felixcloutier.com/x86/setcc
  1013  		code = append(code, 0x0f, 0x93, modRM)
  1014  	case SETCS:
  1015  		// https://www.felixcloutier.com/x86/setcc
  1016  		code = append(code, 0x0f, 0x92, modRM)
  1017  	case SETEQ:
  1018  		// https://www.felixcloutier.com/x86/setcc
  1019  		code = append(code, 0x0f, 0x94, modRM)
  1020  	case SETGE:
  1021  		// https://www.felixcloutier.com/x86/setcc
  1022  		code = append(code, 0x0f, 0x9d, modRM)
  1023  	case SETGT:
  1024  		// https://www.felixcloutier.com/x86/setcc
  1025  		code = append(code, 0x0f, 0x9f, modRM)
  1026  	case SETHI:
  1027  		// https://www.felixcloutier.com/x86/setcc
  1028  		code = append(code, 0x0f, 0x97, modRM)
  1029  	case SETLE:
  1030  		// https://www.felixcloutier.com/x86/setcc
  1031  		code = append(code, 0x0f, 0x9e, modRM)
  1032  	case SETLS:
  1033  		// https://www.felixcloutier.com/x86/setcc
  1034  		code = append(code, 0x0f, 0x96, modRM)
  1035  	case SETLT:
  1036  		// https://www.felixcloutier.com/x86/setcc
  1037  		code = append(code, 0x0f, 0x9c, modRM)
  1038  	case SETNE:
  1039  		// https://www.felixcloutier.com/x86/setcc
  1040  		code = append(code, 0x0f, 0x95, modRM)
  1041  	case SETPC:
  1042  		// https://www.felixcloutier.com/x86/setcc
  1043  		code = append(code, 0x0f, 0x9b, modRM)
  1044  	case SETPS:
  1045  		// https://www.felixcloutier.com/x86/setcc
  1046  		code = append(code, 0x0f, 0x9a, modRM)
  1047  	case NEGQ:
  1048  		// https://www.felixcloutier.com/x86/neg
  1049  		code = append(code, 0xf7, modRM)
  1050  	case INCQ:
  1051  		// https://www.felixcloutier.com/x86/inc
  1052  		code = append(code, 0xff, modRM)
  1053  	case DECQ:
  1054  		// https://www.felixcloutier.com/x86/dec
  1055  		code = append(code, 0xff, modRM)
  1056  	default:
  1057  		err = errorEncodingUnsupported(n)
  1058  	}
  1059  
  1060  	buf.Truncate(base + len(code))
  1061  	return
  1062  }
  1063  
  1064  func (a *AssemblerImpl) encodeNoneToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  1065  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true)
  1066  	if err != nil {
  1067  		return err
  1068  	}
  1069  
  1070  	var opcode byte
  1071  	switch n.instruction {
  1072  	case INCQ:
  1073  		// https://www.felixcloutier.com/x86/inc
  1074  		rexPrefix |= rexPrefixW
  1075  		opcode = 0xff
  1076  	case DECQ:
  1077  		// https://www.felixcloutier.com/x86/dec
  1078  		rexPrefix |= rexPrefixW
  1079  		modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM.
  1080  		opcode = 0xff
  1081  	case JMP:
  1082  		// https://www.felixcloutier.com/x86/jmp
  1083  		modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM.
  1084  		opcode = 0xff
  1085  	default:
  1086  		return errorEncodingUnsupported(n)
  1087  	}
  1088  
  1089  	base := buf.Len()
  1090  	code := buf.Append(12)[:0]
  1091  
  1092  	if rexPrefix != rexPrefixNone {
  1093  		code = append(code, rexPrefix)
  1094  	}
  1095  
  1096  	code = append(code, opcode, modRM)
  1097  
  1098  	if sbiExist {
  1099  		code = append(code, sbi)
  1100  	}
  1101  
  1102  	if displacementWidth != 0 {
  1103  		code = appendConst(code, n.dstConst, displacementWidth)
  1104  	}
  1105  
  1106  	buf.Truncate(base + len(code))
  1107  	return
  1108  }
  1109  
  1110  type relativeJumpOpcode struct{ short, long []byte }
  1111  
  1112  func (o relativeJumpOpcode) instructionLen(short bool) int64 {
  1113  	if short {
  1114  		return int64(len(o.short)) + 1 // 1 byte = 8 bit offset
  1115  	} else {
  1116  		return int64(len(o.long)) + 4 // 4 byte = 32 bit offset
  1117  	}
  1118  }
  1119  
  1120  var relativeJumpOpcodes = [...]relativeJumpOpcode{
  1121  	// https://www.felixcloutier.com/x86/jcc
  1122  	JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}},
  1123  	JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}},
  1124  	JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}},
  1125  	JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}},
  1126  	JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}},
  1127  	JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}},
  1128  	JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}},
  1129  	JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}},
  1130  	JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}},
  1131  	JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}},
  1132  	JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}},
  1133  	JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}},
  1134  	JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}},
  1135  	JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}},
  1136  	// https://www.felixcloutier.com/x86/jmp
  1137  	JMP: {short: []byte{0xeb}, long: []byte{0xe9}},
  1138  }
  1139  
  1140  func (a *AssemblerImpl) resolveForwardRelativeJumps(buf asm.Buffer, target *nodeImpl) (err error) {
  1141  	offsetInBinary := int64(target.OffsetInBinary())
  1142  	origin := target.forwardJumpOrigins
  1143  	for ; origin != nil; origin = origin.forwardJumpOrigins {
  1144  		shortJump := origin.isForwardShortJump()
  1145  		op := relativeJumpOpcodes[origin.instruction]
  1146  		instructionLen := op.instructionLen(shortJump)
  1147  
  1148  		// Calculate the offset from the EIP (at the time of executing this jump instruction)
  1149  		// to the target instruction. This value is always >= 0 as here we only handle forward jumps.
  1150  		offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen)
  1151  		if shortJump {
  1152  			if offset > math.MaxInt8 {
  1153  				// This forces reassemble in the outer loop inside AssemblerImpl.Assemble().
  1154  				a.forceReAssemble = true
  1155  				// From the next reAssemble phases, this forward jump will be encoded long jump and
  1156  				// allocate 32-bit offset bytes by default. This means that this `origin` node
  1157  				// will always enter the "long jump offset encoding" block below
  1158  				origin.flag ^= nodeFlagShortForwardJump
  1159  			} else {
  1160  				buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset)
  1161  			}
  1162  		} else { // long jump offset encoding.
  1163  			if offset > math.MaxInt32 {
  1164  				return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction))
  1165  			}
  1166  			binary.LittleEndian.PutUint32(buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset))
  1167  		}
  1168  	}
  1169  	return nil
  1170  }
  1171  
  1172  func (a *AssemblerImpl) encodeRelativeJump(buf asm.Buffer, n *nodeImpl) (err error) {
  1173  	if n.jumpTarget == nil {
  1174  		err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction))
  1175  		return
  1176  	}
  1177  
  1178  	op := relativeJumpOpcodes[n.instruction]
  1179  	var isShortJump bool
  1180  	// offsetOfEIP means the offset of EIP register at the time of executing this jump instruction.
  1181  	// Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP.
  1182  	var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps
  1183  	if n.isBackwardJump() {
  1184  		// If this is the backward jump, we can calculate the exact offset now.
  1185  		offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary())
  1186  		isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8
  1187  		offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump)
  1188  	} else {
  1189  		// For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps.
  1190  		isShortJump = n.isForwardShortJump()
  1191  	}
  1192  
  1193  	if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here.
  1194  		return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction))
  1195  	}
  1196  
  1197  	base := buf.Len()
  1198  	code := buf.Append(6)[:0]
  1199  
  1200  	if isShortJump {
  1201  		code = append(code, op.short...)
  1202  		code = append(code, byte(offsetOfEIP))
  1203  	} else {
  1204  		code = append(code, op.long...)
  1205  		code = appendUint32(code, uint32(offsetOfEIP))
  1206  	}
  1207  
  1208  	buf.Truncate(base + len(code))
  1209  	return
  1210  }
  1211  
  1212  func (a *AssemblerImpl) encodeRegisterToNone(buf asm.Buffer, n *nodeImpl) (err error) {
  1213  	regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM)
  1214  
  1215  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1216  	modRM := 0b11_000_000 | // Specifying that opeand is register.
  1217  		regBits
  1218  
  1219  	var opcode byte
  1220  	switch n.instruction {
  1221  	case DIVL:
  1222  		// https://www.felixcloutier.com/x86/div
  1223  		modRM |= 0b00_110_000
  1224  		opcode = 0xf7
  1225  	case DIVQ:
  1226  		// https://www.felixcloutier.com/x86/div
  1227  		prefix |= rexPrefixW
  1228  		modRM |= 0b00_110_000
  1229  		opcode = 0xf7
  1230  	case IDIVL:
  1231  		// https://www.felixcloutier.com/x86/idiv
  1232  		modRM |= 0b00_111_000
  1233  		opcode = 0xf7
  1234  	case IDIVQ:
  1235  		// https://www.felixcloutier.com/x86/idiv
  1236  		prefix |= rexPrefixW
  1237  		modRM |= 0b00_111_000
  1238  		opcode = 0xf7
  1239  	case MULL:
  1240  		// https://www.felixcloutier.com/x86/mul
  1241  		modRM |= 0b00_100_000
  1242  		opcode = 0xf7
  1243  	case MULQ:
  1244  		// https://www.felixcloutier.com/x86/mul
  1245  		prefix |= rexPrefixW
  1246  		modRM |= 0b00_100_000
  1247  		opcode = 0xf7
  1248  	default:
  1249  		err = errorEncodingUnsupported(n)
  1250  	}
  1251  
  1252  	base := buf.Len()
  1253  	code := buf.Append(3)[:0]
  1254  
  1255  	if prefix != rexPrefixNone {
  1256  		code = append(code, prefix)
  1257  	}
  1258  
  1259  	code = append(code, opcode, modRM)
  1260  
  1261  	buf.Truncate(base + len(code))
  1262  	return
  1263  }
  1264  
  1265  var registerToRegisterOpcode = [instructionEnd]*struct {
  1266  	opcode          []byte
  1267  	rPrefix         rexPrefix
  1268  	mandatoryPrefix byte
  1269  	srcOnModRMReg   bool
  1270  	isSrc8bit       bool
  1271  	needArg         bool
  1272  }{
  1273  	// https://www.felixcloutier.com/x86/add
  1274  	ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true},
  1275  	ADDQ: {opcode: []byte{0x1}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1276  	// https://www.felixcloutier.com/x86/and
  1277  	ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true},
  1278  	ANDQ: {opcode: []byte{0x21}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1279  	// https://www.felixcloutier.com/x86/cmp
  1280  	CMPL: {opcode: []byte{0x39}},
  1281  	CMPQ: {opcode: []byte{0x39}, rPrefix: rexPrefixW},
  1282  	// https://www.felixcloutier.com/x86/cmovcc
  1283  	CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: rexPrefixW},
  1284  	// https://www.felixcloutier.com/x86/addsd
  1285  	ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}},
  1286  	// https://www.felixcloutier.com/x86/addss
  1287  	ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}},
  1288  	// https://www.felixcloutier.com/x86/addpd
  1289  	ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}},
  1290  	// https://www.felixcloutier.com/x86/addps
  1291  	ANDPS: {opcode: []byte{0x0f, 0x54}},
  1292  	// https://www.felixcloutier.com/x86/bsr
  1293  	BSRL: {opcode: []byte{0xf, 0xbd}},
  1294  	BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: rexPrefixW},
  1295  	// https://www.felixcloutier.com/x86/comisd
  1296  	COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}},
  1297  	// https://www.felixcloutier.com/x86/comiss
  1298  	COMISS: {opcode: []byte{0x0f, 0x2f}},
  1299  	// https://www.felixcloutier.com/x86/cvtsd2ss
  1300  	CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}},
  1301  	// https://www.felixcloutier.com/x86/cvtsi2sd
  1302  	CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}},
  1303  	// https://www.felixcloutier.com/x86/cvtsi2sd
  1304  	CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW},
  1305  	// https://www.felixcloutier.com/x86/cvtsi2ss
  1306  	CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}},
  1307  	// https://www.felixcloutier.com/x86/cvtsi2ss
  1308  	CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW},
  1309  	// https://www.felixcloutier.com/x86/cvtss2sd
  1310  	CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}},
  1311  	// https://www.felixcloutier.com/x86/cvttsd2si
  1312  	CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}},
  1313  	CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW},
  1314  	// https://www.felixcloutier.com/x86/cvttss2si
  1315  	CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}},
  1316  	CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW},
  1317  	// https://www.felixcloutier.com/x86/divsd
  1318  	DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}},
  1319  	// https://www.felixcloutier.com/x86/divss
  1320  	DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}},
  1321  	// https://www.felixcloutier.com/x86/lzcnt
  1322  	LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}},
  1323  	LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: rexPrefixW},
  1324  	// https://www.felixcloutier.com/x86/maxsd
  1325  	MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}},
  1326  	// https://www.felixcloutier.com/x86/maxss
  1327  	MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}},
  1328  	// https://www.felixcloutier.com/x86/minsd
  1329  	MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}},
  1330  	// https://www.felixcloutier.com/x86/minss
  1331  	MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}},
  1332  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1333  	MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true},
  1334  	// https://www.felixcloutier.com/x86/movzx
  1335  	MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true},
  1336  	// https://www.felixcloutier.com/x86/movzx
  1337  	MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true},
  1338  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1339  	MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: rexPrefixW, isSrc8bit: true},
  1340  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1341  	MOVLQSX: {opcode: []byte{0x63}, rPrefix: rexPrefixW},
  1342  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1343  	MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: rexPrefixW},
  1344  	// https://www.felixcloutier.com/x86/movsx:movsxd
  1345  	MOVWLSX: {opcode: []byte{0x0f, 0xbf}},
  1346  	// https://www.felixcloutier.com/x86/imul
  1347  	IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: rexPrefixW},
  1348  	// https://www.felixcloutier.com/x86/mulss
  1349  	MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}},
  1350  	// https://www.felixcloutier.com/x86/mulsd
  1351  	MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}},
  1352  	// https://www.felixcloutier.com/x86/or
  1353  	ORL: {opcode: []byte{0x09}, srcOnModRMReg: true},
  1354  	ORQ: {opcode: []byte{0x09}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1355  	// https://www.felixcloutier.com/x86/orpd
  1356  	ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}},
  1357  	// https://www.felixcloutier.com/x86/orps
  1358  	ORPS: {opcode: []byte{0x0f, 0x56}},
  1359  	// https://www.felixcloutier.com/x86/popcnt
  1360  	POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}},
  1361  	POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: rexPrefixW},
  1362  	// https://www.felixcloutier.com/x86/roundss
  1363  	ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true},
  1364  	// https://www.felixcloutier.com/x86/roundsd
  1365  	ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true},
  1366  	// https://www.felixcloutier.com/x86/sqrtss
  1367  	SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}},
  1368  	// https://www.felixcloutier.com/x86/sqrtsd
  1369  	SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}},
  1370  	// https://www.felixcloutier.com/x86/sub
  1371  	SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true},
  1372  	SUBQ: {opcode: []byte{0x29}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1373  	// https://www.felixcloutier.com/x86/subss
  1374  	SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}},
  1375  	// https://www.felixcloutier.com/x86/subsd
  1376  	SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}},
  1377  	// https://www.felixcloutier.com/x86/test
  1378  	TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true},
  1379  	TESTQ: {opcode: []byte{0x85}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1380  	// https://www.felixcloutier.com/x86/tzcnt
  1381  	TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}},
  1382  	TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: rexPrefixW},
  1383  	// https://www.felixcloutier.com/x86/ucomisd
  1384  	UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}},
  1385  	// https://www.felixcloutier.com/x86/ucomiss
  1386  	UCOMISS: {opcode: []byte{0x0f, 0x2e}},
  1387  	// https://www.felixcloutier.com/x86/xchg
  1388  	XCHGQ: {opcode: []byte{0x87}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1389  	// https://www.felixcloutier.com/x86/xor
  1390  	XORL: {opcode: []byte{0x31}, srcOnModRMReg: true},
  1391  	XORQ: {opcode: []byte{0x31}, rPrefix: rexPrefixW, srcOnModRMReg: true},
  1392  	// https://www.felixcloutier.com/x86/xorpd
  1393  	XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}},
  1394  	XORPS: {opcode: []byte{0x0f, 0x57}},
  1395  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1396  	PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, needArg: true},
  1397  	// https://www.felixcloutier.com/x86/pinsrw
  1398  	PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, needArg: true},
  1399  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1400  	PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true},
  1401  	// https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  1402  	PINSRQ: {mandatoryPrefix: 0x66, rPrefix: rexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true},
  1403  	// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1404  	MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}},
  1405  	// https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
  1406  	MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}},
  1407  	// https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq
  1408  	PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}},
  1409  	PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}},
  1410  	PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}},
  1411  	PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}},
  1412  	// https://www.felixcloutier.com/x86/psubb:psubw:psubd
  1413  	PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}},
  1414  	PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}},
  1415  	PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}},
  1416  	// https://www.felixcloutier.com/x86/psubq
  1417  	PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}},
  1418  	// https://www.felixcloutier.com/x86/addps
  1419  	ADDPS: {opcode: []byte{0x0f, 0x58}},
  1420  	// https://www.felixcloutier.com/x86/addpd
  1421  	ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}},
  1422  	// https://www.felixcloutier.com/x86/subps
  1423  	SUBPS: {opcode: []byte{0x0f, 0x5c}},
  1424  	// https://www.felixcloutier.com/x86/subpd
  1425  	SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}},
  1426  	// https://www.felixcloutier.com/x86/pxor
  1427  	PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}},
  1428  	// https://www.felixcloutier.com/x86/pand
  1429  	PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}},
  1430  	// https://www.felixcloutier.com/x86/por
  1431  	POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}},
  1432  	// https://www.felixcloutier.com/x86/pandn
  1433  	PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}},
  1434  	// https://www.felixcloutier.com/x86/pshufb
  1435  	PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}},
  1436  	// https://www.felixcloutier.com/x86/pshufd
  1437  	PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, needArg: true},
  1438  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1439  	PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, needArg: true, srcOnModRMReg: true},
  1440  	// https://www.felixcloutier.com/x86/pextrw
  1441  	PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, needArg: true},
  1442  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1443  	PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true},
  1444  	// https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1445  	PEXTRQ: {rPrefix: rexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true},
  1446  	// https://www.felixcloutier.com/x86/insertps
  1447  	INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, needArg: true},
  1448  	// https://www.felixcloutier.com/x86/movlhps
  1449  	MOVLHPS: {opcode: []byte{0x0f, 0x16}},
  1450  	// https://www.felixcloutier.com/x86/ptest
  1451  	PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}},
  1452  	// https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd
  1453  	PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}},
  1454  	PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}},
  1455  	PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}},
  1456  	// https://www.felixcloutier.com/x86/pcmpeqq
  1457  	PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}},
  1458  	// https://www.felixcloutier.com/x86/paddusb:paddusw
  1459  	PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}},
  1460  	// https://www.felixcloutier.com/x86/movsd
  1461  	MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}},
  1462  	// https://www.felixcloutier.com/x86/packsswb:packssdw
  1463  	PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}},
  1464  	// https://www.felixcloutier.com/x86/pmovmskb
  1465  	PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}},
  1466  	// https://www.felixcloutier.com/x86/movmskps
  1467  	MOVMSKPS: {opcode: []byte{0x0f, 0x50}},
  1468  	// https://www.felixcloutier.com/x86/movmskpd
  1469  	MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}},
  1470  	// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  1471  	PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}},
  1472  	// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  1473  	PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}},
  1474  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1475  	PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}},
  1476  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1477  	PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}},
  1478  	// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  1479  	PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}},
  1480  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1481  	PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}},
  1482  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1483  	PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}},
  1484  	// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  1485  	PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}},
  1486  	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
  1487  	PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}},
  1488  	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
  1489  	PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}},
  1490  	// https://www.felixcloutier.com/x86/cmpps
  1491  	CMPPS: {opcode: []byte{0x0f, 0xc2}, needArg: true},
  1492  	// https://www.felixcloutier.com/x86/cmppd
  1493  	CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true},
  1494  	// https://www.felixcloutier.com/x86/pcmpgtq
  1495  	PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}},
  1496  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1497  	PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}},
  1498  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1499  	PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}},
  1500  	// https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd
  1501  	PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}},
  1502  	// https://www.felixcloutier.com/x86/pminsd:pminsq
  1503  	PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}},
  1504  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1505  	PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}},
  1506  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1507  	PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}},
  1508  	// https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq
  1509  	PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}},
  1510  	// https://www.felixcloutier.com/x86/pminsb:pminsw
  1511  	PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}},
  1512  	// https://www.felixcloutier.com/x86/pminsb:pminsw
  1513  	PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}},
  1514  	// https://www.felixcloutier.com/x86/pminud:pminuq
  1515  	PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}},
  1516  	// https://www.felixcloutier.com/x86/pminub:pminuw
  1517  	PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}},
  1518  	// https://www.felixcloutier.com/x86/pminub:pminuw
  1519  	PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}},
  1520  	// https://www.felixcloutier.com/x86/pmaxud:pmaxuq
  1521  	PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}},
  1522  	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
  1523  	PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}},
  1524  	// https://www.felixcloutier.com/x86/pmaxub:pmaxuw
  1525  	PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}},
  1526  	// https://www.felixcloutier.com/x86/pmullw
  1527  	PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}},
  1528  	// https://www.felixcloutier.com/x86/pmulld:pmullq
  1529  	PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}},
  1530  	// https://www.felixcloutier.com/x86/pmuludq
  1531  	PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}},
  1532  	// https://www.felixcloutier.com/x86/psubsb:psubsw
  1533  	PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}},
  1534  	// https://www.felixcloutier.com/x86/psubsb:psubsw
  1535  	PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}},
  1536  	// https://www.felixcloutier.com/x86/psubusb:psubusw
  1537  	PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}},
  1538  	// https://www.felixcloutier.com/x86/psubusb:psubusw
  1539  	PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}},
  1540  	// https://www.felixcloutier.com/x86/paddsb:paddsw
  1541  	PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}},
  1542  	// https://www.felixcloutier.com/x86/paddsb:paddsw
  1543  	PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}},
  1544  	// https://www.felixcloutier.com/x86/paddusb:paddusw
  1545  	PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}},
  1546  	// https://www.felixcloutier.com/x86/pavgb:pavgw
  1547  	PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}},
  1548  	// https://www.felixcloutier.com/x86/pavgb:pavgw
  1549  	PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}},
  1550  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1551  	PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}},
  1552  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1553  	PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}},
  1554  	// https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq
  1555  	PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}},
  1556  	// https://www.felixcloutier.com/x86/blendvpd
  1557  	BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}},
  1558  	// https://www.felixcloutier.com/x86/maxpd
  1559  	MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}},
  1560  	// https://www.felixcloutier.com/x86/maxps
  1561  	MAXPS: {opcode: []byte{0x0f, 0x5f}},
  1562  	// https://www.felixcloutier.com/x86/minpd
  1563  	MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}},
  1564  	// https://www.felixcloutier.com/x86/minps
  1565  	MINPS: {opcode: []byte{0x0f, 0x5d}},
  1566  	// https://www.felixcloutier.com/x86/andnpd
  1567  	ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}},
  1568  	// https://www.felixcloutier.com/x86/andnps
  1569  	ANDNPS: {opcode: []byte{0x0f, 0x55}},
  1570  	// https://www.felixcloutier.com/x86/mulps
  1571  	MULPS: {opcode: []byte{0x0f, 0x59}},
  1572  	// https://www.felixcloutier.com/x86/mulpd
  1573  	MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}},
  1574  	// https://www.felixcloutier.com/x86/divps
  1575  	DIVPS: {opcode: []byte{0x0f, 0x5e}},
  1576  	// https://www.felixcloutier.com/x86/divpd
  1577  	DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}},
  1578  	// https://www.felixcloutier.com/x86/sqrtps
  1579  	SQRTPS: {opcode: []byte{0x0f, 0x51}},
  1580  	// https://www.felixcloutier.com/x86/sqrtpd
  1581  	SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}},
  1582  	// https://www.felixcloutier.com/x86/roundps
  1583  	ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, needArg: true},
  1584  	// https://www.felixcloutier.com/x86/roundpd
  1585  	ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, needArg: true},
  1586  	// https://www.felixcloutier.com/x86/palignr
  1587  	PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, needArg: true},
  1588  	// https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq
  1589  	PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}},
  1590  	// https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq
  1591  	PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}},
  1592  	// https://www.felixcloutier.com/x86/pmulhuw
  1593  	PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}},
  1594  	// https://www.felixcloutier.com/x86/pmuldq
  1595  	PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}},
  1596  	// https://www.felixcloutier.com/x86/pmulhrsw
  1597  	PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}},
  1598  	// https://www.felixcloutier.com/x86/pmovsx
  1599  	PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}},
  1600  	// https://www.felixcloutier.com/x86/pmovsx
  1601  	PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}},
  1602  	// https://www.felixcloutier.com/x86/pmovsx
  1603  	PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}},
  1604  	// https://www.felixcloutier.com/x86/pmovzx
  1605  	PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}},
  1606  	// https://www.felixcloutier.com/x86/pmovzx
  1607  	PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}},
  1608  	// https://www.felixcloutier.com/x86/pmovzx
  1609  	PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}},
  1610  	// https://www.felixcloutier.com/x86/pmulhw
  1611  	PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}},
  1612  	// https://www.felixcloutier.com/x86/cmpps
  1613  	CMPEQPS: {opcode: []byte{0x0f, 0xc2}, needArg: true},
  1614  	// https://www.felixcloutier.com/x86/cmppd
  1615  	CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true},
  1616  	// https://www.felixcloutier.com/x86/cvttps2dq
  1617  	CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}},
  1618  	// https://www.felixcloutier.com/x86/cvtdq2ps
  1619  	CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}},
  1620  	// https://www.felixcloutier.com/x86/cvtdq2pd
  1621  	CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}},
  1622  	// https://www.felixcloutier.com/x86/cvtpd2ps
  1623  	CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}},
  1624  	// https://www.felixcloutier.com/x86/cvtps2pd
  1625  	CVTPS2PD: {opcode: []byte{0x0f, 0x5a}},
  1626  	// https://www.felixcloutier.com/x86/movupd
  1627  	MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}},
  1628  	// https://www.felixcloutier.com/x86/shufps
  1629  	SHUFPS: {opcode: []byte{0x0f, 0xc6}, needArg: true},
  1630  	// https://www.felixcloutier.com/x86/pmaddwd
  1631  	PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}},
  1632  	// https://www.felixcloutier.com/x86/unpcklps
  1633  	UNPCKLPS: {opcode: []byte{0x0f, 0x14}},
  1634  	// https://www.felixcloutier.com/x86/packuswb
  1635  	PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}},
  1636  	// https://www.felixcloutier.com/x86/packsswb:packssdw
  1637  	PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}},
  1638  	// https://www.felixcloutier.com/x86/packusdw
  1639  	PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}},
  1640  	// https://www.felixcloutier.com/x86/pmaddubsw
  1641  	PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}},
  1642  	// https://www.felixcloutier.com/x86/cvttpd2dq
  1643  	CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}},
  1644  }
  1645  
  1646  var registerToRegisterShiftOpcode = [instructionEnd]*struct {
  1647  	opcode         []byte
  1648  	rPrefix        rexPrefix
  1649  	modRMExtension byte
  1650  }{
  1651  	// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1652  	ROLL: {opcode: []byte{0xd3}},
  1653  	ROLQ: {opcode: []byte{0xd3}, rPrefix: rexPrefixW},
  1654  	RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000},
  1655  	RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: rexPrefixW},
  1656  	// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1657  	SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000},
  1658  	SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: rexPrefixW},
  1659  	SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000},
  1660  	SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: rexPrefixW},
  1661  	SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000},
  1662  	SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: rexPrefixW},
  1663  }
  1664  
  1665  func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  1666  	// Alias for readability
  1667  	inst := n.instruction
  1668  	base := buf.Len()
  1669  	code := buf.Append(8)[:0]
  1670  
  1671  	switch inst {
  1672  	case MOVL, MOVQ:
  1673  		var (
  1674  			opcode          []byte
  1675  			mandatoryPrefix byte
  1676  			srcOnModRMReg   bool
  1677  			rPrefix         rexPrefix
  1678  		)
  1679  		srcIsFloat, dstIsFloat := isVectorRegister(n.srcReg), isVectorRegister(n.dstReg)
  1680  		f2f := srcIsFloat && dstIsFloat
  1681  		if f2f {
  1682  			// https://www.felixcloutier.com/x86/movq
  1683  			opcode, mandatoryPrefix = []byte{0x0f, 0x7e}, 0xf3
  1684  		} else if srcIsFloat && !dstIsFloat {
  1685  			// https://www.felixcloutier.com/x86/movd:movq
  1686  			opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x7e}, 0x66, true
  1687  		} else if !srcIsFloat && dstIsFloat {
  1688  			// https://www.felixcloutier.com/x86/movd:movq
  1689  			opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x6e}, 0x66, false
  1690  		} else {
  1691  			// https://www.felixcloutier.com/x86/mov
  1692  			opcode, srcOnModRMReg = []byte{0x89}, true
  1693  		}
  1694  
  1695  		rexPrefix, modRM, err := n.getRegisterToRegisterModRM(srcOnModRMReg)
  1696  		if err != nil {
  1697  			return err
  1698  		}
  1699  		rexPrefix |= rPrefix
  1700  
  1701  		if inst == MOVQ && !f2f {
  1702  			rexPrefix |= rexPrefixW
  1703  		}
  1704  		if mandatoryPrefix != 0 {
  1705  			code = append(code, mandatoryPrefix)
  1706  		}
  1707  		if rexPrefix != rexPrefixNone {
  1708  			code = append(code, rexPrefix)
  1709  		}
  1710  		code = append(code, opcode...)
  1711  		code = append(code, modRM)
  1712  		buf.Truncate(base + len(code))
  1713  		return nil
  1714  	}
  1715  
  1716  	if op := registerToRegisterOpcode[inst]; op != nil {
  1717  		rexPrefix, modRM, err := n.getRegisterToRegisterModRM(op.srcOnModRMReg)
  1718  		if err != nil {
  1719  			return err
  1720  		}
  1721  		rexPrefix |= op.rPrefix
  1722  
  1723  		if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI {
  1724  			// If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix.
  1725  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers
  1726  			rexPrefix |= rexPrefixDefault
  1727  		}
  1728  
  1729  		if op.mandatoryPrefix != 0 {
  1730  			code = append(code, op.mandatoryPrefix)
  1731  		}
  1732  
  1733  		if rexPrefix != rexPrefixNone {
  1734  			code = append(code, rexPrefix)
  1735  		}
  1736  		code = append(code, op.opcode...)
  1737  		code = append(code, modRM)
  1738  
  1739  		if op.needArg {
  1740  			code = append(code, n.arg)
  1741  		}
  1742  	} else if op := registerToRegisterShiftOpcode[inst]; op != nil {
  1743  		reg3bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  1744  		rexPrefix |= op.rPrefix
  1745  		if rexPrefix != rexPrefixNone {
  1746  			code = append(code, rexPrefix)
  1747  		}
  1748  
  1749  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1750  		modRM := 0b11_000_000 |
  1751  			(op.modRMExtension) |
  1752  			reg3bits
  1753  		code = append(code, op.opcode...)
  1754  		code = append(code, modRM)
  1755  	} else {
  1756  		return errorEncodingUnsupported(n)
  1757  	}
  1758  
  1759  	buf.Truncate(base + len(code))
  1760  	return nil
  1761  }
  1762  
  1763  func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  1764  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true)
  1765  	if err != nil {
  1766  		return err
  1767  	}
  1768  
  1769  	var opcode []byte
  1770  	var mandatoryPrefix byte
  1771  	var isShiftInstruction bool
  1772  	var needArg bool
  1773  	switch n.instruction {
  1774  	case CMPL:
  1775  		// https://www.felixcloutier.com/x86/cmp
  1776  		opcode = []byte{0x3b}
  1777  	case CMPQ:
  1778  		// https://www.felixcloutier.com/x86/cmp
  1779  		rexPrefix |= rexPrefixW
  1780  		opcode = []byte{0x3b}
  1781  	case MOVB:
  1782  		// https://www.felixcloutier.com/x86/mov
  1783  		opcode = []byte{0x88}
  1784  		// 1 byte register operands need default prefix for the following registers.
  1785  		if n.srcReg >= RegSP && n.srcReg <= RegDI {
  1786  			rexPrefix |= rexPrefixDefault
  1787  		}
  1788  	case MOVL:
  1789  		if isVectorRegister(n.srcReg) {
  1790  			// https://www.felixcloutier.com/x86/movd:movq
  1791  			opcode = []byte{0x0f, 0x7e}
  1792  			mandatoryPrefix = 0x66
  1793  		} else {
  1794  			// https://www.felixcloutier.com/x86/mov
  1795  			opcode = []byte{0x89}
  1796  		}
  1797  	case MOVQ:
  1798  		if isVectorRegister(n.srcReg) {
  1799  			// https://www.felixcloutier.com/x86/movq
  1800  			opcode = []byte{0x0f, 0xd6}
  1801  			mandatoryPrefix = 0x66
  1802  		} else {
  1803  			// https://www.felixcloutier.com/x86/mov
  1804  			rexPrefix |= rexPrefixW
  1805  			opcode = []byte{0x89}
  1806  		}
  1807  	case MOVW:
  1808  		// https://www.felixcloutier.com/x86/mov
  1809  		// Note: Need 0x66 to indicate that the operand size is 16-bit.
  1810  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix
  1811  		mandatoryPrefix = 0x66
  1812  		opcode = []byte{0x89}
  1813  	case SARL:
  1814  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1815  		modRM |= 0b00_111_000
  1816  		opcode = []byte{0xd3}
  1817  		isShiftInstruction = true
  1818  	case SARQ:
  1819  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1820  		rexPrefix |= rexPrefixW
  1821  		modRM |= 0b00_111_000
  1822  		opcode = []byte{0xd3}
  1823  		isShiftInstruction = true
  1824  	case SHLL:
  1825  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1826  		modRM |= 0b00_100_000
  1827  		opcode = []byte{0xd3}
  1828  		isShiftInstruction = true
  1829  	case SHLQ:
  1830  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1831  		rexPrefix |= rexPrefixW
  1832  		modRM |= 0b00_100_000
  1833  		opcode = []byte{0xd3}
  1834  		isShiftInstruction = true
  1835  	case SHRL:
  1836  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1837  		modRM |= 0b00_101_000
  1838  		opcode = []byte{0xd3}
  1839  		isShiftInstruction = true
  1840  	case SHRQ:
  1841  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  1842  		rexPrefix |= rexPrefixW
  1843  		modRM |= 0b00_101_000
  1844  		opcode = []byte{0xd3}
  1845  		isShiftInstruction = true
  1846  	case ROLL:
  1847  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1848  		opcode = []byte{0xd3}
  1849  		isShiftInstruction = true
  1850  	case ROLQ:
  1851  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1852  		rexPrefix |= rexPrefixW
  1853  		opcode = []byte{0xd3}
  1854  		isShiftInstruction = true
  1855  	case RORL:
  1856  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1857  		modRM |= 0b00_001_000
  1858  		opcode = []byte{0xd3}
  1859  		isShiftInstruction = true
  1860  	case RORQ:
  1861  		// https://www.felixcloutier.com/x86/rcl:rcr:rol:ror
  1862  		rexPrefix |= rexPrefixW
  1863  		opcode = []byte{0xd3}
  1864  		modRM |= 0b00_001_000
  1865  		isShiftInstruction = true
  1866  	case MOVDQU:
  1867  		// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  1868  		mandatoryPrefix = 0xf3
  1869  		opcode = []byte{0x0f, 0x7f}
  1870  	case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1871  		mandatoryPrefix = 0x66
  1872  		opcode = []byte{0x0f, 0x3a, 0x14}
  1873  		needArg = true
  1874  	case PEXTRW: // https://www.felixcloutier.com/x86/pextrw
  1875  		mandatoryPrefix = 0x66
  1876  		opcode = []byte{0x0f, 0x3a, 0x15}
  1877  		needArg = true
  1878  	case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1879  		mandatoryPrefix = 0x66
  1880  		opcode = []byte{0x0f, 0x3a, 0x16}
  1881  		needArg = true
  1882  	case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq
  1883  		mandatoryPrefix = 0x66
  1884  		rexPrefix |= rexPrefixW // REX.W
  1885  		opcode = []byte{0x0f, 0x3a, 0x16}
  1886  		needArg = true
  1887  	default:
  1888  		return errorEncodingUnsupported(n)
  1889  	}
  1890  
  1891  	if !isShiftInstruction {
  1892  		srcReg3Bits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg)
  1893  
  1894  		rexPrefix |= prefix
  1895  		modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg
  1896  	} else {
  1897  		if n.srcReg != RegCX {
  1898  			return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg))
  1899  		}
  1900  	}
  1901  
  1902  	base := buf.Len()
  1903  	code := buf.Append(16)[:0]
  1904  
  1905  	if mandatoryPrefix != 0 {
  1906  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  1907  		code = append(code, mandatoryPrefix)
  1908  	}
  1909  
  1910  	if rexPrefix != rexPrefixNone {
  1911  		code = append(code, rexPrefix)
  1912  	}
  1913  
  1914  	code = append(code, opcode...)
  1915  	code = append(code, modRM)
  1916  
  1917  	if sbiExist {
  1918  		code = append(code, sbi)
  1919  	}
  1920  
  1921  	if displacementWidth != 0 {
  1922  		code = appendConst(code, n.dstConst, displacementWidth)
  1923  	}
  1924  
  1925  	if needArg {
  1926  		code = append(code, n.arg)
  1927  	}
  1928  
  1929  	buf.Truncate(base + len(code))
  1930  	return
  1931  }
  1932  
  1933  func (a *AssemblerImpl) encodeRegisterToConst(buf asm.Buffer, n *nodeImpl) (err error) {
  1934  	regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM)
  1935  
  1936  	base := buf.Len()
  1937  	code := buf.Append(10)[:0]
  1938  
  1939  	switch n.instruction {
  1940  	case CMPL, CMPQ:
  1941  		if n.instruction == CMPQ {
  1942  			prefix |= rexPrefixW
  1943  		}
  1944  		if prefix != rexPrefixNone {
  1945  			code = append(code, prefix)
  1946  		}
  1947  		is8bitConst := fitInSigned8bit(n.dstConst)
  1948  		// https://www.felixcloutier.com/x86/cmp
  1949  		if n.srcReg == RegAX && !is8bitConst {
  1950  			code = append(code, 0x3d)
  1951  		} else {
  1952  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  1953  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  1954  				0b00_111_000 | // CMP with immediate needs "/7" extension.
  1955  				regBits
  1956  			if is8bitConst {
  1957  				code = append(code, 0x83, modRM)
  1958  			} else {
  1959  				code = append(code, 0x81, modRM)
  1960  			}
  1961  		}
  1962  	default:
  1963  		err = errorEncodingUnsupported(n)
  1964  	}
  1965  
  1966  	if fitInSigned8bit(n.dstConst) {
  1967  		code = append(code, byte(n.dstConst))
  1968  	} else {
  1969  		code = appendUint32(code, uint32(n.dstConst))
  1970  	}
  1971  
  1972  	buf.Truncate(base + len(code))
  1973  	return
  1974  }
  1975  
  1976  func (a *AssemblerImpl) finalizeReadInstructionAddressNode(code []byte, n *nodeImpl) (err error) {
  1977  	// Find the target instruction node.
  1978  	targetNode := n
  1979  	for ; targetNode != nil; targetNode = targetNode.next {
  1980  		if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction {
  1981  			targetNode = targetNode.next
  1982  			break
  1983  		}
  1984  	}
  1985  
  1986  	if targetNode == nil {
  1987  		return errors.New("BUG: target instruction not found for read instruction address")
  1988  	}
  1989  
  1990  	offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */)
  1991  	if offset >= math.MaxInt32 {
  1992  		return errors.New("BUG: too large offset for LEAQ instruction")
  1993  	}
  1994  
  1995  	binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset)))
  1996  	return nil
  1997  }
  1998  
  1999  func (a *AssemblerImpl) encodeReadInstructionAddress(buf asm.Buffer, n *nodeImpl) error {
  2000  	dstReg3Bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg)
  2001  
  2002  	a.readInstructionAddressNodes = append(a.readInstructionAddressNodes, n)
  2003  
  2004  	// https://www.felixcloutier.com/x86/lea
  2005  	opcode := byte(0x8d)
  2006  	rexPrefix |= rexPrefixW
  2007  
  2008  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2009  	modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding.
  2010  		(dstReg3Bits << 3) // Place the dstReg on ModRM:reg.
  2011  
  2012  	code := buf.Append(7)
  2013  	code[0] = rexPrefix
  2014  	code[1] = opcode
  2015  	code[2] = modRM
  2016  	binary.LittleEndian.PutUint32(code[3:], 0) // Preserve
  2017  	return nil
  2018  }
  2019  
  2020  func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2021  	if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE {
  2022  		return a.encodeReadInstructionAddress(buf, n)
  2023  	}
  2024  
  2025  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false)
  2026  	if err != nil {
  2027  		return err
  2028  	}
  2029  
  2030  	dstReg3Bits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg)
  2031  	rexPrefix |= prefix
  2032  	modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg
  2033  
  2034  	var mandatoryPrefix byte
  2035  	var opcode []byte
  2036  	var needArg bool
  2037  
  2038  	switch n.instruction {
  2039  	case ADDL:
  2040  		// https://www.felixcloutier.com/x86/add
  2041  		opcode = []byte{0x03}
  2042  	case ADDQ:
  2043  		// https://www.felixcloutier.com/x86/add
  2044  		rexPrefix |= rexPrefixW
  2045  		opcode = []byte{0x03}
  2046  	case CMPL:
  2047  		// https://www.felixcloutier.com/x86/cmp
  2048  		opcode = []byte{0x39}
  2049  	case CMPQ:
  2050  		// https://www.felixcloutier.com/x86/cmp
  2051  		rexPrefix |= rexPrefixW
  2052  		opcode = []byte{0x39}
  2053  	case LEAQ:
  2054  		// https://www.felixcloutier.com/x86/lea
  2055  		rexPrefix |= rexPrefixW
  2056  		opcode = []byte{0x8d}
  2057  	case MOVBLSX:
  2058  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2059  		opcode = []byte{0x0f, 0xbe}
  2060  	case MOVBLZX:
  2061  		// https://www.felixcloutier.com/x86/movzx
  2062  		opcode = []byte{0x0f, 0xb6}
  2063  	case MOVBQSX:
  2064  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2065  		rexPrefix |= rexPrefixW
  2066  		opcode = []byte{0x0f, 0xbe}
  2067  	case MOVBQZX:
  2068  		// https://www.felixcloutier.com/x86/movzx
  2069  		rexPrefix |= rexPrefixW
  2070  		opcode = []byte{0x0f, 0xb6}
  2071  	case MOVLQSX:
  2072  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2073  		rexPrefix |= rexPrefixW
  2074  		opcode = []byte{0x63}
  2075  	case MOVLQZX:
  2076  		// https://www.felixcloutier.com/x86/mov
  2077  		// Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and
  2078  		// that is semantically equivalent to MOV 32bit to 32bit.
  2079  		opcode = []byte{0x8B}
  2080  	case MOVL:
  2081  		// https://www.felixcloutier.com/x86/mov
  2082  		// Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and
  2083  		// that is semantically equivalent to MOV 32bit to 32bit.
  2084  		if isVectorRegister(n.dstReg) {
  2085  			// https://www.felixcloutier.com/x86/movd:movq
  2086  			opcode = []byte{0x0f, 0x6e}
  2087  			mandatoryPrefix = 0x66
  2088  		} else {
  2089  			// https://www.felixcloutier.com/x86/mov
  2090  			opcode = []byte{0x8B}
  2091  		}
  2092  	case MOVQ:
  2093  		if isVectorRegister(n.dstReg) {
  2094  			// https://www.felixcloutier.com/x86/movq
  2095  			opcode = []byte{0x0f, 0x7e}
  2096  			mandatoryPrefix = 0xf3
  2097  		} else {
  2098  			// https://www.felixcloutier.com/x86/mov
  2099  			rexPrefix |= rexPrefixW
  2100  			opcode = []byte{0x8B}
  2101  		}
  2102  	case MOVWLSX:
  2103  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2104  		opcode = []byte{0x0f, 0xbf}
  2105  	case MOVWLZX:
  2106  		// https://www.felixcloutier.com/x86/movzx
  2107  		opcode = []byte{0x0f, 0xb7}
  2108  	case MOVWQSX:
  2109  		// https://www.felixcloutier.com/x86/movsx:movsxd
  2110  		rexPrefix |= rexPrefixW
  2111  		opcode = []byte{0x0f, 0xbf}
  2112  	case MOVWQZX:
  2113  		// https://www.felixcloutier.com/x86/movzx
  2114  		rexPrefix |= rexPrefixW
  2115  		opcode = []byte{0x0f, 0xb7}
  2116  	case SUBQ:
  2117  		// https://www.felixcloutier.com/x86/sub
  2118  		rexPrefix |= rexPrefixW
  2119  		opcode = []byte{0x2b}
  2120  	case SUBSD:
  2121  		// https://www.felixcloutier.com/x86/subsd
  2122  		opcode = []byte{0x0f, 0x5c}
  2123  		mandatoryPrefix = 0xf2
  2124  	case SUBSS:
  2125  		// https://www.felixcloutier.com/x86/subss
  2126  		opcode = []byte{0x0f, 0x5c}
  2127  		mandatoryPrefix = 0xf3
  2128  	case UCOMISD:
  2129  		// https://www.felixcloutier.com/x86/ucomisd
  2130  		opcode = []byte{0x0f, 0x2e}
  2131  		mandatoryPrefix = 0x66
  2132  	case UCOMISS:
  2133  		// https://www.felixcloutier.com/x86/ucomiss
  2134  		opcode = []byte{0x0f, 0x2e}
  2135  	case MOVDQU:
  2136  		// https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64
  2137  		mandatoryPrefix = 0xf3
  2138  		opcode = []byte{0x0f, 0x6f}
  2139  	case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx
  2140  		mandatoryPrefix = 0x66
  2141  		opcode = []byte{0x0f, 0x38, 0x20}
  2142  	case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx
  2143  		mandatoryPrefix = 0x66
  2144  		opcode = []byte{0x0f, 0x38, 0x23}
  2145  	case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx
  2146  		mandatoryPrefix = 0x66
  2147  		opcode = []byte{0x0f, 0x38, 0x25}
  2148  	case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx
  2149  		mandatoryPrefix = 0x66
  2150  		opcode = []byte{0x0f, 0x38, 0x30}
  2151  	case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx
  2152  		mandatoryPrefix = 0x66
  2153  		opcode = []byte{0x0f, 0x38, 0x33}
  2154  	case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx
  2155  		mandatoryPrefix = 0x66
  2156  		opcode = []byte{0x0f, 0x38, 0x35}
  2157  	case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2158  		mandatoryPrefix = 0x66
  2159  		opcode = []byte{0x0f, 0x3a, 0x20}
  2160  		needArg = true
  2161  	case PINSRW: // https://www.felixcloutier.com/x86/pinsrw
  2162  		mandatoryPrefix = 0x66
  2163  		opcode = []byte{0x0f, 0xc4}
  2164  		needArg = true
  2165  	case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2166  		mandatoryPrefix = 0x66
  2167  		opcode = []byte{0x0f, 0x3a, 0x22}
  2168  		needArg = true
  2169  	case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq
  2170  		rexPrefix |= rexPrefixW
  2171  		mandatoryPrefix = 0x66
  2172  		opcode = []byte{0x0f, 0x3a, 0x22}
  2173  		needArg = true
  2174  	default:
  2175  		return errorEncodingUnsupported(n)
  2176  	}
  2177  
  2178  	base := buf.Len()
  2179  	code := buf.Append(16)[:0]
  2180  
  2181  	if mandatoryPrefix != 0 {
  2182  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix
  2183  		code = append(code, mandatoryPrefix)
  2184  	}
  2185  
  2186  	if rexPrefix != rexPrefixNone {
  2187  		code = append(code, rexPrefix)
  2188  	}
  2189  
  2190  	code = append(code, opcode...)
  2191  	code = append(code, modRM)
  2192  
  2193  	if sbiExist {
  2194  		code = append(code, sbi)
  2195  	}
  2196  
  2197  	if displacementWidth != 0 {
  2198  		code = appendConst(code, n.srcConst, displacementWidth)
  2199  	}
  2200  
  2201  	if needArg {
  2202  		code = append(code, n.arg)
  2203  	}
  2204  
  2205  	buf.Truncate(base + len(code))
  2206  	return
  2207  }
  2208  
  2209  func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2210  	regBits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM)
  2211  
  2212  	isFloatReg := isVectorRegister(n.dstReg)
  2213  	switch n.instruction {
  2214  	case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD:
  2215  		if !isFloatReg {
  2216  			return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg))
  2217  		}
  2218  	default:
  2219  		if isFloatReg {
  2220  			return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg))
  2221  		}
  2222  	}
  2223  
  2224  	if n.instruction != MOVQ && !fitIn32bit(n.srcConst) {
  2225  		return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2226  	} else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) {
  2227  		return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2228  	} else if (n.instruction == PSLLD ||
  2229  		n.instruction == PSLLQ ||
  2230  		n.instruction == PSRLD ||
  2231  		n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) {
  2232  		return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst)
  2233  	}
  2234  
  2235  	base := buf.Len()
  2236  	code := buf.Append(32)[:0]
  2237  
  2238  	isSigned8bitConst := fitInSigned8bit(n.srcConst)
  2239  	switch inst := n.instruction; inst {
  2240  	case ADDQ:
  2241  		// https://www.felixcloutier.com/x86/add
  2242  		rexPrefix |= rexPrefixW
  2243  		if n.dstReg == RegAX && !isSigned8bitConst {
  2244  			code = append(code, rexPrefix, 0x05)
  2245  		} else {
  2246  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2247  				regBits
  2248  			if isSigned8bitConst {
  2249  				code = append(code, rexPrefix, 0x83, modRM)
  2250  			} else {
  2251  				code = append(code, rexPrefix, 0x81, modRM)
  2252  			}
  2253  		}
  2254  		if isSigned8bitConst {
  2255  			code = append(code, byte(n.srcConst))
  2256  		} else {
  2257  			code = appendUint32(code, uint32(n.srcConst))
  2258  		}
  2259  	case ANDQ:
  2260  		// https://www.felixcloutier.com/x86/and
  2261  		rexPrefix |= rexPrefixW
  2262  		if n.dstReg == RegAX && !isSigned8bitConst {
  2263  			code = append(code, rexPrefix, 0x25)
  2264  		} else {
  2265  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2266  				0b00_100_000 | // AND with immediate needs "/4" extension.
  2267  				regBits
  2268  			if isSigned8bitConst {
  2269  				code = append(code, rexPrefix, 0x83, modRM)
  2270  			} else {
  2271  				code = append(code, rexPrefix, 0x81, modRM)
  2272  			}
  2273  		}
  2274  		if fitInSigned8bit(n.srcConst) {
  2275  			code = append(code, byte(n.srcConst))
  2276  		} else {
  2277  			code = appendUint32(code, uint32(n.srcConst))
  2278  		}
  2279  	case TESTQ:
  2280  		// https://www.felixcloutier.com/x86/test
  2281  		rexPrefix |= rexPrefixW
  2282  		if n.dstReg == RegAX && !isSigned8bitConst {
  2283  			code = append(code, rexPrefix, 0xa9)
  2284  		} else {
  2285  			modRM := 0b11_000_000 | // Specifying that operand is register
  2286  				regBits
  2287  			code = append(code, rexPrefix, 0xf7, modRM)
  2288  		}
  2289  		code = appendUint32(code, uint32(n.srcConst))
  2290  	case MOVL:
  2291  		// https://www.felixcloutier.com/x86/mov
  2292  		if rexPrefix != rexPrefixNone {
  2293  			code = append(code, rexPrefix)
  2294  		}
  2295  		code = append(code, 0xb8|regBits)
  2296  		code = appendUint32(code, uint32(n.srcConst))
  2297  	case MOVQ:
  2298  		// https://www.felixcloutier.com/x86/mov
  2299  		if fitIn32bit(n.srcConst) {
  2300  			if n.srcConst > math.MaxInt32 {
  2301  				if rexPrefix != rexPrefixNone {
  2302  					code = append(code, rexPrefix)
  2303  				}
  2304  				code = append(code, 0xb8|regBits)
  2305  			} else {
  2306  				rexPrefix |= rexPrefixW
  2307  				modRM := 0b11_000_000 | // Specifying that opeand is register.
  2308  					regBits
  2309  				code = append(code, rexPrefix, 0xc7, modRM)
  2310  			}
  2311  			code = appendUint32(code, uint32(n.srcConst))
  2312  		} else {
  2313  			rexPrefix |= rexPrefixW
  2314  			code = append(code, rexPrefix, 0xb8|regBits)
  2315  			code = appendUint64(code, uint64(n.srcConst))
  2316  		}
  2317  	case SHLQ:
  2318  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  2319  		rexPrefix |= rexPrefixW
  2320  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2321  			0b00_100_000 | // SHL with immediate needs "/4" extension.
  2322  			regBits
  2323  		if n.srcConst == 1 {
  2324  			code = append(code, rexPrefix, 0xd1, modRM)
  2325  		} else {
  2326  			code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst))
  2327  		}
  2328  	case SHRQ:
  2329  		// https://www.felixcloutier.com/x86/sal:sar:shl:shr
  2330  		rexPrefix |= rexPrefixW
  2331  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2332  			0b00_101_000 | // SHR with immediate needs "/5" extension.
  2333  			regBits
  2334  		if n.srcConst == 1 {
  2335  			code = append(code, rexPrefix, 0xd1, modRM)
  2336  		} else {
  2337  			code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst))
  2338  		}
  2339  	case PSLLD:
  2340  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2341  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2342  			0b00_110_000 | // PSLL with immediate needs "/6" extension.
  2343  			regBits
  2344  		if rexPrefix != rexPrefixNone {
  2345  			code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst))
  2346  		} else {
  2347  			code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst))
  2348  		}
  2349  	case PSLLQ:
  2350  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2351  		modRM := 0b11_000_000 | // Specifying that opeand is register.
  2352  			0b00_110_000 | // PSLL with immediate needs "/6" extension.
  2353  			regBits
  2354  		if rexPrefix != rexPrefixNone {
  2355  			code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst))
  2356  		} else {
  2357  			code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst))
  2358  		}
  2359  	case PSRLD:
  2360  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2361  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2362  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2363  			0b00_010_000 | // PSRL with immediate needs "/2" extension.
  2364  			regBits
  2365  		if rexPrefix != rexPrefixNone {
  2366  			code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst))
  2367  		} else {
  2368  			code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst))
  2369  		}
  2370  	case PSRLQ:
  2371  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2372  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2373  			0b00_010_000 | // PSRL with immediate needs "/2" extension.
  2374  			regBits
  2375  		if rexPrefix != rexPrefixNone {
  2376  			code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst))
  2377  		} else {
  2378  			code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst))
  2379  		}
  2380  	case PSRAW, PSRAD:
  2381  		// https://www.felixcloutier.com/x86/psraw:psrad:psraq
  2382  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2383  			0b00_100_000 | // PSRAW with immediate needs "/4" extension.
  2384  			regBits
  2385  		code = append(code, 0x66)
  2386  		if rexPrefix != rexPrefixNone {
  2387  			code = append(code, rexPrefix)
  2388  		}
  2389  
  2390  		var op byte
  2391  		if inst == PSRAD {
  2392  			op = 0x72
  2393  		} else { // PSRAW
  2394  			op = 0x71
  2395  		}
  2396  
  2397  		code = append(code, 0x0f, op, modRM, byte(n.srcConst))
  2398  	case PSRLW:
  2399  		// https://www.felixcloutier.com/x86/psrlw:psrld:psrlq
  2400  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2401  			0b00_010_000 | // PSRLW with immediate needs "/2" extension.
  2402  			regBits
  2403  		code = append(code, 0x66)
  2404  		if rexPrefix != rexPrefixNone {
  2405  			code = append(code, rexPrefix)
  2406  		}
  2407  		code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst))
  2408  	case PSLLW:
  2409  		// https://www.felixcloutier.com/x86/psllw:pslld:psllq
  2410  		modRM := 0b11_000_000 | // Specifying that operand is register.
  2411  			0b00_110_000 | // PSLLW with immediate needs "/6" extension.
  2412  			regBits
  2413  		code = append(code, 0x66)
  2414  		if rexPrefix != rexPrefixNone {
  2415  			code = append(code, rexPrefix)
  2416  		}
  2417  		code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst))
  2418  	case XORL, XORQ:
  2419  		// https://www.felixcloutier.com/x86/xor
  2420  		if inst == XORQ {
  2421  			rexPrefix |= rexPrefixW
  2422  		}
  2423  		if rexPrefix != rexPrefixNone {
  2424  			code = append(code, rexPrefix)
  2425  		}
  2426  		if n.dstReg == RegAX && !isSigned8bitConst {
  2427  			code = append(code, 0x35)
  2428  		} else {
  2429  			modRM := 0b11_000_000 | // Specifying that opeand is register.
  2430  				0b00_110_000 | // XOR with immediate needs "/6" extension.
  2431  				regBits
  2432  			if isSigned8bitConst {
  2433  				code = append(code, 0x83, modRM)
  2434  			} else {
  2435  				code = append(code, 0x81, modRM)
  2436  			}
  2437  		}
  2438  		if fitInSigned8bit(n.srcConst) {
  2439  			code = append(code, byte(n.srcConst))
  2440  		} else {
  2441  			code = appendUint32(code, uint32(n.srcConst))
  2442  		}
  2443  	default:
  2444  		err = errorEncodingUnsupported(n)
  2445  	}
  2446  
  2447  	buf.Truncate(base + len(code))
  2448  	return
  2449  }
  2450  
  2451  func (a *AssemblerImpl) encodeMemoryToConst(buf asm.Buffer, n *nodeImpl) (err error) {
  2452  	if !fitIn32bit(n.dstConst) {
  2453  		return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction))
  2454  	}
  2455  
  2456  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false)
  2457  	if err != nil {
  2458  		return err
  2459  	}
  2460  
  2461  	// Alias for readability.
  2462  	c := n.dstConst
  2463  
  2464  	var opcode, constWidth byte
  2465  	switch n.instruction {
  2466  	case CMPL:
  2467  		// https://www.felixcloutier.com/x86/cmp
  2468  		if fitInSigned8bit(c) {
  2469  			opcode = 0x83
  2470  			constWidth = 8
  2471  		} else {
  2472  			opcode = 0x81
  2473  			constWidth = 32
  2474  		}
  2475  		modRM |= 0b00_111_000
  2476  	default:
  2477  		return errorEncodingUnsupported(n)
  2478  	}
  2479  
  2480  	base := buf.Len()
  2481  	code := buf.Append(20)[:0]
  2482  
  2483  	if rexPrefix != rexPrefixNone {
  2484  		code = append(code, rexPrefix)
  2485  	}
  2486  
  2487  	code = append(code, opcode, modRM)
  2488  
  2489  	if sbiExist {
  2490  		code = append(code, sbi)
  2491  	}
  2492  
  2493  	if displacementWidth != 0 {
  2494  		code = appendConst(code, n.srcConst, displacementWidth)
  2495  	}
  2496  
  2497  	code = appendConst(code, c, constWidth)
  2498  	buf.Truncate(base + len(code))
  2499  	return
  2500  }
  2501  
  2502  func (a *AssemblerImpl) encodeConstToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  2503  	rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true)
  2504  	if err != nil {
  2505  		return err
  2506  	}
  2507  
  2508  	// Alias for readability.
  2509  	inst := n.instruction
  2510  	c := n.srcConst
  2511  
  2512  	if inst == MOVB && !fitInSigned8bit(c) {
  2513  		return fmt.Errorf("too large load target const %d for MOVB", c)
  2514  	} else if !fitIn32bit(c) {
  2515  		return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction))
  2516  	}
  2517  
  2518  	var constWidth, opcode byte
  2519  	switch inst {
  2520  	case MOVB:
  2521  		opcode = 0xc6
  2522  		constWidth = 8
  2523  	case MOVL:
  2524  		opcode = 0xc7
  2525  		constWidth = 32
  2526  	case MOVQ:
  2527  		rexPrefix |= rexPrefixW
  2528  		opcode = 0xc7
  2529  		constWidth = 32
  2530  	default:
  2531  		return errorEncodingUnsupported(n)
  2532  	}
  2533  
  2534  	base := buf.Len()
  2535  	code := buf.Append(20)[:0]
  2536  
  2537  	if rexPrefix != rexPrefixNone {
  2538  		code = append(code, rexPrefix)
  2539  	}
  2540  
  2541  	code = append(code, opcode, modRM)
  2542  
  2543  	if sbiExist {
  2544  		code = append(code, sbi)
  2545  	}
  2546  
  2547  	if displacementWidth != 0 {
  2548  		code = appendConst(code, n.dstConst, displacementWidth)
  2549  	}
  2550  
  2551  	code = appendConst(code, c, constWidth)
  2552  
  2553  	buf.Truncate(base + len(code))
  2554  	return
  2555  }
  2556  
  2557  func appendUint32(code []byte, v uint32) []byte {
  2558  	b := [4]byte{}
  2559  	binary.LittleEndian.PutUint32(b[:], uint32(v))
  2560  	return append(code, b[:]...)
  2561  }
  2562  
  2563  func appendUint64(code []byte, v uint64) []byte {
  2564  	b := [8]byte{}
  2565  	binary.LittleEndian.PutUint64(b[:], uint64(v))
  2566  	return append(code, b[:]...)
  2567  }
  2568  
  2569  func appendConst(code []byte, v int64, length byte) []byte {
  2570  	switch length {
  2571  	case 8:
  2572  		return append(code, byte(v))
  2573  	case 32:
  2574  		return appendUint32(code, uint32(v))
  2575  	default:
  2576  		return appendUint64(code, uint64(v))
  2577  	}
  2578  }
  2579  
  2580  func (n *nodeImpl) getMemoryLocation(dstMem bool) (p rexPrefix, modRM byte, sbi byte, sbiExist bool, displacementWidth byte, err error) {
  2581  	var baseReg, indexReg asm.Register
  2582  	var offset asm.ConstantValue
  2583  	var scale byte
  2584  	if dstMem {
  2585  		baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale
  2586  	} else {
  2587  		baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale
  2588  	}
  2589  
  2590  	if !fitIn32bit(offset) {
  2591  		err = errors.New("offset does not fit in 32-bit integer")
  2592  		return
  2593  	}
  2594  
  2595  	if baseReg == asm.NilRegister && indexReg != asm.NilRegister {
  2596  		// [(index*scale) + displacement] addressing is possible, but we haven't used it for now.
  2597  		err = errors.New("addressing without base register but with index is not implemented")
  2598  	} else if baseReg == asm.NilRegister {
  2599  		modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB.
  2600  		sbi, sbiExist = byte(0b00_100_101), true
  2601  		displacementWidth = 32
  2602  	} else if indexReg == asm.NilRegister {
  2603  		modRM, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM)
  2604  
  2605  		// Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0
  2606  		// and otherwise [R/M].
  2607  		withoutDisplacement := offset == 0 &&
  2608  			// If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value
  2609  			// is zero since it's not [R/M] operand is not defined for these two registers.
  2610  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2611  			baseReg != RegR13 && baseReg != RegBP
  2612  		if withoutDisplacement {
  2613  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2614  			modRM |= 0b00_000_000 // Specifying that operand is memory without displacement
  2615  			displacementWidth = 0
  2616  		} else if fitInSigned8bit(offset) {
  2617  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2618  			modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement.
  2619  			displacementWidth = 8
  2620  		} else {
  2621  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2622  			modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement.
  2623  			displacementWidth = 32
  2624  		}
  2625  
  2626  		// For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP].
  2627  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing
  2628  		//
  2629  		// Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement].
  2630  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2
  2631  		if baseReg == RegSP || baseReg == RegR12 {
  2632  			sbi, sbiExist = byte(0b00_100_100), true
  2633  		}
  2634  	} else {
  2635  		if indexReg == RegSP {
  2636  			err = errors.New("SP cannot be used for SIB index")
  2637  			return
  2638  		}
  2639  
  2640  		modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB.
  2641  
  2642  		withoutDisplacement := offset == 0 &&
  2643  			// For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod).
  2644  			baseReg != RegR13 && baseReg != RegBP
  2645  		if withoutDisplacement {
  2646  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2647  			modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement
  2648  			displacementWidth = 0
  2649  		} else if fitInSigned8bit(offset) {
  2650  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2651  			modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement.
  2652  			displacementWidth = 8
  2653  		} else {
  2654  			// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2655  			modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement.
  2656  			displacementWidth = 32
  2657  		}
  2658  
  2659  		var baseRegBits byte
  2660  		baseRegBits, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM)
  2661  
  2662  		var indexRegBits byte
  2663  		var indexRegPrefix rexPrefix
  2664  		indexRegBits, indexRegPrefix = register3bits(indexReg, registerSpecifierPositionSIBIndex)
  2665  		p |= indexRegPrefix
  2666  
  2667  		sbi, sbiExist = baseRegBits|(indexRegBits<<3), true
  2668  		switch scale {
  2669  		case 1:
  2670  			sbi |= 0b00_000_000
  2671  		case 2:
  2672  			sbi |= 0b01_000_000
  2673  		case 4:
  2674  			sbi |= 0b10_000_000
  2675  		case 8:
  2676  			sbi |= 0b11_000_000
  2677  		default:
  2678  			err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale)
  2679  			return
  2680  		}
  2681  
  2682  	}
  2683  	return
  2684  }
  2685  
  2686  // getRegisterToRegisterModRM does XXXX
  2687  //
  2688  // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation
  2689  // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity.
  2690  func (n *nodeImpl) getRegisterToRegisterModRM(srcOnModRMReg bool) (rexPrefix, modRM byte, err error) {
  2691  	var reg3bits, rm3bits byte
  2692  	if srcOnModRMReg {
  2693  		reg3bits, rexPrefix = register3bits(n.srcReg,
  2694  			// Indicate that srcReg will be specified by ModRM:reg.
  2695  			registerSpecifierPositionModRMFieldReg)
  2696  
  2697  		var dstRexPrefix byte
  2698  		rm3bits, dstRexPrefix = register3bits(n.dstReg,
  2699  			// Indicate that dstReg will be specified by ModRM:r/m.
  2700  			registerSpecifierPositionModRMFieldRM)
  2701  		rexPrefix |= dstRexPrefix
  2702  	} else {
  2703  		rm3bits, rexPrefix = register3bits(n.srcReg,
  2704  			// Indicate that srcReg will be specified by ModRM:r/m.
  2705  			registerSpecifierPositionModRMFieldRM)
  2706  
  2707  		var dstRexPrefix byte
  2708  		reg3bits, dstRexPrefix = register3bits(n.dstReg,
  2709  			// Indicate that dstReg will be specified by ModRM:reg.
  2710  			registerSpecifierPositionModRMFieldReg)
  2711  		rexPrefix |= dstRexPrefix
  2712  	}
  2713  
  2714  	// https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM
  2715  	modRM = 0b11_000_000 | // Specifying that dst operand is register.
  2716  		(reg3bits << 3) |
  2717  		rm3bits
  2718  
  2719  	return
  2720  }
  2721  
  2722  // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
  2723  type rexPrefix = byte
  2724  
  2725  // REX prefixes are independent of each other and can be combined with OR.
  2726  const (
  2727  	rexPrefixNone    rexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix.
  2728  	rexPrefixDefault rexPrefix = 0b0100_0000
  2729  	rexPrefixW                 = 0b0000_1000 | rexPrefixDefault // REX.W
  2730  	rexPrefixR                 = 0b0000_0100 | rexPrefixDefault // REX.R
  2731  	rexPrefixX                 = 0b0000_0010 | rexPrefixDefault // REX.X
  2732  	rexPrefixB                 = 0b0000_0001 | rexPrefixDefault // REX.B
  2733  )
  2734  
  2735  // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed.
  2736  type registerSpecifierPosition byte
  2737  
  2738  const (
  2739  	registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota
  2740  	registerSpecifierPositionModRMFieldRM
  2741  	registerSpecifierPositionSIBIndex
  2742  )
  2743  
  2744  var regInfo = [...]struct {
  2745  	bits    byte
  2746  	needRex bool
  2747  }{
  2748  	RegAX:  {bits: 0b000},
  2749  	RegCX:  {bits: 0b001},
  2750  	RegDX:  {bits: 0b010},
  2751  	RegBX:  {bits: 0b011},
  2752  	RegSP:  {bits: 0b100},
  2753  	RegBP:  {bits: 0b101},
  2754  	RegSI:  {bits: 0b110},
  2755  	RegDI:  {bits: 0b111},
  2756  	RegR8:  {bits: 0b000, needRex: true},
  2757  	RegR9:  {bits: 0b001, needRex: true},
  2758  	RegR10: {bits: 0b010, needRex: true},
  2759  	RegR11: {bits: 0b011, needRex: true},
  2760  	RegR12: {bits: 0b100, needRex: true},
  2761  	RegR13: {bits: 0b101, needRex: true},
  2762  	RegR14: {bits: 0b110, needRex: true},
  2763  	RegR15: {bits: 0b111, needRex: true},
  2764  	RegX0:  {bits: 0b000},
  2765  	RegX1:  {bits: 0b001},
  2766  	RegX2:  {bits: 0b010},
  2767  	RegX3:  {bits: 0b011},
  2768  	RegX4:  {bits: 0b100},
  2769  	RegX5:  {bits: 0b101},
  2770  	RegX6:  {bits: 0b110},
  2771  	RegX7:  {bits: 0b111},
  2772  	RegX8:  {bits: 0b000, needRex: true},
  2773  	RegX9:  {bits: 0b001, needRex: true},
  2774  	RegX10: {bits: 0b010, needRex: true},
  2775  	RegX11: {bits: 0b011, needRex: true},
  2776  	RegX12: {bits: 0b100, needRex: true},
  2777  	RegX13: {bits: 0b101, needRex: true},
  2778  	RegX14: {bits: 0b110, needRex: true},
  2779  	RegX15: {bits: 0b111, needRex: true},
  2780  }
  2781  
  2782  func register3bits(
  2783  	reg asm.Register,
  2784  	registerSpecifierPosition registerSpecifierPosition,
  2785  ) (bits byte, prefix rexPrefix) {
  2786  	info := regInfo[reg]
  2787  	bits = info.bits
  2788  	if info.needRex {
  2789  		// https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix
  2790  		switch registerSpecifierPosition {
  2791  		case registerSpecifierPositionModRMFieldReg:
  2792  			prefix = rexPrefixR
  2793  		case registerSpecifierPositionModRMFieldRM:
  2794  			prefix = rexPrefixB
  2795  		case registerSpecifierPositionSIBIndex:
  2796  			prefix = rexPrefixX
  2797  		}
  2798  	}
  2799  	return
  2800  }
  2801  
  2802  func fitIn32bit(v int64) bool {
  2803  	return math.MinInt32 <= v && v <= math.MaxUint32
  2804  }
  2805  
  2806  func fitInSigned8bit(v int64) bool {
  2807  	return math.MinInt8 <= v && v <= math.MaxInt8
  2808  }
  2809  
  2810  func isVectorRegister(r asm.Register) bool {
  2811  	return RegX0 <= r && r <= RegX15
  2812  }