github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/asm/arm64/impl.go (about)

     1  package arm64
     2  
     3  import (
     4  	"encoding/binary"
     5  	"errors"
     6  	"fmt"
     7  
     8  	"github.com/wasilibs/wazerox/internal/asm"
     9  )
    10  
    11  type nodeImpl struct {
    12  	// jumpTarget holds the target node in the linked for the jump-kind instruction.
    13  	jumpTarget *nodeImpl
    14  	// next holds the next node from this node in the assembled linked list.
    15  	next        *nodeImpl
    16  	staticConst *asm.StaticConst
    17  
    18  	instruction                      asm.Instruction
    19  	types                            operandTypes
    20  	srcReg, srcReg2, dstReg, dstReg2 asm.Register
    21  	srcConst, dstConst               asm.ConstantValue
    22  
    23  	offsetInBinary asm.NodeOffsetInBinary
    24  
    25  	// readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of
    26  	// read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress.
    27  	readInstructionAddressBeforeTargetInstruction asm.Instruction
    28  
    29  	vectorArrangement              VectorArrangement
    30  	srcVectorIndex, dstVectorIndex VectorIndex
    31  }
    32  
    33  // AssignJumpTarget implements the same method as documented on asm.Node.
    34  func (n *nodeImpl) AssignJumpTarget(target asm.Node) {
    35  	n.jumpTarget = target.(*nodeImpl)
    36  }
    37  
    38  // AssignDestinationConstant implements the same method as documented on asm.Node.
    39  func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) {
    40  	n.dstConst = value
    41  }
    42  
    43  // AssignSourceConstant implements the same method as documented on asm.Node.
    44  func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) {
    45  	n.srcConst = value
    46  }
    47  
    48  // OffsetInBinary implements the same method as documented on asm.Node.
    49  func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary {
    50  	return n.offsetInBinary
    51  }
    52  
    53  // String implements fmt.Stringer.
    54  //
    55  // This is for debugging purpose, and the format is similar to the AT&T assembly syntax,
    56  // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand
    57  // might be embraced by '[]' to represent the memory location, and multiple operands
    58  // are embraced by `()`.
    59  func (n *nodeImpl) String() (ret string) {
    60  	instName := InstructionName(n.instruction)
    61  	switch n.types {
    62  	case operandTypesNoneToNone:
    63  		ret = instName
    64  	case operandTypesNoneToRegister:
    65  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg))
    66  	case operandTypesNoneToBranch:
    67  		ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget)
    68  	case operandTypesRegisterToRegister:
    69  		ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg))
    70  	case operandTypesLeftShiftedRegisterToRegister:
    71  		ret = fmt.Sprintf("%s (%s, %s << %d), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), n.srcConst, RegisterName(n.dstReg))
    72  	case operandTypesTwoRegistersToRegister:
    73  		ret = fmt.Sprintf("%s (%s, %s), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg))
    74  	case operandTypesThreeRegistersToRegister:
    75  		ret = fmt.Sprintf("%s (%s, %s, %s), %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), RegisterName(n.dstReg2))
    76  	case operandTypesTwoRegistersToNone:
    77  		ret = fmt.Sprintf("%s (%s, %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2))
    78  	case operandTypesRegisterAndConstToNone:
    79  		ret = fmt.Sprintf("%s (%s, 0x%x)", instName, RegisterName(n.srcReg), n.srcConst)
    80  	case operandTypesRegisterAndConstToRegister:
    81  		ret = fmt.Sprintf("%s (%s, 0x%x), %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
    82  	case operandTypesRegisterToMemory:
    83  		if n.dstReg2 != asm.NilRegister {
    84  			ret = fmt.Sprintf("%s %s, [%s + %s]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), RegisterName(n.dstReg2))
    85  		} else {
    86  			ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst)
    87  		}
    88  	case operandTypesMemoryToRegister:
    89  		if n.srcReg2 != asm.NilRegister {
    90  			ret = fmt.Sprintf("%s [%s + %s], %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg))
    91  		} else {
    92  			ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
    93  		}
    94  	case operandTypesConstToRegister:
    95  		ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg))
    96  	case operandTypesRegisterToVectorRegister:
    97  		ret = fmt.Sprintf("%s %s, %s.%s[%d]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement, n.dstVectorIndex)
    98  	case operandTypesVectorRegisterToRegister:
    99  		ret = fmt.Sprintf("%s %s.%s[%d], %s", instName, RegisterName(n.srcReg), n.vectorArrangement, n.srcVectorIndex, RegisterName(n.dstReg))
   100  	case operandTypesVectorRegisterToMemory:
   101  		if n.dstReg2 != asm.NilRegister {
   102  			ret = fmt.Sprintf("%s %s.%s, [%s + %s]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), RegisterName(n.dstReg2))
   103  		} else {
   104  			ret = fmt.Sprintf("%s %s.%s, [%s + 0x%x]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), n.dstConst)
   105  		}
   106  	case operandTypesMemoryToVectorRegister:
   107  		ret = fmt.Sprintf("%s [%s], %s.%s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement)
   108  	case operandTypesVectorRegisterToVectorRegister:
   109  		ret = fmt.Sprintf("%s %[2]s.%[4]s, %[3]s.%[4]s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement)
   110  	case operandTypesStaticConstToVectorRegister:
   111  		ret = fmt.Sprintf("%s $%#x %s.%s", instName, n.staticConst.Raw, RegisterName(n.dstReg), n.vectorArrangement)
   112  	case operandTypesTwoVectorRegistersToVectorRegister:
   113  		ret = fmt.Sprintf("%s (%s.%[5]s, %[3]s.%[5]s), %[4]s.%[5]s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), n.vectorArrangement)
   114  	}
   115  	return
   116  }
   117  
   118  // operandTypes represents types of operands of a node.
   119  type operandTypes byte
   120  
   121  const (
   122  	operandTypesNoneToNone operandTypes = iota
   123  	operandTypesNoneToRegister
   124  	operandTypesNoneToBranch
   125  	operandTypesRegisterToRegister
   126  	operandTypesLeftShiftedRegisterToRegister
   127  	operandTypesTwoRegistersToRegister
   128  	operandTypesThreeRegistersToRegister
   129  	operandTypesTwoRegistersToNone
   130  	operandTypesRegisterAndConstToNone
   131  	operandTypesRegisterAndConstToRegister
   132  	operandTypesRegisterToMemory
   133  	operandTypesMemoryToRegister
   134  	operandTypesConstToRegister
   135  	operandTypesRegisterToVectorRegister
   136  	operandTypesVectorRegisterToRegister
   137  	operandTypesMemoryToVectorRegister
   138  	operandTypesVectorRegisterToMemory
   139  	operandTypesVectorRegisterToVectorRegister
   140  	operandTypesTwoVectorRegistersToVectorRegister
   141  	operandTypesStaticConstToVectorRegister
   142  )
   143  
   144  // String implements fmt.Stringer
   145  func (o operandTypes) String() (ret string) {
   146  	switch o {
   147  	case operandTypesNoneToNone:
   148  		ret = "NoneToNone"
   149  	case operandTypesNoneToRegister:
   150  		ret = "NoneToRegister"
   151  	case operandTypesNoneToBranch:
   152  		ret = "NoneToBranch"
   153  	case operandTypesRegisterToRegister:
   154  		ret = "RegisterToRegister"
   155  	case operandTypesLeftShiftedRegisterToRegister:
   156  		ret = "LeftShiftedRegisterToRegister"
   157  	case operandTypesTwoRegistersToRegister:
   158  		ret = "TwoRegistersToRegister"
   159  	case operandTypesThreeRegistersToRegister:
   160  		ret = "ThreeRegistersToRegister"
   161  	case operandTypesTwoRegistersToNone:
   162  		ret = "TwoRegistersToNone"
   163  	case operandTypesRegisterAndConstToNone:
   164  		ret = "RegisterAndConstToNone"
   165  	case operandTypesRegisterAndConstToRegister:
   166  		ret = "RegisterAndConstToRegister"
   167  	case operandTypesRegisterToMemory:
   168  		ret = "RegisterToMemory"
   169  	case operandTypesMemoryToRegister:
   170  		ret = "MemoryToRegister"
   171  	case operandTypesConstToRegister:
   172  		ret = "ConstToRegister"
   173  	case operandTypesRegisterToVectorRegister:
   174  		ret = "RegisterToVectorRegister"
   175  	case operandTypesVectorRegisterToRegister:
   176  		ret = "VectorRegisterToRegister"
   177  	case operandTypesMemoryToVectorRegister:
   178  		ret = "MemoryToVectorRegister"
   179  	case operandTypesVectorRegisterToMemory:
   180  		ret = "VectorRegisterToMemory"
   181  	case operandTypesVectorRegisterToVectorRegister:
   182  		ret = "VectorRegisterToVectorRegister"
   183  	case operandTypesTwoVectorRegistersToVectorRegister:
   184  		ret = "TwoVectorRegistersToVectorRegister"
   185  	case operandTypesStaticConstToVectorRegister:
   186  		ret = "StaticConstToVectorRegister"
   187  	}
   188  	return
   189  }
   190  
   191  const (
   192  	maxSignedInt26 int64 = 1<<25 - 1
   193  	minSignedInt26 int64 = -(1 << 25)
   194  
   195  	maxSignedInt19 int64 = 1<<19 - 1
   196  	minSignedInt19 int64 = -(1 << 19)
   197  )
   198  
   199  // AssemblerImpl implements Assembler.
   200  type AssemblerImpl struct {
   201  	root    *nodeImpl
   202  	current *nodeImpl
   203  	asm.BaseAssemblerImpl
   204  	relativeJumpNodes   []*nodeImpl
   205  	adrInstructionNodes []*nodeImpl
   206  	nodePool            nodePool
   207  	pool                asm.StaticConstPool
   208  	nodeCount           int
   209  
   210  	// MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstPool
   211  	// but have it as a field here for testability.
   212  	MaxDisplacementForConstantPool int
   213  
   214  	temporaryRegister asm.Register
   215  }
   216  
   217  const nodePageSize = 128
   218  
   219  type nodePage = [nodePageSize]nodeImpl
   220  
   221  // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl.
   222  // This reduces the allocations over compilation by reusing AssemblerImpl.
   223  type nodePool struct {
   224  	pages []*nodePage
   225  	index int
   226  }
   227  
   228  // allocNode allocates a new nodeImpl for use from the pool.
   229  // This expands the pool if there is no space left for it.
   230  func (n *nodePool) allocNode() *nodeImpl {
   231  	if n.index == nodePageSize {
   232  		if len(n.pages) == cap(n.pages) {
   233  			n.pages = append(n.pages, new(nodePage))
   234  		} else {
   235  			i := len(n.pages)
   236  			n.pages = n.pages[:i+1]
   237  			if n.pages[i] == nil {
   238  				n.pages[i] = new(nodePage)
   239  			}
   240  		}
   241  		n.index = 0
   242  	}
   243  	ret := &n.pages[len(n.pages)-1][n.index]
   244  	n.index++
   245  	return ret
   246  }
   247  
   248  func (n *nodePool) reset() {
   249  	for _, ns := range n.pages {
   250  		pages := ns[:]
   251  		for i := range pages {
   252  			pages[i] = nodeImpl{}
   253  		}
   254  	}
   255  	n.pages = n.pages[:0]
   256  	n.index = nodePageSize
   257  }
   258  
   259  func NewAssembler(temporaryRegister asm.Register) *AssemblerImpl {
   260  	return &AssemblerImpl{
   261  		nodePool:                       nodePool{index: nodePageSize},
   262  		temporaryRegister:              temporaryRegister,
   263  		pool:                           asm.NewStaticConstPool(),
   264  		MaxDisplacementForConstantPool: defaultMaxDisplacementForConstPool,
   265  	}
   266  }
   267  
   268  // AllocateNOP implements asm.AssemblerBase.
   269  func (a *AssemblerImpl) AllocateNOP() asm.Node {
   270  	n := a.nodePool.allocNode()
   271  	n.instruction = NOP
   272  	n.types = operandTypesNoneToNone
   273  	return n
   274  }
   275  
   276  // Add implements asm.AssemblerBase.
   277  func (a *AssemblerImpl) Add(n asm.Node) {
   278  	a.addNode(n.(*nodeImpl))
   279  }
   280  
   281  // Reset implements asm.AssemblerBase.
   282  func (a *AssemblerImpl) Reset() {
   283  	pool := a.pool
   284  	pool.Reset()
   285  	*a = AssemblerImpl{
   286  		nodePool:            a.nodePool,
   287  		pool:                pool,
   288  		temporaryRegister:   a.temporaryRegister,
   289  		adrInstructionNodes: a.adrInstructionNodes[:0],
   290  		relativeJumpNodes:   a.relativeJumpNodes[:0],
   291  		BaseAssemblerImpl: asm.BaseAssemblerImpl{
   292  			SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0],
   293  			JumpTableEntries:           a.JumpTableEntries[:0],
   294  		},
   295  	}
   296  	a.nodePool.reset()
   297  }
   298  
   299  // newNode creates a new Node and appends it into the linked list.
   300  func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl {
   301  	n := a.nodePool.allocNode()
   302  	n.instruction = instruction
   303  	n.types = types
   304  
   305  	a.addNode(n)
   306  	return n
   307  }
   308  
   309  // addNode appends the new node into the linked list.
   310  func (a *AssemblerImpl) addNode(node *nodeImpl) {
   311  	a.nodeCount++
   312  
   313  	if a.root == nil {
   314  		a.root = node
   315  		a.current = node
   316  	} else {
   317  		parent := a.current
   318  		parent.next = node
   319  		a.current = node
   320  	}
   321  
   322  	for _, o := range a.SetBranchTargetOnNextNodes {
   323  		origin := o.(*nodeImpl)
   324  		origin.jumpTarget = node
   325  	}
   326  	// Reuse the underlying slice to avoid re-allocations.
   327  	a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0]
   328  }
   329  
   330  // Assemble implements asm.AssemblerBase
   331  func (a *AssemblerImpl) Assemble(buf asm.Buffer) error {
   332  	// arm64 has 32-bit fixed length instructions,
   333  	// but note that some nodes are encoded as multiple instructions,
   334  	// so the resulting binary might not be the size of count*8.
   335  	buf.Grow(a.nodeCount * 8)
   336  
   337  	for n := a.root; n != nil; n = n.next {
   338  		n.offsetInBinary = uint64(buf.Len())
   339  		if err := a.encodeNode(buf, n); err != nil {
   340  			return err
   341  		}
   342  		a.maybeFlushConstPool(buf, n.next == nil)
   343  	}
   344  
   345  	code := buf.Bytes()
   346  
   347  	if err := a.FinalizeJumpTableEntry(code); err != nil {
   348  		return err
   349  	}
   350  
   351  	for _, rel := range a.relativeJumpNodes {
   352  		if err := a.relativeBranchFinalize(code, rel); err != nil {
   353  			return err
   354  		}
   355  	}
   356  
   357  	for _, adr := range a.adrInstructionNodes {
   358  		if err := a.finalizeADRInstructionNode(code, adr); err != nil {
   359  			return err
   360  		}
   361  	}
   362  	return nil
   363  }
   364  
   365  const defaultMaxDisplacementForConstPool = (1 << 20) - 1 - 4 // -4 for unconditional branch to skip the constants.
   366  
   367  // maybeFlushConstPool flushes the constant pool if endOfBinary or a boundary condition was met.
   368  func (a *AssemblerImpl) maybeFlushConstPool(buf asm.Buffer, endOfBinary bool) {
   369  	if a.pool.Empty() {
   370  		return
   371  	}
   372  
   373  	// If endOfBinary = true, we no longer need to emit the instructions, therefore
   374  	// flush all the constants.
   375  	if endOfBinary ||
   376  		// Also, if the offset between the first usage of the constant pool and
   377  		// the first constant would exceed 2^20 -1(= 2MiB-1), which is the maximum offset
   378  		// for LDR(literal)/ADR instruction, flush all the constants in the pool.
   379  		(buf.Len()+a.pool.PoolSizeInBytes-int(a.pool.FirstUseOffsetInBinary)) >= a.MaxDisplacementForConstantPool {
   380  
   381  		// Before emitting consts, we have to add br instruction to skip the const pool.
   382  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1123-L1129
   383  		skipOffset := a.pool.PoolSizeInBytes/4 + 1
   384  		if a.pool.PoolSizeInBytes%4 != 0 {
   385  			skipOffset++
   386  		}
   387  		if endOfBinary {
   388  			// If this is the end of binary, we never reach this block,
   389  			// so offset can be zero (which is the behavior of Go's assembler).
   390  			skipOffset = 0
   391  		}
   392  
   393  		buf.Append4Bytes(
   394  			byte(skipOffset),
   395  			byte(skipOffset>>8),
   396  			byte(skipOffset>>16),
   397  			0x14,
   398  		)
   399  
   400  		// Then adding the consts into the binary.
   401  		for _, c := range a.pool.Consts {
   402  			c.SetOffsetInBinary(uint64(buf.Len()))
   403  			buf.AppendBytes(c.Raw)
   404  		}
   405  
   406  		// arm64 instructions are 4-byte (32-bit) aligned, so we must pad the zero consts here.
   407  		if pad := buf.Len() % 4; pad != 0 {
   408  			buf.AppendBytes(make([]byte, 4-pad))
   409  		}
   410  
   411  		// After the flush, reset the constant pool.
   412  		a.pool.Reset()
   413  	}
   414  }
   415  
   416  // encodeNode encodes the given node into writer.
   417  func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) {
   418  	switch n.types {
   419  	case operandTypesNoneToNone:
   420  		err = a.encodeNoneToNone(buf, n)
   421  	case operandTypesNoneToRegister:
   422  		err = a.encodeJumpToRegister(buf, n)
   423  	case operandTypesNoneToBranch:
   424  		err = a.encodeRelativeBranch(buf, n)
   425  	case operandTypesRegisterToRegister:
   426  		err = a.encodeRegisterToRegister(buf, n)
   427  	case operandTypesLeftShiftedRegisterToRegister:
   428  		err = a.encodeLeftShiftedRegisterToRegister(buf, n)
   429  	case operandTypesTwoRegistersToRegister:
   430  		err = a.encodeTwoRegistersToRegister(buf, n)
   431  	case operandTypesThreeRegistersToRegister:
   432  		err = a.encodeThreeRegistersToRegister(buf, n)
   433  	case operandTypesTwoRegistersToNone:
   434  		err = a.encodeTwoRegistersToNone(buf, n)
   435  	case operandTypesRegisterAndConstToNone:
   436  		err = a.encodeRegisterAndConstToNone(buf, n)
   437  	case operandTypesRegisterToMemory:
   438  		err = a.encodeRegisterToMemory(buf, n)
   439  	case operandTypesMemoryToRegister:
   440  		err = a.encodeMemoryToRegister(buf, n)
   441  	case operandTypesRegisterAndConstToRegister, operandTypesConstToRegister:
   442  		err = a.encodeConstToRegister(buf, n)
   443  	case operandTypesRegisterToVectorRegister:
   444  		err = a.encodeRegisterToVectorRegister(buf, n)
   445  	case operandTypesVectorRegisterToRegister:
   446  		err = a.encodeVectorRegisterToRegister(buf, n)
   447  	case operandTypesMemoryToVectorRegister:
   448  		err = a.encodeMemoryToVectorRegister(buf, n)
   449  	case operandTypesVectorRegisterToMemory:
   450  		err = a.encodeVectorRegisterToMemory(buf, n)
   451  	case operandTypesVectorRegisterToVectorRegister:
   452  		err = a.encodeVectorRegisterToVectorRegister(buf, n)
   453  	case operandTypesStaticConstToVectorRegister:
   454  		err = a.encodeStaticConstToVectorRegister(buf, n)
   455  	case operandTypesTwoVectorRegistersToVectorRegister:
   456  		err = a.encodeTwoVectorRegistersToVectorRegister(buf, n)
   457  	default:
   458  		err = fmt.Errorf("encoder undefined for [%s] operand type", n.types)
   459  	}
   460  	if err != nil {
   461  		err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node.
   462  	}
   463  	return
   464  }
   465  
   466  // CompileStandAlone implements the same method as documented on asm.AssemblerBase.
   467  func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node {
   468  	return a.newNode(instruction, operandTypesNoneToNone)
   469  }
   470  
   471  // CompileConstToRegister implements the same method as documented on asm.AssemblerBase.
   472  func (a *AssemblerImpl) CompileConstToRegister(
   473  	instruction asm.Instruction,
   474  	value asm.ConstantValue,
   475  	destinationReg asm.Register,
   476  ) (inst asm.Node) {
   477  	n := a.newNode(instruction, operandTypesConstToRegister)
   478  	n.srcConst = value
   479  	n.dstReg = destinationReg
   480  	return n
   481  }
   482  
   483  // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase.
   484  func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
   485  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   486  	n.srcReg = from
   487  	n.dstReg = to
   488  }
   489  
   490  // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase.
   491  func (a *AssemblerImpl) CompileMemoryToRegister(
   492  	instruction asm.Instruction,
   493  	sourceBaseReg asm.Register,
   494  	sourceOffsetConst asm.ConstantValue,
   495  	destinationReg asm.Register,
   496  ) {
   497  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   498  	n.srcReg = sourceBaseReg
   499  	n.srcConst = sourceOffsetConst
   500  	n.dstReg = destinationReg
   501  }
   502  
   503  // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase.
   504  func (a *AssemblerImpl) CompileRegisterToMemory(
   505  	instruction asm.Instruction,
   506  	sourceRegister, destinationBaseRegister asm.Register,
   507  	destinationOffsetConst asm.ConstantValue,
   508  ) {
   509  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   510  	n.srcReg = sourceRegister
   511  	n.dstReg = destinationBaseRegister
   512  	n.dstConst = destinationOffsetConst
   513  }
   514  
   515  // CompileJump implements the same method as documented on asm.AssemblerBase.
   516  func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node {
   517  	return a.newNode(jmpInstruction, operandTypesNoneToBranch)
   518  }
   519  
   520  // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase.
   521  func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) {
   522  	n := a.newNode(jmpInstruction, operandTypesNoneToRegister)
   523  	n.dstReg = reg
   524  }
   525  
   526  // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase.
   527  func (a *AssemblerImpl) CompileReadInstructionAddress(
   528  	destinationRegister asm.Register,
   529  	beforeAcquisitionTargetInstruction asm.Instruction,
   530  ) {
   531  	n := a.newNode(ADR, operandTypesMemoryToRegister)
   532  	n.dstReg = destinationRegister
   533  	n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction
   534  }
   535  
   536  // CompileMemoryWithRegisterOffsetToRegister implements Assembler.CompileMemoryWithRegisterOffsetToRegister
   537  func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToRegister(
   538  	instruction asm.Instruction,
   539  	srcBaseReg, srcOffsetReg, dstReg asm.Register,
   540  ) {
   541  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   542  	n.dstReg = dstReg
   543  	n.srcReg = srcBaseReg
   544  	n.srcReg2 = srcOffsetReg
   545  }
   546  
   547  // CompileMemoryWithRegisterSourceToRegister implements Assembler.CompileMemoryWithRegisterSourceToRegister
   548  func (a *AssemblerImpl) CompileMemoryWithRegisterSourceToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register) {
   549  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   550  	n.dstReg = dstReg
   551  	n.srcReg = srcReg
   552  }
   553  
   554  // CompileRegisterToMemoryWithRegisterOffset implements Assembler.CompileRegisterToMemoryWithRegisterOffset
   555  func (a *AssemblerImpl) CompileRegisterToMemoryWithRegisterOffset(
   556  	instruction asm.Instruction,
   557  	srcReg, dstBaseReg, dstOffsetReg asm.Register,
   558  ) {
   559  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   560  	n.srcReg = srcReg
   561  	n.dstReg = dstBaseReg
   562  	n.dstReg2 = dstOffsetReg
   563  }
   564  
   565  // CompileRegisterToMemoryWithRegisterDest implements Assembler.CompileRegisterToMemoryWithRegisterDest
   566  func (a *AssemblerImpl) CompileRegisterToMemoryWithRegisterDest(instruction asm.Instruction, srcReg, dstReg asm.Register) {
   567  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   568  	n.srcReg = srcReg
   569  	n.dstReg = dstReg
   570  }
   571  
   572  // CompileTwoRegistersToRegister implements Assembler.CompileTwoRegistersToRegister
   573  func (a *AssemblerImpl) CompileTwoRegistersToRegister(instruction asm.Instruction, src1, src2, dst asm.Register) {
   574  	n := a.newNode(instruction, operandTypesTwoRegistersToRegister)
   575  	n.srcReg = src1
   576  	n.srcReg2 = src2
   577  	n.dstReg = dst
   578  }
   579  
   580  // CompileThreeRegistersToRegister implements Assembler.CompileThreeRegistersToRegister
   581  func (a *AssemblerImpl) CompileThreeRegistersToRegister(
   582  	instruction asm.Instruction,
   583  	src1, src2, src3, dst asm.Register,
   584  ) {
   585  	n := a.newNode(instruction, operandTypesThreeRegistersToRegister)
   586  	n.srcReg = src1
   587  	n.srcReg2 = src2
   588  	n.dstReg = src3 // To minimize the size of nodeImpl struct, we reuse dstReg for the third source operand.
   589  	n.dstReg2 = dst
   590  }
   591  
   592  // CompileTwoRegistersToNone implements Assembler.CompileTwoRegistersToNone
   593  func (a *AssemblerImpl) CompileTwoRegistersToNone(instruction asm.Instruction, src1, src2 asm.Register) {
   594  	n := a.newNode(instruction, operandTypesTwoRegistersToNone)
   595  	n.srcReg = src1
   596  	n.srcReg2 = src2
   597  }
   598  
   599  // CompileRegisterAndConstToNone implements Assembler.CompileRegisterAndConstToNone
   600  func (a *AssemblerImpl) CompileRegisterAndConstToNone(
   601  	instruction asm.Instruction,
   602  	src asm.Register,
   603  	srcConst asm.ConstantValue,
   604  ) {
   605  	n := a.newNode(instruction, operandTypesRegisterAndConstToNone)
   606  	n.srcReg = src
   607  	n.srcConst = srcConst
   608  }
   609  
   610  // CompileRegisterAndConstToRegister implements Assembler.CompileRegisterAndConstToRegister
   611  func (a *AssemblerImpl) CompileRegisterAndConstToRegister(
   612  	instruction asm.Instruction,
   613  	src asm.Register,
   614  	srcConst asm.ConstantValue,
   615  	dst asm.Register,
   616  ) {
   617  	n := a.newNode(instruction, operandTypesRegisterAndConstToRegister)
   618  	n.srcReg = src
   619  	n.srcConst = srcConst
   620  	n.dstReg = dst
   621  }
   622  
   623  // CompileLeftShiftedRegisterToRegister implements Assembler.CompileLeftShiftedRegisterToRegister
   624  func (a *AssemblerImpl) CompileLeftShiftedRegisterToRegister(
   625  	instruction asm.Instruction,
   626  	shiftedSourceReg asm.Register,
   627  	shiftNum asm.ConstantValue,
   628  	srcReg, dstReg asm.Register,
   629  ) {
   630  	n := a.newNode(instruction, operandTypesLeftShiftedRegisterToRegister)
   631  	n.srcReg = srcReg
   632  	n.srcReg2 = shiftedSourceReg
   633  	n.srcConst = shiftNum
   634  	n.dstReg = dstReg
   635  }
   636  
   637  // CompileConditionalRegisterSet implements Assembler.CompileConditionalRegisterSet
   638  func (a *AssemblerImpl) CompileConditionalRegisterSet(cond asm.ConditionalRegisterState, dstReg asm.Register) {
   639  	n := a.newNode(CSET, operandTypesRegisterToRegister)
   640  	n.srcReg = conditionalRegisterStateToRegister(cond)
   641  	n.dstReg = dstReg
   642  }
   643  
   644  // CompileMemoryToVectorRegister implements Assembler.CompileMemoryToVectorRegister
   645  func (a *AssemblerImpl) CompileMemoryToVectorRegister(
   646  	instruction asm.Instruction, srcBaseReg asm.Register, dstOffset asm.ConstantValue, dstReg asm.Register, arrangement VectorArrangement,
   647  ) {
   648  	n := a.newNode(instruction, operandTypesMemoryToVectorRegister)
   649  	n.srcReg = srcBaseReg
   650  	n.srcConst = dstOffset
   651  	n.dstReg = dstReg
   652  	n.vectorArrangement = arrangement
   653  }
   654  
   655  // CompileMemoryWithRegisterOffsetToVectorRegister implements Assembler.CompileMemoryWithRegisterOffsetToVectorRegister
   656  func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToVectorRegister(instruction asm.Instruction,
   657  	srcBaseReg, srcOffsetRegister asm.Register, dstReg asm.Register, arrangement VectorArrangement,
   658  ) {
   659  	n := a.newNode(instruction, operandTypesMemoryToVectorRegister)
   660  	n.srcReg = srcBaseReg
   661  	n.srcReg2 = srcOffsetRegister
   662  	n.dstReg = dstReg
   663  	n.vectorArrangement = arrangement
   664  }
   665  
   666  // CompileVectorRegisterToMemory implements Assembler.CompileVectorRegisterToMemory
   667  func (a *AssemblerImpl) CompileVectorRegisterToMemory(
   668  	instruction asm.Instruction, srcReg, dstBaseReg asm.Register, dstOffset asm.ConstantValue, arrangement VectorArrangement,
   669  ) {
   670  	n := a.newNode(instruction, operandTypesVectorRegisterToMemory)
   671  	n.srcReg = srcReg
   672  	n.dstReg = dstBaseReg
   673  	n.dstConst = dstOffset
   674  	n.vectorArrangement = arrangement
   675  }
   676  
   677  // CompileVectorRegisterToMemoryWithRegisterOffset implements Assembler.CompileVectorRegisterToMemoryWithRegisterOffset
   678  func (a *AssemblerImpl) CompileVectorRegisterToMemoryWithRegisterOffset(instruction asm.Instruction,
   679  	srcReg, dstBaseReg, dstOffsetRegister asm.Register, arrangement VectorArrangement,
   680  ) {
   681  	n := a.newNode(instruction, operandTypesVectorRegisterToMemory)
   682  	n.srcReg = srcReg
   683  	n.dstReg = dstBaseReg
   684  	n.dstReg2 = dstOffsetRegister
   685  	n.vectorArrangement = arrangement
   686  }
   687  
   688  // CompileRegisterToVectorRegister implements Assembler.CompileRegisterToVectorRegister
   689  func (a *AssemblerImpl) CompileRegisterToVectorRegister(
   690  	instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, index VectorIndex,
   691  ) {
   692  	n := a.newNode(instruction, operandTypesRegisterToVectorRegister)
   693  	n.srcReg = srcReg
   694  	n.dstReg = dstReg
   695  	n.vectorArrangement = arrangement
   696  	n.dstVectorIndex = index
   697  }
   698  
   699  // CompileVectorRegisterToRegister implements Assembler.CompileVectorRegisterToRegister
   700  func (a *AssemblerImpl) CompileVectorRegisterToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register,
   701  	arrangement VectorArrangement, index VectorIndex,
   702  ) {
   703  	n := a.newNode(instruction, operandTypesVectorRegisterToRegister)
   704  	n.srcReg = srcReg
   705  	n.dstReg = dstReg
   706  	n.vectorArrangement = arrangement
   707  	n.srcVectorIndex = index
   708  }
   709  
   710  // CompileVectorRegisterToVectorRegister implements Assembler.CompileVectorRegisterToVectorRegister
   711  func (a *AssemblerImpl) CompileVectorRegisterToVectorRegister(
   712  	instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, srcIndex, dstIndex VectorIndex,
   713  ) {
   714  	n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister)
   715  	n.srcReg = srcReg
   716  	n.dstReg = dstReg
   717  	n.vectorArrangement = arrangement
   718  	n.srcVectorIndex = srcIndex
   719  	n.dstVectorIndex = dstIndex
   720  }
   721  
   722  // CompileVectorRegisterToVectorRegisterWithConst implements Assembler.CompileVectorRegisterToVectorRegisterWithConst
   723  func (a *AssemblerImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction,
   724  	srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue,
   725  ) {
   726  	n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister)
   727  	n.srcReg = srcReg
   728  	n.srcConst = c
   729  	n.dstReg = dstReg
   730  	n.vectorArrangement = arrangement
   731  }
   732  
   733  // CompileStaticConstToRegister implements Assembler.CompileStaticConstToVectorRegister
   734  func (a *AssemblerImpl) CompileStaticConstToRegister(instruction asm.Instruction, c *asm.StaticConst, dstReg asm.Register) {
   735  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   736  	n.staticConst = c
   737  	n.dstReg = dstReg
   738  }
   739  
   740  // CompileStaticConstToVectorRegister implements Assembler.CompileStaticConstToVectorRegister
   741  func (a *AssemblerImpl) CompileStaticConstToVectorRegister(instruction asm.Instruction,
   742  	c *asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement,
   743  ) {
   744  	n := a.newNode(instruction, operandTypesStaticConstToVectorRegister)
   745  	n.staticConst = c
   746  	n.dstReg = dstReg
   747  	n.vectorArrangement = arrangement
   748  }
   749  
   750  // CompileTwoVectorRegistersToVectorRegister implements Assembler.CompileTwoVectorRegistersToVectorRegister.
   751  func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register,
   752  	arrangement VectorArrangement,
   753  ) {
   754  	n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister)
   755  	n.srcReg = srcReg
   756  	n.srcReg2 = srcReg2
   757  	n.dstReg = dstReg
   758  	n.vectorArrangement = arrangement
   759  }
   760  
   761  // CompileTwoVectorRegistersToVectorRegisterWithConst implements Assembler.CompileTwoVectorRegistersToVectorRegisterWithConst.
   762  func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction,
   763  	srcReg, srcReg2, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue,
   764  ) {
   765  	n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister)
   766  	n.srcReg = srcReg
   767  	n.srcReg2 = srcReg2
   768  	n.srcConst = c
   769  	n.dstReg = dstReg
   770  	n.vectorArrangement = arrangement
   771  }
   772  
   773  func errorEncodingUnsupported(n *nodeImpl) error {
   774  	return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types)
   775  }
   776  
   777  func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) error {
   778  	switch n.instruction {
   779  	case UDF:
   780  		buf.Append4Bytes(0, 0, 0, 0)
   781  		return nil
   782  	case DMB:
   783  		buf.Append4Bytes(
   784  			0b10111111,
   785  			0b00111011,
   786  			0b00000011,
   787  			0b11010101,
   788  		)
   789  		return nil
   790  	case NOP:
   791  		return nil
   792  	default:
   793  		return errorEncodingUnsupported(n)
   794  	}
   795  }
   796  
   797  func (a *AssemblerImpl) encodeJumpToRegister(buf asm.Buffer, n *nodeImpl) error {
   798  	// "Unconditional branch (register)" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions
   799  	var opc byte
   800  	switch n.instruction {
   801  	case RET:
   802  		opc = 0b0010
   803  	case B:
   804  		opc = 0b0000
   805  	default:
   806  		return errorEncodingUnsupported(n)
   807  	}
   808  
   809  	regBits, err := intRegisterBits(n.dstReg)
   810  	if err != nil {
   811  		return fmt.Errorf("invalid destination register: %w", err)
   812  	}
   813  
   814  	buf.Append4Bytes(
   815  		0x00|(regBits<<5),
   816  		0x00|(regBits>>3),
   817  		0b000_11111|(opc<<5),
   818  		0b1101011_0|(opc>>3),
   819  	)
   820  	return err
   821  }
   822  
   823  func (a *AssemblerImpl) relativeBranchFinalize(code []byte, n *nodeImpl) error {
   824  	var condBits byte
   825  	const condBitsUnconditional = 0xff // Indicates this is not conditional jump.
   826  
   827  	// https://developer.arm.com/documentation/den0024/a/CHDEEABE
   828  	switch n.instruction {
   829  	case B:
   830  		condBits = condBitsUnconditional
   831  	case BCONDEQ:
   832  		condBits = 0b0000
   833  	case BCONDGE:
   834  		condBits = 0b1010
   835  	case BCONDGT:
   836  		condBits = 0b1100
   837  	case BCONDHI:
   838  		condBits = 0b1000
   839  	case BCONDHS:
   840  		condBits = 0b0010
   841  	case BCONDLE:
   842  		condBits = 0b1101
   843  	case BCONDLO:
   844  		condBits = 0b0011
   845  	case BCONDLS:
   846  		condBits = 0b1001
   847  	case BCONDLT:
   848  		condBits = 0b1011
   849  	case BCONDMI:
   850  		condBits = 0b0100
   851  	case BCONDPL:
   852  		condBits = 0b0101
   853  	case BCONDNE:
   854  		condBits = 0b0001
   855  	case BCONDVS:
   856  		condBits = 0b0110
   857  	case BCONDVC:
   858  		condBits = 0b0111
   859  	}
   860  
   861  	branchInstOffset := int64(n.OffsetInBinary())
   862  	offset := int64(n.jumpTarget.OffsetInBinary()) - branchInstOffset
   863  	if offset%4 != 0 {
   864  		return errors.New("BUG: relative jump offset must be 4 bytes aligned")
   865  	}
   866  
   867  	branchInst := code[branchInstOffset : branchInstOffset+4]
   868  	if condBits == condBitsUnconditional {
   869  		imm26 := offset >> 2 // divide by 4.
   870  		if imm26 < minSignedInt26 || imm26 > maxSignedInt26 {
   871  			// In theory this could happen if a Wasm binary has a huge single label (more than 128MB for a single block),
   872  			// and in that case, we use load the offset into a register and do the register jump, but to avoid the complexity,
   873  			// we impose this limit for now as that would be *unlikely* happen in practice.
   874  			return fmt.Errorf("relative jump offset %d/4 must be within %d and %d", offset, minSignedInt26, maxSignedInt26)
   875  		}
   876  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-?lang=en
   877  		branchInst[0] = byte(imm26)
   878  		branchInst[1] = byte(imm26 >> 8)
   879  		branchInst[2] = byte(imm26 >> 16)
   880  		branchInst[3] = (byte(imm26 >> 24 & 0b000000_11)) | 0b000101_00
   881  	} else {
   882  		imm19 := offset >> 2 // divide by 4.
   883  		if imm19 < minSignedInt19 || imm19 > maxSignedInt19 {
   884  			// This should be a bug in our compiler as the conditional jumps are only used in the small offsets (~a few bytes),
   885  			// and if ever happens, compiler can be fixed.
   886  			return fmt.Errorf("BUG: relative jump offset %d/4(=%d) must be within %d and %d", offset, imm19, minSignedInt19, maxSignedInt19)
   887  		}
   888  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-?lang=en
   889  		branchInst[0] = (byte(imm19<<5) & 0b111_0_0000) | condBits
   890  		branchInst[1] = byte(imm19 >> 3)
   891  		branchInst[2] = byte(imm19 >> 11)
   892  		branchInst[3] = 0b01010100
   893  	}
   894  	return nil
   895  }
   896  
   897  func (a *AssemblerImpl) encodeRelativeBranch(buf asm.Buffer, n *nodeImpl) error {
   898  	switch n.instruction {
   899  	case B, BCONDEQ, BCONDGE, BCONDGT, BCONDHI, BCONDHS, BCONDLE, BCONDLO, BCONDLS, BCONDLT, BCONDMI, BCONDNE, BCONDVS, BCONDVC, BCONDPL:
   900  	default:
   901  		return errorEncodingUnsupported(n)
   902  	}
   903  
   904  	if n.jumpTarget == nil {
   905  		return fmt.Errorf("branch target must be set for %s", InstructionName(n.instruction))
   906  	}
   907  
   908  	// At this point, we don't yet know that target's branch, so emit the placeholder (4 bytes).
   909  	buf.Append4Bytes(0, 0, 0, 0)
   910  	a.relativeJumpNodes = append(a.relativeJumpNodes, n)
   911  	return nil
   912  }
   913  
   914  func checkRegisterToRegisterType(src, dst asm.Register, requireSrcInt, requireDstInt bool) (err error) {
   915  	isSrcInt, isDstInt := isIntRegister(src), isIntRegister(dst)
   916  	if isSrcInt && !requireSrcInt {
   917  		err = fmt.Errorf("src requires float register but got %s", RegisterName(src))
   918  	} else if !isSrcInt && requireSrcInt {
   919  		err = fmt.Errorf("src requires int register but got %s", RegisterName(src))
   920  	} else if isDstInt && !requireDstInt {
   921  		err = fmt.Errorf("dst requires float register but got %s", RegisterName(dst))
   922  	} else if !isDstInt && requireDstInt {
   923  		err = fmt.Errorf("dst requires int register but got %s", RegisterName(dst))
   924  	}
   925  	return
   926  }
   927  
   928  func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
   929  	switch inst := n.instruction; inst {
   930  	case ADD, ADDW, SUB:
   931  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
   932  			return
   933  		}
   934  
   935  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
   936  		var sfops byte
   937  		switch inst {
   938  		case ADD:
   939  			sfops = 0b100
   940  		case ADDW:
   941  		case SUB:
   942  			sfops = 0b110
   943  		}
   944  
   945  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
   946  		buf.Append4Bytes(
   947  			(dstRegBits<<5)|dstRegBits,
   948  			dstRegBits>>3,
   949  			srcRegBits,
   950  			(sfops<<5)|0b01011,
   951  		)
   952  	case CLZ, CLZW, RBIT, RBITW:
   953  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
   954  			return
   955  		}
   956  
   957  		var sf, opcode byte
   958  		switch inst {
   959  		case CLZ:
   960  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en
   961  			sf, opcode = 0b1, 0b000_100
   962  		case CLZW:
   963  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en
   964  			sf, opcode = 0b0, 0b000_100
   965  		case RBIT:
   966  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en
   967  			sf, opcode = 0b1, 0b000_000
   968  		case RBITW:
   969  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en
   970  			sf, opcode = 0b0, 0b000_000
   971  		}
   972  		if inst == CLZ {
   973  			sf = 1
   974  		}
   975  
   976  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
   977  		buf.Append4Bytes(
   978  			(srcRegBits<<5)|dstRegBits,
   979  			opcode<<2|(srcRegBits>>3),
   980  			0b110_00000,
   981  			(sf<<7)|0b0_1011010,
   982  		)
   983  	case CSET:
   984  		if !isConditionalRegister(n.srcReg) {
   985  			return fmt.Errorf("CSET requires conditional register but got %s", RegisterName(n.srcReg))
   986  		}
   987  
   988  		dstRegBits, err := intRegisterBits(n.dstReg)
   989  		if err != nil {
   990  			return err
   991  		}
   992  
   993  		// CSET encodes the conditional bits with its least significant bit inverted.
   994  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en
   995  		//
   996  		// https://developer.arm.com/documentation/den0024/a/CHDEEABE
   997  		var conditionalBits byte
   998  		switch n.srcReg {
   999  		case RegCondEQ:
  1000  			conditionalBits = 0b0001
  1001  		case RegCondNE:
  1002  			conditionalBits = 0b0000
  1003  		case RegCondHS:
  1004  			conditionalBits = 0b0011
  1005  		case RegCondLO:
  1006  			conditionalBits = 0b0010
  1007  		case RegCondMI:
  1008  			conditionalBits = 0b0101
  1009  		case RegCondPL:
  1010  			conditionalBits = 0b0100
  1011  		case RegCondVS:
  1012  			conditionalBits = 0b0111
  1013  		case RegCondVC:
  1014  			conditionalBits = 0b0110
  1015  		case RegCondHI:
  1016  			conditionalBits = 0b1001
  1017  		case RegCondLS:
  1018  			conditionalBits = 0b1000
  1019  		case RegCondGE:
  1020  			conditionalBits = 0b1011
  1021  		case RegCondLT:
  1022  			conditionalBits = 0b1010
  1023  		case RegCondGT:
  1024  			conditionalBits = 0b1101
  1025  		case RegCondLE:
  1026  			conditionalBits = 0b1100
  1027  		case RegCondAL:
  1028  			conditionalBits = 0b1111
  1029  		case RegCondNV:
  1030  			conditionalBits = 0b1110
  1031  		}
  1032  
  1033  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en
  1034  		buf.Append4Bytes(
  1035  			0b111_00000|dstRegBits,
  1036  			(conditionalBits<<4)|0b0000_0111,
  1037  			0b100_11111,
  1038  			0b10011010,
  1039  		)
  1040  
  1041  	case FABSD, FABSS, FNEGD, FNEGS, FSQRTD, FSQRTS, FCVTSD, FCVTDS, FRINTMD, FRINTMS,
  1042  		FRINTND, FRINTNS, FRINTPD, FRINTPS, FRINTZD, FRINTZS:
  1043  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil {
  1044  			return
  1045  		}
  1046  
  1047  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1048  
  1049  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
  1050  		var tp, opcode byte
  1051  		switch inst {
  1052  		case FABSD:
  1053  			opcode, tp = 0b000001, 0b01
  1054  		case FABSS:
  1055  			opcode, tp = 0b000001, 0b00
  1056  		case FNEGD:
  1057  			opcode, tp = 0b000010, 0b01
  1058  		case FNEGS:
  1059  			opcode, tp = 0b000010, 0b00
  1060  		case FSQRTD:
  1061  			opcode, tp = 0b000011, 0b01
  1062  		case FSQRTS:
  1063  			opcode, tp = 0b000011, 0b00
  1064  		case FCVTSD:
  1065  			opcode, tp = 0b000101, 0b00
  1066  		case FCVTDS:
  1067  			opcode, tp = 0b000100, 0b01
  1068  		case FRINTMD:
  1069  			opcode, tp = 0b001010, 0b01
  1070  		case FRINTMS:
  1071  			opcode, tp = 0b001010, 0b00
  1072  		case FRINTND:
  1073  			opcode, tp = 0b001000, 0b01
  1074  		case FRINTNS:
  1075  			opcode, tp = 0b001000, 0b00
  1076  		case FRINTPD:
  1077  			opcode, tp = 0b001001, 0b01
  1078  		case FRINTPS:
  1079  			opcode, tp = 0b001001, 0b00
  1080  		case FRINTZD:
  1081  			opcode, tp = 0b001011, 0b01
  1082  		case FRINTZS:
  1083  			opcode, tp = 0b001011, 0b00
  1084  		}
  1085  		buf.Append4Bytes(
  1086  			(srcRegBits<<5)|dstRegBits,
  1087  			(opcode<<7)|0b0_10000_00|(srcRegBits>>3),
  1088  			tp<<6|0b00_1_00000|opcode>>1,
  1089  			0b0_00_11110,
  1090  		)
  1091  
  1092  	case FADDD, FADDS, FDIVS, FDIVD, FMAXD, FMAXS, FMIND, FMINS, FMULS, FMULD:
  1093  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil {
  1094  			return
  1095  		}
  1096  
  1097  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1098  
  1099  		// "Floating-point data-processing (2 source)" in
  1100  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
  1101  		var tp, opcode byte
  1102  		switch inst {
  1103  		case FADDD:
  1104  			opcode, tp = 0b0010, 0b01
  1105  		case FADDS:
  1106  			opcode, tp = 0b0010, 0b00
  1107  		case FDIVD:
  1108  			opcode, tp = 0b0001, 0b01
  1109  		case FDIVS:
  1110  			opcode, tp = 0b0001, 0b00
  1111  		case FMAXD:
  1112  			opcode, tp = 0b0100, 0b01
  1113  		case FMAXS:
  1114  			opcode, tp = 0b0100, 0b00
  1115  		case FMIND:
  1116  			opcode, tp = 0b0101, 0b01
  1117  		case FMINS:
  1118  			opcode, tp = 0b0101, 0b00
  1119  		case FMULS:
  1120  			opcode, tp = 0b0000, 0b00
  1121  		case FMULD:
  1122  			opcode, tp = 0b0000, 0b01
  1123  		}
  1124  
  1125  		buf.Append4Bytes(
  1126  			(dstRegBits<<5)|dstRegBits,
  1127  			opcode<<4|0b0000_10_00|(dstRegBits>>3),
  1128  			tp<<6|0b00_1_00000|srcRegBits,
  1129  			0b0001_1110,
  1130  		)
  1131  
  1132  	case FCVTZSD, FCVTZSDW, FCVTZSS, FCVTZSSW, FCVTZUD, FCVTZUDW, FCVTZUS, FCVTZUSW:
  1133  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, true); err != nil {
  1134  			return
  1135  		}
  1136  
  1137  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1138  
  1139  		// "Conversion between floating-point and integer" in
  1140  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
  1141  		var sf, tp, opcode byte
  1142  		switch inst {
  1143  		case FCVTZSD: // Double to signed 64-bit
  1144  			sf, tp, opcode = 0b1, 0b01, 0b000
  1145  		case FCVTZSDW: // Double to signed 32-bit.
  1146  			sf, tp, opcode = 0b0, 0b01, 0b000
  1147  		case FCVTZSS: // Single to signed 64-bit.
  1148  			sf, tp, opcode = 0b1, 0b00, 0b000
  1149  		case FCVTZSSW: // Single to signed 32-bit.
  1150  			sf, tp, opcode = 0b0, 0b00, 0b000
  1151  		case FCVTZUD: // Double to unsigned 64-bit.
  1152  			sf, tp, opcode = 0b1, 0b01, 0b001
  1153  		case FCVTZUDW: // Double to unsigned 32-bit.
  1154  			sf, tp, opcode = 0b0, 0b01, 0b001
  1155  		case FCVTZUS: // Single to unsigned 64-bit.
  1156  			sf, tp, opcode = 0b1, 0b00, 0b001
  1157  		case FCVTZUSW: // Single to unsigned 32-bit.
  1158  			sf, tp, opcode = 0b0, 0b00, 0b001
  1159  		}
  1160  
  1161  		buf.Append4Bytes(
  1162  			(srcRegBits<<5)|dstRegBits,
  1163  			0|(srcRegBits>>3),
  1164  			tp<<6|0b00_1_11_000|opcode,
  1165  			sf<<7|0b0_0_0_11110,
  1166  		)
  1167  
  1168  	case FMOVD, FMOVS:
  1169  		isSrcInt, isDstInt := isIntRegister(n.srcReg), isIntRegister(n.dstReg)
  1170  		if isSrcInt && isDstInt {
  1171  			return errors.New("FMOV needs at least one of operands to be integer")
  1172  		}
  1173  
  1174  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1175  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMOV--register---Floating-point-Move-register-without-conversion-?lang=en
  1176  		if !isSrcInt && !isDstInt { // Float to float.
  1177  			var tp byte
  1178  			if inst == FMOVD {
  1179  				tp = 0b01
  1180  			}
  1181  			buf.Append4Bytes(
  1182  				(srcRegBits<<5)|dstRegBits,
  1183  				0b0_10000_00|(srcRegBits>>3),
  1184  				tp<<6|0b00_1_00000,
  1185  				0b000_11110,
  1186  			)
  1187  		} else if isSrcInt && !isDstInt { // Int to float.
  1188  			var tp, sf byte
  1189  			if inst == FMOVD {
  1190  				tp, sf = 0b01, 0b1
  1191  			}
  1192  			buf.Append4Bytes(
  1193  				(srcRegBits<<5)|dstRegBits,
  1194  				srcRegBits>>3,
  1195  				tp<<6|0b00_1_00_111,
  1196  				sf<<7|0b0_00_11110,
  1197  			)
  1198  		} else { // Float to int.
  1199  			var tp, sf byte
  1200  			if inst == FMOVD {
  1201  				tp, sf = 0b01, 0b1
  1202  			}
  1203  			buf.Append4Bytes(
  1204  				(srcRegBits<<5)|dstRegBits,
  1205  				srcRegBits>>3,
  1206  				tp<<6|0b00_1_00_110,
  1207  				sf<<7|0b0_00_11110,
  1208  			)
  1209  		}
  1210  
  1211  	case MOVD, MOVW:
  1212  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1213  			return
  1214  		}
  1215  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1216  
  1217  		if n.srcReg == RegSP || n.dstReg == RegSP {
  1218  			// Moving between stack pointers.
  1219  			// https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate--
  1220  			buf.Append4Bytes(
  1221  				(srcRegBits<<5)|dstRegBits,
  1222  				srcRegBits>>3,
  1223  				0x0,
  1224  				0b1001_0001,
  1225  			)
  1226  			return
  1227  		}
  1228  
  1229  		if n.srcReg == RegRZR && inst == MOVD {
  1230  			// If this is 64-bit mov from zero register, then we encode this as MOVK.
  1231  			// See "Move wide (immediate)" in
  1232  			// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Immediate
  1233  			buf.Append4Bytes(
  1234  				dstRegBits,
  1235  				0x0,
  1236  				0b1000_0000,
  1237  				0b1_10_10010,
  1238  			)
  1239  		} else {
  1240  			// MOV can be encoded as ORR (shifted register): "ORR Wd, WZR, Wm".
  1241  			// https://developer.arm.com/documentation/100069/0609/A64-General-Instructions/MOV--register-
  1242  			var sf byte
  1243  			if inst == MOVD {
  1244  				sf = 0b1
  1245  			}
  1246  			buf.Append4Bytes(
  1247  				(zeroRegisterBits<<5)|dstRegBits,
  1248  				zeroRegisterBits>>3,
  1249  				0b000_00000|srcRegBits,
  1250  				sf<<7|0b0_01_01010,
  1251  			)
  1252  		}
  1253  
  1254  	case MRS:
  1255  		if n.srcReg != RegFPSR {
  1256  			return fmt.Errorf("MRS has only support for FPSR register as a src but got %s", RegisterName(n.srcReg))
  1257  		}
  1258  
  1259  		// For how to specify FPSR register, see "Accessing FPSR" in:
  1260  		// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en
  1261  		dstRegBits := registerBits(n.dstReg)
  1262  		buf.Append4Bytes(
  1263  			0b001<<5|dstRegBits,
  1264  			0b0100<<4|0b0100,
  1265  			0b0011_0000|0b11<<3|0b011,
  1266  			0b1101_0101,
  1267  		)
  1268  
  1269  	case MSR:
  1270  		if n.dstReg != RegFPSR {
  1271  			return fmt.Errorf("MSR has only support for FPSR register as a dst but got %s", RegisterName(n.srcReg))
  1272  		}
  1273  
  1274  		// For how to specify FPSR register, see "Accessing FPSR" in:
  1275  		// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en
  1276  		srcRegBits := registerBits(n.srcReg)
  1277  		buf.Append4Bytes(
  1278  			0b001<<5|srcRegBits,
  1279  			0b0100<<4|0b0100,
  1280  			0b0001_0000|0b11<<3|0b011,
  1281  			0b1101_0101,
  1282  		)
  1283  
  1284  	case MUL, MULW:
  1285  		// Multiplications are encoded as MADD (zero register, src, dst), dst = zero + (src * dst) = src * dst.
  1286  		// See "Data-processing (3 source)" in
  1287  		// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
  1288  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1289  			return
  1290  		}
  1291  
  1292  		var sf byte
  1293  		if inst == MUL {
  1294  			sf = 0b1
  1295  		}
  1296  
  1297  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1298  
  1299  		buf.Append4Bytes(
  1300  			dstRegBits<<5|dstRegBits,
  1301  			zeroRegisterBits<<2|dstRegBits>>3,
  1302  			srcRegBits,
  1303  			sf<<7|0b11011,
  1304  		)
  1305  
  1306  	case NEG, NEGW:
  1307  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1308  
  1309  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1310  			return
  1311  		}
  1312  
  1313  		// NEG is encoded as "SUB dst, XZR, src" = "dst = 0 - src"
  1314  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
  1315  		var sf byte
  1316  		if inst == NEG {
  1317  			sf = 0b1
  1318  		}
  1319  
  1320  		buf.Append4Bytes(
  1321  			(zeroRegisterBits<<5)|dstRegBits,
  1322  			zeroRegisterBits>>3,
  1323  			srcRegBits,
  1324  			sf<<7|0b0_10_00000|0b0_00_01011,
  1325  		)
  1326  
  1327  	case SDIV, SDIVW, UDIV, UDIVW:
  1328  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1329  
  1330  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1331  			return
  1332  		}
  1333  
  1334  		// See "Data-processing (2 source)" in
  1335  		// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
  1336  		var sf, opcode byte
  1337  		switch inst {
  1338  		case SDIV:
  1339  			sf, opcode = 0b1, 0b000011
  1340  		case SDIVW:
  1341  			sf, opcode = 0b0, 0b000011
  1342  		case UDIV:
  1343  			sf, opcode = 0b1, 0b000010
  1344  		case UDIVW:
  1345  			sf, opcode = 0b0, 0b000010
  1346  		}
  1347  
  1348  		buf.Append4Bytes(
  1349  			(dstRegBits<<5)|dstRegBits,
  1350  			opcode<<2|(dstRegBits>>3),
  1351  			0b110_00000|srcRegBits,
  1352  			sf<<7|0b0_00_11010,
  1353  		)
  1354  
  1355  	case SCVTFD, SCVTFWD, SCVTFS, SCVTFWS, UCVTFD, UCVTFS, UCVTFWD, UCVTFWS:
  1356  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1357  
  1358  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, false); err != nil {
  1359  			return
  1360  		}
  1361  
  1362  		// "Conversion between floating-point and integer" in
  1363  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
  1364  		var sf, tp, opcode byte
  1365  		switch inst {
  1366  		case SCVTFD: // 64-bit integer to double
  1367  			sf, tp, opcode = 0b1, 0b01, 0b010
  1368  		case SCVTFWD: // 32-bit integer to double
  1369  			sf, tp, opcode = 0b0, 0b01, 0b010
  1370  		case SCVTFS: // 64-bit integer to single
  1371  			sf, tp, opcode = 0b1, 0b00, 0b010
  1372  		case SCVTFWS: // 32-bit integer to single
  1373  			sf, tp, opcode = 0b0, 0b00, 0b010
  1374  		case UCVTFD: // 64-bit to double
  1375  			sf, tp, opcode = 0b1, 0b01, 0b011
  1376  		case UCVTFWD: // 32-bit to double
  1377  			sf, tp, opcode = 0b0, 0b01, 0b011
  1378  		case UCVTFS: // 64-bit to single
  1379  			sf, tp, opcode = 0b1, 0b00, 0b011
  1380  		case UCVTFWS: // 32-bit to single
  1381  			sf, tp, opcode = 0b0, 0b00, 0b011
  1382  		}
  1383  
  1384  		buf.Append4Bytes(
  1385  			(srcRegBits<<5)|dstRegBits,
  1386  			srcRegBits>>3,
  1387  			tp<<6|0b00_1_00_000|opcode,
  1388  			sf<<7|0b0_0_0_11110,
  1389  		)
  1390  
  1391  	case SXTB, SXTBW, SXTH, SXTHW, SXTW:
  1392  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1393  			return
  1394  		}
  1395  
  1396  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1397  		if n.srcReg == RegRZR {
  1398  			// If the source is zero register, we encode as MOV dst, zero.
  1399  			var sf byte
  1400  			if inst == MOVD {
  1401  				sf = 0b1
  1402  			}
  1403  			buf.Append4Bytes(
  1404  				(zeroRegisterBits<<5)|dstRegBits,
  1405  				zeroRegisterBits>>3,
  1406  				0b000_00000|srcRegBits,
  1407  				sf<<7|0b0_01_01010,
  1408  			)
  1409  			return
  1410  		}
  1411  
  1412  		// SXTB is encoded as "SBFM Wd, Wn, #0, #7"
  1413  		// https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTB
  1414  		// SXTH is encoded as "SBFM Wd, Wn, #0, #15"
  1415  		// https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTH
  1416  		// SXTW is encoded as "SBFM Xd, Xn, #0, #31"
  1417  		// https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/SXTW
  1418  
  1419  		var n, sf, imms, opc byte
  1420  		switch inst {
  1421  		case SXTB:
  1422  			n, sf, imms = 0b1, 0b1, 0x7
  1423  		case SXTBW:
  1424  			n, sf, imms = 0b0, 0b0, 0x7
  1425  		case SXTH:
  1426  			n, sf, imms = 0b1, 0b1, 0xf
  1427  		case SXTHW:
  1428  			n, sf, imms = 0b0, 0b0, 0xf
  1429  		case SXTW:
  1430  			n, sf, imms = 0b1, 0b1, 0x1f
  1431  		}
  1432  
  1433  		buf.Append4Bytes(
  1434  			(srcRegBits<<5)|dstRegBits,
  1435  			imms<<2|(srcRegBits>>3),
  1436  			n<<6,
  1437  			sf<<7|opc<<5|0b10011,
  1438  		)
  1439  	default:
  1440  		return errorEncodingUnsupported(n)
  1441  	}
  1442  	return
  1443  }
  1444  
  1445  func (a *AssemblerImpl) encodeLeftShiftedRegisterToRegister(buf asm.Buffer, n *nodeImpl) error {
  1446  	baseRegBits, err := intRegisterBits(n.srcReg)
  1447  	if err != nil {
  1448  		return err
  1449  	}
  1450  	shiftTargetRegBits, err := intRegisterBits(n.srcReg2)
  1451  	if err != nil {
  1452  		return err
  1453  	}
  1454  	dstRegBits, err := intRegisterBits(n.dstReg)
  1455  	if err != nil {
  1456  		return err
  1457  	}
  1458  
  1459  	switch n.instruction {
  1460  	case ADD:
  1461  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
  1462  		const logicalLeftShiftBits = 0b00
  1463  		if n.srcConst < 0 || n.srcConst > 64 {
  1464  			return fmt.Errorf("shift amount must fit in unsigned 6-bit integer (0-64) but got %d", n.srcConst)
  1465  		}
  1466  		shiftByte := byte(n.srcConst)
  1467  		buf.Append4Bytes(
  1468  			(baseRegBits<<5)|dstRegBits,
  1469  			(shiftByte<<2)|(baseRegBits>>3),
  1470  			(logicalLeftShiftBits<<6)|shiftTargetRegBits,
  1471  			0b1000_1011,
  1472  		)
  1473  		return err
  1474  	default:
  1475  		return errorEncodingUnsupported(n)
  1476  	}
  1477  }
  1478  
  1479  func (a *AssemblerImpl) encodeTwoRegistersToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  1480  	switch inst := n.instruction; inst {
  1481  	case AND, ANDW, ORR, ORRW, ORN, ORNW, EOR, EORW:
  1482  		// See "Logical (shifted register)" in
  1483  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1484  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1485  		var sf, opc, n byte
  1486  		switch inst {
  1487  		case AND:
  1488  			sf, opc = 0b1, 0b00
  1489  		case ANDW:
  1490  			sf, opc = 0b0, 0b00
  1491  		case ORR:
  1492  			sf, opc = 0b1, 0b01
  1493  		case ORRW:
  1494  			sf, opc = 0b0, 0b01
  1495  		case ORN:
  1496  			sf, opc, n = 0b1, 0b01, 0b1
  1497  		case ORNW:
  1498  			sf, opc, n = 0b0, 0b01, 0b1
  1499  		case EOR:
  1500  			sf, opc = 0b1, 0b10
  1501  		case EORW:
  1502  			sf, opc = 0b0, 0b10
  1503  		}
  1504  		buf.Append4Bytes(
  1505  			(srcReg2Bits<<5)|dstRegBits,
  1506  			srcReg2Bits>>3,
  1507  			(n<<5)|srcRegBits,
  1508  			sf<<7|opc<<5|0b01010,
  1509  		)
  1510  	case ASR, ASRW, LSL, LSLW, LSR, LSRW, ROR, RORW:
  1511  		// See "Data-processing (2 source)" in
  1512  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1513  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1514  
  1515  		var sf, opcode byte
  1516  		switch inst {
  1517  		case ASR:
  1518  			sf, opcode = 0b1, 0b001010
  1519  		case ASRW:
  1520  			sf, opcode = 0b0, 0b001010
  1521  		case LSL:
  1522  			sf, opcode = 0b1, 0b001000
  1523  		case LSLW:
  1524  			sf, opcode = 0b0, 0b001000
  1525  		case LSR:
  1526  			sf, opcode = 0b1, 0b001001
  1527  		case LSRW:
  1528  			sf, opcode = 0b0, 0b001001
  1529  		case ROR:
  1530  			sf, opcode = 0b1, 0b001011
  1531  		case RORW:
  1532  			sf, opcode = 0b0, 0b001011
  1533  		}
  1534  		buf.Append4Bytes(
  1535  			(srcReg2Bits<<5)|dstRegBits,
  1536  			opcode<<2|(srcReg2Bits>>3),
  1537  			0b110_00000|srcRegBits,
  1538  			sf<<7|0b0_00_11010,
  1539  		)
  1540  	case SDIV, SDIVW, UDIV, UDIVW:
  1541  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1542  
  1543  		// See "Data-processing (2 source)" in
  1544  		// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
  1545  		var sf, opcode byte
  1546  		switch inst {
  1547  		case SDIV:
  1548  			sf, opcode = 0b1, 0b000011
  1549  		case SDIVW:
  1550  			sf, opcode = 0b0, 0b000011
  1551  		case UDIV:
  1552  			sf, opcode = 0b1, 0b000010
  1553  		case UDIVW:
  1554  			sf, opcode = 0b0, 0b000010
  1555  		}
  1556  
  1557  		buf.Append4Bytes(
  1558  			(srcReg2Bits<<5)|dstRegBits,
  1559  			opcode<<2|(srcReg2Bits>>3),
  1560  			0b110_00000|srcRegBits,
  1561  			sf<<7|0b0_00_11010,
  1562  		)
  1563  	case SUB, SUBW:
  1564  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1565  
  1566  		// See "Add/subtract (shifted register)" in
  1567  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1568  		var sf byte
  1569  		if inst == SUB {
  1570  			sf = 0b1
  1571  		}
  1572  
  1573  		buf.Append4Bytes(
  1574  			(srcReg2Bits<<5)|dstRegBits,
  1575  			srcReg2Bits>>3,
  1576  			srcRegBits,
  1577  			sf<<7|0b0_10_01011,
  1578  		)
  1579  	case FSUBD, FSUBS:
  1580  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1581  
  1582  		// See "Floating-point data-processing (2 source)" in
  1583  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  1584  		var tp byte
  1585  		if inst == FSUBD {
  1586  			tp = 0b01
  1587  		}
  1588  		buf.Append4Bytes(
  1589  			(srcReg2Bits<<5)|dstRegBits,
  1590  			0b0011_10_00|(srcReg2Bits>>3),
  1591  			tp<<6|0b00_1_00000|srcRegBits,
  1592  			0b0_00_11110,
  1593  		)
  1594  
  1595  	case LDADDALD, LDADDALW, LDADDALH, LDADDALB,
  1596  		LDCLRALD, LDCLRALW, LDCLRALH, LDCLRALB,
  1597  		LDSETALD, LDSETALW, LDSETALH, LDSETALB,
  1598  		LDEORALD, LDEORALW, LDEORALH, LDEORALB,
  1599  		SWPALD, SWPALW, SWPALH, SWPALB:
  1600  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1601  
  1602  		// While arm's docs don't group the opcodes together for the arithmetic and swap instructions, they are
  1603  		// actually identical except for the single bit before the opcode being 1 for swap, so we encode them together.
  1604  		var size, opcode byte
  1605  		switch n.instruction {
  1606  		case LDADDALD:
  1607  			size, opcode = 0b11, 0b0000
  1608  		case LDADDALW:
  1609  			size, opcode = 0b10, 0b0000
  1610  		case LDADDALH:
  1611  			size, opcode = 0b01, 0b0000
  1612  		case LDADDALB:
  1613  			size, opcode = 0b00, 0b0000
  1614  		case LDCLRALD:
  1615  			size, opcode = 0b11, 0b0001
  1616  		case LDCLRALW:
  1617  			size, opcode = 0b10, 0b0001
  1618  		case LDCLRALH:
  1619  			size, opcode = 0b01, 0b0001
  1620  		case LDCLRALB:
  1621  			size, opcode = 0b00, 0b0001
  1622  		case LDSETALD:
  1623  			size, opcode = 0b11, 0b0011
  1624  		case LDSETALW:
  1625  			size, opcode = 0b10, 0b0011
  1626  		case LDSETALH:
  1627  			size, opcode = 0b01, 0b0011
  1628  		case LDSETALB:
  1629  			size, opcode = 0b00, 0b0011
  1630  		case LDEORALD:
  1631  			size, opcode = 0b11, 0b0010
  1632  		case LDEORALW:
  1633  			size, opcode = 0b10, 0b0010
  1634  		case LDEORALH:
  1635  			size, opcode = 0b01, 0b0010
  1636  		case LDEORALB:
  1637  			size, opcode = 0b00, 0b0010
  1638  		case SWPALD:
  1639  			size, opcode = 0b11, 0b1000
  1640  		case SWPALW:
  1641  			size, opcode = 0b10, 0b1000
  1642  		case SWPALH:
  1643  			size, opcode = 0b01, 0b1000
  1644  		case SWPALB:
  1645  			size, opcode = 0b00, 0b1000
  1646  		}
  1647  
  1648  		buf.Append4Bytes(
  1649  			(srcReg2Bits<<5)|dstRegBits,
  1650  			(opcode<<4)|(srcReg2Bits>>3),
  1651  			0b111_00000|srcRegBits,
  1652  			(size<<6)|0b00_111_000,
  1653  		)
  1654  
  1655  	case CASALD, CASALW, CASALH, CASALB:
  1656  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1657  
  1658  		var size byte
  1659  		switch n.instruction {
  1660  		case CASALD:
  1661  			size = 0b11
  1662  		case CASALW:
  1663  			size = 0b10
  1664  		case CASALH:
  1665  			size = 0b01
  1666  		case CASALB:
  1667  			size = 0b00
  1668  		}
  1669  
  1670  		buf.Append4Bytes(
  1671  			(srcReg2Bits<<5)|dstRegBits,
  1672  			0b111111_00|(srcReg2Bits>>3),
  1673  			0b111_00000|srcRegBits,
  1674  			(size<<6)|0b00_001_000,
  1675  		)
  1676  
  1677  	default:
  1678  		return errorEncodingUnsupported(n)
  1679  	}
  1680  	return
  1681  }
  1682  
  1683  func (a *AssemblerImpl) encodeThreeRegistersToRegister(buf asm.Buffer, n *nodeImpl) error {
  1684  	switch n.instruction {
  1685  	case MSUB, MSUBW:
  1686  		// Dst = Src2 - (Src1 * Src3)
  1687  		// "Data-processing (3 source)" in:
  1688  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1689  		src1RegBits, err := intRegisterBits(n.srcReg)
  1690  		if err != nil {
  1691  			return err
  1692  		}
  1693  		src2RegBits, err := intRegisterBits(n.srcReg2)
  1694  		if err != nil {
  1695  			return err
  1696  		}
  1697  		src3RegBits, err := intRegisterBits(n.dstReg)
  1698  		if err != nil {
  1699  			return err
  1700  		}
  1701  		dstRegBits, err := intRegisterBits(n.dstReg2)
  1702  		if err != nil {
  1703  			return err
  1704  		}
  1705  
  1706  		var sf byte // is zero for MSUBW (32-bit MSUB).
  1707  		if n.instruction == MSUB {
  1708  			sf = 0b1
  1709  		}
  1710  
  1711  		buf.Append4Bytes(
  1712  			(src3RegBits<<5)|dstRegBits,
  1713  			0b1_0000000|(src2RegBits<<2)|(src3RegBits>>3),
  1714  			src1RegBits,
  1715  			sf<<7|0b00_11011,
  1716  		)
  1717  		return nil
  1718  
  1719  	default:
  1720  		return errorEncodingUnsupported(n)
  1721  	}
  1722  }
  1723  
  1724  func (a *AssemblerImpl) encodeTwoRegistersToNone(buf asm.Buffer, n *nodeImpl) error {
  1725  	switch n.instruction {
  1726  	case CMPW, CMP:
  1727  		// Compare on two registers is an alias for "SUBS (src1, src2) ZERO"
  1728  		// which can be encoded as SUBS (shifted registers) with zero shifting.
  1729  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
  1730  		src1RegBits, err := intRegisterBits(n.srcReg)
  1731  		if err != nil {
  1732  			return err
  1733  		}
  1734  		src2RegBits, err := intRegisterBits(n.srcReg2)
  1735  		if err != nil {
  1736  			return err
  1737  		}
  1738  
  1739  		var op byte
  1740  		if n.instruction == CMP {
  1741  			op = 0b111
  1742  		} else {
  1743  			op = 0b011
  1744  		}
  1745  
  1746  		buf.Append4Bytes(
  1747  			(src2RegBits<<5)|zeroRegisterBits,
  1748  			src2RegBits>>3,
  1749  			src1RegBits,
  1750  			0b01011|(op<<5),
  1751  		)
  1752  		return nil
  1753  	case FCMPS, FCMPD:
  1754  		// "Floating-point compare" section in:
  1755  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  1756  		src1RegBits, err := vectorRegisterBits(n.srcReg)
  1757  		if err != nil {
  1758  			return err
  1759  		}
  1760  		src2RegBits, err := vectorRegisterBits(n.srcReg2)
  1761  		if err != nil {
  1762  			return err
  1763  		}
  1764  
  1765  		var ftype byte // is zero for FCMPS (single precision float compare).
  1766  		if n.instruction == FCMPD {
  1767  			ftype = 0b01
  1768  		}
  1769  		buf.Append4Bytes(
  1770  			src2RegBits<<5,
  1771  			0b001000_00|(src2RegBits>>3),
  1772  			ftype<<6|0b1_00000|src1RegBits,
  1773  			0b000_11110,
  1774  		)
  1775  		return nil
  1776  	default:
  1777  		return errorEncodingUnsupported(n)
  1778  	}
  1779  }
  1780  
  1781  func (a *AssemblerImpl) encodeRegisterAndConstToNone(buf asm.Buffer, n *nodeImpl) error {
  1782  	if n.instruction != CMP {
  1783  		return errorEncodingUnsupported(n)
  1784  	}
  1785  
  1786  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en
  1787  	if n.srcConst < 0 || n.srcConst > 4095 {
  1788  		return fmt.Errorf("immediate for CMP must fit in 0 to 4095 but got %d", n.srcConst)
  1789  	} else if n.srcReg == RegRZR {
  1790  		return errors.New("zero register is not supported for CMP (immediate)")
  1791  	}
  1792  
  1793  	srcRegBits, err := intRegisterBits(n.srcReg)
  1794  	if err != nil {
  1795  		return err
  1796  	}
  1797  
  1798  	buf.Append4Bytes(
  1799  		(srcRegBits<<5)|zeroRegisterBits,
  1800  		(byte(n.srcConst)<<2)|(srcRegBits>>3),
  1801  		byte(n.srcConst>>6),
  1802  		0b111_10001,
  1803  	)
  1804  	return nil
  1805  }
  1806  
  1807  func fitInSigned9Bits(v int64) bool {
  1808  	return v >= -256 && v <= 255
  1809  }
  1810  
  1811  func (a *AssemblerImpl) encodeLoadOrStoreWithRegisterOffset(
  1812  	buf asm.Buffer, baseRegBits, offsetRegBits, targetRegBits byte, opcode, size, v byte,
  1813  ) {
  1814  	// See "Load/store register (register offset)".
  1815  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
  1816  	buf.Append4Bytes(
  1817  		(baseRegBits<<5)|targetRegBits,
  1818  		0b011_010_00|(baseRegBits>>3),
  1819  		opcode<<6|0b00_1_00000|offsetRegBits,
  1820  		size<<6|v<<2|0b00_111_0_00,
  1821  	)
  1822  }
  1823  
  1824  func (a *AssemblerImpl) encodeLoadOrStoreWithAcquireRelease(
  1825  	buf asm.Buffer, baseRegBits, targetRegBits byte, l, size byte,
  1826  ) {
  1827  	buf.Append4Bytes(
  1828  		(baseRegBits<<5)|targetRegBits,
  1829  		0b1_11111_00|(baseRegBits>>3),
  1830  		0b1_0_011111|l<<6,
  1831  		size<<6|0b00_001000,
  1832  	)
  1833  }
  1834  
  1835  // validateMemoryOffset validates the memory offset if the given offset can be encoded in the assembler.
  1836  // In theory, offset can be any, but for simplicity of our homemade assembler, we limit the offset range
  1837  // that can be encoded enough for supporting compiler.
  1838  func validateMemoryOffset(offset int64) error {
  1839  	if offset > 255 && offset%4 != 0 {
  1840  		// This is because we only have large offsets for load/store with Wasm value stack or reading type IDs, and its offset
  1841  		// is always multiplied by 4 or 8 (== the size of uint32 or uint64 == the type of wasm.FunctionTypeID or value stack in Go)
  1842  		return fmt.Errorf("large memory offset (>255) must be a multiple of 4 but got %d", offset)
  1843  	} else if offset < -256 { // 9-bit signed integer's minimum = 2^8.
  1844  		return fmt.Errorf("negative memory offset must be larget than or equal -256 but got %d", offset)
  1845  	} else if offset > 1<<31-1 {
  1846  		return fmt.Errorf("large memory offset must be less than %d but got %d", 1<<31-1, offset)
  1847  	} else {
  1848  		return nil
  1849  	}
  1850  }
  1851  
  1852  // encodeLoadOrStoreWithConstOffset encodes load/store instructions with the constant offset.
  1853  //
  1854  // Note: Encoding strategy intentionally matches the Go assembler: https://go.dev/doc/asm
  1855  func (a *AssemblerImpl) encodeLoadOrStoreWithConstOffset(
  1856  	buf asm.Buffer,
  1857  	baseRegBits, targetRegBits byte,
  1858  	offset int64,
  1859  	opcode, size, v byte,
  1860  	datasize, datasizeLog2 int64,
  1861  ) (err error) {
  1862  	if err = validateMemoryOffset(offset); err != nil {
  1863  		return
  1864  	}
  1865  
  1866  	if fitInSigned9Bits(offset) {
  1867  		// See "LDAPR/STLR (unscaled immediate)"
  1868  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldapstl_unscaled
  1869  		if offset < 0 || offset%datasize != 0 {
  1870  			// This case is encoded as one "unscaled signed store".
  1871  			buf.Append4Bytes(
  1872  				(baseRegBits<<5)|targetRegBits,
  1873  				byte(offset<<4)|(baseRegBits>>3),
  1874  				opcode<<6|(0b00_00_11111&byte(offset>>4)),
  1875  				size<<6|v<<2|0b00_1_11_0_00,
  1876  			)
  1877  			return
  1878  		}
  1879  	}
  1880  
  1881  	// At this point we have the assumption that offset is positive.
  1882  	// Plus if it is a multiple of datasize, then it can be encoded as a single "unsigned immediate".
  1883  	if offset%datasize == 0 &&
  1884  		offset < (1<<12)<<datasizeLog2 {
  1885  		m := offset / datasize
  1886  		buf.Append4Bytes(
  1887  			(baseRegBits<<5)|targetRegBits,
  1888  			(byte(m<<2))|(baseRegBits>>3),
  1889  			opcode<<6|0b00_111111&byte(m>>6),
  1890  			size<<6|v<<2|0b00_1_11_0_01,
  1891  		)
  1892  		return
  1893  	}
  1894  
  1895  	// Otherwise, we need multiple instructions.
  1896  	tmpRegBits := registerBits(a.temporaryRegister)
  1897  	offset32 := int32(offset)
  1898  
  1899  	// Go's assembler adds a const into the const pool at this point,
  1900  	// regardless of its usage; e.g. if we enter the then block of the following if statement,
  1901  	// the const is not used but it is added into the const pool.
  1902  	c := asm.NewStaticConst(make([]byte, 4))
  1903  	binary.LittleEndian.PutUint32(c.Raw, uint32(offset))
  1904  	a.pool.AddConst(c, uint64(buf.Len()))
  1905  
  1906  	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3529-L3532
  1907  	// If the offset is within 24-bits, we can load it with two ADD instructions.
  1908  	hi := offset32 - (offset32 & (0xfff << uint(datasizeLog2)))
  1909  	if hi&^0xfff000 == 0 {
  1910  		var sfops byte = 0b100
  1911  		m := ((offset32 - hi) >> datasizeLog2) & 0xfff
  1912  		hi >>= 12
  1913  
  1914  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3534-L3535
  1915  		buf.Append4Bytes(
  1916  			(baseRegBits<<5)|tmpRegBits,
  1917  			(byte(hi)<<2)|(baseRegBits>>3),
  1918  			0b01<<6 /* shift by 12 */ |byte(hi>>6),
  1919  			sfops<<5|0b10001,
  1920  		)
  1921  
  1922  		buf.Append4Bytes(
  1923  			(tmpRegBits<<5)|targetRegBits,
  1924  			(byte(m<<2))|(tmpRegBits>>3),
  1925  			opcode<<6|0b00_111111&byte(m>>6),
  1926  			size<<6|v<<2|0b00_1_11_0_01,
  1927  		)
  1928  	} else {
  1929  		// This case we load the const via ldr(literal) into tem register,
  1930  		// and the target const is placed after this instruction below.
  1931  		loadLiteralOffsetInBinary := uint64(buf.Len())
  1932  
  1933  		// First we emit the ldr(literal) with offset zero as we don't yet know the const's placement in the binary.
  1934  		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--literal---Load-Register--literal--
  1935  		buf.Append4Bytes(tmpRegBits, 0x0, 0x0, 0b00_011_0_00)
  1936  
  1937  		// Set the callback for the constant, and we set properly the offset in the callback.
  1938  
  1939  		c.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
  1940  			// ldr(literal) encodes offset divided by 4.
  1941  			offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4
  1942  			bin := buf.Bytes()
  1943  			bin[loadLiteralOffsetInBinary] |= byte(offset << 5)
  1944  			bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3)
  1945  			bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11)
  1946  		})
  1947  
  1948  		// Then, load the constant with the register offset.
  1949  		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--register---Load-Register--register--
  1950  		buf.Append4Bytes(
  1951  			(baseRegBits<<5)|targetRegBits,
  1952  			0b011_010_00|(baseRegBits>>3),
  1953  			opcode<<6|0b00_1_00000|tmpRegBits,
  1954  			size<<6|v<<2|0b00_111_0_00,
  1955  		)
  1956  	}
  1957  	return
  1958  }
  1959  
  1960  func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  1961  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
  1962  	var (
  1963  		size, v                byte
  1964  		datasize, datasizeLog2 int64
  1965  		isTargetFloat          bool
  1966  		isRelease              bool
  1967  	)
  1968  	switch n.instruction {
  1969  	case STRD:
  1970  		size, v, datasize, datasizeLog2 = 0b11, 0x0, 8, 3
  1971  	case STRW:
  1972  		size, v, datasize, datasizeLog2 = 0b10, 0x0, 4, 2
  1973  	case STRH:
  1974  		size, v, datasize, datasizeLog2 = 0b01, 0x0, 2, 1
  1975  	case STRB:
  1976  		size, v, datasize, datasizeLog2 = 0b00, 0x0, 1, 0
  1977  	case FSTRD:
  1978  		size, v, datasize, datasizeLog2, isTargetFloat = 0b11, 0x1, 8, 3, true
  1979  	case FSTRS:
  1980  		size, v, datasize, datasizeLog2, isTargetFloat = 0b10, 0x1, 4, 2, true
  1981  	case STLRD:
  1982  		size, isRelease = 0b11, true
  1983  	case STLRW:
  1984  		size, isRelease = 0b10, true
  1985  	case STLRH:
  1986  		size, isRelease = 0b01, true
  1987  	case STLRB:
  1988  		size, isRelease = 0b00, true
  1989  	default:
  1990  		return errorEncodingUnsupported(n)
  1991  	}
  1992  
  1993  	var srcRegBits byte
  1994  	if isTargetFloat {
  1995  		srcRegBits, err = vectorRegisterBits(n.srcReg)
  1996  	} else {
  1997  		srcRegBits, err = intRegisterBits(n.srcReg)
  1998  	}
  1999  	if err != nil {
  2000  		return
  2001  	}
  2002  
  2003  	baseRegBits, err := intRegisterBits(n.dstReg)
  2004  	if err != nil {
  2005  		return err
  2006  	}
  2007  
  2008  	if isRelease {
  2009  		a.encodeLoadOrStoreWithAcquireRelease(buf, baseRegBits, srcRegBits, 0, size)
  2010  		return nil
  2011  	}
  2012  
  2013  	const opcode = 0x00 // opcode for store instructions.
  2014  	if n.dstReg2 != asm.NilRegister {
  2015  		offsetRegBits, err := intRegisterBits(n.dstReg2)
  2016  		if err != nil {
  2017  			return err
  2018  		}
  2019  		a.encodeLoadOrStoreWithRegisterOffset(buf, baseRegBits, offsetRegBits, srcRegBits, opcode, size, v)
  2020  	} else {
  2021  		err = a.encodeLoadOrStoreWithConstOffset(buf, baseRegBits, srcRegBits, n.dstConst, opcode, size, v, datasize, datasizeLog2)
  2022  	}
  2023  	return
  2024  }
  2025  
  2026  func (a *AssemblerImpl) encodeADR(buf asm.Buffer, n *nodeImpl) (err error) {
  2027  	dstRegBits, err := intRegisterBits(n.dstReg)
  2028  	if err != nil {
  2029  		return err
  2030  	}
  2031  
  2032  	adrInstructionOffsetInBinary := uint64(buf.Len())
  2033  
  2034  	// At this point, we don't yet know the target offset to read from,
  2035  	// so we emit the ADR instruction with 0 offset, and replace later in the callback.
  2036  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
  2037  	buf.Append4Bytes(dstRegBits, 0x0, 0x0, 0b10000)
  2038  
  2039  	// This case, the ADR's target offset is for the staticConst's initial address.
  2040  	if sc := n.staticConst; sc != nil {
  2041  		a.pool.AddConst(sc, adrInstructionOffsetInBinary)
  2042  		sc.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
  2043  			adrInstructionBytes := buf.Bytes()[adrInstructionOffsetInBinary : adrInstructionOffsetInBinary+4]
  2044  			offset := int(offsetOfConst) - int(adrInstructionOffsetInBinary)
  2045  
  2046  			// See https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
  2047  			adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5)
  2048  			offset >>= 2
  2049  			adrInstructionBytes[0] |= byte(offset << 5)
  2050  			offset >>= 3
  2051  			adrInstructionBytes[1] |= byte(offset)
  2052  			offset >>= 8
  2053  			adrInstructionBytes[2] |= byte(offset)
  2054  		})
  2055  		return
  2056  	} else {
  2057  		a.adrInstructionNodes = append(a.adrInstructionNodes, n)
  2058  	}
  2059  	return
  2060  }
  2061  
  2062  func (a *AssemblerImpl) finalizeADRInstructionNode(code []byte, n *nodeImpl) (err error) {
  2063  	// Find the target instruction node.
  2064  	targetNode := n
  2065  	for ; targetNode != nil; targetNode = targetNode.next {
  2066  		if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction {
  2067  			targetNode = targetNode.next
  2068  			break
  2069  		}
  2070  	}
  2071  
  2072  	if targetNode == nil {
  2073  		return fmt.Errorf("BUG: target instruction %s not found for ADR", InstructionName(n.readInstructionAddressBeforeTargetInstruction))
  2074  	}
  2075  
  2076  	offset := targetNode.OffsetInBinary() - n.OffsetInBinary()
  2077  	if i64 := int64(offset); i64 >= 1<<20 || i64 < -1<<20 {
  2078  		// We could support offset over 20-bit range by special casing them here,
  2079  		// but 20-bit range should be enough for our impl. If the necessity comes up,
  2080  		// we could add the special casing here to support arbitrary large offset.
  2081  		return fmt.Errorf("BUG: too large offset for ADR: %#x", offset)
  2082  	}
  2083  
  2084  	adrInstructionBytes := code[n.OffsetInBinary() : n.OffsetInBinary()+4]
  2085  	// According to the binary format of ADR instruction:
  2086  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
  2087  	adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5)
  2088  	offset >>= 2
  2089  	adrInstructionBytes[0] |= byte(offset << 5)
  2090  	offset >>= 3
  2091  	adrInstructionBytes[1] |= byte(offset)
  2092  	offset >>= 8
  2093  	adrInstructionBytes[2] |= byte(offset)
  2094  	return nil
  2095  }
  2096  
  2097  func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2098  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
  2099  	var (
  2100  		size, v, opcode        byte
  2101  		datasize, datasizeLog2 int64
  2102  		isTargetFloat          bool
  2103  		isAcquire              bool
  2104  	)
  2105  	switch n.instruction {
  2106  	case ADR:
  2107  		return a.encodeADR(buf, n)
  2108  	case FLDRD:
  2109  		size, v, datasize, datasizeLog2, opcode, isTargetFloat = 0b11, 0x1, 8, 3, 0b01, true
  2110  	case FLDRS:
  2111  		size, v, datasize, datasizeLog2, opcode, isTargetFloat = 0b10, 0x1, 4, 2, 0b01, true
  2112  	case LDRD:
  2113  		size, v, datasize, datasizeLog2, opcode = 0b11, 0x0, 8, 3, 0b01
  2114  	case LDRW:
  2115  		size, v, datasize, datasizeLog2, opcode = 0b10, 0x0, 4, 2, 0b01
  2116  	case LDRSHD:
  2117  		size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b10
  2118  	case LDRSHW:
  2119  		size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b11
  2120  	case LDRH:
  2121  		size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b01
  2122  	case LDRSBD:
  2123  		size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b10
  2124  	case LDRSBW:
  2125  		size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b11
  2126  	case LDRB:
  2127  		size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b01
  2128  	case LDRSW:
  2129  		size, v, datasize, datasizeLog2, opcode = 0b10, 0x0, 4, 2, 0b10
  2130  	case LDARD:
  2131  		size, isAcquire = 0b11, true
  2132  	case LDARW:
  2133  		size, isAcquire = 0b10, true
  2134  	case LDARH:
  2135  		size, isAcquire = 0b01, true
  2136  	case LDARB:
  2137  		size, isAcquire = 0b00, true
  2138  	default:
  2139  		return errorEncodingUnsupported(n)
  2140  	}
  2141  
  2142  	var dstRegBits byte
  2143  	if isTargetFloat {
  2144  		dstRegBits, err = vectorRegisterBits(n.dstReg)
  2145  	} else {
  2146  		dstRegBits, err = intRegisterBits(n.dstReg)
  2147  	}
  2148  	if err != nil {
  2149  		return
  2150  	}
  2151  	baseRegBits, err := intRegisterBits(n.srcReg)
  2152  	if err != nil {
  2153  		return err
  2154  	}
  2155  
  2156  	if isAcquire {
  2157  		a.encodeLoadOrStoreWithAcquireRelease(buf, baseRegBits, dstRegBits, 1, size)
  2158  		return nil
  2159  	}
  2160  
  2161  	if n.srcReg2 != asm.NilRegister {
  2162  		offsetRegBits, err := intRegisterBits(n.srcReg2)
  2163  		if err != nil {
  2164  			return err
  2165  		}
  2166  		a.encodeLoadOrStoreWithRegisterOffset(buf, baseRegBits, offsetRegBits, dstRegBits, opcode,
  2167  			size, v)
  2168  	} else {
  2169  		err = a.encodeLoadOrStoreWithConstOffset(buf, baseRegBits, dstRegBits, n.srcConst, opcode,
  2170  			size, v, datasize, datasizeLog2)
  2171  	}
  2172  	return
  2173  }
  2174  
  2175  // const16bitAligned check if the value is on the 16-bit alignment.
  2176  // If so, returns the shift num divided by 16, and otherwise -1.
  2177  func const16bitAligned(v int64) (ret int) {
  2178  	ret = -1
  2179  	for s := 0; s < 64; s += 16 {
  2180  		if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
  2181  			ret = s / 16
  2182  			break
  2183  		}
  2184  	}
  2185  	return
  2186  }
  2187  
  2188  // isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
  2189  //
  2190  //	Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
  2191  //	Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
  2192  //
  2193  // See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
  2194  func isBitMaskImmediate(x uint64) bool {
  2195  	// All zeros and ones are not "bitmask immediate" by defainition.
  2196  	if x == 0 || x == 0xffff_ffff_ffff_ffff {
  2197  		return false
  2198  	}
  2199  
  2200  	switch {
  2201  	case x != x>>32|x<<32:
  2202  		// e = 64
  2203  	case x != x>>16|x<<48:
  2204  		// e = 32 (x == x>>32|x<<32).
  2205  		// e.g. 0x00ff_ff00_00ff_ff00
  2206  		x = uint64(int32(x))
  2207  	case x != x>>8|x<<56:
  2208  		// e = 16 (x == x>>16|x<<48).
  2209  		// e.g. 0x00ff_00ff_00ff_00ff
  2210  		x = uint64(int16(x))
  2211  	case x != x>>4|x<<60:
  2212  		// e = 8 (x == x>>8|x<<56).
  2213  		// e.g. 0x0f0f_0f0f_0f0f_0f0f
  2214  		x = uint64(int8(x))
  2215  	default:
  2216  		// e = 4 or 2.
  2217  		return true
  2218  	}
  2219  	return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
  2220  }
  2221  
  2222  // sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
  2223  // For example: 0b1110 -> true, 0b1010 -> false
  2224  func sequenceOfSetbits(x uint64) bool {
  2225  	y := getLowestBit(x)
  2226  	// If x is a sequence of set bit, this should results in the number
  2227  	// with only one set bit (i.e. power of two).
  2228  	y += x
  2229  	return (y-1)&y == 0
  2230  }
  2231  
  2232  func getLowestBit(x uint64) uint64 {
  2233  	// See https://stackoverflow.com/questions/12247186/find-the-lowest-set-bit
  2234  	return x & (^x + 1)
  2235  }
  2236  
  2237  func (a *AssemblerImpl) addOrSub64BitRegisters(buf asm.Buffer, sfops byte, sp bool, dstRegBits, src1RegBits, src2RegBits byte) {
  2238  	// src1Reg = src1Reg +/- src2Reg
  2239  	if sp {
  2240  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--extended-register---Add--extended-register--?lang=en
  2241  		buf.Append4Bytes(
  2242  			(src1RegBits<<5)|dstRegBits,
  2243  			0b011<<5|src1RegBits>>3,
  2244  			1<<5|src2RegBits,
  2245  			sfops<<5|0b01011,
  2246  		)
  2247  	} else {
  2248  		buf.Append4Bytes(
  2249  			(src1RegBits<<5)|dstRegBits,
  2250  			src1RegBits>>3,
  2251  			src2RegBits,
  2252  			sfops<<5|0b01011,
  2253  		)
  2254  	}
  2255  }
  2256  
  2257  func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
  2258  	var size uint32
  2259  	switch {
  2260  	case c != c>>32|c<<32:
  2261  		size = 64
  2262  	case c != c>>16|c<<48:
  2263  		size = 32
  2264  		c = uint64(int32(c))
  2265  	case c != c>>8|c<<56:
  2266  		size = 16
  2267  		c = uint64(int16(c))
  2268  	case c != c>>4|c<<60:
  2269  		size = 8
  2270  		c = uint64(int8(c))
  2271  	case c != c>>2|c<<62:
  2272  		size = 4
  2273  		c = uint64(int64(c<<60) >> 60)
  2274  	default:
  2275  		size = 2
  2276  		c = uint64(int64(c<<62) >> 62)
  2277  	}
  2278  
  2279  	neg := false
  2280  	if int64(c) < 0 {
  2281  		c = ^c
  2282  		neg = true
  2283  	}
  2284  
  2285  	onesSize, nonZeroPos := getOnesSequenceSize(c)
  2286  	if neg {
  2287  		nonZeroPos = onesSize + nonZeroPos
  2288  		onesSize = size - onesSize
  2289  	}
  2290  
  2291  	var mode byte = 32
  2292  	if is64bit {
  2293  		N, mode = 0b1, 64
  2294  	}
  2295  
  2296  	immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
  2297  	imms = byte((onesSize - 1) | 63&^(size<<1-1))
  2298  	return
  2299  }
  2300  
  2301  func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2302  	// Alias for readability.
  2303  	c := n.srcConst
  2304  
  2305  	dstRegBits, err := intRegisterBits(n.dstReg)
  2306  	if err != nil {
  2307  		return err
  2308  	}
  2309  
  2310  	// See "Logical (immediate)" in
  2311  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate
  2312  	switch n.instruction {
  2313  	case ANDIMM32, ANDIMM64, ANDSW, ANDS:
  2314  		if !isBitMaskImmediate(uint64(c)) {
  2315  			err = fmt.Errorf("const %d must be valid bitmask immediate for %s", c, InstructionName(n.instruction))
  2316  			return
  2317  		}
  2318  		srcRegBits := dstRegBits
  2319  		var sf, opc, immr, imms, N byte
  2320  		switch n.instruction {
  2321  		case ANDIMM32:
  2322  			sf, opc = 0b0, 0b00
  2323  			immr, imms, N = bitmaskImmediate(uint64(c), false)
  2324  		case ANDIMM64:
  2325  			sf, opc = 0b1, 0b00
  2326  			immr, imms, N = bitmaskImmediate(uint64(c), true)
  2327  		case ANDSW:
  2328  			srcRegBits, err = intRegisterBits(n.srcReg)
  2329  			if err != nil {
  2330  				return err
  2331  			}
  2332  			sf, opc = 0b0, 0b11
  2333  			immr, imms, N = bitmaskImmediate(uint64(c), false)
  2334  		case ANDS:
  2335  			srcRegBits, err = intRegisterBits(n.srcReg)
  2336  			if err != nil {
  2337  				return err
  2338  			}
  2339  			sf, opc = 0b1, 0b11
  2340  			immr, imms, N = bitmaskImmediate(uint64(c), true)
  2341  		}
  2342  		buf.Append4Bytes(
  2343  			(srcRegBits<<5)|dstRegBits,
  2344  			imms<<2|srcRegBits>>3,
  2345  			N<<6|immr,
  2346  			sf<<7|opc<<5|0b10010,
  2347  		)
  2348  		return
  2349  	}
  2350  
  2351  	switch inst := n.instruction; inst {
  2352  	case ADD, ADDS, SUB, SUBS:
  2353  		srcRegBits := dstRegBits
  2354  		if n.srcReg != asm.NilRegister {
  2355  			srcRegBits, err = intRegisterBits(n.srcReg)
  2356  			if err != nil {
  2357  				return err
  2358  			}
  2359  		}
  2360  
  2361  		var sfops byte
  2362  		if inst == ADD {
  2363  			sfops = 0b100
  2364  		} else if inst == ADDS {
  2365  			sfops = 0b101
  2366  		} else if inst == SUB {
  2367  			sfops = 0b110
  2368  		} else if inst == SUBS {
  2369  			sfops = 0b111
  2370  		}
  2371  
  2372  		isSP := n.srcReg == RegSP || n.dstReg == RegSP
  2373  		if c == 0 {
  2374  			// If the constant equals zero, we encode it as ADD (register) with zero register.
  2375  			a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, zeroRegisterBits)
  2376  			return
  2377  		}
  2378  
  2379  		if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
  2380  			// If the const can be represented as "imm12" or "imm12 << 12": one instruction
  2381  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L2992
  2382  
  2383  			if c <= 0xfff {
  2384  				buf.Append4Bytes(
  2385  					(srcRegBits<<5)|dstRegBits,
  2386  					(byte(c)<<2)|(srcRegBits>>3),
  2387  					byte(c>>6),
  2388  					sfops<<5|0b10001,
  2389  				)
  2390  			} else {
  2391  				c >>= 12
  2392  				buf.Append4Bytes(
  2393  					(srcRegBits<<5)|dstRegBits,
  2394  					(byte(c)<<2)|(srcRegBits>>3),
  2395  					0b01<<6 /* shift by 12 */ |byte(c>>6),
  2396  					sfops<<5|0b10001,
  2397  				)
  2398  			}
  2399  			return
  2400  		}
  2401  
  2402  		if t := const16bitAligned(c); t >= 0 {
  2403  			// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
  2404  			// We could load it into temporary with movk.
  2405  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029
  2406  			tmpRegBits := registerBits(a.temporaryRegister)
  2407  
  2408  			// MOVZ $c, tmpReg with shifting.
  2409  			a.load16bitAlignedConst(buf, c>>(16*t), byte(t), tmpRegBits, false, true)
  2410  
  2411  			// ADD/SUB tmpReg, dstReg
  2412  			a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
  2413  			return
  2414  		} else if t := const16bitAligned(^c); t >= 0 {
  2415  			// Also if the reverse of the const can fit within 16-bit range, do the same ^^.
  2416  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029
  2417  			tmpRegBits := registerBits(a.temporaryRegister)
  2418  
  2419  			// MOVN $c, tmpReg with shifting.
  2420  			a.load16bitAlignedConst(buf, ^c>>(16*t), byte(t), tmpRegBits, true, true)
  2421  
  2422  			// ADD/SUB tmpReg, dstReg
  2423  			a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
  2424  			return
  2425  		}
  2426  
  2427  		if uc := uint64(c); isBitMaskImmediate(uc) {
  2428  			// If the const can be represented as "bitmask immediate", we load it via ORR into temp register.
  2429  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6570-L6583
  2430  			tmpRegBits := registerBits(a.temporaryRegister)
  2431  			// OOR $c, tmpReg
  2432  			a.loadConstViaBitMaskImmediate(buf, uc, tmpRegBits, true)
  2433  
  2434  			// ADD/SUB tmpReg, dstReg
  2435  			a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
  2436  			return
  2437  		}
  2438  
  2439  		// If the value fits within 24-bit, then we emit two add instructions
  2440  		if 0 <= c && c <= 0xffffff && inst != SUBS && inst != ADDS {
  2441  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3849-L3862
  2442  			buf.Append4Bytes(
  2443  				(dstRegBits<<5)|dstRegBits,
  2444  				(byte(c)<<2)|(dstRegBits>>3),
  2445  				byte(c&0xfff>>6),
  2446  				sfops<<5|0b10001,
  2447  			)
  2448  			c = c >> 12
  2449  			buf.Append4Bytes(
  2450  				(dstRegBits<<5)|dstRegBits,
  2451  				(byte(c)<<2)|(dstRegBits>>3),
  2452  				0b01_000000 /* shift by 12 */ |byte(c>>6),
  2453  				sfops<<5|0b10001,
  2454  			)
  2455  			return
  2456  		}
  2457  
  2458  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3163-L3203
  2459  		// Otherwise we use MOVZ and MOVNs for loading const into tmpRegister.
  2460  		tmpRegBits := registerBits(a.temporaryRegister)
  2461  		a.load64bitConst(buf, c, tmpRegBits)
  2462  		a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits)
  2463  	case MOVW:
  2464  		if c == 0 {
  2465  			buf.Append4Bytes(
  2466  				(zeroRegisterBits<<5)|dstRegBits,
  2467  				zeroRegisterBits>>3,
  2468  				0b000_00000|zeroRegisterBits,
  2469  				0b0_01_01010,
  2470  			)
  2471  			return
  2472  		}
  2473  
  2474  		// Following the logic here:
  2475  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
  2476  		c32 := uint32(c)
  2477  		ic := int64(c32)
  2478  		if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
  2479  			if isBitMaskImmediate(uint64(c)) {
  2480  				a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, false)
  2481  				return
  2482  			}
  2483  		}
  2484  
  2485  		if t := const16bitAligned(int64(c32)); t >= 0 {
  2486  			// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
  2487  			// We could load it into temporary with movk.
  2488  			a.load16bitAlignedConst(buf, int64(c32)>>(16*t), byte(t), dstRegBits, false, false)
  2489  		} else if t := const16bitAligned(int64(^c32)); t >= 0 {
  2490  			// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
  2491  			a.load16bitAlignedConst(buf, int64(^c32)>>(16*t), byte(t), dstRegBits, true, false)
  2492  		} else if isBitMaskImmediate(uint64(c)) {
  2493  			a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, false)
  2494  		} else {
  2495  			// Otherwise, we use MOVZ and MOVK to load it.
  2496  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6623-L6630
  2497  			c16 := uint16(c32)
  2498  			// MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2499  			buf.Append4Bytes(
  2500  				(byte(c16)<<5)|dstRegBits,
  2501  				byte(c16>>3),
  2502  				1<<7|byte(c16>>11),
  2503  				0b0_10_10010,
  2504  			)
  2505  			// MOVK: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK
  2506  			c16 = uint16(c32 >> 16)
  2507  			if c16 != 0 {
  2508  				buf.Append4Bytes(
  2509  					(byte(c16)<<5)|dstRegBits,
  2510  					byte(c16>>3),
  2511  					1<<7|0b0_01_00000 /* shift by 16 */ |byte(c16>>11),
  2512  					0b0_11_10010,
  2513  				)
  2514  			}
  2515  		}
  2516  	case MOVD:
  2517  		// Following the logic here:
  2518  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
  2519  		if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
  2520  			if isBitMaskImmediate(uint64(c)) {
  2521  				a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, true)
  2522  				return
  2523  			}
  2524  		}
  2525  
  2526  		if t := const16bitAligned(c); t >= 0 {
  2527  			// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
  2528  			// We could load it into temporary with movk.
  2529  			a.load16bitAlignedConst(buf, c>>(16*t), byte(t), dstRegBits, false, true)
  2530  		} else if t := const16bitAligned(^c); t >= 0 {
  2531  			// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
  2532  			a.load16bitAlignedConst(buf, (^c)>>(16*t), byte(t), dstRegBits, true, true)
  2533  		} else if isBitMaskImmediate(uint64(c)) {
  2534  			a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, true)
  2535  		} else {
  2536  			a.load64bitConst(buf, c, dstRegBits)
  2537  		}
  2538  	case LSR:
  2539  		if c == 0 {
  2540  			err = errors.New("LSR with zero constant should be optimized out")
  2541  			return
  2542  		} else if c < 0 || c > 63 {
  2543  			err = fmt.Errorf("LSR requires immediate to be within 0 to 63, but got %d", c)
  2544  			return
  2545  		}
  2546  
  2547  		// LSR(immediate) is an alias of UBFM
  2548  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
  2549  		buf.Append4Bytes(
  2550  			(dstRegBits<<5)|dstRegBits,
  2551  			0b111111_00|dstRegBits>>3,
  2552  			0b01_000000|byte(c),
  2553  			0b110_10011,
  2554  		)
  2555  	case LSL:
  2556  		if c == 0 {
  2557  			err = errors.New("LSL with zero constant should be optimized out")
  2558  			return
  2559  		} else if c < 0 || c > 63 {
  2560  			err = fmt.Errorf("LSL requires immediate to be within 0 to 63, but got %d", c)
  2561  			return
  2562  		}
  2563  
  2564  		// LSL(immediate) is an alias of UBFM
  2565  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM-
  2566  		cb := byte(c)
  2567  		buf.Append4Bytes(
  2568  			(dstRegBits<<5)|dstRegBits,
  2569  			(0b111111-cb)<<2|dstRegBits>>3,
  2570  			0b01_000000|(64-cb),
  2571  			0b110_10011,
  2572  		)
  2573  
  2574  	default:
  2575  		return errorEncodingUnsupported(n)
  2576  	}
  2577  	return
  2578  }
  2579  
  2580  func (a *AssemblerImpl) movk(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) {
  2581  	// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK
  2582  	buf.Append4Bytes(
  2583  		(byte(v)<<5)|dstRegBits,
  2584  		byte(v>>3),
  2585  		1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)),
  2586  		0b1_11_10010,
  2587  	)
  2588  }
  2589  
  2590  func (a *AssemblerImpl) movz(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) {
  2591  	// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2592  	buf.Append4Bytes(
  2593  		(byte(v)<<5)|dstRegBits,
  2594  		byte(v>>3),
  2595  		1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)),
  2596  		0b1_10_10010,
  2597  	)
  2598  }
  2599  
  2600  func (a *AssemblerImpl) movn(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) {
  2601  	// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2602  	buf.Append4Bytes(
  2603  		(byte(v)<<5)|dstRegBits,
  2604  		byte(v>>3),
  2605  		1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)),
  2606  		0b1_00_10010,
  2607  	)
  2608  }
  2609  
  2610  // load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
  2611  // consts as in the Go assembler.
  2612  //
  2613  // See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
  2614  func (a *AssemblerImpl) load64bitConst(buf asm.Buffer, c int64, dstRegBits byte) {
  2615  	var bits [4]uint64
  2616  	var zeros, negs int
  2617  	for i := 0; i < 4; i++ {
  2618  		bits[i] = uint64((c >> uint(i*16)) & 0xffff)
  2619  		if v := bits[i]; v == 0 {
  2620  			zeros++
  2621  		} else if v == 0xffff {
  2622  			negs++
  2623  		}
  2624  	}
  2625  
  2626  	if zeros == 3 {
  2627  		// one MOVZ instruction.
  2628  		for i, v := range bits {
  2629  			if v != 0 {
  2630  				a.movz(buf, v, i, dstRegBits)
  2631  			}
  2632  		}
  2633  	} else if negs == 3 {
  2634  		// one MOVN instruction.
  2635  		for i, v := range bits {
  2636  			if v != 0xffff {
  2637  				v = ^v
  2638  				a.movn(buf, v, i, dstRegBits)
  2639  			}
  2640  		}
  2641  	} else if zeros == 2 {
  2642  		// one MOVZ then one OVK.
  2643  		var movz bool
  2644  		for i, v := range bits {
  2645  			if !movz && v != 0 { // MOVZ.
  2646  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2647  				a.movz(buf, v, i, dstRegBits)
  2648  				movz = true
  2649  			} else if v != 0 {
  2650  				a.movk(buf, v, i, dstRegBits)
  2651  			}
  2652  		}
  2653  
  2654  	} else if negs == 2 {
  2655  		// one MOVN then one or two MOVK.
  2656  		var movn bool
  2657  		for i, v := range bits { // Emit MOVN.
  2658  			if !movn && v != 0xffff {
  2659  				v = ^v
  2660  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
  2661  				a.movn(buf, v, i, dstRegBits)
  2662  				movn = true
  2663  			} else if v != 0xffff {
  2664  				a.movk(buf, v, i, dstRegBits)
  2665  			}
  2666  		}
  2667  
  2668  	} else if zeros == 1 {
  2669  		// one MOVZ then two MOVK.
  2670  		var movz bool
  2671  		for i, v := range bits {
  2672  			if !movz && v != 0 { // MOVZ.
  2673  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2674  				a.movz(buf, v, i, dstRegBits)
  2675  				movz = true
  2676  			} else if v != 0 {
  2677  				a.movk(buf, v, i, dstRegBits)
  2678  			}
  2679  		}
  2680  
  2681  	} else if negs == 1 {
  2682  		// one MOVN then two MOVK.
  2683  		var movn bool
  2684  		for i, v := range bits { // Emit MOVN.
  2685  			if !movn && v != 0xffff {
  2686  				v = ^v
  2687  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
  2688  				a.movn(buf, v, i, dstRegBits)
  2689  				movn = true
  2690  			} else if v != 0xffff {
  2691  				a.movk(buf, v, i, dstRegBits)
  2692  			}
  2693  		}
  2694  
  2695  	} else {
  2696  		// one MOVZ then tree MOVK.
  2697  		var movz bool
  2698  		for i, v := range bits {
  2699  			if !movz && v != 0 { // MOVZ.
  2700  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2701  				a.movz(buf, v, i, dstRegBits)
  2702  				movz = true
  2703  			} else if v != 0 {
  2704  				a.movk(buf, v, i, dstRegBits)
  2705  			}
  2706  		}
  2707  
  2708  	}
  2709  }
  2710  
  2711  func (a *AssemblerImpl) load16bitAlignedConst(buf asm.Buffer, c int64, shiftNum byte, regBits byte, reverse bool, dst64bit bool) {
  2712  	var lastByte byte
  2713  	if reverse {
  2714  		// MOVN: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2715  		lastByte = 0b0_00_10010
  2716  	} else {
  2717  		// MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
  2718  		lastByte = 0b0_10_10010
  2719  	}
  2720  	if dst64bit {
  2721  		lastByte |= 0b1 << 7
  2722  	}
  2723  	buf.Append4Bytes(
  2724  		(byte(c)<<5)|regBits,
  2725  		byte(c>>3),
  2726  		1<<7|(shiftNum<<5)|byte(c>>11),
  2727  		lastByte,
  2728  	)
  2729  }
  2730  
  2731  // loadConstViaBitMaskImmediate loads the constant with ORR (bitmask immediate).
  2732  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate--?lang=en
  2733  func (a *AssemblerImpl) loadConstViaBitMaskImmediate(buf asm.Buffer, c uint64, regBits byte, dst64bit bool) {
  2734  	var size uint32
  2735  	switch {
  2736  	case c != c>>32|c<<32:
  2737  		size = 64
  2738  	case c != c>>16|c<<48:
  2739  		size = 32
  2740  		c = uint64(int32(c))
  2741  	case c != c>>8|c<<56:
  2742  		size = 16
  2743  		c = uint64(int16(c))
  2744  	case c != c>>4|c<<60:
  2745  		size = 8
  2746  		c = uint64(int8(c))
  2747  	case c != c>>2|c<<62:
  2748  		size = 4
  2749  		c = uint64(int64(c<<60) >> 60)
  2750  	default:
  2751  		size = 2
  2752  		c = uint64(int64(c<<62) >> 62)
  2753  	}
  2754  
  2755  	neg := false
  2756  	if int64(c) < 0 {
  2757  		c = ^c
  2758  		neg = true
  2759  	}
  2760  
  2761  	onesSize, nonZeroPos := getOnesSequenceSize(c)
  2762  	if neg {
  2763  		nonZeroPos = onesSize + nonZeroPos
  2764  		onesSize = size - onesSize
  2765  	}
  2766  
  2767  	// See the following article for understanding the encoding.
  2768  	// https://dinfuehr.github.io/blog/encoding-of-immediate-values-on-aarch64/
  2769  	var n byte
  2770  	mode := 32
  2771  	if dst64bit && size == 64 {
  2772  		n = 0b1
  2773  		mode = 64
  2774  	}
  2775  
  2776  	r := byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
  2777  	s := byte((onesSize - 1) | 63&^(size<<1-1))
  2778  
  2779  	var sf byte
  2780  	if dst64bit {
  2781  		sf = 0b1
  2782  	}
  2783  	buf.Append4Bytes(
  2784  		(zeroRegisterBits<<5)|regBits,
  2785  		s<<2|(zeroRegisterBits>>3),
  2786  		n<<6|r,
  2787  		sf<<7|0b0_01_10010,
  2788  	)
  2789  }
  2790  
  2791  func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) {
  2792  	// Take 0b00111000 for example:
  2793  	y := getLowestBit(x)               // = 0b0000100
  2794  	nonZeroPos = setBitPos(y)          // = 2
  2795  	size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3
  2796  	return
  2797  }
  2798  
  2799  func setBitPos(x uint64) (ret uint32) {
  2800  	for ; ; ret++ {
  2801  		if x == 0b1 {
  2802  			break
  2803  		}
  2804  		x = x >> 1
  2805  	}
  2806  	return
  2807  }
  2808  
  2809  func checkArrangementIndexPair(arr VectorArrangement, index VectorIndex) (err error) {
  2810  	if arr == VectorArrangementNone {
  2811  		return nil
  2812  	}
  2813  	var valid bool
  2814  	switch arr {
  2815  	case VectorArrangement8B:
  2816  		valid = index < 8
  2817  	case VectorArrangement16B:
  2818  		valid = index < 16
  2819  	case VectorArrangement4H:
  2820  		valid = index < 4
  2821  	case VectorArrangement8H:
  2822  		valid = index < 8
  2823  	case VectorArrangement2S:
  2824  		valid = index < 2
  2825  	case VectorArrangement4S:
  2826  		valid = index < 4
  2827  	case VectorArrangement1D:
  2828  		valid = index < 1
  2829  	case VectorArrangement2D:
  2830  		valid = index < 2
  2831  	case VectorArrangementB:
  2832  		valid = index < 16
  2833  	case VectorArrangementH:
  2834  		valid = index < 8
  2835  	case VectorArrangementS:
  2836  		valid = index < 4
  2837  	case VectorArrangementD:
  2838  		valid = index < 2
  2839  	}
  2840  	if !valid {
  2841  		err = fmt.Errorf("invalid arrangement and index pair: %s[%d]", arr, index)
  2842  	}
  2843  	return
  2844  }
  2845  
  2846  func (a *AssemblerImpl) encodeMemoryToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2847  	srcBaseRegBits, err := intRegisterBits(n.srcReg)
  2848  	if err != nil {
  2849  		return err
  2850  	}
  2851  
  2852  	dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
  2853  	if err != nil {
  2854  		return err
  2855  	}
  2856  
  2857  	switch n.instruction {
  2858  	case VMOV: // translated as LDR(immediate,SIMD&FP)
  2859  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LDR--immediate--SIMD-FP---Load-SIMD-FP-Register--immediate-offset--?lang=en
  2860  		var size, opcode byte
  2861  		var dataSize, dataSizeLog2 int64
  2862  		switch n.vectorArrangement {
  2863  		case VectorArrangementB:
  2864  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b01, 1, 0
  2865  		case VectorArrangementH:
  2866  			size, opcode, dataSize, dataSizeLog2 = 0b01, 0b01, 2, 1
  2867  		case VectorArrangementS:
  2868  			size, opcode, dataSize, dataSizeLog2 = 0b10, 0b01, 4, 2
  2869  		case VectorArrangementD:
  2870  			size, opcode, dataSize, dataSizeLog2 = 0b11, 0b01, 8, 3
  2871  		case VectorArrangementQ:
  2872  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b11, 16, 4
  2873  		}
  2874  		const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos
  2875  		if n.srcReg2 != asm.NilRegister {
  2876  			offsetRegBits, err := intRegisterBits(n.srcReg2)
  2877  			if err != nil {
  2878  				return err
  2879  			}
  2880  			a.encodeLoadOrStoreWithRegisterOffset(buf, srcBaseRegBits, offsetRegBits, dstVectorRegBits, opcode, size, v)
  2881  		} else {
  2882  			err = a.encodeLoadOrStoreWithConstOffset(buf, srcBaseRegBits, dstVectorRegBits,
  2883  				n.srcConst, opcode, size, v, dataSize, dataSizeLog2)
  2884  		}
  2885  	case LD1R:
  2886  		if n.srcReg2 != asm.NilRegister || n.srcConst != 0 {
  2887  			return fmt.Errorf("offset for %s is not implemented", InstructionName(LD1R))
  2888  		}
  2889  
  2890  		var size, q byte
  2891  		switch n.vectorArrangement {
  2892  		case VectorArrangement8B:
  2893  			size, q = 0b00, 0b0
  2894  		case VectorArrangement16B:
  2895  			size, q = 0b00, 0b1
  2896  		case VectorArrangement4H:
  2897  			size, q = 0b01, 0b0
  2898  		case VectorArrangement8H:
  2899  			size, q = 0b01, 0b1
  2900  		case VectorArrangement2S:
  2901  			size, q = 0b10, 0b0
  2902  		case VectorArrangement4S:
  2903  			size, q = 0b10, 0b1
  2904  		case VectorArrangement1D:
  2905  			size, q = 0b11, 0b0
  2906  		case VectorArrangement2D:
  2907  			size, q = 0b11, 0b1
  2908  		}
  2909  
  2910  		// No offset encoding.
  2911  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#iclass_as_post_index
  2912  		buf.Append4Bytes(
  2913  			(srcBaseRegBits<<5)|dstVectorRegBits,
  2914  			0b11_000000|size<<2|srcBaseRegBits>>3,
  2915  			0b01_000000,
  2916  			q<<6|0b1101,
  2917  		)
  2918  	default:
  2919  		return errorEncodingUnsupported(n)
  2920  	}
  2921  	return
  2922  }
  2923  
  2924  func arrangementSizeQ(arr VectorArrangement) (size, q byte) {
  2925  	switch arr {
  2926  	case VectorArrangement8B:
  2927  		size, q = 0b00, 0
  2928  	case VectorArrangement16B:
  2929  		size, q = 0b00, 1
  2930  	case VectorArrangement4H:
  2931  		size, q = 0b01, 0
  2932  	case VectorArrangement8H:
  2933  		size, q = 0b01, 1
  2934  	case VectorArrangement2S:
  2935  		size, q = 0b10, 0
  2936  	case VectorArrangement4S:
  2937  		size, q = 0b10, 1
  2938  	case VectorArrangement1D:
  2939  		size, q = 0b11, 0
  2940  	case VectorArrangement2D:
  2941  		size, q = 0b11, 1
  2942  	}
  2943  	return
  2944  }
  2945  
  2946  func (a *AssemblerImpl) encodeVectorRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) {
  2947  	srcVectorRegBits, err := vectorRegisterBits(n.srcReg)
  2948  	if err != nil {
  2949  		return err
  2950  	}
  2951  
  2952  	dstBaseRegBits, err := intRegisterBits(n.dstReg)
  2953  	if err != nil {
  2954  		return err
  2955  	}
  2956  
  2957  	switch n.instruction {
  2958  	case VMOV: // translated as STR(immediate,SIMD&FP)
  2959  		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/STR--immediate--SIMD-FP---Store-SIMD-FP-register--immediate-offset--
  2960  		var size, opcode byte
  2961  		var dataSize, dataSizeLog2 int64
  2962  		switch n.vectorArrangement {
  2963  		case VectorArrangementB:
  2964  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b00, 1, 0
  2965  		case VectorArrangementH:
  2966  			size, opcode, dataSize, dataSizeLog2 = 0b01, 0b00, 2, 1
  2967  		case VectorArrangementS:
  2968  			size, opcode, dataSize, dataSizeLog2 = 0b10, 0b00, 4, 2
  2969  		case VectorArrangementD:
  2970  			size, opcode, dataSize, dataSizeLog2 = 0b11, 0b00, 8, 3
  2971  		case VectorArrangementQ:
  2972  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b10, 16, 4
  2973  		}
  2974  		const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos
  2975  
  2976  		if n.dstReg2 != asm.NilRegister {
  2977  			offsetRegBits, err := intRegisterBits(n.dstReg2)
  2978  			if err != nil {
  2979  				return err
  2980  			}
  2981  			a.encodeLoadOrStoreWithRegisterOffset(buf, dstBaseRegBits, offsetRegBits, srcVectorRegBits, opcode, size, v)
  2982  		} else {
  2983  			err = a.encodeLoadOrStoreWithConstOffset(buf, dstBaseRegBits, srcVectorRegBits,
  2984  				n.dstConst, opcode, size, v, dataSize, dataSizeLog2)
  2985  		}
  2986  	default:
  2987  		return errorEncodingUnsupported(n)
  2988  	}
  2989  	return
  2990  }
  2991  
  2992  func (a *AssemblerImpl) encodeStaticConstToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  2993  	if n.instruction != VMOV {
  2994  		return errorEncodingUnsupported(n)
  2995  	}
  2996  
  2997  	dstRegBits, err := vectorRegisterBits(n.dstReg)
  2998  	if err != nil {
  2999  		return err
  3000  	}
  3001  
  3002  	// LDR (literal, SIMD&FP)
  3003  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--
  3004  	var opc byte
  3005  	var constLength int
  3006  	switch n.vectorArrangement {
  3007  	case VectorArrangementS:
  3008  		opc, constLength = 0b00, 4
  3009  	case VectorArrangementD:
  3010  		opc, constLength = 0b01, 8
  3011  	case VectorArrangementQ:
  3012  		opc, constLength = 0b10, 16
  3013  	}
  3014  
  3015  	loadLiteralOffsetInBinary := uint64(buf.Len())
  3016  	a.pool.AddConst(n.staticConst, loadLiteralOffsetInBinary)
  3017  
  3018  	if len(n.staticConst.Raw) != constLength {
  3019  		return fmt.Errorf("invalid const length for %s: want %d but was %d",
  3020  			n.vectorArrangement, constLength, len(n.staticConst.Raw))
  3021  	}
  3022  
  3023  	buf.Append4Bytes(dstRegBits, 0x0, 0x0, opc<<6|0b11100)
  3024  	n.staticConst.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
  3025  		// LDR (literal, SIMD&FP) encodes offset divided by 4.
  3026  		offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4
  3027  		bin := buf.Bytes()
  3028  		bin[loadLiteralOffsetInBinary] |= byte(offset << 5)
  3029  		bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3)
  3030  		bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11)
  3031  	})
  3032  	return
  3033  }
  3034  
  3035  // advancedSIMDTwoRegisterMisc holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in
  3036  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3037  var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct {
  3038  	qAndSize  map[VectorArrangement]qAndSize
  3039  	u, opcode byte
  3040  }{
  3041  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en
  3042  	NOT: {
  3043  		u: 0b1, opcode: 0b00101,
  3044  		qAndSize: map[VectorArrangement]qAndSize{
  3045  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3046  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3047  		},
  3048  	},
  3049  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en
  3050  	VFNEG: {
  3051  		u: 0b1, opcode: 0b01111,
  3052  		qAndSize: map[VectorArrangement]qAndSize{
  3053  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3054  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3055  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3056  		},
  3057  	},
  3058  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FABS--vector---Floating-point-Absolute-value--vector--?lang=en
  3059  	VFABS: {u: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{
  3060  		VectorArrangement2D: {size: 0b11, q: 0b1},
  3061  		VectorArrangement4S: {size: 0b10, q: 0b1},
  3062  		VectorArrangement2S: {size: 0b10, q: 0b0},
  3063  	}},
  3064  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSQRT--vector---Floating-point-Square-Root--vector--?lang=en
  3065  	VFSQRT: {u: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{
  3066  		VectorArrangement2D: {size: 0b11, q: 0b1},
  3067  		VectorArrangement4S: {size: 0b10, q: 0b1},
  3068  		VectorArrangement2S: {size: 0b10, q: 0b0},
  3069  	}},
  3070  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTM--vector---Floating-point-Round-to-Integral--toward-Minus-infinity--vector--?lang=en
  3071  	VFRINTM: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{
  3072  		VectorArrangement2D: {size: 0b01, q: 0b1},
  3073  		VectorArrangement4S: {size: 0b00, q: 0b1},
  3074  		VectorArrangement2S: {size: 0b00, q: 0b0},
  3075  	}},
  3076  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTN--vector---Floating-point-Round-to-Integral--to-nearest-with-ties-to-even--vector--?lang=en
  3077  	VFRINTN: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{
  3078  		VectorArrangement2D: {size: 0b01, q: 0b1},
  3079  		VectorArrangement4S: {size: 0b00, q: 0b1},
  3080  		VectorArrangement2S: {size: 0b00, q: 0b0},
  3081  	}},
  3082  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTP--vector---Floating-point-Round-to-Integral--toward-Plus-infinity--vector--?lang=en
  3083  	VFRINTP: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{
  3084  		VectorArrangement2D: {size: 0b11, q: 0b1},
  3085  		VectorArrangement4S: {size: 0b10, q: 0b1},
  3086  		VectorArrangement2S: {size: 0b10, q: 0b0},
  3087  	}},
  3088  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTZ--vector---Floating-point-Round-to-Integral--toward-Zero--vector--?lang=en
  3089  	VFRINTZ: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{
  3090  		VectorArrangement2D: {size: 0b11, q: 0b1},
  3091  		VectorArrangement4S: {size: 0b10, q: 0b1},
  3092  		VectorArrangement2S: {size: 0b10, q: 0b0},
  3093  	}},
  3094  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CNT--Population-Count-per-byte-?lang=en
  3095  	VCNT: {u: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{
  3096  		VectorArrangement8B:  {size: 0b00, q: 0b0},
  3097  		VectorArrangement16B: {size: 0b00, q: 0b1},
  3098  	}},
  3099  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NEG--vector---Negate--vector--?lang=en
  3100  	VNEG: {u: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize},
  3101  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ABS--Absolute-value--vector--?lang=en
  3102  	VABS: {u: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize},
  3103  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/REV64--Reverse-elements-in-64-bit-doublewords--vector--?lang=en
  3104  	REV64: {u: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize},
  3105  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/XTN--XTN2--Extract-Narrow-?lang=en
  3106  	XTN: {u: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
  3107  		VectorArrangement2D: {q: 0, size: 0b10},
  3108  		VectorArrangement4S: {q: 0, size: 0b01},
  3109  		VectorArrangement8H: {q: 0, size: 0b00},
  3110  	}},
  3111  	SHLL: {u: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{
  3112  		VectorArrangement8B: {q: 0b00, size: 0b00},
  3113  		VectorArrangement4H: {q: 0b00, size: 0b01},
  3114  		VectorArrangement2S: {q: 0b00, size: 0b10},
  3115  	}},
  3116  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en
  3117  	CMEQZERO: {u: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize},
  3118  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en
  3119  	SADDLP: {u: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize},
  3120  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en
  3121  	UADDLP: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize},
  3122  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en
  3123  	VFCVTZS: {u: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{
  3124  		VectorArrangement4S: {size: 0b10, q: 0b1},
  3125  		VectorArrangement2S: {size: 0b10, q: 0b0},
  3126  		VectorArrangement2D: {size: 0b11, q: 0b1},
  3127  	}},
  3128  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en
  3129  	VFCVTZU: {u: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{
  3130  		VectorArrangement4S: {size: 0b10, q: 0b1},
  3131  		VectorArrangement2S: {size: 0b10, q: 0b0},
  3132  		VectorArrangement2D: {size: 0b11, q: 0b1},
  3133  	}},
  3134  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en
  3135  	SQXTN: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{
  3136  		VectorArrangement8B: {q: 0b0, size: 0b00},
  3137  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3138  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3139  	}},
  3140  
  3141  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en
  3142  	SQXTN2: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{
  3143  		VectorArrangement16B: {q: 0b1, size: 0b00},
  3144  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  3145  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  3146  	}},
  3147  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en
  3148  	UQXTN: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize},
  3149  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en
  3150  	SQXTUN: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
  3151  		VectorArrangement8B: {q: 0b0, size: 0b00},
  3152  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3153  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3154  	}},
  3155  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en
  3156  	SQXTUN2: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
  3157  		VectorArrangement16B: {q: 0b1, size: 0b00},
  3158  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  3159  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  3160  	}},
  3161  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-Convert-to-Floating-point--vector--?lang=en
  3162  	VSCVTF: {u: 0b0, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{
  3163  		VectorArrangement2D: {q: 0b1, size: 0b01},
  3164  		VectorArrangement4S: {q: 0b1, size: 0b00},
  3165  		VectorArrangement2S: {q: 0b0, size: 0b00},
  3166  	}},
  3167  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-Convert-to-Floating-point--vector--?lang=en
  3168  	VUCVTF: {u: 0b1, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{
  3169  		VectorArrangement2D: {q: 0b1, size: 0b01},
  3170  		VectorArrangement4S: {q: 0b1, size: 0b00},
  3171  		VectorArrangement2S: {q: 0b0, size: 0b00},
  3172  	}},
  3173  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTL--FCVTL2--Floating-point-Convert-to-higher-precision-Long--vector--?lang=en
  3174  	FCVTL: {u: 0b0, opcode: 0b10111, qAndSize: map[VectorArrangement]qAndSize{
  3175  		VectorArrangement2S: {size: 0b01, q: 0b0},
  3176  		VectorArrangement4H: {size: 0b00, q: 0b0},
  3177  	}},
  3178  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTN--FCVTN2--Floating-point-Convert-to-lower-precision-Narrow--vector--?lang=en
  3179  	FCVTN: {u: 0b0, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{
  3180  		VectorArrangement2S: {size: 0b01, q: 0b0},
  3181  		VectorArrangement4H: {size: 0b00, q: 0b0},
  3182  	}},
  3183  }
  3184  
  3185  // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in
  3186  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3187  var advancedSIMDThreeDifferent = map[asm.Instruction]struct {
  3188  	qAndSize  map[VectorArrangement]qAndSize
  3189  	u, opcode byte
  3190  }{
  3191  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMLAL--UMLAL2--vector---Unsigned-Multiply-Add-Long--vector--?lang=en
  3192  	VUMLAL: {u: 0b1, opcode: 0b1000, qAndSize: map[VectorArrangement]qAndSize{
  3193  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3194  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3195  		VectorArrangement8B: {q: 0b0, size: 0b00},
  3196  	}},
  3197  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en
  3198  	SMULL: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  3199  		VectorArrangement8B: {q: 0b0, size: 0b00},
  3200  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3201  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3202  	}},
  3203  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en
  3204  	SMULL2: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  3205  		VectorArrangement16B: {q: 0b1, size: 0b00},
  3206  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  3207  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  3208  	}},
  3209  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3210  	UMULL: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  3211  		VectorArrangement8B: {q: 0b0, size: 0b00},
  3212  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3213  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3214  	}},
  3215  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3216  	UMULL2: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  3217  		VectorArrangement16B: {q: 0b1, size: 0b00},
  3218  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  3219  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  3220  	}},
  3221  }
  3222  
  3223  // advancedSIMDThreeSame holds information to encode instructions as "Advanced SIMD three same" in
  3224  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3225  var advancedSIMDThreeSame = map[asm.Instruction]struct {
  3226  	qAndSize  map[VectorArrangement]qAndSize
  3227  	u, opcode byte
  3228  }{
  3229  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--?lang=en
  3230  	VAND: {
  3231  		u: 0b0, opcode: 0b00011,
  3232  		qAndSize: map[VectorArrangement]qAndSize{
  3233  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3234  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3235  		},
  3236  	},
  3237  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BSL--Bitwise-Select-?lang=en
  3238  	BSL: {
  3239  		u: 0b1, opcode: 0b00011,
  3240  		qAndSize: map[VectorArrangement]qAndSize{
  3241  			VectorArrangement16B: {size: 0b01, q: 0b1},
  3242  			VectorArrangement8B:  {size: 0b01, q: 0b0},
  3243  		},
  3244  	},
  3245  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EOR--vector---Bitwise-Exclusive-OR--vector--?lang=en
  3246  	EOR: {
  3247  		u: 0b1, opcode: 0b00011,
  3248  		qAndSize: map[VectorArrangement]qAndSize{
  3249  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3250  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3251  		},
  3252  	},
  3253  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--?lang=en
  3254  	VORR: {
  3255  		u: 0b0, opcode: 0b00011,
  3256  		qAndSize: map[VectorArrangement]qAndSize{
  3257  			VectorArrangement16B: {size: 0b10, q: 0b1},
  3258  			VectorArrangement8B:  {size: 0b10, q: 0b0},
  3259  		},
  3260  	},
  3261  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--?lang=en
  3262  	BIC: {
  3263  		u: 0b0, opcode: 0b00011,
  3264  		qAndSize: map[VectorArrangement]qAndSize{
  3265  			VectorArrangement16B: {size: 0b01, q: 0b1},
  3266  			VectorArrangement8B:  {size: 0b01, q: 0b0},
  3267  		},
  3268  	},
  3269  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
  3270  	VFADDS: {
  3271  		u: 0b0, opcode: 0b11010,
  3272  		qAndSize: map[VectorArrangement]qAndSize{
  3273  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3274  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3275  		},
  3276  	},
  3277  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
  3278  	VFADDD: {
  3279  		u: 0b0, opcode: 0b11010,
  3280  		qAndSize: map[VectorArrangement]qAndSize{
  3281  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3282  		},
  3283  	},
  3284  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
  3285  	VFSUBS: {
  3286  		u: 0b0, opcode: 0b11010,
  3287  		qAndSize: map[VectorArrangement]qAndSize{
  3288  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3289  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3290  		},
  3291  	},
  3292  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
  3293  	VFSUBD: {
  3294  		u: 0b0, opcode: 0b11010,
  3295  		qAndSize: map[VectorArrangement]qAndSize{
  3296  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3297  		},
  3298  	},
  3299  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-?lang=en
  3300  	UMAXP: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize},
  3301  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en
  3302  	CMEQ: {u: 0b1, opcode: 0b10001, qAndSize: defaultQAndSize},
  3303  	// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector-
  3304  	VADDP: {u: 0b0, opcode: 0b10111, qAndSize: defaultQAndSize},
  3305  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector---Add--vector--?lang=en
  3306  	VADD: {u: 0, opcode: 0b10000, qAndSize: defaultQAndSize},
  3307  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--?lang=en
  3308  	VSUB: {u: 1, opcode: 0b10000, qAndSize: defaultQAndSize},
  3309  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
  3310  	SSHL: {u: 0, opcode: 0b01000, qAndSize: defaultQAndSize},
  3311  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
  3312  	USHL: {u: 0b1, opcode: 0b01000, qAndSize: defaultQAndSize},
  3313  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGT--register---Compare-signed-Greater-than--vector--?lang=en
  3314  	CMGT: {u: 0b0, opcode: 0b00110, qAndSize: defaultQAndSize},
  3315  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHI--register---Compare-unsigned-Higher--vector--?lang=en
  3316  	CMHI: {u: 0b1, opcode: 0b00110, qAndSize: defaultQAndSize},
  3317  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGE--register---Compare-signed-Greater-than-or-Equal--vector--?lang=en
  3318  	CMGE: {u: 0b0, opcode: 0b00111, qAndSize: defaultQAndSize},
  3319  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHS--register---Compare-unsigned-Higher-or-Same--vector--?lang=en
  3320  	CMHS: {u: 0b1, opcode: 0b00111, qAndSize: defaultQAndSize},
  3321  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMEQ--register---Floating-point-Compare-Equal--vector--?lang=en
  3322  	FCMEQ: {
  3323  		u: 0b0, opcode: 0b11100,
  3324  		qAndSize: map[VectorArrangement]qAndSize{
  3325  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3326  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3327  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3328  		},
  3329  	},
  3330  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGT--register---Floating-point-Compare-Greater-than--vector--?lang=en
  3331  	FCMGT: {
  3332  		u: 0b1, opcode: 0b11100,
  3333  		qAndSize: map[VectorArrangement]qAndSize{
  3334  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3335  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3336  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3337  		},
  3338  	},
  3339  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGE--register---Floating-point-Compare-Greater-than-or-Equal--vector--?lang=en
  3340  	FCMGE: {
  3341  		u: 0b1, opcode: 0b11100,
  3342  		qAndSize: map[VectorArrangement]qAndSize{
  3343  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3344  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3345  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3346  		},
  3347  	},
  3348  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMIN--vector---Floating-point-minimum--vector--?lang=en
  3349  	VFMIN: {
  3350  		u: 0b0, opcode: 0b11110,
  3351  		qAndSize: map[VectorArrangement]qAndSize{
  3352  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3353  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3354  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3355  		},
  3356  	},
  3357  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMAX--vector---Floating-point-Maximum--vector--?lang=en
  3358  	VFMAX: {
  3359  		u: 0b0, opcode: 0b11110,
  3360  		qAndSize: map[VectorArrangement]qAndSize{
  3361  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3362  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3363  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3364  		},
  3365  	},
  3366  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMUL--vector---Floating-point-Multiply--vector--?lang=en
  3367  	VFMUL: {
  3368  		u: 0b1, opcode: 0b11011,
  3369  		qAndSize: map[VectorArrangement]qAndSize{
  3370  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3371  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3372  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3373  		},
  3374  	},
  3375  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FDIV--vector---Floating-point-Divide--vector--?lang=en
  3376  	VFDIV: {
  3377  		u: 0b1, opcode: 0b11111,
  3378  		qAndSize: map[VectorArrangement]qAndSize{
  3379  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3380  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3381  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3382  		},
  3383  	},
  3384  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MUL--vector---Multiply--vector--?lang=en
  3385  	VMUL: {u: 0b0, opcode: 0b10011, qAndSize: defaultQAndSize},
  3386  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQADD--Signed-saturating-Add-?lang=en
  3387  	VSQADD: {u: 0b0, opcode: 0b00001, qAndSize: defaultQAndSize},
  3388  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQADD--Unsigned-saturating-Add-?lang=en
  3389  	VUQADD: {u: 0b1, opcode: 0b00001, qAndSize: defaultQAndSize},
  3390  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMIN--Signed-Minimum--vector--?lang=en
  3391  	SMIN: {u: 0b0, opcode: 0b01101, qAndSize: defaultQAndSize},
  3392  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMAX--Signed-Maximum--vector--?lang=en
  3393  	SMAX: {u: 0b0, opcode: 0b01100, qAndSize: defaultQAndSize},
  3394  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMIN--Unsigned-Minimum--vector--?lang=en
  3395  	UMIN: {u: 0b1, opcode: 0b01101, qAndSize: defaultQAndSize},
  3396  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAX--Unsigned-Maximum--vector--?lang=en
  3397  	UMAX: {u: 0b1, opcode: 0b01100, qAndSize: defaultQAndSize},
  3398  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/URHADD--Unsigned-Rounding-Halving-Add-?lang=en
  3399  	URHADD: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize},
  3400  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQSUB--Signed-saturating-Subtract-?lang=en
  3401  	VSQSUB: {u: 0b0, opcode: 0b00101, qAndSize: defaultQAndSize},
  3402  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQSUB--Unsigned-saturating-Subtract-?lang=en
  3403  	VUQSUB: {u: 0b1, opcode: 0b00101, qAndSize: defaultQAndSize},
  3404  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIT--Bitwise-Insert-if-True-?lang=en
  3405  	VBIT: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{
  3406  		VectorArrangement8B:  {q: 0b0, size: 0b10},
  3407  		VectorArrangement16B: {q: 0b1, size: 0b10},
  3408  	}},
  3409  	SQRDMULH: {u: 0b1, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{
  3410  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3411  		VectorArrangement8H: {q: 0b1, size: 0b01},
  3412  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3413  		VectorArrangement4S: {q: 0b1, size: 0b10},
  3414  	}},
  3415  }
  3416  
  3417  // aAndSize is a pair of "Q" and "size" that appear in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3418  type qAndSize struct{ q, size byte }
  3419  
  3420  // defaultQAndSize maps a vector arrangement to the default qAndSize which is encoded by many instructions.
  3421  var defaultQAndSize = map[VectorArrangement]qAndSize{
  3422  	VectorArrangement8B:  {size: 0b00, q: 0b0},
  3423  	VectorArrangement16B: {size: 0b00, q: 0b1},
  3424  	VectorArrangement4H:  {size: 0b01, q: 0b0},
  3425  	VectorArrangement8H:  {size: 0b01, q: 0b1},
  3426  	VectorArrangement2S:  {size: 0b10, q: 0b0},
  3427  	VectorArrangement4S:  {size: 0b10, q: 0b1},
  3428  	VectorArrangement1D:  {size: 0b11, q: 0b0},
  3429  	VectorArrangement2D:  {size: 0b11, q: 0b1},
  3430  }
  3431  
  3432  // advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD across lanes" in
  3433  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3434  var advancedSIMDAcrossLanes = map[asm.Instruction]struct {
  3435  	qAndSize  map[VectorArrangement]qAndSize
  3436  	u, opcode byte
  3437  }{
  3438  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-?lang=en
  3439  	ADDV: {
  3440  		u: 0b0, opcode: 0b11011,
  3441  		qAndSize: map[VectorArrangement]qAndSize{
  3442  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3443  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3444  			VectorArrangement8H:  {size: 0b01, q: 0b1},
  3445  			VectorArrangement4H:  {size: 0b01, q: 0b0},
  3446  			VectorArrangement4S:  {size: 0b10, q: 0b1},
  3447  		},
  3448  	},
  3449  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMINV--Unsigned-Minimum-across-Vector-?lang=en
  3450  	UMINV: {
  3451  		u: 0b1, opcode: 0b11010,
  3452  		qAndSize: map[VectorArrangement]qAndSize{
  3453  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3454  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3455  			VectorArrangement8H:  {size: 0b01, q: 0b1},
  3456  			VectorArrangement4H:  {size: 0b01, q: 0b0},
  3457  			VectorArrangement4S:  {size: 0b10, q: 0b1},
  3458  		},
  3459  	},
  3460  	UADDLV: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{
  3461  		VectorArrangement16B: {size: 0b00, q: 0b1},
  3462  		VectorArrangement8B:  {size: 0b00, q: 0b0},
  3463  		VectorArrangement8H:  {size: 0b01, q: 0b1},
  3464  		VectorArrangement4H:  {size: 0b01, q: 0b0},
  3465  		VectorArrangement4S:  {size: 0b10, q: 0b1},
  3466  	}},
  3467  }
  3468  
  3469  // advancedSIMDScalarPairwise holds information to encode instructions as "Advanced SIMD scalar pairwise" in
  3470  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3471  var advancedSIMDScalarPairwise = map[asm.Instruction]struct {
  3472  	size      map[VectorArrangement]byte
  3473  	u, opcode byte
  3474  }{
  3475  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en
  3476  	ADDP: {u: 0b0, opcode: 0b11011, size: map[VectorArrangement]byte{VectorArrangement2D: 0b11}},
  3477  }
  3478  
  3479  // advancedSIMDCopy holds information to encode instructions as "Advanced SIMD copy" in
  3480  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3481  var advancedSIMDCopy = map[asm.Instruction]struct {
  3482  	// TODO: extract common implementation of resolver.
  3483  	resolver func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error)
  3484  	op       byte
  3485  }{
  3486  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-?lang=en
  3487  	DUPELEM: {op: 0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3488  		imm4 = 0b0000
  3489  		q = 0b1
  3490  
  3491  		switch arr {
  3492  		case VectorArrangementB:
  3493  			imm5 |= 0b1
  3494  			imm5 |= byte(srcIndex) << 1
  3495  		case VectorArrangementH:
  3496  			imm5 |= 0b10
  3497  			imm5 |= byte(srcIndex) << 2
  3498  		case VectorArrangementS:
  3499  			imm5 |= 0b100
  3500  			imm5 |= byte(srcIndex) << 3
  3501  		case VectorArrangementD:
  3502  			imm5 |= 0b1000
  3503  			imm5 |= byte(srcIndex) << 4
  3504  		default:
  3505  			err = fmt.Errorf("unsupported arrangement for DUPELEM: %d", arr)
  3506  		}
  3507  
  3508  		return
  3509  	}},
  3510  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
  3511  	DUPGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3512  		imm4 = 0b0001
  3513  		switch arr {
  3514  		case VectorArrangement8B:
  3515  			imm5 = 0b1
  3516  		case VectorArrangement16B:
  3517  			imm5 = 0b1
  3518  			q = 0b1
  3519  		case VectorArrangement4H:
  3520  			imm5 = 0b10
  3521  		case VectorArrangement8H:
  3522  			imm5 = 0b10
  3523  			q = 0b1
  3524  		case VectorArrangement2S:
  3525  			imm5 = 0b100
  3526  		case VectorArrangement4S:
  3527  			imm5 = 0b100
  3528  			q = 0b1
  3529  		case VectorArrangement2D:
  3530  			imm5 = 0b1000
  3531  			q = 0b1
  3532  		default:
  3533  			err = fmt.Errorf("unsupported arrangement for DUPGEN: %s", arr)
  3534  		}
  3535  		return
  3536  	}},
  3537  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en
  3538  	INSGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3539  		imm4, q = 0b0011, 0b1
  3540  		switch arr {
  3541  		case VectorArrangementB:
  3542  			imm5 |= 0b1
  3543  			imm5 |= byte(dstIndex) << 1
  3544  		case VectorArrangementH:
  3545  			imm5 |= 0b10
  3546  			imm5 |= byte(dstIndex) << 2
  3547  		case VectorArrangementS:
  3548  			imm5 |= 0b100
  3549  			imm5 |= byte(dstIndex) << 3
  3550  		case VectorArrangementD:
  3551  			imm5 |= 0b1000
  3552  			imm5 |= byte(dstIndex) << 4
  3553  		default:
  3554  			err = fmt.Errorf("unsupported arrangement for INSGEN: %s", arr)
  3555  		}
  3556  		return
  3557  	}},
  3558  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
  3559  	UMOV: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3560  		imm4 = 0b0111
  3561  		switch arr {
  3562  		case VectorArrangementB:
  3563  			imm5 |= 0b1
  3564  			imm5 |= byte(srcIndex) << 1
  3565  		case VectorArrangementH:
  3566  			imm5 |= 0b10
  3567  			imm5 |= byte(srcIndex) << 2
  3568  		case VectorArrangementS:
  3569  			imm5 |= 0b100
  3570  			imm5 |= byte(srcIndex) << 3
  3571  		case VectorArrangementD:
  3572  			imm5 |= 0b1000
  3573  			imm5 |= byte(srcIndex) << 4
  3574  			q = 0b1
  3575  		default:
  3576  			err = fmt.Errorf("unsupported arrangement for UMOV: %s", arr)
  3577  		}
  3578  		return
  3579  	}},
  3580  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-?lang=en
  3581  	SMOV32: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3582  		imm4 = 0b0101
  3583  		switch arr {
  3584  		case VectorArrangementB:
  3585  			imm5 |= 0b1
  3586  			imm5 |= byte(srcIndex) << 1
  3587  		case VectorArrangementH:
  3588  			imm5 |= 0b10
  3589  			imm5 |= byte(srcIndex) << 2
  3590  		default:
  3591  			err = fmt.Errorf("unsupported arrangement for SMOV32: %s", arr)
  3592  		}
  3593  		return
  3594  	}},
  3595  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
  3596  	INSELEM: {op: 0b1, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3597  		q = 0b1
  3598  		switch arr {
  3599  		case VectorArrangementB:
  3600  			imm5 |= 0b1
  3601  			imm5 |= byte(dstIndex) << 1
  3602  			imm4 = byte(srcIndex)
  3603  		case VectorArrangementH:
  3604  			imm5 |= 0b10
  3605  			imm5 |= byte(dstIndex) << 2
  3606  			imm4 = byte(srcIndex) << 1
  3607  		case VectorArrangementS:
  3608  			imm5 |= 0b100
  3609  			imm5 |= byte(dstIndex) << 3
  3610  			imm4 = byte(srcIndex) << 2
  3611  		case VectorArrangementD:
  3612  			imm5 |= 0b1000
  3613  			imm5 |= byte(dstIndex) << 4
  3614  			imm4 = byte(srcIndex) << 3
  3615  		default:
  3616  			err = fmt.Errorf("unsupported arrangement for INSELEM: %d", arr)
  3617  		}
  3618  		return
  3619  	}},
  3620  }
  3621  
  3622  // advancedSIMDTableLookup holds information to encode instructions as "Advanced SIMD table lookup" in
  3623  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3624  var advancedSIMDTableLookup = map[asm.Instruction]struct {
  3625  	q            map[VectorArrangement]byte
  3626  	op, op2, Len byte
  3627  }{
  3628  	TBL1: {op: 0, op2: 0, Len: 0b00, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}},
  3629  	TBL2: {op: 0, op2: 0, Len: 0b01, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}},
  3630  }
  3631  
  3632  // advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in
  3633  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3634  var advancedSIMDShiftByImmediate = map[asm.Instruction]struct {
  3635  	q           map[VectorArrangement]byte
  3636  	immResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error)
  3637  	U, opcode   byte
  3638  }{
  3639  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
  3640  	SSHLL: {
  3641  		U: 0b0, opcode: 0b10100,
  3642  		q:           map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0},
  3643  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3644  	},
  3645  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
  3646  	SSHLL2: {
  3647  		U: 0b0, opcode: 0b10100,
  3648  		q:           map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1},
  3649  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3650  	},
  3651  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
  3652  	USHLL: {
  3653  		U: 0b1, opcode: 0b10100,
  3654  		q:           map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0},
  3655  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3656  	},
  3657  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
  3658  	USHLL2: {
  3659  		U: 0b1, opcode: 0b10100,
  3660  		q:           map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1},
  3661  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3662  	},
  3663  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en
  3664  	SSHR: {
  3665  		U: 0b0, opcode: 0b00000,
  3666  		q: map[VectorArrangement]byte{
  3667  			VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1, VectorArrangement2D: 0b1,
  3668  			VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0,
  3669  		},
  3670  		immResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) {
  3671  			switch arr {
  3672  			case VectorArrangement16B, VectorArrangement8B:
  3673  				immh = 0b0001
  3674  				immb = 8 - byte(shiftAmount&0b111)
  3675  			case VectorArrangement8H, VectorArrangement4H:
  3676  				v := 16 - byte(shiftAmount&0b1111)
  3677  				immb = v & 0b111
  3678  				immh = 0b0010 | (v >> 3)
  3679  			case VectorArrangement4S, VectorArrangement2S:
  3680  				v := 32 - byte(shiftAmount&0b11111)
  3681  				immb = v & 0b111
  3682  				immh = 0b0100 | (v >> 3)
  3683  			case VectorArrangement2D:
  3684  				v := 64 - byte(shiftAmount&0b111111)
  3685  				immb = v & 0b111
  3686  				immh = 0b1000 | (v >> 3)
  3687  			default:
  3688  				err = fmt.Errorf("unsupported arrangement %s", arr)
  3689  			}
  3690  			return
  3691  		},
  3692  	},
  3693  }
  3694  
  3695  // advancedSIMDPermute holds information to encode instructions as "Advanced SIMD permute" in
  3696  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3697  var advancedSIMDPermute = map[asm.Instruction]struct {
  3698  	opcode byte
  3699  }{
  3700  	ZIP1: {opcode: 0b011},
  3701  }
  3702  
  3703  func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) {
  3704  	switch arr {
  3705  	case VectorArrangement16B, VectorArrangement8B:
  3706  		immb = byte(shiftAmount)
  3707  		immh = 0b0001
  3708  	case VectorArrangement8H, VectorArrangement4H:
  3709  		immb = byte(shiftAmount) & 0b111
  3710  		immh = 0b0010 | byte(shiftAmount>>3)
  3711  	case VectorArrangement4S, VectorArrangement2S:
  3712  		immb = byte(shiftAmount) & 0b111
  3713  		immh = 0b0100 | byte(shiftAmount>>3)
  3714  	default:
  3715  		err = fmt.Errorf("unsupported arrangement %s", arr)
  3716  	}
  3717  	return
  3718  }
  3719  
  3720  // encodeAdvancedSIMDCopy encodes instruction as "Advanced SIMD copy" in
  3721  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3722  func (a *AssemblerImpl) encodeAdvancedSIMDCopy(buf asm.Buffer, srcRegBits, dstRegBits, op, imm5, imm4, q byte) {
  3723  	buf.Append4Bytes(
  3724  		(srcRegBits<<5)|dstRegBits,
  3725  		imm4<<3|0b1<<2|srcRegBits>>3,
  3726  		imm5,
  3727  		q<<6|op<<5|0b1110,
  3728  	)
  3729  }
  3730  
  3731  // encodeAdvancedSIMDThreeSame encodes instruction as  "Advanced SIMD three same" in
  3732  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3733  func (a *AssemblerImpl) encodeAdvancedSIMDThreeSame(buf asm.Buffer, src1, src2, dst, opcode, size, q, u byte) {
  3734  	buf.Append4Bytes(
  3735  		(src2<<5)|dst,
  3736  		opcode<<3|1<<2|src2>>3,
  3737  		size<<6|0b1<<5|src1,
  3738  		q<<6|u<<5|0b1110,
  3739  	)
  3740  }
  3741  
  3742  // encodeAdvancedSIMDThreeDifferent encodes instruction as  "Advanced SIMD three different" in
  3743  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3744  func (a *AssemblerImpl) encodeAdvancedSIMDThreeDifferent(buf asm.Buffer, src1, src2, dst, opcode, size, q, u byte) {
  3745  	buf.Append4Bytes(
  3746  		(src2<<5)|dst,
  3747  		opcode<<4|src2>>3,
  3748  		size<<6|0b1<<5|src1,
  3749  		q<<6|u<<5|0b1110,
  3750  	)
  3751  }
  3752  
  3753  // encodeAdvancedSIMDPermute encodes instruction as  "Advanced SIMD permute" in
  3754  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3755  func (a *AssemblerImpl) encodeAdvancedSIMDPermute(buf asm.Buffer, src1, src2, dst, opcode, size, q byte) {
  3756  	buf.Append4Bytes(
  3757  		(src2<<5)|dst,
  3758  		opcode<<4|0b1<<3|src2>>3,
  3759  		size<<6|src1,
  3760  		q<<6|0b1110,
  3761  	)
  3762  }
  3763  
  3764  func (a *AssemblerImpl) encodeVectorRegisterToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  3765  	var srcVectorRegBits byte
  3766  	if n.srcReg != RegRZR {
  3767  		srcVectorRegBits, err = vectorRegisterBits(n.srcReg)
  3768  	} else if n.instruction == CMEQZERO {
  3769  		// CMEQZERO has RegRZR as the src, and we apply the instruction to the same register as the destination.
  3770  		srcVectorRegBits, err = vectorRegisterBits(n.dstReg)
  3771  	}
  3772  
  3773  	if err != nil {
  3774  		return err
  3775  	}
  3776  
  3777  	dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
  3778  	if err != nil {
  3779  		return err
  3780  	}
  3781  
  3782  	if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
  3783  		imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
  3784  		if err != nil {
  3785  			return err
  3786  		}
  3787  		a.encodeAdvancedSIMDCopy(buf, srcVectorRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
  3788  		return nil
  3789  	}
  3790  
  3791  	if scalarPairwise, ok := advancedSIMDScalarPairwise[n.instruction]; ok {
  3792  		// See "Advanced SIMD scalar pairwise" in
  3793  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3794  		size, ok := scalarPairwise.size[n.vectorArrangement]
  3795  		if !ok {
  3796  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3797  		}
  3798  		buf.Append4Bytes(
  3799  			(srcVectorRegBits<<5)|dstVectorRegBits,
  3800  			scalarPairwise.opcode<<4|1<<3|srcVectorRegBits>>3,
  3801  			size<<6|0b11<<4|scalarPairwise.opcode>>4,
  3802  			0b1<<6|scalarPairwise.u<<5|0b11110,
  3803  		)
  3804  		return
  3805  	}
  3806  
  3807  	if twoRegMisc, ok := advancedSIMDTwoRegisterMisc[n.instruction]; ok {
  3808  		// See "Advanced SIMD two-register miscellaneous" in
  3809  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3810  		qs, ok := twoRegMisc.qAndSize[n.vectorArrangement]
  3811  		if !ok {
  3812  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3813  		}
  3814  		buf.Append4Bytes(
  3815  			(srcVectorRegBits<<5)|dstVectorRegBits,
  3816  			twoRegMisc.opcode<<4|0b1<<3|srcVectorRegBits>>3,
  3817  			qs.size<<6|0b1<<5|twoRegMisc.opcode>>4,
  3818  			qs.q<<6|twoRegMisc.u<<5|0b01110,
  3819  		)
  3820  		return nil
  3821  	}
  3822  
  3823  	if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok {
  3824  		qs, ok := threeSame.qAndSize[n.vectorArrangement]
  3825  		if !ok {
  3826  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3827  		}
  3828  		a.encodeAdvancedSIMDThreeSame(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u)
  3829  		return nil
  3830  	}
  3831  
  3832  	if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok {
  3833  		qs, ok := threeDifferent.qAndSize[n.vectorArrangement]
  3834  		if !ok {
  3835  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3836  		}
  3837  		a.encodeAdvancedSIMDThreeDifferent(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u)
  3838  		return nil
  3839  	}
  3840  
  3841  	if acrossLanes, ok := advancedSIMDAcrossLanes[n.instruction]; ok {
  3842  		// See "Advanced SIMD across lanes" in
  3843  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3844  		qs, ok := acrossLanes.qAndSize[n.vectorArrangement]
  3845  		if !ok {
  3846  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3847  		}
  3848  		buf.Append4Bytes(
  3849  			(srcVectorRegBits<<5)|dstVectorRegBits,
  3850  			acrossLanes.opcode<<4|0b1<<3|srcVectorRegBits>>3,
  3851  			qs.size<<6|0b11000<<1|acrossLanes.opcode>>4,
  3852  			qs.q<<6|acrossLanes.u<<5|0b01110,
  3853  		)
  3854  		return nil
  3855  	}
  3856  
  3857  	if lookup, ok := advancedSIMDTableLookup[n.instruction]; ok {
  3858  		q, ok := lookup.q[n.vectorArrangement]
  3859  		if !ok {
  3860  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3861  		}
  3862  		buf.Append4Bytes(
  3863  			(srcVectorRegBits<<5)|dstVectorRegBits,
  3864  			lookup.Len<<5|lookup.op<<4|srcVectorRegBits>>3,
  3865  			lookup.op2<<6|dstVectorRegBits,
  3866  			q<<6|0b1110,
  3867  		)
  3868  		return
  3869  	}
  3870  
  3871  	if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.instruction]; ok {
  3872  		immh, immb, err := shiftByImmediate.immResolver(n.srcConst, n.vectorArrangement)
  3873  		if err != nil {
  3874  			return err
  3875  		}
  3876  
  3877  		q, ok := shiftByImmediate.q[n.vectorArrangement]
  3878  		if !ok {
  3879  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3880  		}
  3881  
  3882  		buf.Append4Bytes(
  3883  			(srcVectorRegBits<<5)|dstVectorRegBits,
  3884  			shiftByImmediate.opcode<<3|0b1<<2|srcVectorRegBits>>3,
  3885  			immh<<3|immb,
  3886  			q<<6|shiftByImmediate.U<<5|0b1111,
  3887  		)
  3888  		return nil
  3889  	}
  3890  
  3891  	if permute, ok := advancedSIMDPermute[n.instruction]; ok {
  3892  		size, q := arrangementSizeQ(n.vectorArrangement)
  3893  		a.encodeAdvancedSIMDPermute(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, permute.opcode, size, q)
  3894  		return
  3895  	}
  3896  	return errorEncodingUnsupported(n)
  3897  }
  3898  
  3899  func (a *AssemblerImpl) encodeTwoVectorRegistersToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  3900  	var srcRegBits, srcRegBits2, dstRegBits byte
  3901  	srcRegBits, err = vectorRegisterBits(n.srcReg)
  3902  	if err != nil {
  3903  		return err
  3904  	}
  3905  
  3906  	srcRegBits2, err = vectorRegisterBits(n.srcReg2)
  3907  	if err != nil {
  3908  		return err
  3909  	}
  3910  
  3911  	dstRegBits, err = vectorRegisterBits(n.dstReg)
  3912  	if err != nil {
  3913  		return err
  3914  	}
  3915  
  3916  	if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok {
  3917  		qs, ok := threeSame.qAndSize[n.vectorArrangement]
  3918  		if !ok {
  3919  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3920  		}
  3921  		a.encodeAdvancedSIMDThreeSame(buf, srcRegBits, srcRegBits2, dstRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u)
  3922  		return nil
  3923  	}
  3924  
  3925  	if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok {
  3926  		qs, ok := threeDifferent.qAndSize[n.vectorArrangement]
  3927  		if !ok {
  3928  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3929  		}
  3930  		a.encodeAdvancedSIMDThreeDifferent(buf, srcRegBits, srcRegBits2, dstRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u)
  3931  		return nil
  3932  	}
  3933  
  3934  	if permute, ok := advancedSIMDPermute[n.instruction]; ok {
  3935  		size, q := arrangementSizeQ(n.vectorArrangement)
  3936  		a.encodeAdvancedSIMDPermute(buf, srcRegBits, srcRegBits2, dstRegBits, permute.opcode, size, q)
  3937  		return
  3938  	}
  3939  
  3940  	if n.instruction == EXT {
  3941  		// EXT is the only instruction in "Advanced SIMD extract", so inline the encoding here.
  3942  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
  3943  		var q, imm4 byte
  3944  		switch n.vectorArrangement {
  3945  		case VectorArrangement16B:
  3946  			imm4 = 0b1111 & byte(n.srcConst)
  3947  			q = 0b1
  3948  		case VectorArrangement8B:
  3949  			imm4 = 0b111 & byte(n.srcConst)
  3950  		default:
  3951  			return fmt.Errorf("invalid arrangement %s for EXT", n.vectorArrangement)
  3952  		}
  3953  		buf.Append4Bytes(
  3954  			(srcRegBits2<<5)|dstRegBits,
  3955  			imm4<<3|srcRegBits2>>3,
  3956  			srcRegBits,
  3957  			q<<6|0b101110,
  3958  		)
  3959  		return
  3960  	}
  3961  	return
  3962  }
  3963  
  3964  func (a *AssemblerImpl) encodeVectorRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  3965  	if err = checkArrangementIndexPair(n.vectorArrangement, n.srcVectorIndex); err != nil {
  3966  		return
  3967  	}
  3968  
  3969  	srcVecRegBits, err := vectorRegisterBits(n.srcReg)
  3970  	if err != nil {
  3971  		return err
  3972  	}
  3973  
  3974  	dstRegBits, err := intRegisterBits(n.dstReg)
  3975  	if err != nil {
  3976  		return err
  3977  	}
  3978  
  3979  	if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
  3980  		imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
  3981  		if err != nil {
  3982  			return err
  3983  		}
  3984  		a.encodeAdvancedSIMDCopy(buf, srcVecRegBits, dstRegBits, simdCopy.op, imm5, imm4, q)
  3985  		return nil
  3986  	}
  3987  	return errorEncodingUnsupported(n)
  3988  }
  3989  
  3990  func (a *AssemblerImpl) encodeRegisterToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) {
  3991  	srcRegBits, err := intRegisterBits(n.srcReg)
  3992  	if err != nil {
  3993  		return err
  3994  	}
  3995  
  3996  	dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
  3997  	if err != nil {
  3998  		return err
  3999  	}
  4000  
  4001  	if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
  4002  		imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
  4003  		if err != nil {
  4004  			return err
  4005  		}
  4006  		a.encodeAdvancedSIMDCopy(buf, srcRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
  4007  		return nil
  4008  	}
  4009  	return errorEncodingUnsupported(n)
  4010  }
  4011  
  4012  var zeroRegisterBits byte = 0b11111
  4013  
  4014  func isIntRegister(r asm.Register) bool {
  4015  	return RegR0 <= r && r <= RegSP
  4016  }
  4017  
  4018  func isVectorRegister(r asm.Register) bool {
  4019  	return RegV0 <= r && r <= RegV31
  4020  }
  4021  
  4022  func isConditionalRegister(r asm.Register) bool {
  4023  	return RegCondEQ <= r && r <= RegCondNV
  4024  }
  4025  
  4026  func intRegisterBits(r asm.Register) (ret byte, err error) {
  4027  	if !isIntRegister(r) {
  4028  		err = fmt.Errorf("%s is not integer", RegisterName(r))
  4029  	} else if r == RegSP {
  4030  		// SP has the same bit representations as RegRZR.
  4031  		r = RegRZR
  4032  	}
  4033  	ret = byte(r - RegR0)
  4034  	return
  4035  }
  4036  
  4037  func vectorRegisterBits(r asm.Register) (ret byte, err error) {
  4038  	if !isVectorRegister(r) {
  4039  		err = fmt.Errorf("%s is not vector", RegisterName(r))
  4040  	} else {
  4041  		ret = byte(r - RegV0)
  4042  	}
  4043  	return
  4044  }
  4045  
  4046  func registerBits(r asm.Register) (ret byte) {
  4047  	if isIntRegister(r) {
  4048  		if r == RegSP {
  4049  			// SP has the same bit representations as RegRZR.
  4050  			r = RegRZR
  4051  		}
  4052  		ret = byte(r - RegR0)
  4053  	} else {
  4054  		ret = byte(r - RegV0)
  4055  	}
  4056  	return
  4057  }