wa-lang.org/wazero@v1.0.2/internal/asm/arm64/impl.go (about)

     1  package arm64
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"errors"
     7  	"fmt"
     8  
     9  	"wa-lang.org/wazero/internal/asm"
    10  )
    11  
    12  type nodeImpl struct {
    13  	instruction asm.Instruction
    14  
    15  	offsetInBinaryField asm.NodeOffsetInBinary // Field suffix to dodge conflict with OffsetInBinary
    16  
    17  	// jumpTarget holds the target node in the linked for the jump-kind instruction.
    18  	jumpTarget *nodeImpl
    19  	// next holds the next node from this node in the assembled linked list.
    20  	next *nodeImpl
    21  
    22  	types                            operandTypes
    23  	srcReg, srcReg2, dstReg, dstReg2 asm.Register
    24  	srcConst, dstConst               asm.ConstantValue
    25  
    26  	vectorArrangement              VectorArrangement
    27  	srcVectorIndex, dstVectorIndex VectorIndex
    28  
    29  	// readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of
    30  	// read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress.
    31  	readInstructionAddressBeforeTargetInstruction asm.Instruction
    32  
    33  	// jumpOrigins hold all the nodes trying to jump into this node. In other words, all the nodes with .jumpTarget == this.
    34  	jumpOrigins map[*nodeImpl]struct{}
    35  
    36  	staticConst *asm.StaticConst
    37  }
    38  
    39  // AssignJumpTarget implements the same method as documented on asm.Node.
    40  func (n *nodeImpl) AssignJumpTarget(target asm.Node) {
    41  	n.jumpTarget = target.(*nodeImpl)
    42  }
    43  
    44  // AssignDestinationConstant implements the same method as documented on asm.Node.
    45  func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) {
    46  	n.dstConst = value
    47  }
    48  
    49  // AssignSourceConstant implements the same method as documented on asm.Node.
    50  func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) {
    51  	n.srcConst = value
    52  }
    53  
    54  // OffsetInBinary implements the same method as documented on asm.Node.
    55  func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary {
    56  	return n.offsetInBinaryField
    57  }
    58  
    59  // String implements fmt.Stringer.
    60  //
    61  // This is for debugging purpose, and the format is similar to the AT&T assembly syntax,
    62  // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand
    63  // might be embraced by '[]' to represent the memory location, and multiple operands
    64  // are embraced by `()`.
    65  func (n *nodeImpl) String() (ret string) {
    66  	instName := InstructionName(n.instruction)
    67  	switch n.types {
    68  	case operandTypesNoneToNone:
    69  		ret = instName
    70  	case operandTypesNoneToRegister:
    71  		ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg))
    72  	case operandTypesNoneToBranch:
    73  		ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget)
    74  	case operandTypesRegisterToRegister:
    75  		ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg))
    76  	case operandTypesLeftShiftedRegisterToRegister:
    77  		ret = fmt.Sprintf("%s (%s, %s << %d), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), n.srcConst, RegisterName(n.dstReg))
    78  	case operandTypesTwoRegistersToRegister:
    79  		ret = fmt.Sprintf("%s (%s, %s), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg))
    80  	case operandTypesThreeRegistersToRegister:
    81  		ret = fmt.Sprintf("%s (%s, %s, %s), %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), RegisterName(n.dstReg2))
    82  	case operandTypesTwoRegistersToNone:
    83  		ret = fmt.Sprintf("%s (%s, %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2))
    84  	case operandTypesRegisterAndConstToNone:
    85  		ret = fmt.Sprintf("%s (%s, 0x%x)", instName, RegisterName(n.srcReg), n.srcConst)
    86  	case operandTypesRegisterToMemory:
    87  		if n.dstReg2 != asm.NilRegister {
    88  			ret = fmt.Sprintf("%s %s, [%s + %s]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), RegisterName(n.dstReg2))
    89  		} else {
    90  			ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst)
    91  		}
    92  	case operandTypesMemoryToRegister:
    93  		if n.srcReg2 != asm.NilRegister {
    94  			ret = fmt.Sprintf("%s [%s + %s], %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg))
    95  		} else {
    96  			ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg))
    97  		}
    98  	case operandTypesConstToRegister:
    99  		ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg))
   100  	case operandTypesRegisterToVectorRegister:
   101  		ret = fmt.Sprintf("%s %s, %s.%s[%d]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement, n.dstVectorIndex)
   102  	case operandTypesVectorRegisterToRegister:
   103  		ret = fmt.Sprintf("%s %s.%s[%d], %s", instName, RegisterName(n.srcReg), n.vectorArrangement, n.srcVectorIndex, RegisterName(n.dstReg))
   104  	case operandTypesVectorRegisterToMemory:
   105  		if n.dstReg2 != asm.NilRegister {
   106  			ret = fmt.Sprintf("%s %s.%s, [%s + %s]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), RegisterName(n.dstReg2))
   107  		} else {
   108  			ret = fmt.Sprintf("%s %s.%s, [%s + 0x%x]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), n.dstConst)
   109  		}
   110  	case operandTypesMemoryToVectorRegister:
   111  		ret = fmt.Sprintf("%s [%s], %s.%s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement)
   112  	case operandTypesVectorRegisterToVectorRegister:
   113  		ret = fmt.Sprintf("%s %[2]s.%[4]s, %[3]s.%[4]s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement)
   114  	case operandTypesStaticConstToVectorRegister:
   115  		ret = fmt.Sprintf("%s $%#x %s.%s", instName, n.staticConst.Raw, RegisterName(n.dstReg), n.vectorArrangement)
   116  	case operandTypesTwoVectorRegistersToVectorRegister:
   117  		ret = fmt.Sprintf("%s (%s.%[5]s, %[3]s.%[5]s), %[4]s.%[5]s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), n.vectorArrangement)
   118  	}
   119  	return
   120  }
   121  
   122  // operandType represents where an operand is placed for an instruction.
   123  // Note: this is almost the same as obj.AddrType in GO assembler.
   124  type operandType byte
   125  
   126  const (
   127  	operandTypeNone operandType = iota
   128  	operandTypeRegister
   129  	operandTypeLeftShiftedRegister
   130  	operandTypeTwoRegisters
   131  	operandTypeThreeRegisters
   132  	operandTypeRegisterAndConst
   133  	operandTypeMemory
   134  	operandTypeConst
   135  	operandTypeBranch
   136  	operandTypeSIMDByte
   137  	operandTypeTwoSIMDBytes
   138  	operandTypeVectorRegister
   139  	operandTypeTwoVectorRegisters
   140  	operandTypeStaticConst
   141  )
   142  
   143  // String implements fmt.Stringer.
   144  func (o operandType) String() (ret string) {
   145  	switch o {
   146  	case operandTypeNone:
   147  		ret = "none"
   148  	case operandTypeRegister:
   149  		ret = "register"
   150  	case operandTypeLeftShiftedRegister:
   151  		ret = "left-shifted-register"
   152  	case operandTypeTwoRegisters:
   153  		ret = "two-registers"
   154  	case operandTypeRegisterAndConst:
   155  		ret = "register-and-const"
   156  	case operandTypeMemory:
   157  		ret = "memory"
   158  	case operandTypeConst:
   159  		ret = "const"
   160  	case operandTypeBranch:
   161  		ret = "branch"
   162  	case operandTypeSIMDByte:
   163  		ret = "simd-byte"
   164  	case operandTypeTwoSIMDBytes:
   165  		ret = "two-simd-bytes"
   166  	case operandTypeVectorRegister:
   167  		ret = "vector-register"
   168  	case operandTypeStaticConst:
   169  		ret = "static-const"
   170  	case operandTypeTwoVectorRegisters:
   171  		ret = "two-vector-registers"
   172  	}
   173  	return
   174  }
   175  
   176  // operandTypes represents the only combinations of two operandTypes used by wazero
   177  type operandTypes struct{ src, dst operandType }
   178  
   179  var (
   180  	operandTypesNoneToNone                         = operandTypes{operandTypeNone, operandTypeNone}
   181  	operandTypesNoneToRegister                     = operandTypes{operandTypeNone, operandTypeRegister}
   182  	operandTypesNoneToBranch                       = operandTypes{operandTypeNone, operandTypeBranch}
   183  	operandTypesRegisterToRegister                 = operandTypes{operandTypeRegister, operandTypeRegister}
   184  	operandTypesLeftShiftedRegisterToRegister      = operandTypes{operandTypeLeftShiftedRegister, operandTypeRegister}
   185  	operandTypesTwoRegistersToRegister             = operandTypes{operandTypeTwoRegisters, operandTypeRegister}
   186  	operandTypesThreeRegistersToRegister           = operandTypes{operandTypeThreeRegisters, operandTypeRegister}
   187  	operandTypesTwoRegistersToNone                 = operandTypes{operandTypeTwoRegisters, operandTypeNone}
   188  	operandTypesRegisterAndConstToNone             = operandTypes{operandTypeRegisterAndConst, operandTypeNone}
   189  	operandTypesRegisterToMemory                   = operandTypes{operandTypeRegister, operandTypeMemory}
   190  	operandTypesMemoryToRegister                   = operandTypes{operandTypeMemory, operandTypeRegister}
   191  	operandTypesConstToRegister                    = operandTypes{operandTypeConst, operandTypeRegister}
   192  	operandTypesRegisterToVectorRegister           = operandTypes{operandTypeRegister, operandTypeVectorRegister}
   193  	operandTypesVectorRegisterToRegister           = operandTypes{operandTypeVectorRegister, operandTypeRegister}
   194  	operandTypesMemoryToVectorRegister             = operandTypes{operandTypeMemory, operandTypeVectorRegister}
   195  	operandTypesVectorRegisterToMemory             = operandTypes{operandTypeVectorRegister, operandTypeMemory}
   196  	operandTypesVectorRegisterToVectorRegister     = operandTypes{operandTypeVectorRegister, operandTypeVectorRegister}
   197  	operandTypesTwoVectorRegistersToVectorRegister = operandTypes{operandTypeTwoVectorRegisters, operandTypeVectorRegister}
   198  	operandTypesStaticConstToVectorRegister        = operandTypes{operandTypeStaticConst, operandTypeVectorRegister}
   199  )
   200  
   201  // String implements fmt.Stringer
   202  func (o operandTypes) String() string {
   203  	return fmt.Sprintf("from:%s,to:%s", o.src, o.dst)
   204  }
   205  
   206  const (
   207  	maxSignedInt26 int64 = 1<<25 - 1
   208  	minSignedInt26 int64 = -(1 << 25)
   209  
   210  	maxSignedInt19 int64 = 1<<19 - 1
   211  	minSignedInt19 int64 = -(1 << 19)
   212  )
   213  
   214  // AssemblerImpl implements Assembler.
   215  type AssemblerImpl struct {
   216  	asm.BaseAssemblerImpl
   217  	Root, Current     *nodeImpl
   218  	Buf               *bytes.Buffer
   219  	temporaryRegister asm.Register
   220  	nodeCount         int
   221  	pool              *asm.StaticConstPool
   222  	// MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstPool
   223  	// but have it as a field here for testability.
   224  	MaxDisplacementForConstantPool int
   225  }
   226  
   227  func NewAssembler(temporaryRegister asm.Register) *AssemblerImpl {
   228  	return &AssemblerImpl{
   229  		Buf: bytes.NewBuffer(nil), temporaryRegister: temporaryRegister,
   230  		pool:                           asm.NewStaticConstPool(),
   231  		MaxDisplacementForConstantPool: defaultMaxDisplacementForConstPool,
   232  	}
   233  }
   234  
   235  // newNode creates a new Node and appends it into the linked list.
   236  func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl {
   237  	n := &nodeImpl{
   238  		instruction: instruction,
   239  		next:        nil,
   240  		types:       types,
   241  		jumpOrigins: map[*nodeImpl]struct{}{},
   242  	}
   243  
   244  	a.addNode(n)
   245  	return n
   246  }
   247  
   248  // addNode appends the new node into the linked list.
   249  func (a *AssemblerImpl) addNode(node *nodeImpl) {
   250  	a.nodeCount++
   251  
   252  	if a.Root == nil {
   253  		a.Root = node
   254  		a.Current = node
   255  	} else {
   256  		parent := a.Current
   257  		parent.next = node
   258  		a.Current = node
   259  	}
   260  
   261  	for _, o := range a.SetBranchTargetOnNextNodes {
   262  		origin := o.(*nodeImpl)
   263  		origin.jumpTarget = node
   264  	}
   265  	a.SetBranchTargetOnNextNodes = nil
   266  }
   267  
   268  // Assemble implements asm.AssemblerBase
   269  func (a *AssemblerImpl) Assemble() ([]byte, error) {
   270  	// arm64 has 32-bit fixed length instructions,
   271  	// but note that some nodes are encoded as multiple instructions,
   272  	// so the resulting binary might not be the size of count*8.
   273  	a.Buf.Grow(a.nodeCount * 8)
   274  
   275  	for n := a.Root; n != nil; n = n.next {
   276  		n.offsetInBinaryField = uint64(a.Buf.Len())
   277  		if err := a.encodeNode(n); err != nil {
   278  			return nil, err
   279  		}
   280  		a.maybeFlushConstPool(n.next == nil)
   281  	}
   282  
   283  	code := a.bytes()
   284  	for _, cb := range a.OnGenerateCallbacks {
   285  		if err := cb(code); err != nil {
   286  			return nil, err
   287  		}
   288  	}
   289  	return code, nil
   290  }
   291  
   292  const defaultMaxDisplacementForConstPool = (1 << 20) - 1 - 4 // -4 for unconditional branch to skip the constants.
   293  
   294  // maybeFlushConstPool flushes the constant pool if endOfBinary or a boundary condition was met.
   295  func (a *AssemblerImpl) maybeFlushConstPool(endOfBinary bool) {
   296  	if a.pool.FirstUseOffsetInBinary == nil {
   297  		return
   298  	}
   299  
   300  	// If endOfBinary = true, we no longer need to emit the instructions, therefore
   301  	// flush all the constants.
   302  	if endOfBinary ||
   303  		// Also, if the offset between the first usage of the constant pool and
   304  		// the first constant would exceed 2^20 -1(= 2MiB-1), which is the maximum offset
   305  		// for LDR(literal)/ADR instruction, flush all the constants in the pool.
   306  		(a.Buf.Len()+a.pool.PoolSizeInBytes-int(*a.pool.FirstUseOffsetInBinary)) >= a.MaxDisplacementForConstantPool {
   307  
   308  		// Before emitting consts, we have to add br instruction to skip the const pool.
   309  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1123-L1129
   310  		skipOffset := a.pool.PoolSizeInBytes/4 + 1
   311  		if a.pool.PoolSizeInBytes%4 != 0 {
   312  			skipOffset++
   313  		}
   314  		if endOfBinary {
   315  			// If this is the end of binary, we never reach this block,
   316  			// so offset can be zero (which is the behavior of Go's assembler).
   317  			skipOffset = 0
   318  		}
   319  
   320  		a.Buf.Write([]byte{
   321  			byte(skipOffset),
   322  			byte(skipOffset >> 8),
   323  			byte(skipOffset >> 16),
   324  			0x14,
   325  		})
   326  
   327  		// Then adding the consts into the binary.
   328  		for _, c := range a.pool.Consts {
   329  			c.SetOffsetInBinary(uint64(a.Buf.Len()))
   330  			a.Buf.Write(c.Raw)
   331  		}
   332  
   333  		// arm64 instructions are 4-byte (32-bit) aligned, so we must pad the zero consts here.
   334  		if pad := a.Buf.Len() % 4; pad != 0 {
   335  			a.Buf.Write(make([]byte, 4-pad))
   336  		}
   337  
   338  		// After the flush, reset the constant pool.
   339  		a.pool = asm.NewStaticConstPool()
   340  	}
   341  }
   342  
   343  // bytes returns the encoded binary.
   344  func (a *AssemblerImpl) bytes() []byte {
   345  	// 16 bytes alignment to match our impl with golang-asm.
   346  	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L62
   347  	//
   348  	// TODO: Delete after golang-asm removal.
   349  	if pad := 16 - a.Buf.Len()%16; pad > 0 && pad != 16 {
   350  		a.Buf.Write(make([]byte, pad))
   351  	}
   352  	return a.Buf.Bytes()
   353  }
   354  
   355  // encodeNode encodes the given node into writer.
   356  func (a *AssemblerImpl) encodeNode(n *nodeImpl) (err error) {
   357  	switch n.types {
   358  	case operandTypesNoneToNone:
   359  		err = a.encodeNoneToNone(n)
   360  	case operandTypesNoneToRegister:
   361  		err = a.encodeJumpToRegister(n)
   362  	case operandTypesNoneToBranch:
   363  		err = a.encodeRelativeBranch(n)
   364  	case operandTypesRegisterToRegister:
   365  		err = a.encodeRegisterToRegister(n)
   366  	case operandTypesLeftShiftedRegisterToRegister:
   367  		err = a.encodeLeftShiftedRegisterToRegister(n)
   368  	case operandTypesTwoRegistersToRegister:
   369  		err = a.encodeTwoRegistersToRegister(n)
   370  	case operandTypesThreeRegistersToRegister:
   371  		err = a.encodeThreeRegistersToRegister(n)
   372  	case operandTypesTwoRegistersToNone:
   373  		err = a.encodeTwoRegistersToNone(n)
   374  	case operandTypesRegisterAndConstToNone:
   375  		err = a.encodeRegisterAndConstToNone(n)
   376  	case operandTypesRegisterToMemory:
   377  		err = a.encodeRegisterToMemory(n)
   378  	case operandTypesMemoryToRegister:
   379  		err = a.encodeMemoryToRegister(n)
   380  	case operandTypesConstToRegister:
   381  		err = a.encodeConstToRegister(n)
   382  	case operandTypesRegisterToVectorRegister:
   383  		err = a.encodeRegisterToVectorRegister(n)
   384  	case operandTypesVectorRegisterToRegister:
   385  		err = a.encodeVectorRegisterToRegister(n)
   386  	case operandTypesMemoryToVectorRegister:
   387  		err = a.encodeMemoryToVectorRegister(n)
   388  	case operandTypesVectorRegisterToMemory:
   389  		err = a.encodeVectorRegisterToMemory(n)
   390  	case operandTypesVectorRegisterToVectorRegister:
   391  		err = a.encodeVectorRegisterToVectorRegister(n)
   392  	case operandTypesStaticConstToVectorRegister:
   393  		err = a.encodeStaticConstToVectorRegister(n)
   394  	case operandTypesTwoVectorRegistersToVectorRegister:
   395  		err = a.encodeTwoVectorRegistersToVectorRegister(n)
   396  	default:
   397  		err = fmt.Errorf("encoder undefined for [%s] operand type", n.types)
   398  	}
   399  	if err != nil {
   400  		err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node.
   401  	}
   402  	return
   403  }
   404  
   405  // CompileStandAlone implements the same method as documented on asm.AssemblerBase.
   406  func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node {
   407  	return a.newNode(instruction, operandTypesNoneToNone)
   408  }
   409  
   410  // CompileConstToRegister implements the same method as documented on asm.AssemblerBase.
   411  func (a *AssemblerImpl) CompileConstToRegister(
   412  	instruction asm.Instruction,
   413  	value asm.ConstantValue,
   414  	destinationReg asm.Register,
   415  ) (inst asm.Node) {
   416  	n := a.newNode(instruction, operandTypesConstToRegister)
   417  	n.srcConst = value
   418  	n.dstReg = destinationReg
   419  	return n
   420  }
   421  
   422  // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase.
   423  func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) {
   424  	n := a.newNode(instruction, operandTypesRegisterToRegister)
   425  	n.srcReg = from
   426  	n.dstReg = to
   427  }
   428  
   429  // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase.
   430  func (a *AssemblerImpl) CompileMemoryToRegister(
   431  	instruction asm.Instruction,
   432  	sourceBaseReg asm.Register,
   433  	sourceOffsetConst asm.ConstantValue,
   434  	destinationReg asm.Register,
   435  ) {
   436  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   437  	n.srcReg = sourceBaseReg
   438  	n.srcConst = sourceOffsetConst
   439  	n.dstReg = destinationReg
   440  }
   441  
   442  // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase.
   443  func (a *AssemblerImpl) CompileRegisterToMemory(
   444  	instruction asm.Instruction,
   445  	sourceRegister, destinationBaseRegister asm.Register,
   446  	destinationOffsetConst asm.ConstantValue,
   447  ) {
   448  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   449  	n.srcReg = sourceRegister
   450  	n.dstReg = destinationBaseRegister
   451  	n.dstConst = destinationOffsetConst
   452  }
   453  
   454  // CompileJump implements the same method as documented on asm.AssemblerBase.
   455  func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node {
   456  	return a.newNode(jmpInstruction, operandTypesNoneToBranch)
   457  }
   458  
   459  // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase.
   460  func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) {
   461  	n := a.newNode(jmpInstruction, operandTypesNoneToRegister)
   462  	n.dstReg = reg
   463  }
   464  
   465  // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase.
   466  func (a *AssemblerImpl) CompileReadInstructionAddress(
   467  	destinationRegister asm.Register,
   468  	beforeAcquisitionTargetInstruction asm.Instruction,
   469  ) {
   470  	n := a.newNode(ADR, operandTypesMemoryToRegister)
   471  	n.dstReg = destinationRegister
   472  	n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction
   473  }
   474  
   475  // CompileMemoryWithRegisterOffsetToRegister implements Assembler.CompileMemoryWithRegisterOffsetToRegister
   476  func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToRegister(
   477  	instruction asm.Instruction,
   478  	srcBaseReg, srcOffsetReg, dstReg asm.Register,
   479  ) {
   480  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   481  	n.dstReg = dstReg
   482  	n.srcReg = srcBaseReg
   483  	n.srcReg2 = srcOffsetReg
   484  }
   485  
   486  // CompileRegisterToMemoryWithRegisterOffset implements Assembler.CompileRegisterToMemoryWithRegisterOffset
   487  func (a *AssemblerImpl) CompileRegisterToMemoryWithRegisterOffset(
   488  	instruction asm.Instruction,
   489  	srcReg, dstBaseReg, dstOffsetReg asm.Register,
   490  ) {
   491  	n := a.newNode(instruction, operandTypesRegisterToMemory)
   492  	n.srcReg = srcReg
   493  	n.dstReg = dstBaseReg
   494  	n.dstReg2 = dstOffsetReg
   495  }
   496  
   497  // CompileTwoRegistersToRegister implements Assembler.CompileTwoRegistersToRegister
   498  func (a *AssemblerImpl) CompileTwoRegistersToRegister(instruction asm.Instruction, src1, src2, dst asm.Register) {
   499  	n := a.newNode(instruction, operandTypesTwoRegistersToRegister)
   500  	n.srcReg = src1
   501  	n.srcReg2 = src2
   502  	n.dstReg = dst
   503  }
   504  
   505  // CompileThreeRegistersToRegister implements Assembler.CompileThreeRegistersToRegister
   506  func (a *AssemblerImpl) CompileThreeRegistersToRegister(
   507  	instruction asm.Instruction,
   508  	src1, src2, src3, dst asm.Register,
   509  ) {
   510  	n := a.newNode(instruction, operandTypesThreeRegistersToRegister)
   511  	n.srcReg = src1
   512  	n.srcReg2 = src2
   513  	n.dstReg = src3 // To minimize the size of nodeImpl struct, we reuse dstReg for the third source operand.
   514  	n.dstReg2 = dst
   515  }
   516  
   517  // CompileTwoRegistersToNone implements Assembler.CompileTwoRegistersToNone
   518  func (a *AssemblerImpl) CompileTwoRegistersToNone(instruction asm.Instruction, src1, src2 asm.Register) {
   519  	n := a.newNode(instruction, operandTypesTwoRegistersToNone)
   520  	n.srcReg = src1
   521  	n.srcReg2 = src2
   522  }
   523  
   524  // CompileRegisterAndConstToNone implements Assembler.CompileRegisterAndConstToNone
   525  func (a *AssemblerImpl) CompileRegisterAndConstToNone(
   526  	instruction asm.Instruction,
   527  	src asm.Register,
   528  	srcConst asm.ConstantValue,
   529  ) {
   530  	n := a.newNode(instruction, operandTypesRegisterAndConstToNone)
   531  	n.srcReg = src
   532  	n.srcConst = srcConst
   533  }
   534  
   535  // CompileLeftShiftedRegisterToRegister implements Assembler.CompileLeftShiftedRegisterToRegister
   536  func (a *AssemblerImpl) CompileLeftShiftedRegisterToRegister(
   537  	instruction asm.Instruction,
   538  	shiftedSourceReg asm.Register,
   539  	shiftNum asm.ConstantValue,
   540  	srcReg, dstReg asm.Register,
   541  ) {
   542  	n := a.newNode(instruction, operandTypesLeftShiftedRegisterToRegister)
   543  	n.srcReg = srcReg
   544  	n.srcReg2 = shiftedSourceReg
   545  	n.srcConst = shiftNum
   546  	n.dstReg = dstReg
   547  }
   548  
   549  // CompileConditionalRegisterSet implements Assembler.CompileConditionalRegisterSet
   550  func (a *AssemblerImpl) CompileConditionalRegisterSet(cond asm.ConditionalRegisterState, dstReg asm.Register) {
   551  	n := a.newNode(CSET, operandTypesRegisterToRegister)
   552  	n.srcReg = conditionalRegisterStateToRegister(cond)
   553  	n.dstReg = dstReg
   554  }
   555  
   556  // CompileMemoryToVectorRegister implements Assembler.CompileMemoryToVectorRegister
   557  func (a *AssemblerImpl) CompileMemoryToVectorRegister(
   558  	instruction asm.Instruction, srcBaseReg asm.Register, dstOffset asm.ConstantValue, dstReg asm.Register, arrangement VectorArrangement,
   559  ) {
   560  	n := a.newNode(instruction, operandTypesMemoryToVectorRegister)
   561  	n.srcReg = srcBaseReg
   562  	n.srcConst = dstOffset
   563  	n.dstReg = dstReg
   564  	n.vectorArrangement = arrangement
   565  }
   566  
   567  // CompileMemoryWithRegisterOffsetToVectorRegister implements Assembler.CompileMemoryWithRegisterOffsetToVectorRegister
   568  func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToVectorRegister(instruction asm.Instruction,
   569  	srcBaseReg, srcOffsetRegister asm.Register, dstReg asm.Register, arrangement VectorArrangement,
   570  ) {
   571  	n := a.newNode(instruction, operandTypesMemoryToVectorRegister)
   572  	n.srcReg = srcBaseReg
   573  	n.srcReg2 = srcOffsetRegister
   574  	n.dstReg = dstReg
   575  	n.vectorArrangement = arrangement
   576  }
   577  
   578  // CompileVectorRegisterToMemory implements Assembler.CompileVectorRegisterToMemory
   579  func (a *AssemblerImpl) CompileVectorRegisterToMemory(
   580  	instruction asm.Instruction, srcReg, dstBaseReg asm.Register, dstOffset asm.ConstantValue, arrangement VectorArrangement,
   581  ) {
   582  	n := a.newNode(instruction, operandTypesVectorRegisterToMemory)
   583  	n.srcReg = srcReg
   584  	n.dstReg = dstBaseReg
   585  	n.dstConst = dstOffset
   586  	n.vectorArrangement = arrangement
   587  }
   588  
   589  // CompileVectorRegisterToMemoryWithRegisterOffset implements Assembler.CompileVectorRegisterToMemoryWithRegisterOffset
   590  func (a *AssemblerImpl) CompileVectorRegisterToMemoryWithRegisterOffset(instruction asm.Instruction,
   591  	srcReg, dstBaseReg, dstOffsetRegister asm.Register, arrangement VectorArrangement,
   592  ) {
   593  	n := a.newNode(instruction, operandTypesVectorRegisterToMemory)
   594  	n.srcReg = srcReg
   595  	n.dstReg = dstBaseReg
   596  	n.dstReg2 = dstOffsetRegister
   597  	n.vectorArrangement = arrangement
   598  }
   599  
   600  // CompileRegisterToVectorRegister implements Assembler.CompileRegisterToVectorRegister
   601  func (a *AssemblerImpl) CompileRegisterToVectorRegister(
   602  	instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, index VectorIndex,
   603  ) {
   604  	n := a.newNode(instruction, operandTypesRegisterToVectorRegister)
   605  	n.srcReg = srcReg
   606  	n.dstReg = dstReg
   607  	n.vectorArrangement = arrangement
   608  	n.dstVectorIndex = index
   609  }
   610  
   611  // CompileVectorRegisterToRegister implements Assembler.CompileVectorRegisterToRegister
   612  func (a *AssemblerImpl) CompileVectorRegisterToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register,
   613  	arrangement VectorArrangement, index VectorIndex,
   614  ) {
   615  	n := a.newNode(instruction, operandTypesVectorRegisterToRegister)
   616  	n.srcReg = srcReg
   617  	n.dstReg = dstReg
   618  	n.vectorArrangement = arrangement
   619  	n.srcVectorIndex = index
   620  }
   621  
   622  // CompileVectorRegisterToVectorRegister implements Assembler.CompileVectorRegisterToVectorRegister
   623  func (a *AssemblerImpl) CompileVectorRegisterToVectorRegister(
   624  	instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, srcIndex, dstIndex VectorIndex,
   625  ) {
   626  	n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister)
   627  	n.srcReg = srcReg
   628  	n.dstReg = dstReg
   629  	n.vectorArrangement = arrangement
   630  	n.srcVectorIndex = srcIndex
   631  	n.dstVectorIndex = dstIndex
   632  }
   633  
   634  // CompileVectorRegisterToVectorRegisterWithConst implements Assembler.CompileVectorRegisterToVectorRegisterWithConst
   635  func (a *AssemblerImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction,
   636  	srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue,
   637  ) {
   638  	n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister)
   639  	n.srcReg = srcReg
   640  	n.srcConst = c
   641  	n.dstReg = dstReg
   642  	n.vectorArrangement = arrangement
   643  }
   644  
   645  // CompileStaticConstToRegister implements Assembler.CompileStaticConstToVectorRegister
   646  func (a *AssemblerImpl) CompileStaticConstToRegister(instruction asm.Instruction, c *asm.StaticConst, dstReg asm.Register) {
   647  	n := a.newNode(instruction, operandTypesMemoryToRegister)
   648  	n.staticConst = c
   649  	n.dstReg = dstReg
   650  }
   651  
   652  // CompileStaticConstToVectorRegister implements Assembler.CompileStaticConstToVectorRegister
   653  func (a *AssemblerImpl) CompileStaticConstToVectorRegister(instruction asm.Instruction,
   654  	c *asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement,
   655  ) {
   656  	n := a.newNode(instruction, operandTypesStaticConstToVectorRegister)
   657  	n.staticConst = c
   658  	n.dstReg = dstReg
   659  	n.vectorArrangement = arrangement
   660  }
   661  
   662  // CompileTwoVectorRegistersToVectorRegister implements Assembler.CompileTwoVectorRegistersToVectorRegister.
   663  func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register,
   664  	arrangement VectorArrangement,
   665  ) {
   666  	n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister)
   667  	n.srcReg = srcReg
   668  	n.srcReg2 = srcReg2
   669  	n.dstReg = dstReg
   670  	n.vectorArrangement = arrangement
   671  }
   672  
   673  // CompileTwoVectorRegistersToVectorRegisterWithConst implements Assembler.CompileTwoVectorRegistersToVectorRegisterWithConst.
   674  func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction,
   675  	srcReg, srcReg2, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue,
   676  ) {
   677  	n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister)
   678  	n.srcReg = srcReg
   679  	n.srcReg2 = srcReg2
   680  	n.srcConst = c
   681  	n.dstReg = dstReg
   682  	n.vectorArrangement = arrangement
   683  }
   684  
   685  func errorEncodingUnsupported(n *nodeImpl) error {
   686  	return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types)
   687  }
   688  
   689  func (a *AssemblerImpl) encodeNoneToNone(n *nodeImpl) (err error) {
   690  	if n.instruction != NOP {
   691  		err = errorEncodingUnsupported(n)
   692  	}
   693  	return
   694  }
   695  
   696  func (a *AssemblerImpl) encodeJumpToRegister(n *nodeImpl) (err error) {
   697  	// "Unconditional branch (register)" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions
   698  	var opc byte
   699  	switch n.instruction {
   700  	case RET:
   701  		opc = 0b0010
   702  	case B:
   703  		opc = 0b0000
   704  	default:
   705  		return errorEncodingUnsupported(n)
   706  	}
   707  
   708  	regBits, err := intRegisterBits(n.dstReg)
   709  	if err != nil {
   710  		return fmt.Errorf("invalid destination register: %w", err)
   711  	}
   712  
   713  	a.Buf.Write([]byte{
   714  		0x00 | (regBits << 5),
   715  		0x00 | (regBits >> 3),
   716  		0b000_11111 | (opc << 5),
   717  		0b1101011_0 | (opc >> 3),
   718  	})
   719  	return
   720  }
   721  
   722  func (a *AssemblerImpl) encodeRelativeBranch(n *nodeImpl) (err error) {
   723  	switch n.instruction {
   724  	case B, BCONDEQ, BCONDGE, BCONDGT, BCONDHI, BCONDHS, BCONDLE, BCONDLO, BCONDLS, BCONDLT, BCONDMI, BCONDNE, BCONDVS, BCONDPL:
   725  	default:
   726  		return errorEncodingUnsupported(n)
   727  	}
   728  
   729  	if n.jumpTarget == nil {
   730  		return fmt.Errorf("branch target must be set for %s", InstructionName(n.instruction))
   731  	}
   732  
   733  	// At this point, we don't yet know that target's branch, so emit the placeholder (4 bytes).
   734  	a.Buf.Write([]byte{0, 0, 0, 0})
   735  
   736  	a.AddOnGenerateCallBack(func(code []byte) error {
   737  		var condBits byte
   738  		const condBitsUnconditional = 0xff // Indicates this is not conditional jump.
   739  
   740  		// https://developer.arm.com/documentation/den0024/a/CHDEEABE
   741  		switch n.instruction {
   742  		case B:
   743  			condBits = condBitsUnconditional
   744  		case BCONDEQ:
   745  			condBits = 0b0000
   746  		case BCONDGE:
   747  			condBits = 0b1010
   748  		case BCONDGT:
   749  			condBits = 0b1100
   750  		case BCONDHI:
   751  			condBits = 0b1000
   752  		case BCONDHS:
   753  			condBits = 0b0010
   754  		case BCONDLE:
   755  			condBits = 0b1101
   756  		case BCONDLO:
   757  			condBits = 0b0011
   758  		case BCONDLS:
   759  			condBits = 0b1001
   760  		case BCONDLT:
   761  			condBits = 0b1011
   762  		case BCONDMI:
   763  			condBits = 0b0100
   764  		case BCONDPL:
   765  			condBits = 0b0101
   766  		case BCONDNE:
   767  			condBits = 0b0001
   768  		case BCONDVS:
   769  			condBits = 0b0110
   770  		}
   771  
   772  		branchInstOffset := int64(n.OffsetInBinary())
   773  		offset := int64(n.jumpTarget.OffsetInBinary()) - branchInstOffset
   774  		if offset%4 != 0 {
   775  			return errors.New("BUG: relative jump offset must be 4 bytes aligned")
   776  		}
   777  
   778  		branchInst := code[branchInstOffset : branchInstOffset+4]
   779  		if condBits == condBitsUnconditional {
   780  			imm26 := offset / 4
   781  			if imm26 < minSignedInt26 || imm26 > maxSignedInt26 {
   782  				// In theory this could happen if a Wasm binary has a huge single label (more than 128MB for a single block),
   783  				// and in that case, we use load the offset into a register and do the register jump, but to avoid the complexity,
   784  				// we impose this limit for now as that would be *unlikely* happen in practice.
   785  				return fmt.Errorf("relative jump offset %d/4 must be within %d and %d", offset, minSignedInt26, maxSignedInt26)
   786  			}
   787  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-?lang=en
   788  			branchInst[0] = byte(imm26)
   789  			branchInst[1] = byte(imm26 >> 8)
   790  			branchInst[2] = byte(imm26 >> 16)
   791  			branchInst[3] = (byte(imm26 >> 24 & 0b000000_11)) | 0b000101_00
   792  		} else {
   793  			imm19 := offset / 4
   794  			if imm19 < minSignedInt19 || imm19 > maxSignedInt19 {
   795  				// This should be a bug in our compiler as the conditional jumps are only used in the small offsets (~a few bytes),
   796  				// and if ever happens, compiler can be fixed.
   797  				return fmt.Errorf("BUG: relative jump offset %d/4(=%d) must be within %d and %d", offset, imm19, minSignedInt19, maxSignedInt19)
   798  			}
   799  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-?lang=en
   800  			branchInst[0] = (byte(imm19<<5) & 0b111_0_0000) | condBits
   801  			branchInst[1] = byte(imm19 >> 3)
   802  			branchInst[2] = byte(imm19 >> 11)
   803  			branchInst[3] = 0b01010100
   804  		}
   805  		return nil
   806  	})
   807  	return
   808  }
   809  
   810  func checkRegisterToRegisterType(src, dst asm.Register, requireSrcInt, requireDstInt bool) (err error) {
   811  	isSrcInt, isDstInt := isIntRegister(src), isIntRegister(dst)
   812  	if isSrcInt && !requireSrcInt {
   813  		err = fmt.Errorf("src requires float register but got %s", RegisterName(src))
   814  	} else if !isSrcInt && requireSrcInt {
   815  		err = fmt.Errorf("src requires int register but got %s", RegisterName(src))
   816  	} else if isDstInt && !requireDstInt {
   817  		err = fmt.Errorf("dst requires float register but got %s", RegisterName(dst))
   818  	} else if !isDstInt && requireDstInt {
   819  		err = fmt.Errorf("dst requires int register but got %s", RegisterName(dst))
   820  	}
   821  	return
   822  }
   823  
   824  func (a *AssemblerImpl) encodeRegisterToRegister(n *nodeImpl) (err error) {
   825  	switch inst := n.instruction; inst {
   826  	case ADD, ADDW, SUB:
   827  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
   828  			return
   829  		}
   830  
   831  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
   832  		var sfops byte
   833  		switch inst {
   834  		case ADD:
   835  			sfops = 0b100
   836  		case ADDW:
   837  		case SUB:
   838  			sfops = 0b110
   839  		}
   840  
   841  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
   842  		a.Buf.Write([]byte{
   843  			(dstRegBits << 5) | dstRegBits,
   844  			dstRegBits >> 3,
   845  			srcRegBits,
   846  			(sfops << 5) | 0b01011,
   847  		})
   848  	case CLZ, CLZW, RBIT, RBITW:
   849  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
   850  			return
   851  		}
   852  
   853  		var sf, opcode byte
   854  		switch inst {
   855  		case CLZ:
   856  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en
   857  			sf, opcode = 0b1, 0b000_100
   858  		case CLZW:
   859  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en
   860  			sf, opcode = 0b0, 0b000_100
   861  		case RBIT:
   862  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en
   863  			sf, opcode = 0b1, 0b000_000
   864  		case RBITW:
   865  			// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en
   866  			sf, opcode = 0b0, 0b000_000
   867  		}
   868  		if inst == CLZ {
   869  			sf = 1
   870  		}
   871  
   872  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
   873  		a.Buf.Write([]byte{
   874  			(srcRegBits << 5) | dstRegBits,
   875  			opcode<<2 | (srcRegBits >> 3),
   876  			0b110_00000,
   877  			(sf << 7) | 0b0_1011010,
   878  		})
   879  	case CSET:
   880  		if !isConditionalRegister(n.srcReg) {
   881  			return fmt.Errorf("CSET requires conditional register but got %s", RegisterName(n.srcReg))
   882  		}
   883  
   884  		dstRegBits, err := intRegisterBits(n.dstReg)
   885  		if err != nil {
   886  			return err
   887  		}
   888  
   889  		// CSET encodes the conditional bits with its least significant bit inverted.
   890  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en
   891  		//
   892  		// https://developer.arm.com/documentation/den0024/a/CHDEEABE
   893  		var conditionalBits byte
   894  		switch n.srcReg {
   895  		case RegCondEQ:
   896  			conditionalBits = 0b0001
   897  		case RegCondNE:
   898  			conditionalBits = 0b0000
   899  		case RegCondHS:
   900  			conditionalBits = 0b0011
   901  		case RegCondLO:
   902  			conditionalBits = 0b0010
   903  		case RegCondMI:
   904  			conditionalBits = 0b0101
   905  		case RegCondPL:
   906  			conditionalBits = 0b0100
   907  		case RegCondVS:
   908  			conditionalBits = 0b0111
   909  		case RegCondVC:
   910  			conditionalBits = 0b0110
   911  		case RegCondHI:
   912  			conditionalBits = 0b1001
   913  		case RegCondLS:
   914  			conditionalBits = 0b1000
   915  		case RegCondGE:
   916  			conditionalBits = 0b1011
   917  		case RegCondLT:
   918  			conditionalBits = 0b1010
   919  		case RegCondGT:
   920  			conditionalBits = 0b1101
   921  		case RegCondLE:
   922  			conditionalBits = 0b1100
   923  		case RegCondAL:
   924  			conditionalBits = 0b1111
   925  		case RegCondNV:
   926  			conditionalBits = 0b1110
   927  		}
   928  
   929  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en
   930  		a.Buf.Write([]byte{
   931  			0b111_00000 | dstRegBits,
   932  			(conditionalBits << 4) | 0b0000_0111,
   933  			0b100_11111,
   934  			0b10011010,
   935  		})
   936  
   937  	case FABSD, FABSS, FNEGD, FNEGS, FSQRTD, FSQRTS, FCVTSD, FCVTDS, FRINTMD, FRINTMS,
   938  		FRINTND, FRINTNS, FRINTPD, FRINTPS, FRINTZD, FRINTZS:
   939  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil {
   940  			return
   941  		}
   942  
   943  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
   944  
   945  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
   946  		var tp, opcode byte
   947  		switch inst {
   948  		case FABSD:
   949  			opcode, tp = 0b000001, 0b01
   950  		case FABSS:
   951  			opcode, tp = 0b000001, 0b00
   952  		case FNEGD:
   953  			opcode, tp = 0b000010, 0b01
   954  		case FNEGS:
   955  			opcode, tp = 0b000010, 0b00
   956  		case FSQRTD:
   957  			opcode, tp = 0b000011, 0b01
   958  		case FSQRTS:
   959  			opcode, tp = 0b000011, 0b00
   960  		case FCVTSD:
   961  			opcode, tp = 0b000101, 0b00
   962  		case FCVTDS:
   963  			opcode, tp = 0b000100, 0b01
   964  		case FRINTMD:
   965  			opcode, tp = 0b001010, 0b01
   966  		case FRINTMS:
   967  			opcode, tp = 0b001010, 0b00
   968  		case FRINTND:
   969  			opcode, tp = 0b001000, 0b01
   970  		case FRINTNS:
   971  			opcode, tp = 0b001000, 0b00
   972  		case FRINTPD:
   973  			opcode, tp = 0b001001, 0b01
   974  		case FRINTPS:
   975  			opcode, tp = 0b001001, 0b00
   976  		case FRINTZD:
   977  			opcode, tp = 0b001011, 0b01
   978  		case FRINTZS:
   979  			opcode, tp = 0b001011, 0b00
   980  		}
   981  		a.Buf.Write([]byte{
   982  			(srcRegBits << 5) | dstRegBits,
   983  			(opcode << 7) | 0b0_10000_00 | (srcRegBits >> 3),
   984  			tp<<6 | 0b00_1_00000 | opcode>>1,
   985  			0b0_00_11110,
   986  		})
   987  
   988  	case FADDD, FADDS, FDIVS, FDIVD, FMAXD, FMAXS, FMIND, FMINS, FMULS, FMULD:
   989  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil {
   990  			return
   991  		}
   992  
   993  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
   994  
   995  		// "Floating-point data-processing (2 source)" in
   996  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
   997  		var tp, opcode byte
   998  		switch inst {
   999  		case FADDD:
  1000  			opcode, tp = 0b0010, 0b01
  1001  		case FADDS:
  1002  			opcode, tp = 0b0010, 0b00
  1003  		case FDIVD:
  1004  			opcode, tp = 0b0001, 0b01
  1005  		case FDIVS:
  1006  			opcode, tp = 0b0001, 0b00
  1007  		case FMAXD:
  1008  			opcode, tp = 0b0100, 0b01
  1009  		case FMAXS:
  1010  			opcode, tp = 0b0100, 0b00
  1011  		case FMIND:
  1012  			opcode, tp = 0b0101, 0b01
  1013  		case FMINS:
  1014  			opcode, tp = 0b0101, 0b00
  1015  		case FMULS:
  1016  			opcode, tp = 0b0000, 0b00
  1017  		case FMULD:
  1018  			opcode, tp = 0b0000, 0b01
  1019  		}
  1020  
  1021  		a.Buf.Write([]byte{
  1022  			(dstRegBits << 5) | dstRegBits,
  1023  			opcode<<4 | 0b0000_10_00 | (dstRegBits >> 3),
  1024  			tp<<6 | 0b00_1_00000 | srcRegBits,
  1025  			0b0001_1110,
  1026  		})
  1027  
  1028  	case FCVTZSD, FCVTZSDW, FCVTZSS, FCVTZSSW, FCVTZUD, FCVTZUDW, FCVTZUS, FCVTZUSW:
  1029  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, true); err != nil {
  1030  			return
  1031  		}
  1032  
  1033  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1034  
  1035  		// "Conversion between floating-point and integer" in
  1036  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
  1037  		var sf, tp, opcode byte
  1038  		switch inst {
  1039  		case FCVTZSD: // Double to signed 64-bit
  1040  			sf, tp, opcode = 0b1, 0b01, 0b000
  1041  		case FCVTZSDW: // Double to signed 32-bit.
  1042  			sf, tp, opcode = 0b0, 0b01, 0b000
  1043  		case FCVTZSS: // Single to signed 64-bit.
  1044  			sf, tp, opcode = 0b1, 0b00, 0b000
  1045  		case FCVTZSSW: // Single to signed 32-bit.
  1046  			sf, tp, opcode = 0b0, 0b00, 0b000
  1047  		case FCVTZUD: // Double to unsigned 64-bit.
  1048  			sf, tp, opcode = 0b1, 0b01, 0b001
  1049  		case FCVTZUDW: // Double to unsigned 32-bit.
  1050  			sf, tp, opcode = 0b0, 0b01, 0b001
  1051  		case FCVTZUS: // Single to unsigned 64-bit.
  1052  			sf, tp, opcode = 0b1, 0b00, 0b001
  1053  		case FCVTZUSW: // Single to unsigned 32-bit.
  1054  			sf, tp, opcode = 0b0, 0b00, 0b001
  1055  		}
  1056  
  1057  		a.Buf.Write([]byte{
  1058  			(srcRegBits << 5) | dstRegBits,
  1059  			0 | (srcRegBits >> 3),
  1060  			tp<<6 | 0b00_1_11_000 | opcode,
  1061  			sf<<7 | 0b0_0_0_11110,
  1062  		})
  1063  
  1064  	case FMOVD, FMOVS:
  1065  		isSrcInt, isDstInt := isIntRegister(n.srcReg), isIntRegister(n.dstReg)
  1066  		if isSrcInt && isDstInt {
  1067  			return errors.New("FMOV needs at least one of operands to be integer")
  1068  		}
  1069  
  1070  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1071  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMOV--register---Floating-point-Move-register-without-conversion-?lang=en
  1072  		if !isSrcInt && !isDstInt { // Float to float.
  1073  			var tp byte
  1074  			if inst == FMOVD {
  1075  				tp = 0b01
  1076  			}
  1077  			a.Buf.Write([]byte{
  1078  				(srcRegBits << 5) | dstRegBits,
  1079  				0b0_10000_00 | (srcRegBits >> 3),
  1080  				tp<<6 | 0b00_1_00000,
  1081  				0b000_11110,
  1082  			})
  1083  		} else if isSrcInt && !isDstInt { // Int to float.
  1084  			var tp, sf byte
  1085  			if inst == FMOVD {
  1086  				tp, sf = 0b01, 0b1
  1087  			}
  1088  			a.Buf.Write([]byte{
  1089  				(srcRegBits << 5) | dstRegBits,
  1090  				srcRegBits >> 3,
  1091  				tp<<6 | 0b00_1_00_111,
  1092  				sf<<7 | 0b0_00_11110,
  1093  			})
  1094  		} else { // Float to int.
  1095  			var tp, sf byte
  1096  			if inst == FMOVD {
  1097  				tp, sf = 0b01, 0b1
  1098  			}
  1099  			a.Buf.Write([]byte{
  1100  				(srcRegBits << 5) | dstRegBits,
  1101  				srcRegBits >> 3,
  1102  				tp<<6 | 0b00_1_00_110,
  1103  				sf<<7 | 0b0_00_11110,
  1104  			})
  1105  		}
  1106  
  1107  	case MOVD, MOVW:
  1108  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1109  			return
  1110  		}
  1111  
  1112  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1113  		if n.srcReg == RegRZR && inst == MOVD {
  1114  			// If this is 64-bit mov from zero register, then we encode this as MOVK.
  1115  			// See "Move wide (immediate)" in
  1116  			// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Immediate
  1117  			a.Buf.Write([]byte{
  1118  				dstRegBits,
  1119  				0x0,
  1120  				0b1000_0000,
  1121  				0b1_10_10010,
  1122  			})
  1123  		} else {
  1124  			// MOV can be encoded as ORR (shifted register): "ORR Wd, WZR, Wm".
  1125  			// https://developer.arm.com/documentation/100069/0609/A64-General-Instructions/MOV--register-
  1126  			var sf byte
  1127  			if inst == MOVD {
  1128  				sf = 0b1
  1129  			}
  1130  			a.Buf.Write([]byte{
  1131  				(zeroRegisterBits << 5) | dstRegBits,
  1132  				zeroRegisterBits >> 3,
  1133  				0b000_00000 | srcRegBits,
  1134  				sf<<7 | 0b0_01_01010,
  1135  			})
  1136  		}
  1137  
  1138  	case MRS:
  1139  		if n.srcReg != RegFPSR {
  1140  			return fmt.Errorf("MRS has only support for FPSR register as a src but got %s", RegisterName(n.srcReg))
  1141  		}
  1142  
  1143  		// For how to specify FPSR register, see "Accessing FPSR" in:
  1144  		// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en
  1145  		dstRegBits := registerBits(n.dstReg)
  1146  		a.Buf.Write([]byte{
  1147  			0b001<<5 | dstRegBits,
  1148  			0b0100<<4 | 0b0100,
  1149  			0b0011_0000 | 0b11<<3 | 0b011,
  1150  			0b1101_0101,
  1151  		})
  1152  
  1153  	case MSR:
  1154  		if n.dstReg != RegFPSR {
  1155  			return fmt.Errorf("MSR has only support for FPSR register as a dst but got %s", RegisterName(n.srcReg))
  1156  		}
  1157  
  1158  		// For how to specify FPSR register, see "Accessing FPSR" in:
  1159  		// https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en
  1160  		srcRegBits := registerBits(n.srcReg)
  1161  		a.Buf.Write([]byte{
  1162  			0b001<<5 | srcRegBits,
  1163  			0b0100<<4 | 0b0100,
  1164  			0b0001_0000 | 0b11<<3 | 0b011,
  1165  			0b1101_0101,
  1166  		})
  1167  
  1168  	case MUL, MULW:
  1169  		// Multiplications are encoded as MADD (zero register, src, dst), dst = zero + (src * dst) = src * dst.
  1170  		// See "Data-processing (3 source)" in
  1171  		// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
  1172  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1173  			return
  1174  		}
  1175  
  1176  		var sf byte
  1177  		if inst == MUL {
  1178  			sf = 0b1
  1179  		}
  1180  
  1181  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1182  
  1183  		a.Buf.Write([]byte{
  1184  			dstRegBits<<5 | dstRegBits,
  1185  			zeroRegisterBits<<2 | dstRegBits>>3,
  1186  			srcRegBits,
  1187  			sf<<7 | 0b11011,
  1188  		})
  1189  
  1190  	case NEG, NEGW:
  1191  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1192  
  1193  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1194  			return
  1195  		}
  1196  
  1197  		// NEG is encoded as "SUB dst, XZR, src" = "dst = 0 - src"
  1198  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
  1199  		var sf byte
  1200  		if inst == NEG {
  1201  			sf = 0b1
  1202  		}
  1203  
  1204  		a.Buf.Write([]byte{
  1205  			(zeroRegisterBits << 5) | dstRegBits,
  1206  			zeroRegisterBits >> 3,
  1207  			srcRegBits,
  1208  			sf<<7 | 0b0_10_00000 | 0b0_00_01011,
  1209  		})
  1210  
  1211  	case SDIV, SDIVW, UDIV, UDIVW:
  1212  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1213  
  1214  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1215  			return
  1216  		}
  1217  
  1218  		// See "Data-processing (2 source)" in
  1219  		// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
  1220  		var sf, opcode byte
  1221  		switch inst {
  1222  		case SDIV:
  1223  			sf, opcode = 0b1, 0b000011
  1224  		case SDIVW:
  1225  			sf, opcode = 0b0, 0b000011
  1226  		case UDIV:
  1227  			sf, opcode = 0b1, 0b000010
  1228  		case UDIVW:
  1229  			sf, opcode = 0b0, 0b000010
  1230  		}
  1231  
  1232  		a.Buf.Write([]byte{
  1233  			(dstRegBits << 5) | dstRegBits,
  1234  			opcode<<2 | (dstRegBits >> 3),
  1235  			0b110_00000 | srcRegBits,
  1236  			sf<<7 | 0b0_00_11010,
  1237  		})
  1238  
  1239  	case SCVTFD, SCVTFWD, SCVTFS, SCVTFWS, UCVTFD, UCVTFS, UCVTFWD, UCVTFWS:
  1240  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1241  
  1242  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, false); err != nil {
  1243  			return
  1244  		}
  1245  
  1246  		// "Conversion between floating-point and integer" in
  1247  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1
  1248  		var sf, tp, opcode byte
  1249  		switch inst {
  1250  		case SCVTFD: // 64-bit integer to double
  1251  			sf, tp, opcode = 0b1, 0b01, 0b010
  1252  		case SCVTFWD: // 32-bit integer to double
  1253  			sf, tp, opcode = 0b0, 0b01, 0b010
  1254  		case SCVTFS: // 64-bit integer to single
  1255  			sf, tp, opcode = 0b1, 0b00, 0b010
  1256  		case SCVTFWS: // 32-bit integer to single
  1257  			sf, tp, opcode = 0b0, 0b00, 0b010
  1258  		case UCVTFD: // 64-bit to double
  1259  			sf, tp, opcode = 0b1, 0b01, 0b011
  1260  		case UCVTFWD: // 32-bit to double
  1261  			sf, tp, opcode = 0b0, 0b01, 0b011
  1262  		case UCVTFS: // 64-bit to single
  1263  			sf, tp, opcode = 0b1, 0b00, 0b011
  1264  		case UCVTFWS: // 32-bit to single
  1265  			sf, tp, opcode = 0b0, 0b00, 0b011
  1266  		}
  1267  
  1268  		a.Buf.Write([]byte{
  1269  			(srcRegBits << 5) | dstRegBits,
  1270  			srcRegBits >> 3,
  1271  			tp<<6 | 0b00_1_00_000 | opcode,
  1272  			sf<<7 | 0b0_0_0_11110,
  1273  		})
  1274  
  1275  	case SXTB, SXTBW, SXTH, SXTHW, SXTW:
  1276  		if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil {
  1277  			return
  1278  		}
  1279  
  1280  		srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg)
  1281  		if n.srcReg == RegRZR {
  1282  			// If the source is zero register, we encode as MOV dst, zero.
  1283  			var sf byte
  1284  			if inst == MOVD {
  1285  				sf = 0b1
  1286  			}
  1287  			a.Buf.Write([]byte{
  1288  				(zeroRegisterBits << 5) | dstRegBits,
  1289  				zeroRegisterBits >> 3,
  1290  				0b000_00000 | srcRegBits,
  1291  				sf<<7 | 0b0_01_01010,
  1292  			})
  1293  			return
  1294  		}
  1295  
  1296  		// SXTB is encoded as "SBFM Wd, Wn, #0, #7"
  1297  		// https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTB
  1298  		// SXTH is encoded as "SBFM Wd, Wn, #0, #15"
  1299  		// https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTH
  1300  		// SXTW is encoded as "SBFM Xd, Xn, #0, #31"
  1301  		// https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/SXTW
  1302  
  1303  		var n, sf, imms, opc byte
  1304  		switch inst {
  1305  		case SXTB:
  1306  			n, sf, imms = 0b1, 0b1, 0x7
  1307  		case SXTBW:
  1308  			n, sf, imms = 0b0, 0b0, 0x7
  1309  		case SXTH:
  1310  			n, sf, imms = 0b1, 0b1, 0xf
  1311  		case SXTHW:
  1312  			n, sf, imms = 0b0, 0b0, 0xf
  1313  		case SXTW:
  1314  			n, sf, imms = 0b1, 0b1, 0x1f
  1315  		}
  1316  
  1317  		a.Buf.Write([]byte{
  1318  			(srcRegBits << 5) | dstRegBits,
  1319  			imms<<2 | (srcRegBits >> 3),
  1320  			n << 6,
  1321  			sf<<7 | opc<<5 | 0b10011,
  1322  		})
  1323  	default:
  1324  		return errorEncodingUnsupported(n)
  1325  	}
  1326  	return
  1327  }
  1328  
  1329  func (a *AssemblerImpl) encodeLeftShiftedRegisterToRegister(n *nodeImpl) (err error) {
  1330  	baseRegBits, err := intRegisterBits(n.srcReg)
  1331  	if err != nil {
  1332  		return err
  1333  	}
  1334  	shiftTargetRegBits, err := intRegisterBits(n.srcReg2)
  1335  	if err != nil {
  1336  		return err
  1337  	}
  1338  	dstRegBits, err := intRegisterBits(n.dstReg)
  1339  	if err != nil {
  1340  		return err
  1341  	}
  1342  
  1343  	switch n.instruction {
  1344  	case ADD:
  1345  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
  1346  		const logicalLeftShiftBits = 0b00
  1347  		if n.srcConst < 0 || n.srcConst > 64 {
  1348  			return fmt.Errorf("shift amount must fit in unsigned 6-bit integer (0-64) but got %d", n.srcConst)
  1349  		}
  1350  		shiftByte := byte(n.srcConst)
  1351  		a.Buf.Write([]byte{
  1352  			(baseRegBits << 5) | dstRegBits,
  1353  			(shiftByte << 2) | (baseRegBits >> 3),
  1354  			(logicalLeftShiftBits << 6) | shiftTargetRegBits,
  1355  			0b1000_1011,
  1356  		})
  1357  	default:
  1358  		return errorEncodingUnsupported(n)
  1359  	}
  1360  	return
  1361  }
  1362  
  1363  func (a *AssemblerImpl) encodeTwoRegistersToRegister(n *nodeImpl) (err error) {
  1364  	switch inst := n.instruction; inst {
  1365  	case AND, ANDW, ORR, ORRW, EOR, EORW:
  1366  		// See "Logical (shifted register)" in
  1367  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1368  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1369  		var sf, opc byte
  1370  		switch inst {
  1371  		case AND:
  1372  			sf, opc = 0b1, 0b00
  1373  		case ANDW:
  1374  			sf, opc = 0b0, 0b00
  1375  		case ORR:
  1376  			sf, opc = 0b1, 0b01
  1377  		case ORRW:
  1378  			sf, opc = 0b0, 0b01
  1379  		case EOR:
  1380  			sf, opc = 0b1, 0b10
  1381  		case EORW:
  1382  			sf, opc = 0b0, 0b10
  1383  		}
  1384  		a.Buf.Write([]byte{
  1385  			(srcReg2Bits << 5) | dstRegBits,
  1386  			srcReg2Bits >> 3,
  1387  			srcRegBits,
  1388  			sf<<7 | opc<<5 | 0b01010,
  1389  		})
  1390  	case ASR, ASRW, LSL, LSLW, LSR, LSRW, ROR, RORW:
  1391  		// See "Data-processing (2 source)" in
  1392  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1393  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1394  
  1395  		var sf, opcode byte
  1396  		switch inst {
  1397  		case ASR:
  1398  			sf, opcode = 0b1, 0b001010
  1399  		case ASRW:
  1400  			sf, opcode = 0b0, 0b001010
  1401  		case LSL:
  1402  			sf, opcode = 0b1, 0b001000
  1403  		case LSLW:
  1404  			sf, opcode = 0b0, 0b001000
  1405  		case LSR:
  1406  			sf, opcode = 0b1, 0b001001
  1407  		case LSRW:
  1408  			sf, opcode = 0b0, 0b001001
  1409  		case ROR:
  1410  			sf, opcode = 0b1, 0b001011
  1411  		case RORW:
  1412  			sf, opcode = 0b0, 0b001011
  1413  		}
  1414  		a.Buf.Write([]byte{
  1415  			(srcReg2Bits << 5) | dstRegBits,
  1416  			opcode<<2 | (srcReg2Bits >> 3),
  1417  			0b110_00000 | srcRegBits,
  1418  			sf<<7 | 0b0_00_11010,
  1419  		})
  1420  	case SDIV, SDIVW, UDIV, UDIVW:
  1421  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1422  
  1423  		// See "Data-processing (2 source)" in
  1424  		// https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en
  1425  		var sf, opcode byte
  1426  		switch inst {
  1427  		case SDIV:
  1428  			sf, opcode = 0b1, 0b000011
  1429  		case SDIVW:
  1430  			sf, opcode = 0b0, 0b000011
  1431  		case UDIV:
  1432  			sf, opcode = 0b1, 0b000010
  1433  		case UDIVW:
  1434  			sf, opcode = 0b0, 0b000010
  1435  		}
  1436  
  1437  		a.Buf.Write([]byte{
  1438  			(srcReg2Bits << 5) | dstRegBits,
  1439  			opcode<<2 | (srcReg2Bits >> 3),
  1440  			0b110_00000 | srcRegBits,
  1441  			sf<<7 | 0b0_00_11010,
  1442  		})
  1443  	case SUB, SUBW:
  1444  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1445  
  1446  		// See "Add/subtract (shifted register)" in
  1447  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1448  		var sf byte
  1449  		if inst == SUB {
  1450  			sf = 0b1
  1451  		}
  1452  
  1453  		a.Buf.Write([]byte{
  1454  			(srcReg2Bits << 5) | dstRegBits,
  1455  			srcReg2Bits >> 3,
  1456  			srcRegBits,
  1457  			sf<<7 | 0b0_10_01011,
  1458  		})
  1459  	case FSUBD, FSUBS:
  1460  		srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg)
  1461  
  1462  		// See "Floating-point data-processing (2 source)" in
  1463  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  1464  		var tp byte
  1465  		if inst == FSUBD {
  1466  			tp = 0b01
  1467  		}
  1468  		a.Buf.Write([]byte{
  1469  			(srcReg2Bits << 5) | dstRegBits,
  1470  			0b0011_10_00 | (srcReg2Bits >> 3),
  1471  			tp<<6 | 0b00_1_00000 | srcRegBits,
  1472  			0b0_00_11110,
  1473  		})
  1474  	default:
  1475  		return errorEncodingUnsupported(n)
  1476  	}
  1477  	return
  1478  }
  1479  
  1480  func (a *AssemblerImpl) encodeThreeRegistersToRegister(n *nodeImpl) (err error) {
  1481  	switch n.instruction {
  1482  	case MSUB, MSUBW:
  1483  		// Dst = Src2 - (Src1 * Src3)
  1484  		// "Data-processing (3 source)" in:
  1485  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en
  1486  		src1RegBits, err := intRegisterBits(n.srcReg)
  1487  		if err != nil {
  1488  			return err
  1489  		}
  1490  		src2RegBits, err := intRegisterBits(n.srcReg2)
  1491  		if err != nil {
  1492  			return err
  1493  		}
  1494  		src3RegBits, err := intRegisterBits(n.dstReg)
  1495  		if err != nil {
  1496  			return err
  1497  		}
  1498  		dstRegBits, err := intRegisterBits(n.dstReg2)
  1499  		if err != nil {
  1500  			return err
  1501  		}
  1502  
  1503  		var sf byte // is zero for MSUBW (32-bit MSUB).
  1504  		if n.instruction == MSUB {
  1505  			sf = 0b1
  1506  		}
  1507  
  1508  		a.Buf.Write([]byte{
  1509  			(src3RegBits << 5) | dstRegBits,
  1510  			0b1_0000000 | (src2RegBits << 2) | (src3RegBits >> 3),
  1511  			src1RegBits,
  1512  			sf<<7 | 0b00_11011,
  1513  		})
  1514  	default:
  1515  		return errorEncodingUnsupported(n)
  1516  	}
  1517  	return
  1518  }
  1519  
  1520  func (a *AssemblerImpl) encodeTwoRegistersToNone(n *nodeImpl) (err error) {
  1521  	switch n.instruction {
  1522  	case CMPW, CMP:
  1523  		// Compare on two registers is an alias for "SUBS (src1, src2) ZERO"
  1524  		// which can be encoded as SUBS (shifted registers) with zero shifting.
  1525  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift
  1526  		src1RegBits, err := intRegisterBits(n.srcReg)
  1527  		if err != nil {
  1528  			return err
  1529  		}
  1530  		src2RegBits, err := intRegisterBits(n.srcReg2)
  1531  		if err != nil {
  1532  			return err
  1533  		}
  1534  
  1535  		var op byte
  1536  		if n.instruction == CMP {
  1537  			op = 0b111
  1538  		} else {
  1539  			op = 0b011
  1540  		}
  1541  
  1542  		a.Buf.Write([]byte{
  1543  			(src2RegBits << 5) | zeroRegisterBits,
  1544  			src2RegBits >> 3,
  1545  			src1RegBits,
  1546  			0b01011 | (op << 5),
  1547  		})
  1548  	case FCMPS, FCMPD:
  1549  		// "Floating-point compare" section in:
  1550  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  1551  		src1RegBits, err := vectorRegisterBits(n.srcReg)
  1552  		if err != nil {
  1553  			return err
  1554  		}
  1555  		src2RegBits, err := vectorRegisterBits(n.srcReg2)
  1556  		if err != nil {
  1557  			return err
  1558  		}
  1559  
  1560  		var ftype byte // is zero for FCMPS (single precision float compare).
  1561  		if n.instruction == FCMPD {
  1562  			ftype = 0b01
  1563  		}
  1564  		a.Buf.Write([]byte{
  1565  			src2RegBits << 5,
  1566  			0b001000_00 | (src2RegBits >> 3),
  1567  			ftype<<6 | 0b1_00000 | src1RegBits,
  1568  			0b000_11110,
  1569  		})
  1570  	default:
  1571  		return errorEncodingUnsupported(n)
  1572  	}
  1573  	return
  1574  }
  1575  
  1576  func (a *AssemblerImpl) encodeRegisterAndConstToNone(n *nodeImpl) (err error) {
  1577  	if n.instruction != CMP {
  1578  		return errorEncodingUnsupported(n)
  1579  	}
  1580  
  1581  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en
  1582  	if n.srcConst < 0 || n.srcConst > 4095 {
  1583  		return fmt.Errorf("immediate for CMP must fit in 0 to 4095 but got %d", n.srcConst)
  1584  	} else if n.srcReg == RegRZR {
  1585  		return errors.New("zero register is not supported for CMP (immediate)")
  1586  	}
  1587  
  1588  	srcRegBits, err := intRegisterBits(n.srcReg)
  1589  	if err != nil {
  1590  		return err
  1591  	}
  1592  
  1593  	a.Buf.Write([]byte{
  1594  		(srcRegBits << 5) | zeroRegisterBits,
  1595  		(byte(n.srcConst) << 2) | (srcRegBits >> 3),
  1596  		byte(n.srcConst >> 6),
  1597  		0b111_10001,
  1598  	})
  1599  	return
  1600  }
  1601  
  1602  func fitInSigned9Bits(v int64) bool {
  1603  	return v >= -256 && v <= 255
  1604  }
  1605  
  1606  func (a *AssemblerImpl) encodeLoadOrStoreWithRegisterOffset(
  1607  	baseRegBits, offsetRegBits, targetRegBits byte, opcode, size, v byte,
  1608  ) {
  1609  	// See "Load/store register (register offset)".
  1610  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
  1611  	a.Buf.Write([]byte{
  1612  		(baseRegBits << 5) | targetRegBits,
  1613  		0b011_010_00 | (baseRegBits >> 3),
  1614  		opcode<<6 | 0b00_1_00000 | offsetRegBits,
  1615  		size<<6 | v<<2 | 0b00_111_0_00,
  1616  	})
  1617  }
  1618  
  1619  // validateMemoryOffset validates the memory offset if the given offset can be encoded in the assembler.
  1620  // In theory, offset can be any, but for simplicity of our homemade assembler, we limit the offset range
  1621  // that can be encoded enough for supporting compiler.
  1622  func validateMemoryOffset(offset int64) (err error) {
  1623  	if offset > 255 && offset%4 != 0 {
  1624  		// This is because we only have large offsets for load/store with Wasm value stack or reading type IDs, and its offset
  1625  		// is always multiplied by 4 or 8 (== the size of uint32 or uint64 == the type of wasm.FunctionTypeID or value stack in Go)
  1626  		err = fmt.Errorf("large memory offset (>255) must be a multiple of 4 but got %d", offset)
  1627  	} else if offset < -256 { // 9-bit signed integer's minimum = 2^8.
  1628  		err = fmt.Errorf("negative memory offset must be larget than or equal -256 but got %d", offset)
  1629  	} else if offset > 1<<31-1 {
  1630  		return fmt.Errorf("large memory offset must be less than %d but got %d", 1<<31-1, offset)
  1631  	}
  1632  	return
  1633  }
  1634  
  1635  // encodeLoadOrStoreWithConstOffset encodes load/store instructions with the constant offset.
  1636  //
  1637  // Note: Encoding strategy intentionally matches the Go assembler: https://go.dev/doc/asm
  1638  func (a *AssemblerImpl) encodeLoadOrStoreWithConstOffset(
  1639  	baseRegBits, targetRegBits byte,
  1640  	offset int64,
  1641  	opcode, size, v byte,
  1642  	datasize, datasizeLog2 int64,
  1643  ) (err error) {
  1644  	if err = validateMemoryOffset(offset); err != nil {
  1645  		return
  1646  	}
  1647  
  1648  	if fitInSigned9Bits(offset) {
  1649  		// See "LDAPR/STLR (unscaled immediate)"
  1650  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldapstl_unscaled
  1651  		if offset < 0 || offset%datasize != 0 {
  1652  			// This case is encoded as one "unscaled signed store".
  1653  			a.Buf.Write([]byte{
  1654  				(baseRegBits << 5) | targetRegBits,
  1655  				byte(offset<<4) | (baseRegBits >> 3),
  1656  				opcode<<6 | (0b00_00_11111 & byte(offset>>4)),
  1657  				size<<6 | v<<2 | 0b00_1_11_0_00,
  1658  			})
  1659  			return
  1660  		}
  1661  	}
  1662  
  1663  	// At this point we have the assumption that offset is positive.
  1664  	// Plus if it is a multiple of datasize, then it can be encoded as a single "unsigned immediate".
  1665  	if offset%datasize == 0 &&
  1666  		offset < (1<<12)<<datasizeLog2 {
  1667  		m := offset / datasize
  1668  		a.Buf.Write([]byte{
  1669  			(baseRegBits << 5) | targetRegBits,
  1670  			(byte(m << 2)) | (baseRegBits >> 3),
  1671  			opcode<<6 | 0b00_111111&byte(m>>6),
  1672  			size<<6 | v<<2 | 0b00_1_11_0_01,
  1673  		})
  1674  		return
  1675  	}
  1676  
  1677  	// Otherwise, we need multiple instructions.
  1678  	tmpRegBits := registerBits(a.temporaryRegister)
  1679  	offset32 := int32(offset)
  1680  
  1681  	// Go's assembler adds a const into the const pool at this point,
  1682  	// regardless of its usage; e.g. if we enter the then block of the following if statement,
  1683  	// the const is not used but it is added into the const pool.
  1684  	c := asm.NewStaticConst(make([]byte, 4))
  1685  	binary.LittleEndian.PutUint32(c.Raw, uint32(offset))
  1686  	a.pool.AddConst(c, uint64(a.Buf.Len()))
  1687  
  1688  	// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3529-L3532
  1689  	// If the offset is within 24-bits, we can load it with two ADD instructions.
  1690  	hi := offset32 - (offset32 & (0xfff << uint(datasizeLog2)))
  1691  	if hi&^0xfff000 == 0 {
  1692  		var sfops byte = 0b100
  1693  		m := ((offset32 - hi) >> datasizeLog2) & 0xfff
  1694  		hi >>= 12
  1695  
  1696  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3534-L3535
  1697  		a.Buf.Write([]byte{
  1698  			(baseRegBits << 5) | tmpRegBits,
  1699  			(byte(hi) << 2) | (baseRegBits >> 3),
  1700  			0b01<<6 /* shift by 12 */ | byte(hi>>6),
  1701  			sfops<<5 | 0b10001,
  1702  		})
  1703  
  1704  		a.Buf.Write([]byte{
  1705  			(tmpRegBits << 5) | targetRegBits,
  1706  			(byte(m << 2)) | (tmpRegBits >> 3),
  1707  			opcode<<6 | 0b00_111111&byte(m>>6),
  1708  			size<<6 | v<<2 | 0b00_1_11_0_01,
  1709  		})
  1710  	} else {
  1711  		// This case we load the const via ldr(literal) into tem register,
  1712  		// and the target const is placed after this instruction below.
  1713  		loadLiteralOffsetInBinary := uint64(a.Buf.Len())
  1714  
  1715  		// First we emit the ldr(literal) with offset zero as we don't yet know the const's placement in the binary.
  1716  		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--literal---Load-Register--literal--
  1717  		a.Buf.Write([]byte{tmpRegBits, 0x0, 0x0, 0b00_011_0_00})
  1718  
  1719  		// Set the callback for the constant, and we set properly the offset in the callback.
  1720  
  1721  		c.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
  1722  			// ldr(literal) encodes offset divided by 4.
  1723  			offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4
  1724  			bin := a.Buf.Bytes()
  1725  			bin[loadLiteralOffsetInBinary] |= byte(offset << 5)
  1726  			bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3)
  1727  			bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11)
  1728  		})
  1729  
  1730  		// Then, load the constant with the register offset.
  1731  		// https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--register---Load-Register--register--
  1732  		a.Buf.Write([]byte{
  1733  			(baseRegBits << 5) | targetRegBits,
  1734  			0b011_010_00 | (baseRegBits >> 3),
  1735  			opcode<<6 | 0b00_1_00000 | tmpRegBits,
  1736  			size<<6 | v<<2 | 0b00_111_0_00,
  1737  		})
  1738  	}
  1739  	return
  1740  }
  1741  
  1742  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
  1743  var storeInstructionTable = map[asm.Instruction]struct {
  1744  	size, v                byte
  1745  	datasize, datasizeLog2 int64
  1746  	isTargetFloat          bool
  1747  }{
  1748  	STRD:  {size: 0b11, v: 0x0, datasize: 8, datasizeLog2: 3},
  1749  	STRW:  {size: 0b10, v: 0x0, datasize: 4, datasizeLog2: 2},
  1750  	STRH:  {size: 0b01, v: 0x0, datasize: 2, datasizeLog2: 1},
  1751  	STRB:  {size: 0b00, v: 0x0, datasize: 1, datasizeLog2: 0},
  1752  	FSTRD: {size: 0b11, v: 0x1, datasize: 8, datasizeLog2: 3, isTargetFloat: true},
  1753  	FSTRS: {size: 0b10, v: 0x1, datasize: 4, datasizeLog2: 2, isTargetFloat: true},
  1754  }
  1755  
  1756  func (a *AssemblerImpl) encodeRegisterToMemory(n *nodeImpl) (err error) {
  1757  	inst, ok := storeInstructionTable[n.instruction]
  1758  	if !ok {
  1759  		return errorEncodingUnsupported(n)
  1760  	}
  1761  
  1762  	var srcRegBits byte
  1763  	if inst.isTargetFloat {
  1764  		srcRegBits, err = vectorRegisterBits(n.srcReg)
  1765  	} else {
  1766  		srcRegBits, err = intRegisterBits(n.srcReg)
  1767  	}
  1768  	if err != nil {
  1769  		return
  1770  	}
  1771  
  1772  	baseRegBits, err := intRegisterBits(n.dstReg)
  1773  	if err != nil {
  1774  		return err
  1775  	}
  1776  
  1777  	const opcode = 0x00 // opcode for store instructions.
  1778  	if n.dstReg2 != asm.NilRegister {
  1779  		offsetRegBits, err := intRegisterBits(n.dstReg2)
  1780  		if err != nil {
  1781  			return err
  1782  		}
  1783  		a.encodeLoadOrStoreWithRegisterOffset(baseRegBits, offsetRegBits, srcRegBits, opcode, inst.size, inst.v)
  1784  	} else {
  1785  		err = a.encodeLoadOrStoreWithConstOffset(baseRegBits, srcRegBits, n.dstConst, opcode, inst.size, inst.v, inst.datasize, inst.datasizeLog2)
  1786  	}
  1787  	return
  1788  }
  1789  
  1790  func (a *AssemblerImpl) encodeADR(n *nodeImpl) (err error) {
  1791  	dstRegBits, err := intRegisterBits(n.dstReg)
  1792  	if err != nil {
  1793  		return err
  1794  	}
  1795  
  1796  	adrInstructionOffsetInBinary := uint64(a.Buf.Len())
  1797  
  1798  	// At this point, we don't yet know the target offset to read from,
  1799  	// so we emit the ADR instruction with 0 offset, and replace later in the callback.
  1800  	// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
  1801  	a.Buf.Write([]byte{dstRegBits, 0x0, 0x0, 0b10000})
  1802  
  1803  	// This case, the ADR's target offset is for the staticConst's initial address.
  1804  	if sc := n.staticConst; sc != nil {
  1805  		a.pool.AddConst(sc, adrInstructionOffsetInBinary)
  1806  		sc.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
  1807  			adrInstructionBytes := a.Buf.Bytes()[adrInstructionOffsetInBinary : adrInstructionOffsetInBinary+4]
  1808  			offset := int(offsetOfConst) - int(adrInstructionOffsetInBinary)
  1809  
  1810  			// See https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
  1811  			adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5)
  1812  			offset >>= 2
  1813  			adrInstructionBytes[0] |= byte(offset << 5)
  1814  			offset >>= 3
  1815  			adrInstructionBytes[1] |= byte(offset)
  1816  			offset >>= 8
  1817  			adrInstructionBytes[2] |= byte(offset)
  1818  		})
  1819  		return
  1820  	}
  1821  
  1822  	a.AddOnGenerateCallBack(func(code []byte) error {
  1823  		// Find the target instruction node.
  1824  		targetNode := n
  1825  		for ; targetNode != nil; targetNode = targetNode.next {
  1826  			if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction {
  1827  				targetNode = targetNode.next
  1828  				break
  1829  			}
  1830  		}
  1831  
  1832  		if targetNode == nil {
  1833  			return fmt.Errorf("BUG: target instruction %s not found for ADR", InstructionName(n.readInstructionAddressBeforeTargetInstruction))
  1834  		}
  1835  
  1836  		offset := targetNode.OffsetInBinary() - n.OffsetInBinary()
  1837  		if i64 := int64(offset); i64 >= 1<<20 || i64 < -1<<20 {
  1838  			// We could support offset over 20-bit range by special casing them here,
  1839  			// but 20-bit range should be enough for our impl. If the necessity comes up,
  1840  			// we could add the special casing here to support arbitrary large offset.
  1841  			return fmt.Errorf("BUG: too large offset for ADR: %#x", offset)
  1842  		}
  1843  
  1844  		adrInstructionBytes := code[n.OffsetInBinary() : n.OffsetInBinary()+4]
  1845  		// According to the binary format of ADR instruction:
  1846  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en
  1847  		adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5)
  1848  		offset >>= 2
  1849  		adrInstructionBytes[0] |= byte(offset << 5)
  1850  		offset >>= 3
  1851  		adrInstructionBytes[1] |= byte(offset)
  1852  		offset >>= 8
  1853  		adrInstructionBytes[2] |= byte(offset)
  1854  		return nil
  1855  	})
  1856  	return
  1857  }
  1858  
  1859  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff
  1860  var loadInstructionTable = map[asm.Instruction]struct {
  1861  	size, v, opcode        byte
  1862  	datasize, datasizeLog2 int64
  1863  	isTargetFloat          bool
  1864  }{
  1865  	FLDRD:  {size: 0b11, v: 0x1, datasize: 8, datasizeLog2: 3, isTargetFloat: true, opcode: 0b01},
  1866  	FLDRS:  {size: 0b10, v: 0x1, datasize: 4, datasizeLog2: 2, isTargetFloat: true, opcode: 0b01},
  1867  	LDRD:   {size: 0b11, v: 0x0, datasize: 8, datasizeLog2: 3, opcode: 0b01},
  1868  	LDRW:   {size: 0b10, v: 0x0, datasize: 4, datasizeLog2: 2, opcode: 0b01},
  1869  	LDRSHD: {size: 0b01, v: 0x0, datasize: 2, datasizeLog2: 1, opcode: 0b10},
  1870  	LDRSHW: {size: 0b01, v: 0x0, datasize: 2, datasizeLog2: 1, opcode: 0b11},
  1871  	LDRH:   {size: 0b01, v: 0x0, datasize: 2, datasizeLog2: 1, opcode: 0b01},
  1872  	LDRSBD: {size: 0b00, v: 0x0, datasize: 1, datasizeLog2: 0, opcode: 0b10},
  1873  	LDRSBW: {size: 0b00, v: 0x0, datasize: 1, datasizeLog2: 0, opcode: 0b11},
  1874  	LDRB:   {size: 0b00, v: 0x0, datasize: 1, datasizeLog2: 0, opcode: 0b01},
  1875  	LDRSW:  {size: 0b10, v: 0x0, datasize: 4, datasizeLog2: 2, opcode: 0b10},
  1876  }
  1877  
  1878  func (a *AssemblerImpl) encodeMemoryToRegister(n *nodeImpl) (err error) {
  1879  	if n.instruction == ADR {
  1880  		return a.encodeADR(n)
  1881  	}
  1882  
  1883  	inst, ok := loadInstructionTable[n.instruction]
  1884  	if !ok {
  1885  		return errorEncodingUnsupported(n)
  1886  	}
  1887  
  1888  	var dstRegBits byte
  1889  	if inst.isTargetFloat {
  1890  		dstRegBits, err = vectorRegisterBits(n.dstReg)
  1891  	} else {
  1892  		dstRegBits, err = intRegisterBits(n.dstReg)
  1893  	}
  1894  	if err != nil {
  1895  		return
  1896  	}
  1897  	baseRegBits, err := intRegisterBits(n.srcReg)
  1898  	if err != nil {
  1899  		return err
  1900  	}
  1901  
  1902  	if n.srcReg2 != asm.NilRegister {
  1903  		offsetRegBits, err := intRegisterBits(n.srcReg2)
  1904  		if err != nil {
  1905  			return err
  1906  		}
  1907  		a.encodeLoadOrStoreWithRegisterOffset(baseRegBits, offsetRegBits, dstRegBits, inst.opcode,
  1908  			inst.size, inst.v)
  1909  	} else {
  1910  		err = a.encodeLoadOrStoreWithConstOffset(baseRegBits, dstRegBits, n.srcConst, inst.opcode,
  1911  			inst.size, inst.v, inst.datasize, inst.datasizeLog2)
  1912  	}
  1913  	return
  1914  }
  1915  
  1916  // const16bitAligned check if the value is on the 16-bit alignment.
  1917  // If so, returns the shift num divided by 16, and otherwise -1.
  1918  func const16bitAligned(v int64) (ret int) {
  1919  	ret = -1
  1920  	for s := 0; s < 64; s += 16 {
  1921  		if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 {
  1922  			ret = s / 16
  1923  			break
  1924  		}
  1925  	}
  1926  	return
  1927  }
  1928  
  1929  // isBitMaskImmediate determines if the value can be encoded as "bitmask immediate".
  1930  //
  1931  //	Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits.
  1932  //	Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits.
  1933  //
  1934  // See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate-
  1935  func isBitMaskImmediate(x uint64) bool {
  1936  	// All zeros and ones are not "bitmask immediate" by defainition.
  1937  	if x == 0 || x == 0xffff_ffff_ffff_ffff {
  1938  		return false
  1939  	}
  1940  
  1941  	switch {
  1942  	case x != x>>32|x<<32:
  1943  		// e = 64
  1944  	case x != x>>16|x<<48:
  1945  		// e = 32 (x == x>>32|x<<32).
  1946  		// e.g. 0x00ff_ff00_00ff_ff00
  1947  		x = uint64(int32(x))
  1948  	case x != x>>8|x<<56:
  1949  		// e = 16 (x == x>>16|x<<48).
  1950  		// e.g. 0x00ff_00ff_00ff_00ff
  1951  		x = uint64(int16(x))
  1952  	case x != x>>4|x<<60:
  1953  		// e = 8 (x == x>>8|x<<56).
  1954  		// e.g. 0x0f0f_0f0f_0f0f_0f0f
  1955  		x = uint64(int8(x))
  1956  	default:
  1957  		// e = 4 or 2.
  1958  		return true
  1959  	}
  1960  	return sequenceOfSetbits(x) || sequenceOfSetbits(^x)
  1961  }
  1962  
  1963  // sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1).
  1964  // For example: 0b1110 -> true, 0b1010 -> false
  1965  func sequenceOfSetbits(x uint64) bool {
  1966  	y := getLowestBit(x)
  1967  	// If x is a sequence of set bit, this should results in the number
  1968  	// with only one set bit (i.e. power of two).
  1969  	y += x
  1970  	return (y-1)&y == 0
  1971  }
  1972  
  1973  func getLowestBit(x uint64) uint64 {
  1974  	// See https://stackoverflow.com/questions/12247186/find-the-lowest-set-bit
  1975  	return x & (^x + 1)
  1976  }
  1977  
  1978  func (a *AssemblerImpl) addOrSub64BitRegisters(sfops byte, src1RegBits byte, src2RegBits byte) {
  1979  	// src1Reg = src1Reg +/- src2Reg
  1980  	a.Buf.Write([]byte{
  1981  		(src1RegBits << 5) | src1RegBits,
  1982  		src1RegBits >> 3,
  1983  		src2RegBits,
  1984  		sfops<<5 | 0b01011,
  1985  	})
  1986  }
  1987  
  1988  // See "Logical (immediate)" in
  1989  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate
  1990  var logicalImmediate = map[asm.Instruction]struct {
  1991  	sf, opc  byte
  1992  	resolver func(imm asm.ConstantValue) (imms, immr, N byte, err error)
  1993  }{
  1994  	ANDIMM32: {sf: 0b0, opc: 0b00, resolver: func(imm asm.ConstantValue) (imms, immr, N byte, err error) {
  1995  		if !isBitMaskImmediate(uint64(imm)) {
  1996  			err = fmt.Errorf("const %d must be valid bitmask immediate for %s", imm, InstructionName(ANDIMM64))
  1997  			return
  1998  		}
  1999  		immr, imms, N = bitmaskImmediate(uint64(imm), false)
  2000  		return
  2001  	}},
  2002  	ANDIMM64: {sf: 0b1, opc: 0b00, resolver: func(imm asm.ConstantValue) (imms, immr, N byte, err error) {
  2003  		if !isBitMaskImmediate(uint64(imm)) {
  2004  			err = fmt.Errorf("const %d must be valid bitmask immediate for %s", imm, InstructionName(ANDIMM64))
  2005  			return
  2006  		}
  2007  		immr, imms, N = bitmaskImmediate(uint64(imm), true)
  2008  		return
  2009  	}},
  2010  }
  2011  
  2012  func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) {
  2013  	var size uint32
  2014  	switch {
  2015  	case c != c>>32|c<<32:
  2016  		size = 64
  2017  	case c != c>>16|c<<48:
  2018  		size = 32
  2019  		c = uint64(int32(c))
  2020  	case c != c>>8|c<<56:
  2021  		size = 16
  2022  		c = uint64(int16(c))
  2023  	case c != c>>4|c<<60:
  2024  		size = 8
  2025  		c = uint64(int8(c))
  2026  	case c != c>>2|c<<62:
  2027  		size = 4
  2028  		c = uint64(int64(c<<60) >> 60)
  2029  	default:
  2030  		size = 2
  2031  		c = uint64(int64(c<<62) >> 62)
  2032  	}
  2033  
  2034  	neg := false
  2035  	if int64(c) < 0 {
  2036  		c = ^c
  2037  		neg = true
  2038  	}
  2039  
  2040  	onesSize, nonZeroPos := getOnesSequenceSize(c)
  2041  	if neg {
  2042  		nonZeroPos = onesSize + nonZeroPos
  2043  		onesSize = size - onesSize
  2044  	}
  2045  
  2046  	var mode byte = 32
  2047  	if is64bit {
  2048  		N, mode = 0b1, 64
  2049  	}
  2050  
  2051  	immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
  2052  	imms = byte((onesSize - 1) | 63&^(size<<1-1))
  2053  	return
  2054  }
  2055  
  2056  func (a *AssemblerImpl) encodeConstToRegister(n *nodeImpl) (err error) {
  2057  	// Alias for readability.
  2058  	c := n.srcConst
  2059  
  2060  	dstRegBits, err := intRegisterBits(n.dstReg)
  2061  	if err != nil {
  2062  		return err
  2063  	}
  2064  
  2065  	if log, ok := logicalImmediate[n.instruction]; ok {
  2066  		// See "Logical (immediate)" in
  2067  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate
  2068  		imms, immr, N, err := log.resolver(c)
  2069  		if err != nil {
  2070  			return err
  2071  		}
  2072  
  2073  		a.Buf.Write([]byte{
  2074  			(dstRegBits << 5) | dstRegBits,
  2075  			imms<<2 | dstRegBits>>3,
  2076  			N<<6 | immr,
  2077  			log.sf<<7 | log.opc<<5 | 0b10010,
  2078  		})
  2079  		return nil
  2080  	}
  2081  
  2082  	// TODO: refactor and generalize the following like ^ logicalImmediate, etc.
  2083  	switch inst := n.instruction; inst {
  2084  	case ADD, ADDS, SUB, SUBS:
  2085  		var sfops byte
  2086  		if inst == ADD {
  2087  			sfops = 0b100
  2088  		} else if inst == ADDS {
  2089  			sfops = 0b101
  2090  		} else if inst == SUB {
  2091  			sfops = 0b110
  2092  		} else if inst == SUBS {
  2093  			sfops = 0b111
  2094  		}
  2095  
  2096  		if c == 0 {
  2097  			// If the constant equals zero, we encode it as ADD (register) with zero register.
  2098  			a.addOrSub64BitRegisters(sfops, dstRegBits, zeroRegisterBits)
  2099  			return
  2100  		}
  2101  
  2102  		if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
  2103  			// If the const can be represented as "imm12" or "imm12 << 12": one instruction
  2104  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L2992
  2105  
  2106  			if c <= 0xfff {
  2107  				a.Buf.Write([]byte{
  2108  					(dstRegBits << 5) | dstRegBits,
  2109  					(byte(c) << 2) | (dstRegBits >> 3),
  2110  					byte(c >> 6),
  2111  					sfops<<5 | 0b10001,
  2112  				})
  2113  			} else {
  2114  				c >>= 12
  2115  				a.Buf.Write([]byte{
  2116  					(dstRegBits << 5) | dstRegBits,
  2117  					(byte(c) << 2) | (dstRegBits >> 3),
  2118  					0b01<<6 /* shift by 12 */ | byte(c>>6),
  2119  					sfops<<5 | 0b10001,
  2120  				})
  2121  			}
  2122  			return
  2123  		}
  2124  
  2125  		if t := const16bitAligned(c); t >= 0 {
  2126  			// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
  2127  			// We could load it into temporary with movk.
  2128  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029
  2129  			tmpRegBits := registerBits(a.temporaryRegister)
  2130  
  2131  			// MOVZ $c, tmpReg with shifting.
  2132  			a.load16bitAlignedConst(c>>(16*t), byte(t), tmpRegBits, false, true)
  2133  
  2134  			// ADD/SUB tmpReg, dstReg
  2135  			a.addOrSub64BitRegisters(sfops, dstRegBits, tmpRegBits)
  2136  			return
  2137  		} else if t := const16bitAligned(^c); t >= 0 {
  2138  			// Also if the reverse of the const can fit within 16-bit range, do the same ^^.
  2139  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029
  2140  			tmpRegBits := registerBits(a.temporaryRegister)
  2141  
  2142  			// MOVN $c, tmpReg with shifting.
  2143  			a.load16bitAlignedConst(^c>>(16*t), byte(t), tmpRegBits, true, true)
  2144  
  2145  			// ADD/SUB tmpReg, dstReg
  2146  			a.addOrSub64BitRegisters(sfops, dstRegBits, tmpRegBits)
  2147  			return
  2148  		}
  2149  
  2150  		if uc := uint64(c); isBitMaskImmediate(uc) {
  2151  			// If the const can be represented as "bitmask immediate", we load it via ORR into temp register.
  2152  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6570-L6583
  2153  			tmpRegBits := registerBits(a.temporaryRegister)
  2154  			// OOR $c, tmpReg
  2155  			a.loadConstViaBitMaskImmediate(uc, tmpRegBits, true)
  2156  
  2157  			// ADD/SUB tmpReg, dstReg
  2158  			a.addOrSub64BitRegisters(sfops, dstRegBits, tmpRegBits)
  2159  			return
  2160  		}
  2161  
  2162  		// If the value fits within 24-bit, then we emit two add instructions
  2163  		if 0 <= c && c <= 0xffffff && inst != SUBS && inst != ADDS {
  2164  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3849-L3862
  2165  			a.Buf.Write([]byte{
  2166  				(dstRegBits << 5) | dstRegBits,
  2167  				(byte(c) << 2) | (dstRegBits >> 3),
  2168  				byte(c & 0xfff >> 6),
  2169  				sfops<<5 | 0b10001,
  2170  			})
  2171  			c = c >> 12
  2172  			a.Buf.Write([]byte{
  2173  				(dstRegBits << 5) | dstRegBits,
  2174  				(byte(c) << 2) | (dstRegBits >> 3),
  2175  				0b01_000000 /* shift by 12 */ | byte(c>>6),
  2176  				sfops<<5 | 0b10001,
  2177  			})
  2178  			return
  2179  		}
  2180  
  2181  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3163-L3203
  2182  		// Otherwise we use MOVZ and MOVNs for loading const into tmpRegister.
  2183  		tmpRegBits := registerBits(a.temporaryRegister)
  2184  		a.load64bitConst(c, tmpRegBits)
  2185  		a.addOrSub64BitRegisters(sfops, dstRegBits, tmpRegBits)
  2186  	case MOVW:
  2187  		if c == 0 {
  2188  			a.Buf.Write([]byte{
  2189  				(zeroRegisterBits << 5) | dstRegBits,
  2190  				zeroRegisterBits >> 3,
  2191  				0b000_00000 | zeroRegisterBits,
  2192  				0b0_01_01010,
  2193  			})
  2194  			return
  2195  		}
  2196  
  2197  		// Following the logic here:
  2198  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637
  2199  		c32 := uint32(c)
  2200  		ic := int64(c32)
  2201  		if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) {
  2202  			if isBitMaskImmediate(uint64(c)) {
  2203  				a.loadConstViaBitMaskImmediate(uint64(c), dstRegBits, false)
  2204  				return
  2205  			}
  2206  		}
  2207  
  2208  		if t := const16bitAligned(int64(c32)); t >= 0 {
  2209  			// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
  2210  			// We could load it into temporary with movk.
  2211  			a.load16bitAlignedConst(int64(c32)>>(16*t), byte(t), dstRegBits, false, false)
  2212  		} else if t := const16bitAligned(int64(^c32)); t >= 0 {
  2213  			// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
  2214  			a.load16bitAlignedConst(int64(^c32)>>(16*t), byte(t), dstRegBits, true, false)
  2215  		} else if isBitMaskImmediate(uint64(c)) {
  2216  			a.loadConstViaBitMaskImmediate(uint64(c), dstRegBits, false)
  2217  		} else {
  2218  			// Otherwise, we use MOVZ and MOVK to load it.
  2219  			// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6623-L6630
  2220  			c16 := uint16(c32)
  2221  			// MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2222  			a.Buf.Write([]byte{
  2223  				(byte(c16) << 5) | dstRegBits,
  2224  				byte(c16 >> 3),
  2225  				1<<7 | byte(c16>>11),
  2226  				0b0_10_10010,
  2227  			})
  2228  			// MOVK: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK
  2229  			c16 = uint16(c32 >> 16)
  2230  			if c16 != 0 {
  2231  				a.Buf.Write([]byte{
  2232  					(byte(c16) << 5) | dstRegBits,
  2233  					byte(c16 >> 3),
  2234  					1<<7 | 0b0_01_00000 /* shift by 16 */ | byte(c16>>11),
  2235  					0b0_11_10010,
  2236  				})
  2237  			}
  2238  		}
  2239  	case MOVD:
  2240  		// Following the logic here:
  2241  		// https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852
  2242  		if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) {
  2243  			if isBitMaskImmediate(uint64(c)) {
  2244  				a.loadConstViaBitMaskImmediate(uint64(c), dstRegBits, true)
  2245  				return
  2246  			}
  2247  		}
  2248  
  2249  		if t := const16bitAligned(c); t >= 0 {
  2250  			// If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000
  2251  			// We could load it into temporary with movk.
  2252  			a.load16bitAlignedConst(c>>(16*t), byte(t), dstRegBits, false, true)
  2253  		} else if t := const16bitAligned(^c); t >= 0 {
  2254  			// Also, if the reverse of the const can fit within 16-bit range, do the same ^^.
  2255  			a.load16bitAlignedConst((^c)>>(16*t), byte(t), dstRegBits, true, true)
  2256  		} else if isBitMaskImmediate(uint64(c)) {
  2257  			a.loadConstViaBitMaskImmediate(uint64(c), dstRegBits, true)
  2258  		} else {
  2259  			a.load64bitConst(c, dstRegBits)
  2260  		}
  2261  	case LSR:
  2262  		if c == 0 {
  2263  			err = errors.New("LSR with zero constant should be optimized out")
  2264  			return
  2265  		} else if c < 0 || c > 63 {
  2266  			err = fmt.Errorf("LSR requires immediate to be within 0 to 63, but got %d", c)
  2267  			return
  2268  		}
  2269  
  2270  		// LSR(immediate) is an alias of UBFM
  2271  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en
  2272  		a.Buf.Write([]byte{
  2273  			(dstRegBits << 5) | dstRegBits,
  2274  			0b111111_00 | dstRegBits>>3,
  2275  			0b01_000000 | byte(c),
  2276  			0b110_10011,
  2277  		})
  2278  	case LSL:
  2279  		if c == 0 {
  2280  			err = errors.New("LSL with zero constant should be optimized out")
  2281  			return
  2282  		} else if c < 0 || c > 63 {
  2283  			err = fmt.Errorf("LSL requires immediate to be within 0 to 63, but got %d", c)
  2284  			return
  2285  		}
  2286  
  2287  		// LSL(immediate) is an alias of UBFM
  2288  		// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM-
  2289  		cb := byte(c)
  2290  		a.Buf.Write([]byte{
  2291  			(dstRegBits << 5) | dstRegBits,
  2292  			(0b111111-cb)<<2 | dstRegBits>>3,
  2293  			0b01_000000 | (64 - cb),
  2294  			0b110_10011,
  2295  		})
  2296  
  2297  	default:
  2298  		return errorEncodingUnsupported(n)
  2299  	}
  2300  	return
  2301  }
  2302  
  2303  func (a *AssemblerImpl) movk(v uint64, shfitNum int, dstRegBits byte) {
  2304  	// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK
  2305  	a.Buf.Write([]byte{
  2306  		(byte(v) << 5) | dstRegBits,
  2307  		byte(v >> 3),
  2308  		1<<7 | byte(shfitNum)<<5 | (0b000_11111 & byte(v>>11)),
  2309  		0b1_11_10010,
  2310  	})
  2311  }
  2312  
  2313  func (a *AssemblerImpl) movz(v uint64, shfitNum int, dstRegBits byte) {
  2314  	// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2315  	a.Buf.Write([]byte{
  2316  		(byte(v) << 5) | dstRegBits,
  2317  		byte(v >> 3),
  2318  		1<<7 | byte(shfitNum)<<5 | (0b000_11111 & byte(v>>11)),
  2319  		0b1_10_10010,
  2320  	})
  2321  }
  2322  
  2323  func (a *AssemblerImpl) movn(v uint64, shfitNum int, dstRegBits byte) {
  2324  	// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2325  	a.Buf.Write([]byte{
  2326  		(byte(v) << 5) | dstRegBits,
  2327  		byte(v >> 3),
  2328  		1<<7 | byte(shfitNum)<<5 | (0b000_11111 & byte(v>>11)),
  2329  		0b1_00_10010,
  2330  	})
  2331  }
  2332  
  2333  // load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit
  2334  // consts as in the Go assembler.
  2335  //
  2336  // See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759
  2337  func (a *AssemblerImpl) load64bitConst(c int64, dstRegBits byte) {
  2338  	var bits [4]uint64
  2339  	var zeros, negs int
  2340  	for i := 0; i < 4; i++ {
  2341  		bits[i] = uint64((c >> uint(i*16)) & 0xffff)
  2342  		if v := bits[i]; v == 0 {
  2343  			zeros++
  2344  		} else if v == 0xffff {
  2345  			negs++
  2346  		}
  2347  	}
  2348  
  2349  	if zeros == 3 {
  2350  		// one MOVZ instruction.
  2351  		for i, v := range bits {
  2352  			if v != 0 {
  2353  				a.movz(v, i, dstRegBits)
  2354  			}
  2355  		}
  2356  	} else if negs == 3 {
  2357  		// one MOVN instruction.
  2358  		for i, v := range bits {
  2359  			if v != 0xffff {
  2360  				v = ^v
  2361  				a.movn(v, i, dstRegBits)
  2362  			}
  2363  		}
  2364  	} else if zeros == 2 {
  2365  		// one MOVZ then one OVK.
  2366  		var movz bool
  2367  		for i, v := range bits {
  2368  			if !movz && v != 0 { // MOVZ.
  2369  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2370  				a.movz(v, i, dstRegBits)
  2371  				movz = true
  2372  			} else if v != 0 {
  2373  				a.movk(v, i, dstRegBits)
  2374  			}
  2375  		}
  2376  
  2377  	} else if negs == 2 {
  2378  		// one MOVN then one or two MOVK.
  2379  		var movn bool
  2380  		for i, v := range bits { // Emit MOVN.
  2381  			if !movn && v != 0xffff {
  2382  				v = ^v
  2383  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
  2384  				a.movn(v, i, dstRegBits)
  2385  				movn = true
  2386  			} else if v != 0xffff {
  2387  				a.movk(v, i, dstRegBits)
  2388  			}
  2389  		}
  2390  
  2391  	} else if zeros == 1 {
  2392  		// one MOVZ then two MOVK.
  2393  		var movz bool
  2394  		for i, v := range bits {
  2395  			if !movz && v != 0 { // MOVZ.
  2396  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2397  				a.movz(v, i, dstRegBits)
  2398  				movz = true
  2399  			} else if v != 0 {
  2400  				a.movk(v, i, dstRegBits)
  2401  			}
  2402  		}
  2403  
  2404  	} else if negs == 1 {
  2405  		// one MOVN then two MOVK.
  2406  		var movn bool
  2407  		for i, v := range bits { // Emit MOVN.
  2408  			if !movn && v != 0xffff {
  2409  				v = ^v
  2410  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
  2411  				a.movn(v, i, dstRegBits)
  2412  				movn = true
  2413  			} else if v != 0xffff {
  2414  				a.movk(v, i, dstRegBits)
  2415  			}
  2416  		}
  2417  
  2418  	} else {
  2419  		// one MOVZ then tree MOVK.
  2420  		var movz bool
  2421  		for i, v := range bits {
  2422  			if !movz && v != 0 { // MOVZ.
  2423  				// https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2424  				a.movz(v, i, dstRegBits)
  2425  				movz = true
  2426  			} else if v != 0 {
  2427  				a.movk(v, i, dstRegBits)
  2428  			}
  2429  		}
  2430  
  2431  	}
  2432  }
  2433  
  2434  func (a *AssemblerImpl) load16bitAlignedConst(c int64, shiftNum byte, regBits byte, reverse bool, dst64bit bool) {
  2435  	var lastByte byte
  2436  	if reverse {
  2437  		// MOVN: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ
  2438  		lastByte = 0b0_00_10010
  2439  	} else {
  2440  		// MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN
  2441  		lastByte = 0b0_10_10010
  2442  	}
  2443  	if dst64bit {
  2444  		lastByte |= 0b1 << 7
  2445  	}
  2446  	a.Buf.Write([]byte{
  2447  		(byte(c) << 5) | regBits,
  2448  		byte(c >> 3),
  2449  		1<<7 | (shiftNum << 5) | byte(c>>11),
  2450  		lastByte,
  2451  	})
  2452  }
  2453  
  2454  // loadConstViaBitMaskImmediate loads the constant with ORR (bitmask immediate).
  2455  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate--?lang=en
  2456  func (a *AssemblerImpl) loadConstViaBitMaskImmediate(c uint64, regBits byte, dst64bit bool) {
  2457  	var size uint32
  2458  	switch {
  2459  	case c != c>>32|c<<32:
  2460  		size = 64
  2461  	case c != c>>16|c<<48:
  2462  		size = 32
  2463  		c = uint64(int32(c))
  2464  	case c != c>>8|c<<56:
  2465  		size = 16
  2466  		c = uint64(int16(c))
  2467  	case c != c>>4|c<<60:
  2468  		size = 8
  2469  		c = uint64(int8(c))
  2470  	case c != c>>2|c<<62:
  2471  		size = 4
  2472  		c = uint64(int64(c<<60) >> 60)
  2473  	default:
  2474  		size = 2
  2475  		c = uint64(int64(c<<62) >> 62)
  2476  	}
  2477  
  2478  	neg := false
  2479  	if int64(c) < 0 {
  2480  		c = ^c
  2481  		neg = true
  2482  	}
  2483  
  2484  	onesSize, nonZeroPos := getOnesSequenceSize(c)
  2485  	if neg {
  2486  		nonZeroPos = onesSize + nonZeroPos
  2487  		onesSize = size - onesSize
  2488  	}
  2489  
  2490  	// See the following article for understanding the encoding.
  2491  	// https://dinfuehr.github.io/blog/encoding-of-immediate-values-on-aarch64/
  2492  	var n byte
  2493  	mode := 32
  2494  	if dst64bit && size == 64 {
  2495  		n = 0b1
  2496  		mode = 64
  2497  	}
  2498  
  2499  	r := byte((size - nonZeroPos) & (size - 1) & uint32(mode-1))
  2500  	s := byte((onesSize - 1) | 63&^(size<<1-1))
  2501  
  2502  	var sf byte
  2503  	if dst64bit {
  2504  		sf = 0b1
  2505  	}
  2506  	a.Buf.Write([]byte{
  2507  		(zeroRegisterBits << 5) | regBits,
  2508  		s<<2 | (zeroRegisterBits >> 3),
  2509  		n<<6 | r,
  2510  		sf<<7 | 0b0_01_10010,
  2511  	})
  2512  }
  2513  
  2514  func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) {
  2515  	// Take 0b00111000 for example:
  2516  	y := getLowestBit(x)               // = 0b0000100
  2517  	nonZeroPos = setBitPos(y)          // = 2
  2518  	size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3
  2519  	return
  2520  }
  2521  
  2522  func setBitPos(x uint64) (ret uint32) {
  2523  	for ; ; ret++ {
  2524  		if x == 0b1 {
  2525  			break
  2526  		}
  2527  		x = x >> 1
  2528  	}
  2529  	return
  2530  }
  2531  
  2532  func checkArrangementIndexPair(arr VectorArrangement, index VectorIndex) (err error) {
  2533  	if arr == VectorArrangementNone {
  2534  		return nil
  2535  	}
  2536  	var valid bool
  2537  	switch arr {
  2538  	case VectorArrangement8B:
  2539  		valid = index < 8
  2540  	case VectorArrangement16B:
  2541  		valid = index < 16
  2542  	case VectorArrangement4H:
  2543  		valid = index < 4
  2544  	case VectorArrangement8H:
  2545  		valid = index < 8
  2546  	case VectorArrangement2S:
  2547  		valid = index < 2
  2548  	case VectorArrangement4S:
  2549  		valid = index < 4
  2550  	case VectorArrangement1D:
  2551  		valid = index < 1
  2552  	case VectorArrangement2D:
  2553  		valid = index < 2
  2554  	case VectorArrangementB:
  2555  		valid = index < 16
  2556  	case VectorArrangementH:
  2557  		valid = index < 8
  2558  	case VectorArrangementS:
  2559  		valid = index < 4
  2560  	case VectorArrangementD:
  2561  		valid = index < 2
  2562  	}
  2563  	if !valid {
  2564  		err = fmt.Errorf("invalid arrangement and index pair: %s[%d]", arr, index)
  2565  	}
  2566  	return
  2567  }
  2568  
  2569  func (a *AssemblerImpl) encodeMemoryToVectorRegister(n *nodeImpl) (err error) {
  2570  	srcBaseRegBits, err := intRegisterBits(n.srcReg)
  2571  	if err != nil {
  2572  		return err
  2573  	}
  2574  
  2575  	dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
  2576  	if err != nil {
  2577  		return err
  2578  	}
  2579  
  2580  	switch n.instruction {
  2581  	case VMOV: // translated as LDR(immediate,SIMD&FP)
  2582  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LDR--immediate--SIMD-FP---Load-SIMD-FP-Register--immediate-offset--?lang=en
  2583  		var size, opcode byte
  2584  		var dataSize, dataSizeLog2 int64
  2585  		switch n.vectorArrangement {
  2586  		case VectorArrangementB:
  2587  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b01, 1, 0
  2588  		case VectorArrangementH:
  2589  			size, opcode, dataSize, dataSizeLog2 = 0b01, 0b01, 2, 1
  2590  		case VectorArrangementS:
  2591  			size, opcode, dataSize, dataSizeLog2 = 0b10, 0b01, 4, 2
  2592  		case VectorArrangementD:
  2593  			size, opcode, dataSize, dataSizeLog2 = 0b11, 0b01, 8, 3
  2594  		case VectorArrangementQ:
  2595  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b11, 16, 4
  2596  		}
  2597  		const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos
  2598  		if n.srcReg2 != asm.NilRegister {
  2599  			offsetRegBits, err := intRegisterBits(n.srcReg2)
  2600  			if err != nil {
  2601  				return err
  2602  			}
  2603  			a.encodeLoadOrStoreWithRegisterOffset(srcBaseRegBits, offsetRegBits, dstVectorRegBits, opcode, size, v)
  2604  		} else {
  2605  			err = a.encodeLoadOrStoreWithConstOffset(srcBaseRegBits, dstVectorRegBits,
  2606  				n.srcConst, opcode, size, v, dataSize, dataSizeLog2)
  2607  		}
  2608  	case LD1R:
  2609  		if n.srcReg2 != asm.NilRegister || n.srcConst != 0 {
  2610  			return fmt.Errorf("offset for %s is not implemented", InstructionName(LD1R))
  2611  		}
  2612  
  2613  		var size, q byte
  2614  		switch n.vectorArrangement {
  2615  		case VectorArrangement8B:
  2616  			size, q = 0b00, 0b0
  2617  		case VectorArrangement16B:
  2618  			size, q = 0b00, 0b1
  2619  		case VectorArrangement4H:
  2620  			size, q = 0b01, 0b0
  2621  		case VectorArrangement8H:
  2622  			size, q = 0b01, 0b1
  2623  		case VectorArrangement2S:
  2624  			size, q = 0b10, 0b0
  2625  		case VectorArrangement4S:
  2626  			size, q = 0b10, 0b1
  2627  		case VectorArrangement1D:
  2628  			size, q = 0b11, 0b0
  2629  		case VectorArrangement2D:
  2630  			size, q = 0b11, 0b1
  2631  		}
  2632  
  2633  		// No offset encoding.
  2634  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#iclass_as_post_index
  2635  		a.Buf.Write([]byte{
  2636  			(srcBaseRegBits << 5) | dstVectorRegBits,
  2637  			0b11_000000 | size<<2 | srcBaseRegBits>>3,
  2638  			0b01_000000,
  2639  			q<<6 | 0b1101,
  2640  		})
  2641  	default:
  2642  		return errorEncodingUnsupported(n)
  2643  	}
  2644  	return
  2645  }
  2646  
  2647  func arrangementSizeQ(arr VectorArrangement) (size, q byte) {
  2648  	switch arr {
  2649  	case VectorArrangement8B:
  2650  		size, q = 0b00, 0
  2651  	case VectorArrangement16B:
  2652  		size, q = 0b00, 1
  2653  	case VectorArrangement4H:
  2654  		size, q = 0b01, 0
  2655  	case VectorArrangement8H:
  2656  		size, q = 0b01, 1
  2657  	case VectorArrangement2S:
  2658  		size, q = 0b10, 0
  2659  	case VectorArrangement4S:
  2660  		size, q = 0b10, 1
  2661  	case VectorArrangement1D:
  2662  		size, q = 0b11, 0
  2663  	case VectorArrangement2D:
  2664  		size, q = 0b11, 1
  2665  	}
  2666  	return
  2667  }
  2668  
  2669  func (a *AssemblerImpl) encodeVectorRegisterToMemory(n *nodeImpl) (err error) {
  2670  	srcVectorRegBits, err := vectorRegisterBits(n.srcReg)
  2671  	if err != nil {
  2672  		return err
  2673  	}
  2674  
  2675  	dstBaseRegBits, err := intRegisterBits(n.dstReg)
  2676  	if err != nil {
  2677  		return err
  2678  	}
  2679  
  2680  	switch n.instruction {
  2681  	case VMOV: // translated as STR(immediate,SIMD&FP)
  2682  		// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/STR--immediate--SIMD-FP---Store-SIMD-FP-register--immediate-offset--
  2683  		var size, opcode byte
  2684  		var dataSize, dataSizeLog2 int64
  2685  		switch n.vectorArrangement {
  2686  		case VectorArrangementB:
  2687  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b00, 1, 0
  2688  		case VectorArrangementH:
  2689  			size, opcode, dataSize, dataSizeLog2 = 0b01, 0b00, 2, 1
  2690  		case VectorArrangementS:
  2691  			size, opcode, dataSize, dataSizeLog2 = 0b10, 0b00, 4, 2
  2692  		case VectorArrangementD:
  2693  			size, opcode, dataSize, dataSizeLog2 = 0b11, 0b00, 8, 3
  2694  		case VectorArrangementQ:
  2695  			size, opcode, dataSize, dataSizeLog2 = 0b00, 0b10, 16, 4
  2696  		}
  2697  		const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos
  2698  
  2699  		if n.dstReg2 != asm.NilRegister {
  2700  			offsetRegBits, err := intRegisterBits(n.dstReg2)
  2701  			if err != nil {
  2702  				return err
  2703  			}
  2704  			a.encodeLoadOrStoreWithRegisterOffset(dstBaseRegBits, offsetRegBits, srcVectorRegBits, opcode, size, v)
  2705  		} else {
  2706  			err = a.encodeLoadOrStoreWithConstOffset(dstBaseRegBits, srcVectorRegBits,
  2707  				n.dstConst, opcode, size, v, dataSize, dataSizeLog2)
  2708  		}
  2709  	default:
  2710  		return errorEncodingUnsupported(n)
  2711  	}
  2712  	return
  2713  }
  2714  
  2715  func (a *AssemblerImpl) encodeStaticConstToVectorRegister(n *nodeImpl) (err error) {
  2716  	if n.instruction != VMOV {
  2717  		return errorEncodingUnsupported(n)
  2718  	}
  2719  
  2720  	dstRegBits, err := vectorRegisterBits(n.dstReg)
  2721  	if err != nil {
  2722  		return err
  2723  	}
  2724  
  2725  	// LDR (literal, SIMD&FP)
  2726  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal--
  2727  	var opc byte
  2728  	var constLength int
  2729  	switch n.vectorArrangement {
  2730  	case VectorArrangementS:
  2731  		opc, constLength = 0b00, 4
  2732  	case VectorArrangementD:
  2733  		opc, constLength = 0b01, 8
  2734  	case VectorArrangementQ:
  2735  		opc, constLength = 0b10, 16
  2736  	}
  2737  
  2738  	loadLiteralOffsetInBinary := uint64(a.Buf.Len())
  2739  	a.pool.AddConst(n.staticConst, loadLiteralOffsetInBinary)
  2740  
  2741  	if len(n.staticConst.Raw) != constLength {
  2742  		return fmt.Errorf("invalid const length for %s: want %d but was %d",
  2743  			n.vectorArrangement, constLength, len(n.staticConst.Raw))
  2744  	}
  2745  
  2746  	a.Buf.Write([]byte{dstRegBits, 0x0, 0x0, opc<<6 | 0b11100})
  2747  	n.staticConst.AddOffsetFinalizedCallback(func(offsetOfConst uint64) {
  2748  		// LDR (literal, SIMD&FP) encodes offset divided by 4.
  2749  		offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4
  2750  		bin := a.Buf.Bytes()
  2751  		bin[loadLiteralOffsetInBinary] |= byte(offset << 5)
  2752  		bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3)
  2753  		bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11)
  2754  	})
  2755  	return
  2756  }
  2757  
  2758  // advancedSIMDTwoRegisterMisc holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in
  2759  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  2760  var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct {
  2761  	u, opcode byte
  2762  	qAndSize  map[VectorArrangement]qAndSize
  2763  }{
  2764  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en
  2765  	NOT: {
  2766  		u: 0b1, opcode: 0b00101,
  2767  		qAndSize: map[VectorArrangement]qAndSize{
  2768  			VectorArrangement16B: {size: 0b00, q: 0b1},
  2769  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  2770  		},
  2771  	},
  2772  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en
  2773  	VFNEG: {
  2774  		u: 0b1, opcode: 0b01111,
  2775  		qAndSize: map[VectorArrangement]qAndSize{
  2776  			VectorArrangement4S: {size: 0b10, q: 0b1},
  2777  			VectorArrangement2S: {size: 0b10, q: 0b0},
  2778  			VectorArrangement2D: {size: 0b11, q: 0b1},
  2779  		},
  2780  	},
  2781  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FABS--vector---Floating-point-Absolute-value--vector--?lang=en
  2782  	VFABS: {u: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{
  2783  		VectorArrangement2D: {size: 0b11, q: 0b1},
  2784  		VectorArrangement4S: {size: 0b10, q: 0b1},
  2785  		VectorArrangement2S: {size: 0b10, q: 0b0},
  2786  	}},
  2787  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSQRT--vector---Floating-point-Square-Root--vector--?lang=en
  2788  	VFSQRT: {u: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{
  2789  		VectorArrangement2D: {size: 0b11, q: 0b1},
  2790  		VectorArrangement4S: {size: 0b10, q: 0b1},
  2791  		VectorArrangement2S: {size: 0b10, q: 0b0},
  2792  	}},
  2793  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTM--vector---Floating-point-Round-to-Integral--toward-Minus-infinity--vector--?lang=en
  2794  	VFRINTM: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{
  2795  		VectorArrangement2D: {size: 0b01, q: 0b1},
  2796  		VectorArrangement4S: {size: 0b00, q: 0b1},
  2797  		VectorArrangement2S: {size: 0b00, q: 0b0},
  2798  	}},
  2799  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTN--vector---Floating-point-Round-to-Integral--to-nearest-with-ties-to-even--vector--?lang=en
  2800  	VFRINTN: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{
  2801  		VectorArrangement2D: {size: 0b01, q: 0b1},
  2802  		VectorArrangement4S: {size: 0b00, q: 0b1},
  2803  		VectorArrangement2S: {size: 0b00, q: 0b0},
  2804  	}},
  2805  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTP--vector---Floating-point-Round-to-Integral--toward-Plus-infinity--vector--?lang=en
  2806  	VFRINTP: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{
  2807  		VectorArrangement2D: {size: 0b11, q: 0b1},
  2808  		VectorArrangement4S: {size: 0b10, q: 0b1},
  2809  		VectorArrangement2S: {size: 0b10, q: 0b0},
  2810  	}},
  2811  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTZ--vector---Floating-point-Round-to-Integral--toward-Zero--vector--?lang=en
  2812  	VFRINTZ: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{
  2813  		VectorArrangement2D: {size: 0b11, q: 0b1},
  2814  		VectorArrangement4S: {size: 0b10, q: 0b1},
  2815  		VectorArrangement2S: {size: 0b10, q: 0b0},
  2816  	}},
  2817  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CNT--Population-Count-per-byte-?lang=en
  2818  	VCNT: {u: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{
  2819  		VectorArrangement8B:  {size: 0b00, q: 0b0},
  2820  		VectorArrangement16B: {size: 0b00, q: 0b1},
  2821  	}},
  2822  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NEG--vector---Negate--vector--?lang=en
  2823  	VNEG: {u: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize},
  2824  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ABS--Absolute-value--vector--?lang=en
  2825  	VABS: {u: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize},
  2826  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/REV64--Reverse-elements-in-64-bit-doublewords--vector--?lang=en
  2827  	REV64: {u: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize},
  2828  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/XTN--XTN2--Extract-Narrow-?lang=en
  2829  	XTN: {u: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
  2830  		VectorArrangement2D: {q: 0, size: 0b10},
  2831  		VectorArrangement4S: {q: 0, size: 0b01},
  2832  		VectorArrangement8H: {q: 0, size: 0b00},
  2833  	}},
  2834  	SHLL: {u: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{
  2835  		VectorArrangement8B: {q: 0b00, size: 0b00},
  2836  		VectorArrangement4H: {q: 0b00, size: 0b01},
  2837  		VectorArrangement2S: {q: 0b00, size: 0b10},
  2838  	}},
  2839  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en
  2840  	CMEQZERO: {u: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize},
  2841  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en
  2842  	SADDLP: {u: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize},
  2843  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en
  2844  	UADDLP: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize},
  2845  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en
  2846  	VFCVTZS: {u: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{
  2847  		VectorArrangement4S: {size: 0b10, q: 0b1},
  2848  		VectorArrangement2S: {size: 0b10, q: 0b0},
  2849  		VectorArrangement2D: {size: 0b11, q: 0b1},
  2850  	}},
  2851  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en
  2852  	VFCVTZU: {u: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{
  2853  		VectorArrangement4S: {size: 0b10, q: 0b1},
  2854  		VectorArrangement2S: {size: 0b10, q: 0b0},
  2855  		VectorArrangement2D: {size: 0b11, q: 0b1},
  2856  	}},
  2857  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en
  2858  	SQXTN: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{
  2859  		VectorArrangement8B: {q: 0b0, size: 0b00},
  2860  		VectorArrangement4H: {q: 0b0, size: 0b01},
  2861  		VectorArrangement2S: {q: 0b0, size: 0b10},
  2862  	}},
  2863  
  2864  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en
  2865  	SQXTN2: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{
  2866  		VectorArrangement16B: {q: 0b1, size: 0b00},
  2867  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  2868  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  2869  	}},
  2870  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en
  2871  	UQXTN: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize},
  2872  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en
  2873  	SQXTUN: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
  2874  		VectorArrangement8B: {q: 0b0, size: 0b00},
  2875  		VectorArrangement4H: {q: 0b0, size: 0b01},
  2876  		VectorArrangement2S: {q: 0b0, size: 0b10},
  2877  	}},
  2878  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en
  2879  	SQXTUN2: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{
  2880  		VectorArrangement16B: {q: 0b1, size: 0b00},
  2881  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  2882  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  2883  	}},
  2884  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-Convert-to-Floating-point--vector--?lang=en
  2885  	VSCVTF: {u: 0b0, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{
  2886  		VectorArrangement2D: {q: 0b1, size: 0b01},
  2887  		VectorArrangement4S: {q: 0b1, size: 0b00},
  2888  		VectorArrangement2S: {q: 0b0, size: 0b00},
  2889  	}},
  2890  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-Convert-to-Floating-point--vector--?lang=en
  2891  	VUCVTF: {u: 0b1, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{
  2892  		VectorArrangement2D: {q: 0b1, size: 0b01},
  2893  		VectorArrangement4S: {q: 0b1, size: 0b00},
  2894  		VectorArrangement2S: {q: 0b0, size: 0b00},
  2895  	}},
  2896  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTL--FCVTL2--Floating-point-Convert-to-higher-precision-Long--vector--?lang=en
  2897  	FCVTL: {u: 0b0, opcode: 0b10111, qAndSize: map[VectorArrangement]qAndSize{
  2898  		VectorArrangement2S: {size: 0b01, q: 0b0},
  2899  		VectorArrangement4H: {size: 0b00, q: 0b0},
  2900  	}},
  2901  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTN--FCVTN2--Floating-point-Convert-to-lower-precision-Narrow--vector--?lang=en
  2902  	FCVTN: {u: 0b0, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{
  2903  		VectorArrangement2S: {size: 0b01, q: 0b0},
  2904  		VectorArrangement4H: {size: 0b00, q: 0b0},
  2905  	}},
  2906  }
  2907  
  2908  // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in
  2909  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  2910  var advancedSIMDThreeDifferent = map[asm.Instruction]struct {
  2911  	u, opcode byte
  2912  	qAndSize  map[VectorArrangement]qAndSize
  2913  }{
  2914  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMLAL--UMLAL2--vector---Unsigned-Multiply-Add-Long--vector--?lang=en
  2915  	VUMLAL: {u: 0b1, opcode: 0b1000, qAndSize: map[VectorArrangement]qAndSize{
  2916  		VectorArrangement2S: {q: 0b0, size: 0b10},
  2917  		VectorArrangement4H: {q: 0b0, size: 0b01},
  2918  		VectorArrangement8B: {q: 0b0, size: 0b00},
  2919  	}},
  2920  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en
  2921  	SMULL: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  2922  		VectorArrangement8B: {q: 0b0, size: 0b00},
  2923  		VectorArrangement4H: {q: 0b0, size: 0b01},
  2924  		VectorArrangement2S: {q: 0b0, size: 0b10},
  2925  	}},
  2926  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en
  2927  	SMULL2: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  2928  		VectorArrangement16B: {q: 0b1, size: 0b00},
  2929  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  2930  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  2931  	}},
  2932  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  2933  	UMULL: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  2934  		VectorArrangement8B: {q: 0b0, size: 0b00},
  2935  		VectorArrangement4H: {q: 0b0, size: 0b01},
  2936  		VectorArrangement2S: {q: 0b0, size: 0b10},
  2937  	}},
  2938  	// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  2939  	UMULL2: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{
  2940  		VectorArrangement16B: {q: 0b1, size: 0b00},
  2941  		VectorArrangement8H:  {q: 0b1, size: 0b01},
  2942  		VectorArrangement4S:  {q: 0b1, size: 0b10},
  2943  	}},
  2944  }
  2945  
  2946  // advancedSIMDThreeSame holds information to encode instructions as "Advanced SIMD three same" in
  2947  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  2948  var advancedSIMDThreeSame = map[asm.Instruction]struct {
  2949  	u, opcode byte
  2950  	qAndSize  map[VectorArrangement]qAndSize
  2951  }{
  2952  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--?lang=en
  2953  	VAND: {
  2954  		u: 0b0, opcode: 0b00011,
  2955  		qAndSize: map[VectorArrangement]qAndSize{
  2956  			VectorArrangement16B: {size: 0b00, q: 0b1},
  2957  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  2958  		},
  2959  	},
  2960  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BSL--Bitwise-Select-?lang=en
  2961  	BSL: {
  2962  		u: 0b1, opcode: 0b00011,
  2963  		qAndSize: map[VectorArrangement]qAndSize{
  2964  			VectorArrangement16B: {size: 0b01, q: 0b1},
  2965  			VectorArrangement8B:  {size: 0b01, q: 0b0},
  2966  		},
  2967  	},
  2968  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EOR--vector---Bitwise-Exclusive-OR--vector--?lang=en
  2969  	EOR: {
  2970  		u: 0b1, opcode: 0b00011,
  2971  		qAndSize: map[VectorArrangement]qAndSize{
  2972  			VectorArrangement16B: {size: 0b00, q: 0b1},
  2973  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  2974  		},
  2975  	},
  2976  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--?lang=en
  2977  	VORR: {
  2978  		u: 0b0, opcode: 0b00011,
  2979  		qAndSize: map[VectorArrangement]qAndSize{
  2980  			VectorArrangement16B: {size: 0b10, q: 0b1},
  2981  			VectorArrangement8B:  {size: 0b10, q: 0b0},
  2982  		},
  2983  	},
  2984  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--?lang=en
  2985  	BIC: {
  2986  		u: 0b0, opcode: 0b00011,
  2987  		qAndSize: map[VectorArrangement]qAndSize{
  2988  			VectorArrangement16B: {size: 0b01, q: 0b1},
  2989  			VectorArrangement8B:  {size: 0b01, q: 0b0},
  2990  		},
  2991  	},
  2992  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
  2993  	VFADDS: {
  2994  		u: 0b0, opcode: 0b11010,
  2995  		qAndSize: map[VectorArrangement]qAndSize{
  2996  			VectorArrangement4S: {size: 0b00, q: 0b1},
  2997  			VectorArrangement2S: {size: 0b00, q: 0b0},
  2998  		},
  2999  	},
  3000  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en
  3001  	VFADDD: {
  3002  		u: 0b0, opcode: 0b11010,
  3003  		qAndSize: map[VectorArrangement]qAndSize{
  3004  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3005  		},
  3006  	},
  3007  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
  3008  	VFSUBS: {
  3009  		u: 0b0, opcode: 0b11010,
  3010  		qAndSize: map[VectorArrangement]qAndSize{
  3011  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3012  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3013  		},
  3014  	},
  3015  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en
  3016  	VFSUBD: {
  3017  		u: 0b0, opcode: 0b11010,
  3018  		qAndSize: map[VectorArrangement]qAndSize{
  3019  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3020  		},
  3021  	},
  3022  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-?lang=en
  3023  	UMAXP: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize},
  3024  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en
  3025  	CMEQ: {u: 0b1, opcode: 0b10001, qAndSize: defaultQAndSize},
  3026  	// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector-
  3027  	VADDP: {u: 0b0, opcode: 0b10111, qAndSize: defaultQAndSize},
  3028  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector---Add--vector--?lang=en
  3029  	VADD: {u: 0, opcode: 0b10000, qAndSize: defaultQAndSize},
  3030  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--?lang=en
  3031  	VSUB: {u: 1, opcode: 0b10000, qAndSize: defaultQAndSize},
  3032  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
  3033  	SSHL: {u: 0, opcode: 0b01000, qAndSize: defaultQAndSize},
  3034  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en
  3035  	USHL: {u: 0b1, opcode: 0b01000, qAndSize: defaultQAndSize},
  3036  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGT--register---Compare-signed-Greater-than--vector--?lang=en
  3037  	CMGT: {u: 0b0, opcode: 0b00110, qAndSize: defaultQAndSize},
  3038  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHI--register---Compare-unsigned-Higher--vector--?lang=en
  3039  	CMHI: {u: 0b1, opcode: 0b00110, qAndSize: defaultQAndSize},
  3040  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGE--register---Compare-signed-Greater-than-or-Equal--vector--?lang=en
  3041  	CMGE: {u: 0b0, opcode: 0b00111, qAndSize: defaultQAndSize},
  3042  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHS--register---Compare-unsigned-Higher-or-Same--vector--?lang=en
  3043  	CMHS: {u: 0b1, opcode: 0b00111, qAndSize: defaultQAndSize},
  3044  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMEQ--register---Floating-point-Compare-Equal--vector--?lang=en
  3045  	FCMEQ: {
  3046  		u: 0b0, opcode: 0b11100,
  3047  		qAndSize: map[VectorArrangement]qAndSize{
  3048  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3049  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3050  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3051  		},
  3052  	},
  3053  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGT--register---Floating-point-Compare-Greater-than--vector--?lang=en
  3054  	FCMGT: {
  3055  		u: 0b1, opcode: 0b11100,
  3056  		qAndSize: map[VectorArrangement]qAndSize{
  3057  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3058  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3059  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3060  		},
  3061  	},
  3062  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGE--register---Floating-point-Compare-Greater-than-or-Equal--vector--?lang=en
  3063  	FCMGE: {
  3064  		u: 0b1, opcode: 0b11100,
  3065  		qAndSize: map[VectorArrangement]qAndSize{
  3066  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3067  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3068  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3069  		},
  3070  	},
  3071  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMIN--vector---Floating-point-minimum--vector--?lang=en
  3072  	VFMIN: {
  3073  		u: 0b0, opcode: 0b11110,
  3074  		qAndSize: map[VectorArrangement]qAndSize{
  3075  			VectorArrangement4S: {size: 0b10, q: 0b1},
  3076  			VectorArrangement2S: {size: 0b10, q: 0b0},
  3077  			VectorArrangement2D: {size: 0b11, q: 0b1},
  3078  		},
  3079  	},
  3080  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMAX--vector---Floating-point-Maximum--vector--?lang=en
  3081  	VFMAX: {
  3082  		u: 0b0, opcode: 0b11110,
  3083  		qAndSize: map[VectorArrangement]qAndSize{
  3084  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3085  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3086  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3087  		},
  3088  	},
  3089  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMUL--vector---Floating-point-Multiply--vector--?lang=en
  3090  	VFMUL: {
  3091  		u: 0b1, opcode: 0b11011,
  3092  		qAndSize: map[VectorArrangement]qAndSize{
  3093  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3094  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3095  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3096  		},
  3097  	},
  3098  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FDIV--vector---Floating-point-Divide--vector--?lang=en
  3099  	VFDIV: {
  3100  		u: 0b1, opcode: 0b11111,
  3101  		qAndSize: map[VectorArrangement]qAndSize{
  3102  			VectorArrangement4S: {size: 0b00, q: 0b1},
  3103  			VectorArrangement2S: {size: 0b00, q: 0b0},
  3104  			VectorArrangement2D: {size: 0b01, q: 0b1},
  3105  		},
  3106  	},
  3107  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MUL--vector---Multiply--vector--?lang=en
  3108  	VMUL: {u: 0b0, opcode: 0b10011, qAndSize: defaultQAndSize},
  3109  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQADD--Signed-saturating-Add-?lang=en
  3110  	VSQADD: {u: 0b0, opcode: 0b00001, qAndSize: defaultQAndSize},
  3111  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQADD--Unsigned-saturating-Add-?lang=en
  3112  	VUQADD: {u: 0b1, opcode: 0b00001, qAndSize: defaultQAndSize},
  3113  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMIN--Signed-Minimum--vector--?lang=en
  3114  	SMIN: {u: 0b0, opcode: 0b01101, qAndSize: defaultQAndSize},
  3115  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMAX--Signed-Maximum--vector--?lang=en
  3116  	SMAX: {u: 0b0, opcode: 0b01100, qAndSize: defaultQAndSize},
  3117  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMIN--Unsigned-Minimum--vector--?lang=en
  3118  	UMIN: {u: 0b1, opcode: 0b01101, qAndSize: defaultQAndSize},
  3119  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAX--Unsigned-Maximum--vector--?lang=en
  3120  	UMAX: {u: 0b1, opcode: 0b01100, qAndSize: defaultQAndSize},
  3121  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/URHADD--Unsigned-Rounding-Halving-Add-?lang=en
  3122  	URHADD: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize},
  3123  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQSUB--Signed-saturating-Subtract-?lang=en
  3124  	VSQSUB: {u: 0b0, opcode: 0b00101, qAndSize: defaultQAndSize},
  3125  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQSUB--Unsigned-saturating-Subtract-?lang=en
  3126  	VUQSUB: {u: 0b1, opcode: 0b00101, qAndSize: defaultQAndSize},
  3127  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIT--Bitwise-Insert-if-True-?lang=en
  3128  	VBIT: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{
  3129  		VectorArrangement8B:  {q: 0b0, size: 0b10},
  3130  		VectorArrangement16B: {q: 0b1, size: 0b10},
  3131  	}},
  3132  	SQRDMULH: {u: 0b1, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{
  3133  		VectorArrangement4H: {q: 0b0, size: 0b01},
  3134  		VectorArrangement8H: {q: 0b1, size: 0b01},
  3135  		VectorArrangement2S: {q: 0b0, size: 0b10},
  3136  		VectorArrangement4S: {q: 0b1, size: 0b10},
  3137  	}},
  3138  }
  3139  
  3140  // aAndSize is a pair of "Q" and "size" that appear in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3141  type qAndSize struct{ q, size byte }
  3142  
  3143  // defaultQAndSize maps a vector arrangement to the default qAndSize which is encoded by many instructions.
  3144  var defaultQAndSize = map[VectorArrangement]qAndSize{
  3145  	VectorArrangement8B:  {size: 0b00, q: 0b0},
  3146  	VectorArrangement16B: {size: 0b00, q: 0b1},
  3147  	VectorArrangement4H:  {size: 0b01, q: 0b0},
  3148  	VectorArrangement8H:  {size: 0b01, q: 0b1},
  3149  	VectorArrangement2S:  {size: 0b10, q: 0b0},
  3150  	VectorArrangement4S:  {size: 0b10, q: 0b1},
  3151  	VectorArrangement1D:  {size: 0b11, q: 0b0},
  3152  	VectorArrangement2D:  {size: 0b11, q: 0b1},
  3153  }
  3154  
  3155  // advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD across lanes" in
  3156  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3157  var advancedSIMDAcrossLanes = map[asm.Instruction]struct {
  3158  	u, opcode byte
  3159  	qAndSize  map[VectorArrangement]qAndSize
  3160  }{
  3161  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-?lang=en
  3162  	ADDV: {
  3163  		u: 0b0, opcode: 0b11011,
  3164  		qAndSize: map[VectorArrangement]qAndSize{
  3165  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3166  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3167  			VectorArrangement8H:  {size: 0b01, q: 0b1},
  3168  			VectorArrangement4H:  {size: 0b01, q: 0b0},
  3169  			VectorArrangement4S:  {size: 0b10, q: 0b1},
  3170  		},
  3171  	},
  3172  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMINV--Unsigned-Minimum-across-Vector-?lang=en
  3173  	UMINV: {
  3174  		u: 0b1, opcode: 0b11010,
  3175  		qAndSize: map[VectorArrangement]qAndSize{
  3176  			VectorArrangement16B: {size: 0b00, q: 0b1},
  3177  			VectorArrangement8B:  {size: 0b00, q: 0b0},
  3178  			VectorArrangement8H:  {size: 0b01, q: 0b1},
  3179  			VectorArrangement4H:  {size: 0b01, q: 0b0},
  3180  			VectorArrangement4S:  {size: 0b10, q: 0b1},
  3181  		},
  3182  	},
  3183  	UADDLV: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{
  3184  		VectorArrangement16B: {size: 0b00, q: 0b1},
  3185  		VectorArrangement8B:  {size: 0b00, q: 0b0},
  3186  		VectorArrangement8H:  {size: 0b01, q: 0b1},
  3187  		VectorArrangement4H:  {size: 0b01, q: 0b0},
  3188  		VectorArrangement4S:  {size: 0b10, q: 0b1},
  3189  	}},
  3190  }
  3191  
  3192  // advancedSIMDScalarPairwise holds information to encode instructions as "Advanced SIMD scalar pairwise" in
  3193  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3194  var advancedSIMDScalarPairwise = map[asm.Instruction]struct {
  3195  	u, opcode byte
  3196  	size      map[VectorArrangement]byte
  3197  }{
  3198  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en
  3199  	ADDP: {u: 0b0, opcode: 0b11011, size: map[VectorArrangement]byte{VectorArrangement2D: 0b11}},
  3200  }
  3201  
  3202  // advancedSIMDCopy holds information to encode instructions as "Advanced SIMD copy" in
  3203  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3204  var advancedSIMDCopy = map[asm.Instruction]struct {
  3205  	op byte
  3206  	// TODO: extract common implementation of resolver.
  3207  	resolver func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error)
  3208  }{
  3209  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-?lang=en
  3210  	DUPELEM: {op: 0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3211  		imm4 = 0b0000
  3212  		q = 0b1
  3213  
  3214  		switch arr {
  3215  		case VectorArrangementB:
  3216  			imm5 |= 0b1
  3217  			imm5 |= byte(srcIndex) << 1
  3218  		case VectorArrangementH:
  3219  			imm5 |= 0b10
  3220  			imm5 |= byte(srcIndex) << 2
  3221  		case VectorArrangementS:
  3222  			imm5 |= 0b100
  3223  			imm5 |= byte(srcIndex) << 3
  3224  		case VectorArrangementD:
  3225  			imm5 |= 0b1000
  3226  			imm5 |= byte(srcIndex) << 4
  3227  		default:
  3228  			err = fmt.Errorf("unsupported arrangement for DUPELEM: %d", arr)
  3229  		}
  3230  
  3231  		return
  3232  	}},
  3233  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en
  3234  	DUPGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3235  		imm4 = 0b0001
  3236  		switch arr {
  3237  		case VectorArrangement8B:
  3238  			imm5 = 0b1
  3239  		case VectorArrangement16B:
  3240  			imm5 = 0b1
  3241  			q = 0b1
  3242  		case VectorArrangement4H:
  3243  			imm5 = 0b10
  3244  		case VectorArrangement8H:
  3245  			imm5 = 0b10
  3246  			q = 0b1
  3247  		case VectorArrangement2S:
  3248  			imm5 = 0b100
  3249  		case VectorArrangement4S:
  3250  			imm5 = 0b100
  3251  			q = 0b1
  3252  		case VectorArrangement2D:
  3253  			imm5 = 0b1000
  3254  			q = 0b1
  3255  		default:
  3256  			err = fmt.Errorf("unsupported arrangement for DUPGEN: %s", arr)
  3257  		}
  3258  		return
  3259  	}},
  3260  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en
  3261  	INSGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3262  		imm4, q = 0b0011, 0b1
  3263  		switch arr {
  3264  		case VectorArrangementB:
  3265  			imm5 |= 0b1
  3266  			imm5 |= byte(dstIndex) << 1
  3267  		case VectorArrangementH:
  3268  			imm5 |= 0b10
  3269  			imm5 |= byte(dstIndex) << 2
  3270  		case VectorArrangementS:
  3271  			imm5 |= 0b100
  3272  			imm5 |= byte(dstIndex) << 3
  3273  		case VectorArrangementD:
  3274  			imm5 |= 0b1000
  3275  			imm5 |= byte(dstIndex) << 4
  3276  		default:
  3277  			err = fmt.Errorf("unsupported arrangement for INSGEN: %s", arr)
  3278  		}
  3279  		return
  3280  	}},
  3281  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
  3282  	UMOV: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3283  		imm4 = 0b0111
  3284  		switch arr {
  3285  		case VectorArrangementB:
  3286  			imm5 |= 0b1
  3287  			imm5 |= byte(srcIndex) << 1
  3288  		case VectorArrangementH:
  3289  			imm5 |= 0b10
  3290  			imm5 |= byte(srcIndex) << 2
  3291  		case VectorArrangementS:
  3292  			imm5 |= 0b100
  3293  			imm5 |= byte(srcIndex) << 3
  3294  		case VectorArrangementD:
  3295  			imm5 |= 0b1000
  3296  			imm5 |= byte(srcIndex) << 4
  3297  			q = 0b1
  3298  		default:
  3299  			err = fmt.Errorf("unsupported arrangement for UMOV: %s", arr)
  3300  		}
  3301  		return
  3302  	}},
  3303  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-?lang=en
  3304  	SMOV32: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3305  		imm4 = 0b0101
  3306  		switch arr {
  3307  		case VectorArrangementB:
  3308  			imm5 |= 0b1
  3309  			imm5 |= byte(srcIndex) << 1
  3310  		case VectorArrangementH:
  3311  			imm5 |= 0b10
  3312  			imm5 |= byte(srcIndex) << 2
  3313  		default:
  3314  			err = fmt.Errorf("unsupported arrangement for SMOV32: %s", arr)
  3315  		}
  3316  		return
  3317  	}},
  3318  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en
  3319  	INSELEM: {op: 0b1, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) {
  3320  		q = 0b1
  3321  		switch arr {
  3322  		case VectorArrangementB:
  3323  			imm5 |= 0b1
  3324  			imm5 |= byte(dstIndex) << 1
  3325  			imm4 = byte(srcIndex)
  3326  		case VectorArrangementH:
  3327  			imm5 |= 0b10
  3328  			imm5 |= byte(dstIndex) << 2
  3329  			imm4 = byte(srcIndex) << 1
  3330  		case VectorArrangementS:
  3331  			imm5 |= 0b100
  3332  			imm5 |= byte(dstIndex) << 3
  3333  			imm4 = byte(srcIndex) << 2
  3334  		case VectorArrangementD:
  3335  			imm5 |= 0b1000
  3336  			imm5 |= byte(dstIndex) << 4
  3337  			imm4 = byte(srcIndex) << 3
  3338  		default:
  3339  			err = fmt.Errorf("unsupported arrangement for INSELEM: %d", arr)
  3340  		}
  3341  		return
  3342  	}},
  3343  }
  3344  
  3345  // advancedSIMDTableLookup holds information to encode instructions as "Advanced SIMD table lookup" in
  3346  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3347  var advancedSIMDTableLookup = map[asm.Instruction]struct {
  3348  	op, op2, Len byte
  3349  	q            map[VectorArrangement]byte
  3350  }{
  3351  	TBL1: {op: 0, op2: 0, Len: 0b00, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}},
  3352  	TBL2: {op: 0, op2: 0, Len: 0b01, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}},
  3353  }
  3354  
  3355  // advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in
  3356  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3357  var advancedSIMDShiftByImmediate = map[asm.Instruction]struct {
  3358  	U, opcode   byte
  3359  	q           map[VectorArrangement]byte
  3360  	immResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error)
  3361  }{
  3362  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
  3363  	SSHLL: {
  3364  		U: 0b0, opcode: 0b10100,
  3365  		q:           map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0},
  3366  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3367  	},
  3368  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate--
  3369  	SSHLL2: {
  3370  		U: 0b0, opcode: 0b10100,
  3371  		q:           map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1},
  3372  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3373  	},
  3374  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
  3375  	USHLL: {
  3376  		U: 0b1, opcode: 0b10100,
  3377  		q:           map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0},
  3378  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3379  	},
  3380  	// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate--
  3381  	USHLL2: {
  3382  		U: 0b1, opcode: 0b10100,
  3383  		q:           map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1},
  3384  		immResolver: immResolverForSIMDSiftLeftByImmediate,
  3385  	},
  3386  	// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en
  3387  	SSHR: {
  3388  		U: 0b0, opcode: 0b00000,
  3389  		q: map[VectorArrangement]byte{
  3390  			VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1, VectorArrangement2D: 0b1,
  3391  			VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0,
  3392  		},
  3393  		immResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) {
  3394  			switch arr {
  3395  			case VectorArrangement16B, VectorArrangement8B:
  3396  				immh = 0b0001
  3397  				immb = 8 - byte(shiftAmount&0b111)
  3398  			case VectorArrangement8H, VectorArrangement4H:
  3399  				v := 16 - byte(shiftAmount&0b1111)
  3400  				immb = v & 0b111
  3401  				immh = 0b0010 | (v >> 3)
  3402  			case VectorArrangement4S, VectorArrangement2S:
  3403  				v := 32 - byte(shiftAmount&0b11111)
  3404  				immb = v & 0b111
  3405  				immh = 0b0100 | (v >> 3)
  3406  			case VectorArrangement2D:
  3407  				v := 64 - byte(shiftAmount&0b111111)
  3408  				immb = v & 0b111
  3409  				immh = 0b1000 | (v >> 3)
  3410  			default:
  3411  				err = fmt.Errorf("unsupported arrangement %s", arr)
  3412  			}
  3413  			return
  3414  		},
  3415  	},
  3416  }
  3417  
  3418  // advancedSIMDPermute holds information to encode instructions as "Advanced SIMD permute" in
  3419  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3420  var advancedSIMDPermute = map[asm.Instruction]struct {
  3421  	opcode byte
  3422  }{
  3423  	ZIP1: {opcode: 0b011},
  3424  }
  3425  
  3426  func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) {
  3427  	switch arr {
  3428  	case VectorArrangement16B, VectorArrangement8B:
  3429  		immb = byte(shiftAmount)
  3430  		immh = 0b0001
  3431  	case VectorArrangement8H, VectorArrangement4H:
  3432  		immb = byte(shiftAmount) & 0b111
  3433  		immh = 0b0010 | byte(shiftAmount>>3)
  3434  	case VectorArrangement4S, VectorArrangement2S:
  3435  		immb = byte(shiftAmount) & 0b111
  3436  		immh = 0b0100 | byte(shiftAmount>>3)
  3437  	default:
  3438  		err = fmt.Errorf("unsupported arrangement %s", arr)
  3439  	}
  3440  	return
  3441  }
  3442  
  3443  // encodeAdvancedSIMDCopy encodes instruction as "Advanced SIMD copy" in
  3444  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3445  func (a *AssemblerImpl) encodeAdvancedSIMDCopy(srcRegBits, dstRegBits, op, imm5, imm4, q byte) {
  3446  	a.Buf.Write([]byte{
  3447  		(srcRegBits << 5) | dstRegBits,
  3448  		imm4<<3 | 0b1<<2 | srcRegBits>>3,
  3449  		imm5,
  3450  		q<<6 | op<<5 | 0b1110,
  3451  	})
  3452  }
  3453  
  3454  // encodeAdvancedSIMDThreeSame encodes instruction as  "Advanced SIMD three same" in
  3455  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3456  func (a *AssemblerImpl) encodeAdvancedSIMDThreeSame(src1, src2, dst, opcode, size, q, u byte) {
  3457  	a.Buf.Write([]byte{
  3458  		(src2 << 5) | dst,
  3459  		opcode<<3 | 1<<2 | src2>>3,
  3460  		size<<6 | 0b1<<5 | src1,
  3461  		q<<6 | u<<5 | 0b1110,
  3462  	})
  3463  }
  3464  
  3465  // encodeAdvancedSIMDThreeDifferent encodes instruction as  "Advanced SIMD three different" in
  3466  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3467  func (a *AssemblerImpl) encodeAdvancedSIMDThreeDifferent(src1, src2, dst, opcode, size, q, u byte) {
  3468  	a.Buf.Write([]byte{
  3469  		(src2 << 5) | dst,
  3470  		opcode<<4 | src2>>3,
  3471  		size<<6 | 0b1<<5 | src1,
  3472  		q<<6 | u<<5 | 0b1110,
  3473  	})
  3474  }
  3475  
  3476  // encodeAdvancedSIMDPermute encodes instruction as  "Advanced SIMD permute" in
  3477  // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3478  func (a *AssemblerImpl) encodeAdvancedSIMDPermute(src1, src2, dst, opcode, size, q byte) {
  3479  	a.Buf.Write([]byte{
  3480  		(src2 << 5) | dst,
  3481  		opcode<<4 | 0b1<<3 | src2>>3,
  3482  		size<<6 | src1,
  3483  		q<<6 | 0b1110,
  3484  	})
  3485  }
  3486  
  3487  func (a *AssemblerImpl) encodeVectorRegisterToVectorRegister(n *nodeImpl) (err error) {
  3488  	var srcVectorRegBits byte
  3489  	if n.srcReg != RegRZR {
  3490  		srcVectorRegBits, err = vectorRegisterBits(n.srcReg)
  3491  	} else if n.instruction == CMEQZERO {
  3492  		// CMEQZERO has RegRZR as the src, and we apply the instruction to the same register as the destination.
  3493  		srcVectorRegBits, err = vectorRegisterBits(n.dstReg)
  3494  	}
  3495  
  3496  	if err != nil {
  3497  		return err
  3498  	}
  3499  
  3500  	dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
  3501  	if err != nil {
  3502  		return err
  3503  	}
  3504  
  3505  	if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
  3506  		imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
  3507  		if err != nil {
  3508  			return err
  3509  		}
  3510  		a.encodeAdvancedSIMDCopy(srcVectorRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
  3511  		return nil
  3512  	}
  3513  
  3514  	if scalarPairwise, ok := advancedSIMDScalarPairwise[n.instruction]; ok {
  3515  		// See "Advanced SIMD scalar pairwise" in
  3516  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3517  		size, ok := scalarPairwise.size[n.vectorArrangement]
  3518  		if !ok {
  3519  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3520  		}
  3521  		a.Buf.Write([]byte{
  3522  			(srcVectorRegBits << 5) | dstVectorRegBits,
  3523  			scalarPairwise.opcode<<4 | 1<<3 | srcVectorRegBits>>3,
  3524  			size<<6 | 0b11<<4 | scalarPairwise.opcode>>4,
  3525  			0b1<<6 | scalarPairwise.u<<5 | 0b11110,
  3526  		})
  3527  		return
  3528  	}
  3529  
  3530  	if twoRegMisc, ok := advancedSIMDTwoRegisterMisc[n.instruction]; ok {
  3531  		// See "Advanced SIMD two-register miscellaneous" in
  3532  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3533  		qs, ok := twoRegMisc.qAndSize[n.vectorArrangement]
  3534  		if !ok {
  3535  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3536  		}
  3537  		a.Buf.Write([]byte{
  3538  			(srcVectorRegBits << 5) | dstVectorRegBits,
  3539  			twoRegMisc.opcode<<4 | 0b1<<3 | srcVectorRegBits>>3,
  3540  			qs.size<<6 | 0b1<<5 | twoRegMisc.opcode>>4,
  3541  			qs.q<<6 | twoRegMisc.u<<5 | 0b01110,
  3542  		})
  3543  		return nil
  3544  	}
  3545  
  3546  	if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok {
  3547  		qs, ok := threeSame.qAndSize[n.vectorArrangement]
  3548  		if !ok {
  3549  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3550  		}
  3551  		a.encodeAdvancedSIMDThreeSame(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u)
  3552  		return nil
  3553  	}
  3554  
  3555  	if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok {
  3556  		qs, ok := threeDifferent.qAndSize[n.vectorArrangement]
  3557  		if !ok {
  3558  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3559  		}
  3560  		a.encodeAdvancedSIMDThreeDifferent(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u)
  3561  		return nil
  3562  	}
  3563  
  3564  	if acrossLanes, ok := advancedSIMDAcrossLanes[n.instruction]; ok {
  3565  		// See "Advanced SIMD across lanes" in
  3566  		// https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
  3567  		qs, ok := acrossLanes.qAndSize[n.vectorArrangement]
  3568  		if !ok {
  3569  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3570  		}
  3571  		a.Buf.Write([]byte{
  3572  			(srcVectorRegBits << 5) | dstVectorRegBits,
  3573  			acrossLanes.opcode<<4 | 0b1<<3 | srcVectorRegBits>>3,
  3574  			qs.size<<6 | 0b11000<<1 | acrossLanes.opcode>>4,
  3575  			qs.q<<6 | acrossLanes.u<<5 | 0b01110,
  3576  		})
  3577  		return nil
  3578  	}
  3579  
  3580  	if lookup, ok := advancedSIMDTableLookup[n.instruction]; ok {
  3581  		q, ok := lookup.q[n.vectorArrangement]
  3582  		if !ok {
  3583  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3584  		}
  3585  		a.Buf.Write([]byte{
  3586  			(srcVectorRegBits << 5) | dstVectorRegBits,
  3587  			lookup.Len<<5 | lookup.op<<4 | srcVectorRegBits>>3,
  3588  			lookup.op2<<6 | dstVectorRegBits,
  3589  			q<<6 | 0b1110,
  3590  		})
  3591  		return
  3592  	}
  3593  
  3594  	if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.instruction]; ok {
  3595  		immh, immb, err := shiftByImmediate.immResolver(n.srcConst, n.vectorArrangement)
  3596  		if err != nil {
  3597  			return err
  3598  		}
  3599  
  3600  		q, ok := shiftByImmediate.q[n.vectorArrangement]
  3601  		if !ok {
  3602  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3603  		}
  3604  
  3605  		a.Buf.Write([]byte{
  3606  			(srcVectorRegBits << 5) | dstVectorRegBits,
  3607  			shiftByImmediate.opcode<<3 | 0b1<<2 | srcVectorRegBits>>3,
  3608  			immh<<3 | immb,
  3609  			q<<6 | shiftByImmediate.U<<5 | 0b1111,
  3610  		})
  3611  		return nil
  3612  	}
  3613  
  3614  	if permute, ok := advancedSIMDPermute[n.instruction]; ok {
  3615  		size, q := arrangementSizeQ(n.vectorArrangement)
  3616  		a.encodeAdvancedSIMDPermute(srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, permute.opcode, size, q)
  3617  		return
  3618  	}
  3619  	return errorEncodingUnsupported(n)
  3620  }
  3621  
  3622  func (a *AssemblerImpl) encodeTwoVectorRegistersToVectorRegister(n *nodeImpl) (err error) {
  3623  	var srcRegBits, srcRegBits2, dstRegBits byte
  3624  	srcRegBits, err = vectorRegisterBits(n.srcReg)
  3625  	if err != nil {
  3626  		return err
  3627  	}
  3628  
  3629  	srcRegBits2, err = vectorRegisterBits(n.srcReg2)
  3630  	if err != nil {
  3631  		return err
  3632  	}
  3633  
  3634  	dstRegBits, err = vectorRegisterBits(n.dstReg)
  3635  	if err != nil {
  3636  		return err
  3637  	}
  3638  
  3639  	if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok {
  3640  		qs, ok := threeSame.qAndSize[n.vectorArrangement]
  3641  		if !ok {
  3642  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3643  		}
  3644  		a.encodeAdvancedSIMDThreeSame(srcRegBits, srcRegBits2, dstRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u)
  3645  		return nil
  3646  	}
  3647  
  3648  	if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok {
  3649  		qs, ok := threeDifferent.qAndSize[n.vectorArrangement]
  3650  		if !ok {
  3651  			return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction))
  3652  		}
  3653  		a.encodeAdvancedSIMDThreeDifferent(srcRegBits, srcRegBits2, dstRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u)
  3654  		return nil
  3655  	}
  3656  
  3657  	if permute, ok := advancedSIMDPermute[n.instruction]; ok {
  3658  		size, q := arrangementSizeQ(n.vectorArrangement)
  3659  		a.encodeAdvancedSIMDPermute(srcRegBits, srcRegBits2, dstRegBits, permute.opcode, size, q)
  3660  		return
  3661  	}
  3662  
  3663  	if n.instruction == EXT {
  3664  		// EXT is the only instruction in "Advanced SIMD extract", so inline the encoding here.
  3665  		// https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en
  3666  		var q, imm4 byte
  3667  		switch n.vectorArrangement {
  3668  		case VectorArrangement16B:
  3669  			imm4 = 0b1111 & byte(n.srcConst)
  3670  			q = 0b1
  3671  		case VectorArrangement8B:
  3672  			imm4 = 0b111 & byte(n.srcConst)
  3673  		default:
  3674  			return fmt.Errorf("invalid arrangement %s for EXT", n.vectorArrangement)
  3675  		}
  3676  		a.Buf.Write([]byte{
  3677  			(srcRegBits2 << 5) | dstRegBits,
  3678  			imm4<<3 | srcRegBits2>>3,
  3679  			srcRegBits,
  3680  			q<<6 | 0b101110,
  3681  		})
  3682  		return
  3683  	}
  3684  	return
  3685  }
  3686  
  3687  func (a *AssemblerImpl) encodeVectorRegisterToRegister(n *nodeImpl) (err error) {
  3688  	if err = checkArrangementIndexPair(n.vectorArrangement, n.srcVectorIndex); err != nil {
  3689  		return
  3690  	}
  3691  
  3692  	srcVecRegBits, err := vectorRegisterBits(n.srcReg)
  3693  	if err != nil {
  3694  		return err
  3695  	}
  3696  
  3697  	dstRegBits, err := intRegisterBits(n.dstReg)
  3698  	if err != nil {
  3699  		return err
  3700  	}
  3701  
  3702  	if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
  3703  		imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
  3704  		if err != nil {
  3705  			return err
  3706  		}
  3707  		a.encodeAdvancedSIMDCopy(srcVecRegBits, dstRegBits, simdCopy.op, imm5, imm4, q)
  3708  		return nil
  3709  	}
  3710  	return errorEncodingUnsupported(n)
  3711  }
  3712  
  3713  func (a *AssemblerImpl) encodeRegisterToVectorRegister(n *nodeImpl) (err error) {
  3714  	srcRegBits, err := intRegisterBits(n.srcReg)
  3715  	if err != nil {
  3716  		return err
  3717  	}
  3718  
  3719  	dstVectorRegBits, err := vectorRegisterBits(n.dstReg)
  3720  	if err != nil {
  3721  		return err
  3722  	}
  3723  
  3724  	if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok {
  3725  		imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement)
  3726  		if err != nil {
  3727  			return err
  3728  		}
  3729  		a.encodeAdvancedSIMDCopy(srcRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q)
  3730  		return nil
  3731  	}
  3732  	return errorEncodingUnsupported(n)
  3733  }
  3734  
  3735  var zeroRegisterBits byte = 0b11111
  3736  
  3737  func isIntRegister(r asm.Register) bool {
  3738  	return RegR0 <= r && r <= RegRZR
  3739  }
  3740  
  3741  func isVectorRegister(r asm.Register) bool {
  3742  	return RegV0 <= r && r <= RegV31
  3743  }
  3744  
  3745  func isConditionalRegister(r asm.Register) bool {
  3746  	return RegCondEQ <= r && r <= RegCondNV
  3747  }
  3748  
  3749  func intRegisterBits(r asm.Register) (ret byte, err error) {
  3750  	if !isIntRegister(r) {
  3751  		err = fmt.Errorf("%s is not integer", RegisterName(r))
  3752  	} else {
  3753  		ret = byte(r - RegR0)
  3754  	}
  3755  	return
  3756  }
  3757  
  3758  func vectorRegisterBits(r asm.Register) (ret byte, err error) {
  3759  	if !isVectorRegister(r) {
  3760  		err = fmt.Errorf("%s is not vector", RegisterName(r))
  3761  	} else {
  3762  		ret = byte(r - RegV0)
  3763  	}
  3764  	return
  3765  }
  3766  
  3767  func registerBits(r asm.Register) (ret byte) {
  3768  	if isIntRegister(r) {
  3769  		ret = byte(r - RegR0)
  3770  	} else {
  3771  		ret = byte(r - RegV0)
  3772  	}
  3773  	return
  3774  }