github.com/tetratelabs/wazero@v1.7.1/internal/engine/wazevo/backend/isa/arm64/instr.go

github.com/tetratelabs/wazero@v1.7.1/internal/engine/wazevo/backend/isa/arm64/instr.go (about)

     1  package arm64
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  
     7  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
     8  	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
     9  	"github.com/tetratelabs/wazero/internal/engine/wazevo/ssa"
    10  )
    11  
    12  type (
    13  	// instruction represents either a real instruction in arm64, or the meta instructions
    14  	// that are convenient for code generation. For example, inline constants are also treated
    15  	// as instructions.
    16  	//
    17  	// Basically, each instruction knows how to get encoded in binaries. Hence, the final output of compilation
    18  	// can be considered equivalent to the sequence of such instructions.
    19  	//
    20  	// Each field is interpreted depending on the kind.
    21  	//
    22  	// TODO: optimize the layout later once the impl settles.
    23  	instruction struct {
    24  		prev, next          *instruction
    25  		u1, u2, u3          uint64
    26  		rd, rm, rn, ra      operand
    27  		amode               addressMode
    28  		kind                instructionKind
    29  		addedBeforeRegAlloc bool
    30  	}
    31  
    32  	// instructionKind represents the kind of instruction.
    33  	// This controls how the instruction struct is interpreted.
    34  	instructionKind byte
    35  )
    36  
    37  func asNop0(i *instruction) {
    38  	i.kind = nop0
    39  }
    40  
    41  func setNext(i, next *instruction) {
    42  	i.next = next
    43  }
    44  
    45  func setPrev(i, prev *instruction) {
    46  	i.prev = prev
    47  }
    48  
    49  // IsCall implements regalloc.Instr IsCall.
    50  func (i *instruction) IsCall() bool {
    51  	return i.kind == call
    52  }
    53  
    54  // IsIndirectCall implements regalloc.Instr IsIndirectCall.
    55  func (i *instruction) IsIndirectCall() bool {
    56  	return i.kind == callInd
    57  }
    58  
    59  // IsReturn implements regalloc.Instr IsReturn.
    60  func (i *instruction) IsReturn() bool {
    61  	return i.kind == ret
    62  }
    63  
    64  // Next implements regalloc.Instr Next.
    65  func (i *instruction) Next() regalloc.Instr {
    66  	return i.next
    67  }
    68  
    69  // Prev implements regalloc.Instr Prev.
    70  func (i *instruction) Prev() regalloc.Instr {
    71  	return i.prev
    72  }
    73  
    74  // AddedBeforeRegAlloc implements regalloc.Instr AddedBeforeRegAlloc.
    75  func (i *instruction) AddedBeforeRegAlloc() bool {
    76  	return i.addedBeforeRegAlloc
    77  }
    78  
    79  type defKind byte
    80  
    81  const (
    82  	defKindNone defKind = iota + 1
    83  	defKindRD
    84  	defKindCall
    85  )
    86  
    87  var defKinds = [numInstructionKinds]defKind{
    88  	adr:                  defKindRD,
    89  	aluRRR:               defKindRD,
    90  	aluRRRR:              defKindRD,
    91  	aluRRImm12:           defKindRD,
    92  	aluRRBitmaskImm:      defKindRD,
    93  	aluRRRShift:          defKindRD,
    94  	aluRRImmShift:        defKindRD,
    95  	aluRRRExtend:         defKindRD,
    96  	bitRR:                defKindRD,
    97  	movZ:                 defKindRD,
    98  	movK:                 defKindRD,
    99  	movN:                 defKindRD,
   100  	mov32:                defKindRD,
   101  	mov64:                defKindRD,
   102  	fpuMov64:             defKindRD,
   103  	fpuMov128:            defKindRD,
   104  	fpuRR:                defKindRD,
   105  	fpuRRR:               defKindRD,
   106  	nop0:                 defKindNone,
   107  	call:                 defKindCall,
   108  	callInd:              defKindCall,
   109  	ret:                  defKindNone,
   110  	store8:               defKindNone,
   111  	store16:              defKindNone,
   112  	store32:              defKindNone,
   113  	store64:              defKindNone,
   114  	exitSequence:         defKindNone,
   115  	condBr:               defKindNone,
   116  	br:                   defKindNone,
   117  	brTableSequence:      defKindNone,
   118  	cSet:                 defKindRD,
   119  	extend:               defKindRD,
   120  	fpuCmp:               defKindNone,
   121  	uLoad8:               defKindRD,
   122  	uLoad16:              defKindRD,
   123  	uLoad32:              defKindRD,
   124  	sLoad8:               defKindRD,
   125  	sLoad16:              defKindRD,
   126  	sLoad32:              defKindRD,
   127  	uLoad64:              defKindRD,
   128  	fpuLoad32:            defKindRD,
   129  	fpuLoad64:            defKindRD,
   130  	fpuLoad128:           defKindRD,
   131  	vecLoad1R:            defKindRD,
   132  	loadFpuConst32:       defKindRD,
   133  	loadFpuConst64:       defKindRD,
   134  	loadFpuConst128:      defKindRD,
   135  	fpuStore32:           defKindNone,
   136  	fpuStore64:           defKindNone,
   137  	fpuStore128:          defKindNone,
   138  	udf:                  defKindNone,
   139  	cSel:                 defKindRD,
   140  	fpuCSel:              defKindRD,
   141  	movToVec:             defKindRD,
   142  	movFromVec:           defKindRD,
   143  	movFromVecSigned:     defKindRD,
   144  	vecDup:               defKindRD,
   145  	vecDupElement:        defKindRD,
   146  	vecExtract:           defKindRD,
   147  	vecMisc:              defKindRD,
   148  	vecMovElement:        defKindRD,
   149  	vecLanes:             defKindRD,
   150  	vecShiftImm:          defKindRD,
   151  	vecTbl:               defKindRD,
   152  	vecTbl2:              defKindRD,
   153  	vecPermute:           defKindRD,
   154  	vecRRR:               defKindRD,
   155  	vecRRRRewrite:        defKindNone,
   156  	fpuToInt:             defKindRD,
   157  	intToFpu:             defKindRD,
   158  	cCmpImm:              defKindNone,
   159  	movToFPSR:            defKindNone,
   160  	movFromFPSR:          defKindRD,
   161  	emitSourceOffsetInfo: defKindNone,
   162  	atomicRmw:            defKindRD,
   163  	atomicCas:            defKindNone,
   164  	atomicLoad:           defKindRD,
   165  	atomicStore:          defKindNone,
   166  	dmb:                  defKindNone,
   167  	loadConstBlockArg:    defKindRD,
   168  }
   169  
   170  // Defs returns the list of regalloc.VReg that are defined by the instruction.
   171  // In order to reduce the number of allocations, the caller can pass the slice to be used.
   172  func (i *instruction) Defs(regs *[]regalloc.VReg) []regalloc.VReg {
   173  	*regs = (*regs)[:0]
   174  	switch defKinds[i.kind] {
   175  	case defKindNone:
   176  	case defKindRD:
   177  		*regs = append(*regs, i.rd.nr())
   178  	case defKindCall:
   179  		_, _, retIntRealRegs, retFloatRealRegs, _ := backend.ABIInfoFromUint64(i.u2)
   180  		for i := byte(0); i < retIntRealRegs; i++ {
   181  			*regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
   182  		}
   183  		for i := byte(0); i < retFloatRealRegs; i++ {
   184  			*regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
   185  		}
   186  	default:
   187  		panic(fmt.Sprintf("defKind for %v not defined", i))
   188  	}
   189  	return *regs
   190  }
   191  
   192  // AssignDef implements regalloc.Instr AssignDef.
   193  func (i *instruction) AssignDef(reg regalloc.VReg) {
   194  	switch defKinds[i.kind] {
   195  	case defKindNone:
   196  	case defKindRD:
   197  		i.rd = i.rd.assignReg(reg)
   198  	case defKindCall:
   199  		panic("BUG: call instructions shouldn't be assigned")
   200  	default:
   201  		panic(fmt.Sprintf("defKind for %v not defined", i))
   202  	}
   203  }
   204  
   205  type useKind byte
   206  
   207  const (
   208  	useKindNone useKind = iota + 1
   209  	useKindRN
   210  	useKindRNRM
   211  	useKindRNRMRA
   212  	useKindRNRN1RM
   213  	useKindCall
   214  	useKindCallInd
   215  	useKindAMode
   216  	useKindRNAMode
   217  	useKindCond
   218  	// useKindRDRewrite indicates an instruction where RD is used both as a source and destination.
   219  	// A temporary register for RD must be allocated explicitly with the source copied to this
   220  	// register before the instruction and the value copied from this register to the instruction
   221  	// return register.
   222  	useKindRDRewrite
   223  )
   224  
   225  var useKinds = [numInstructionKinds]useKind{
   226  	udf:                  useKindNone,
   227  	aluRRR:               useKindRNRM,
   228  	aluRRRR:              useKindRNRMRA,
   229  	aluRRImm12:           useKindRN,
   230  	aluRRBitmaskImm:      useKindRN,
   231  	aluRRRShift:          useKindRNRM,
   232  	aluRRImmShift:        useKindRN,
   233  	aluRRRExtend:         useKindRNRM,
   234  	bitRR:                useKindRN,
   235  	movZ:                 useKindNone,
   236  	movK:                 useKindNone,
   237  	movN:                 useKindNone,
   238  	mov32:                useKindRN,
   239  	mov64:                useKindRN,
   240  	fpuMov64:             useKindRN,
   241  	fpuMov128:            useKindRN,
   242  	fpuRR:                useKindRN,
   243  	fpuRRR:               useKindRNRM,
   244  	nop0:                 useKindNone,
   245  	call:                 useKindCall,
   246  	callInd:              useKindCallInd,
   247  	ret:                  useKindNone,
   248  	store8:               useKindRNAMode,
   249  	store16:              useKindRNAMode,
   250  	store32:              useKindRNAMode,
   251  	store64:              useKindRNAMode,
   252  	exitSequence:         useKindRN,
   253  	condBr:               useKindCond,
   254  	br:                   useKindNone,
   255  	brTableSequence:      useKindRN,
   256  	cSet:                 useKindNone,
   257  	extend:               useKindRN,
   258  	fpuCmp:               useKindRNRM,
   259  	uLoad8:               useKindAMode,
   260  	uLoad16:              useKindAMode,
   261  	uLoad32:              useKindAMode,
   262  	sLoad8:               useKindAMode,
   263  	sLoad16:              useKindAMode,
   264  	sLoad32:              useKindAMode,
   265  	uLoad64:              useKindAMode,
   266  	fpuLoad32:            useKindAMode,
   267  	fpuLoad64:            useKindAMode,
   268  	fpuLoad128:           useKindAMode,
   269  	fpuStore32:           useKindRNAMode,
   270  	fpuStore64:           useKindRNAMode,
   271  	fpuStore128:          useKindRNAMode,
   272  	loadFpuConst32:       useKindNone,
   273  	loadFpuConst64:       useKindNone,
   274  	loadFpuConst128:      useKindNone,
   275  	vecLoad1R:            useKindRN,
   276  	cSel:                 useKindRNRM,
   277  	fpuCSel:              useKindRNRM,
   278  	movToVec:             useKindRN,
   279  	movFromVec:           useKindRN,
   280  	movFromVecSigned:     useKindRN,
   281  	vecDup:               useKindRN,
   282  	vecDupElement:        useKindRN,
   283  	vecExtract:           useKindRNRM,
   284  	cCmpImm:              useKindRN,
   285  	vecMisc:              useKindRN,
   286  	vecMovElement:        useKindRN,
   287  	vecLanes:             useKindRN,
   288  	vecShiftImm:          useKindRN,
   289  	vecTbl:               useKindRNRM,
   290  	vecTbl2:              useKindRNRN1RM,
   291  	vecRRR:               useKindRNRM,
   292  	vecRRRRewrite:        useKindRDRewrite,
   293  	vecPermute:           useKindRNRM,
   294  	fpuToInt:             useKindRN,
   295  	intToFpu:             useKindRN,
   296  	movToFPSR:            useKindRN,
   297  	movFromFPSR:          useKindNone,
   298  	adr:                  useKindNone,
   299  	emitSourceOffsetInfo: useKindNone,
   300  	atomicRmw:            useKindRNRM,
   301  	atomicCas:            useKindRDRewrite,
   302  	atomicLoad:           useKindRN,
   303  	atomicStore:          useKindRNRM,
   304  	loadConstBlockArg:    useKindNone,
   305  	dmb:                  useKindNone,
   306  }
   307  
   308  // Uses returns the list of regalloc.VReg that are used by the instruction.
   309  // In order to reduce the number of allocations, the caller can pass the slice to be used.
   310  func (i *instruction) Uses(regs *[]regalloc.VReg) []regalloc.VReg {
   311  	*regs = (*regs)[:0]
   312  	switch useKinds[i.kind] {
   313  	case useKindNone:
   314  	case useKindRN:
   315  		if rn := i.rn.reg(); rn.Valid() {
   316  			*regs = append(*regs, rn)
   317  		}
   318  	case useKindRNRM:
   319  		if rn := i.rn.reg(); rn.Valid() {
   320  			*regs = append(*regs, rn)
   321  		}
   322  		if rm := i.rm.reg(); rm.Valid() {
   323  			*regs = append(*regs, rm)
   324  		}
   325  	case useKindRNRMRA:
   326  		if rn := i.rn.reg(); rn.Valid() {
   327  			*regs = append(*regs, rn)
   328  		}
   329  		if rm := i.rm.reg(); rm.Valid() {
   330  			*regs = append(*regs, rm)
   331  		}
   332  		if ra := i.ra.reg(); ra.Valid() {
   333  			*regs = append(*regs, ra)
   334  		}
   335  	case useKindRNRN1RM:
   336  		if rn := i.rn.reg(); rn.Valid() && rn.IsRealReg() {
   337  			rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
   338  			*regs = append(*regs, rn, rn1)
   339  		}
   340  		if rm := i.rm.reg(); rm.Valid() {
   341  			*regs = append(*regs, rm)
   342  		}
   343  	case useKindAMode:
   344  		if amodeRN := i.amode.rn; amodeRN.Valid() {
   345  			*regs = append(*regs, amodeRN)
   346  		}
   347  		if amodeRM := i.amode.rm; amodeRM.Valid() {
   348  			*regs = append(*regs, amodeRM)
   349  		}
   350  	case useKindRNAMode:
   351  		*regs = append(*regs, i.rn.reg())
   352  		if amodeRN := i.amode.rn; amodeRN.Valid() {
   353  			*regs = append(*regs, amodeRN)
   354  		}
   355  		if amodeRM := i.amode.rm; amodeRM.Valid() {
   356  			*regs = append(*regs, amodeRM)
   357  		}
   358  	case useKindCond:
   359  		cnd := cond(i.u1)
   360  		if cnd.kind() != condKindCondFlagSet {
   361  			*regs = append(*regs, cnd.register())
   362  		}
   363  	case useKindCallInd:
   364  		*regs = append(*regs, i.rn.nr())
   365  		fallthrough
   366  	case useKindCall:
   367  		argIntRealRegs, argFloatRealRegs, _, _, _ := backend.ABIInfoFromUint64(i.u2)
   368  		for i := byte(0); i < argIntRealRegs; i++ {
   369  			*regs = append(*regs, regInfo.RealRegToVReg[intParamResultRegs[i]])
   370  		}
   371  		for i := byte(0); i < argFloatRealRegs; i++ {
   372  			*regs = append(*regs, regInfo.RealRegToVReg[floatParamResultRegs[i]])
   373  		}
   374  	case useKindRDRewrite:
   375  		*regs = append(*regs, i.rn.reg())
   376  		*regs = append(*regs, i.rm.reg())
   377  		*regs = append(*regs, i.rd.reg())
   378  	default:
   379  		panic(fmt.Sprintf("useKind for %v not defined", i))
   380  	}
   381  	return *regs
   382  }
   383  
   384  func (i *instruction) AssignUse(index int, reg regalloc.VReg) {
   385  	switch useKinds[i.kind] {
   386  	case useKindNone:
   387  	case useKindRN:
   388  		if rn := i.rn.reg(); rn.Valid() {
   389  			i.rn = i.rn.assignReg(reg)
   390  		}
   391  	case useKindRNRM:
   392  		if index == 0 {
   393  			if rn := i.rn.reg(); rn.Valid() {
   394  				i.rn = i.rn.assignReg(reg)
   395  			}
   396  		} else {
   397  			if rm := i.rm.reg(); rm.Valid() {
   398  				i.rm = i.rm.assignReg(reg)
   399  			}
   400  		}
   401  	case useKindRDRewrite:
   402  		if index == 0 {
   403  			if rn := i.rn.reg(); rn.Valid() {
   404  				i.rn = i.rn.assignReg(reg)
   405  			}
   406  		} else if index == 1 {
   407  			if rm := i.rm.reg(); rm.Valid() {
   408  				i.rm = i.rm.assignReg(reg)
   409  			}
   410  		} else {
   411  			if rd := i.rd.reg(); rd.Valid() {
   412  				i.rd = i.rd.assignReg(reg)
   413  			}
   414  		}
   415  	case useKindRNRN1RM:
   416  		if index == 0 {
   417  			if rn := i.rn.reg(); rn.Valid() {
   418  				i.rn = i.rn.assignReg(reg)
   419  			}
   420  			if rn1 := i.rn.reg() + 1; rn1.Valid() {
   421  				i.rm = i.rm.assignReg(reg + 1)
   422  			}
   423  		} else {
   424  			if rm := i.rm.reg(); rm.Valid() {
   425  				i.rm = i.rm.assignReg(reg)
   426  			}
   427  		}
   428  	case useKindRNRMRA:
   429  		if index == 0 {
   430  			if rn := i.rn.reg(); rn.Valid() {
   431  				i.rn = i.rn.assignReg(reg)
   432  			}
   433  		} else if index == 1 {
   434  			if rm := i.rm.reg(); rm.Valid() {
   435  				i.rm = i.rm.assignReg(reg)
   436  			}
   437  		} else {
   438  			if ra := i.ra.reg(); ra.Valid() {
   439  				i.ra = i.ra.assignReg(reg)
   440  			}
   441  		}
   442  	case useKindAMode:
   443  		if index == 0 {
   444  			if amodeRN := i.amode.rn; amodeRN.Valid() {
   445  				i.amode.rn = reg
   446  			}
   447  		} else {
   448  			if amodeRM := i.amode.rm; amodeRM.Valid() {
   449  				i.amode.rm = reg
   450  			}
   451  		}
   452  	case useKindRNAMode:
   453  		if index == 0 {
   454  			i.rn = i.rn.assignReg(reg)
   455  		} else if index == 1 {
   456  			if amodeRN := i.amode.rn; amodeRN.Valid() {
   457  				i.amode.rn = reg
   458  			} else {
   459  				panic("BUG")
   460  			}
   461  		} else {
   462  			if amodeRM := i.amode.rm; amodeRM.Valid() {
   463  				i.amode.rm = reg
   464  			} else {
   465  				panic("BUG")
   466  			}
   467  		}
   468  	case useKindCond:
   469  		c := cond(i.u1)
   470  		switch c.kind() {
   471  		case condKindRegisterZero:
   472  			i.u1 = uint64(registerAsRegZeroCond(reg))
   473  		case condKindRegisterNotZero:
   474  			i.u1 = uint64(registerAsRegNotZeroCond(reg))
   475  		}
   476  	case useKindCall:
   477  		panic("BUG: call instructions shouldn't be assigned")
   478  	case useKindCallInd:
   479  		i.rn = i.rn.assignReg(reg)
   480  	default:
   481  		panic(fmt.Sprintf("useKind for %v not defined", i))
   482  	}
   483  }
   484  
   485  func (i *instruction) asCall(ref ssa.FuncRef, abi *backend.FunctionABI) {
   486  	i.kind = call
   487  	i.u1 = uint64(ref)
   488  	if abi != nil {
   489  		i.u2 = abi.ABIInfoAsUint64()
   490  	}
   491  }
   492  
   493  func (i *instruction) asCallIndirect(ptr regalloc.VReg, abi *backend.FunctionABI) {
   494  	i.kind = callInd
   495  	i.rn = operandNR(ptr)
   496  	if abi != nil {
   497  		i.u2 = abi.ABIInfoAsUint64()
   498  	}
   499  }
   500  
   501  func (i *instruction) callFuncRef() ssa.FuncRef {
   502  	return ssa.FuncRef(i.u1)
   503  }
   504  
   505  // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
   506  func (i *instruction) asMOVZ(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
   507  	i.kind = movZ
   508  	i.rd = operandNR(dst)
   509  	i.u1 = imm
   510  	i.u2 = shift
   511  	if dst64bit {
   512  		i.u3 = 1
   513  	}
   514  }
   515  
   516  // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
   517  func (i *instruction) asMOVK(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
   518  	i.kind = movK
   519  	i.rd = operandNR(dst)
   520  	i.u1 = imm
   521  	i.u2 = shift
   522  	if dst64bit {
   523  		i.u3 = 1
   524  	}
   525  }
   526  
   527  // shift must be divided by 16 and must be in range 0-3 (if dst64bit is true) or 0-1 (if dst64bit is false)
   528  func (i *instruction) asMOVN(dst regalloc.VReg, imm uint64, shift uint64, dst64bit bool) {
   529  	i.kind = movN
   530  	i.rd = operandNR(dst)
   531  	i.u1 = imm
   532  	i.u2 = shift
   533  	if dst64bit {
   534  		i.u3 = 1
   535  	}
   536  }
   537  
   538  func (i *instruction) asNop0() *instruction {
   539  	i.kind = nop0
   540  	return i
   541  }
   542  
   543  func (i *instruction) asNop0WithLabel(l label) {
   544  	i.kind = nop0
   545  	i.u1 = uint64(l)
   546  }
   547  
   548  func (i *instruction) nop0Label() label {
   549  	return label(i.u1)
   550  }
   551  
   552  func (i *instruction) asRet() {
   553  	i.kind = ret
   554  }
   555  
   556  func (i *instruction) asStorePair64(src1, src2 regalloc.VReg, amode addressMode) {
   557  	i.kind = storeP64
   558  	i.rn = operandNR(src1)
   559  	i.rm = operandNR(src2)
   560  	i.amode = amode
   561  }
   562  
   563  func (i *instruction) asLoadPair64(src1, src2 regalloc.VReg, amode addressMode) {
   564  	i.kind = loadP64
   565  	i.rn = operandNR(src1)
   566  	i.rm = operandNR(src2)
   567  	i.amode = amode
   568  }
   569  
   570  func (i *instruction) asStore(src operand, amode addressMode, sizeInBits byte) {
   571  	switch sizeInBits {
   572  	case 8:
   573  		i.kind = store8
   574  	case 16:
   575  		i.kind = store16
   576  	case 32:
   577  		if src.reg().RegType() == regalloc.RegTypeInt {
   578  			i.kind = store32
   579  		} else {
   580  			i.kind = fpuStore32
   581  		}
   582  	case 64:
   583  		if src.reg().RegType() == regalloc.RegTypeInt {
   584  			i.kind = store64
   585  		} else {
   586  			i.kind = fpuStore64
   587  		}
   588  	case 128:
   589  		i.kind = fpuStore128
   590  	}
   591  	i.rn = src
   592  	i.amode = amode
   593  }
   594  
   595  func (i *instruction) asSLoad(dst operand, amode addressMode, sizeInBits byte) {
   596  	switch sizeInBits {
   597  	case 8:
   598  		i.kind = sLoad8
   599  	case 16:
   600  		i.kind = sLoad16
   601  	case 32:
   602  		i.kind = sLoad32
   603  	default:
   604  		panic("BUG")
   605  	}
   606  	i.rd = dst
   607  	i.amode = amode
   608  }
   609  
   610  func (i *instruction) asULoad(dst operand, amode addressMode, sizeInBits byte) {
   611  	switch sizeInBits {
   612  	case 8:
   613  		i.kind = uLoad8
   614  	case 16:
   615  		i.kind = uLoad16
   616  	case 32:
   617  		i.kind = uLoad32
   618  	case 64:
   619  		i.kind = uLoad64
   620  	}
   621  	i.rd = dst
   622  	i.amode = amode
   623  }
   624  
   625  func (i *instruction) asFpuLoad(dst operand, amode addressMode, sizeInBits byte) {
   626  	switch sizeInBits {
   627  	case 32:
   628  		i.kind = fpuLoad32
   629  	case 64:
   630  		i.kind = fpuLoad64
   631  	case 128:
   632  		i.kind = fpuLoad128
   633  	}
   634  	i.rd = dst
   635  	i.amode = amode
   636  }
   637  
   638  func (i *instruction) asVecLoad1R(rd, rn operand, arr vecArrangement) {
   639  	// NOTE: currently only has support for no-offset loads, though it is suspicious that
   640  	// we would need to support offset load (that is only available for post-index).
   641  	i.kind = vecLoad1R
   642  	i.rd = rd
   643  	i.rn = rn
   644  	i.u1 = uint64(arr)
   645  }
   646  
   647  func (i *instruction) asCSet(rd regalloc.VReg, mask bool, c condFlag) {
   648  	i.kind = cSet
   649  	i.rd = operandNR(rd)
   650  	i.u1 = uint64(c)
   651  	if mask {
   652  		i.u2 = 1
   653  	}
   654  }
   655  
   656  func (i *instruction) asCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
   657  	i.kind = cSel
   658  	i.rd = rd
   659  	i.rn = rn
   660  	i.rm = rm
   661  	i.u1 = uint64(c)
   662  	if _64bit {
   663  		i.u3 = 1
   664  	}
   665  }
   666  
   667  func (i *instruction) asFpuCSel(rd, rn, rm operand, c condFlag, _64bit bool) {
   668  	i.kind = fpuCSel
   669  	i.rd = rd
   670  	i.rn = rn
   671  	i.rm = rm
   672  	i.u1 = uint64(c)
   673  	if _64bit {
   674  		i.u3 = 1
   675  	}
   676  }
   677  
   678  func (i *instruction) asBr(target label) {
   679  	if target == labelReturn {
   680  		panic("BUG: call site should special case for returnLabel")
   681  	}
   682  	i.kind = br
   683  	i.u1 = uint64(target)
   684  }
   685  
   686  func (i *instruction) asBrTableSequence(indexReg regalloc.VReg, targetIndex, targetCounts int) {
   687  	i.kind = brTableSequence
   688  	i.rn = operandNR(indexReg)
   689  	i.u1 = uint64(targetIndex)
   690  	i.u2 = uint64(targetCounts)
   691  }
   692  
   693  func (i *instruction) brTableSequenceOffsetsResolved() {
   694  	i.u3 = 1 // indicate that the offsets are resolved, for debugging.
   695  }
   696  
   697  func (i *instruction) brLabel() label {
   698  	return label(i.u1)
   699  }
   700  
   701  // brOffsetResolved is called when the target label is resolved.
   702  func (i *instruction) brOffsetResolve(offset int64) {
   703  	i.u2 = uint64(offset)
   704  	i.u3 = 1 // indicate that the offset is resolved, for debugging.
   705  }
   706  
   707  func (i *instruction) brOffset() int64 {
   708  	return int64(i.u2)
   709  }
   710  
   711  // asCondBr encodes a conditional branch instruction. is64bit is only needed when cond is not flag.
   712  func (i *instruction) asCondBr(c cond, target label, is64bit bool) {
   713  	i.kind = condBr
   714  	i.u1 = c.asUint64()
   715  	i.u2 = uint64(target)
   716  	if is64bit {
   717  		i.u3 = 1
   718  	}
   719  }
   720  
   721  func (i *instruction) setCondBrTargets(target label) {
   722  	i.u2 = uint64(target)
   723  }
   724  
   725  func (i *instruction) condBrLabel() label {
   726  	return label(i.u2)
   727  }
   728  
   729  // condBrOffsetResolve is called when the target label is resolved.
   730  func (i *instruction) condBrOffsetResolve(offset int64) {
   731  	i.rd.data = uint64(offset)
   732  	i.rd.data2 = 1 // indicate that the offset is resolved, for debugging.
   733  }
   734  
   735  // condBrOffsetResolved returns true if condBrOffsetResolve is already called.
   736  func (i *instruction) condBrOffsetResolved() bool {
   737  	return i.rd.data2 == 1
   738  }
   739  
   740  func (i *instruction) condBrOffset() int64 {
   741  	return int64(i.rd.data)
   742  }
   743  
   744  func (i *instruction) condBrCond() cond {
   745  	return cond(i.u1)
   746  }
   747  
   748  func (i *instruction) condBr64bit() bool {
   749  	return i.u3 == 1
   750  }
   751  
   752  func (i *instruction) asLoadFpuConst32(rd regalloc.VReg, raw uint64) {
   753  	i.kind = loadFpuConst32
   754  	i.u1 = raw
   755  	i.rd = operandNR(rd)
   756  }
   757  
   758  func (i *instruction) asLoadFpuConst64(rd regalloc.VReg, raw uint64) {
   759  	i.kind = loadFpuConst64
   760  	i.u1 = raw
   761  	i.rd = operandNR(rd)
   762  }
   763  
   764  func (i *instruction) asLoadFpuConst128(rd regalloc.VReg, lo, hi uint64) {
   765  	i.kind = loadFpuConst128
   766  	i.u1 = lo
   767  	i.u2 = hi
   768  	i.rd = operandNR(rd)
   769  }
   770  
   771  func (i *instruction) asFpuCmp(rn, rm operand, is64bit bool) {
   772  	i.kind = fpuCmp
   773  	i.rn, i.rm = rn, rm
   774  	if is64bit {
   775  		i.u3 = 1
   776  	}
   777  }
   778  
   779  func (i *instruction) asCCmpImm(rn operand, imm uint64, c condFlag, flag byte, is64bit bool) {
   780  	i.kind = cCmpImm
   781  	i.rn = rn
   782  	i.rm.data = imm
   783  	i.u1 = uint64(c)
   784  	i.u2 = uint64(flag)
   785  	if is64bit {
   786  		i.u3 = 1
   787  	}
   788  }
   789  
   790  // asALU setups a basic ALU instruction.
   791  func (i *instruction) asALU(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
   792  	switch rm.kind {
   793  	case operandKindNR:
   794  		i.kind = aluRRR
   795  	case operandKindSR:
   796  		i.kind = aluRRRShift
   797  	case operandKindER:
   798  		i.kind = aluRRRExtend
   799  	case operandKindImm12:
   800  		i.kind = aluRRImm12
   801  	default:
   802  		panic("BUG")
   803  	}
   804  	i.u1 = uint64(aluOp)
   805  	i.rd, i.rn, i.rm = rd, rn, rm
   806  	if dst64bit {
   807  		i.u3 = 1
   808  	}
   809  }
   810  
   811  // asALU setups a basic ALU instruction.
   812  func (i *instruction) asALURRRR(aluOp aluOp, rd, rn, rm, ra operand, dst64bit bool) {
   813  	i.kind = aluRRRR
   814  	i.u1 = uint64(aluOp)
   815  	i.rd, i.rn, i.rm, i.ra = rd, rn, rm, ra
   816  	if dst64bit {
   817  		i.u3 = 1
   818  	}
   819  }
   820  
   821  // asALUShift setups a shift based ALU instruction.
   822  func (i *instruction) asALUShift(aluOp aluOp, rd, rn, rm operand, dst64bit bool) {
   823  	switch rm.kind {
   824  	case operandKindNR:
   825  		i.kind = aluRRR // If the shift amount op is a register, then the instruction is encoded as a normal ALU instruction with two register operands.
   826  	case operandKindShiftImm:
   827  		i.kind = aluRRImmShift
   828  	default:
   829  		panic("BUG")
   830  	}
   831  	i.u1 = uint64(aluOp)
   832  	i.rd, i.rn, i.rm = rd, rn, rm
   833  	if dst64bit {
   834  		i.u3 = 1
   835  	}
   836  }
   837  
   838  func (i *instruction) asALUBitmaskImm(aluOp aluOp, rd, rn regalloc.VReg, imm uint64, dst64bit bool) {
   839  	i.kind = aluRRBitmaskImm
   840  	i.u1 = uint64(aluOp)
   841  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   842  	i.u2 = imm
   843  	if dst64bit {
   844  		i.u3 = 1
   845  	}
   846  }
   847  
   848  func (i *instruction) asMovToFPSR(rn regalloc.VReg) {
   849  	i.kind = movToFPSR
   850  	i.rn = operandNR(rn)
   851  }
   852  
   853  func (i *instruction) asMovFromFPSR(rd regalloc.VReg) {
   854  	i.kind = movFromFPSR
   855  	i.rd = operandNR(rd)
   856  }
   857  
   858  func (i *instruction) asBitRR(bitOp bitOp, rd, rn regalloc.VReg, is64bit bool) {
   859  	i.kind = bitRR
   860  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   861  	i.u1 = uint64(bitOp)
   862  	if is64bit {
   863  		i.u2 = 1
   864  	}
   865  }
   866  
   867  func (i *instruction) asFpuRRR(op fpuBinOp, rd, rn, rm operand, dst64bit bool) {
   868  	i.kind = fpuRRR
   869  	i.u1 = uint64(op)
   870  	i.rd, i.rn, i.rm = rd, rn, rm
   871  	if dst64bit {
   872  		i.u3 = 1
   873  	}
   874  }
   875  
   876  func (i *instruction) asFpuRR(op fpuUniOp, rd, rn operand, dst64bit bool) {
   877  	i.kind = fpuRR
   878  	i.u1 = uint64(op)
   879  	i.rd, i.rn = rd, rn
   880  	if dst64bit {
   881  		i.u3 = 1
   882  	}
   883  }
   884  
   885  func (i *instruction) asExtend(rd, rn regalloc.VReg, fromBits, toBits byte, signed bool) {
   886  	i.kind = extend
   887  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   888  	i.u1 = uint64(fromBits)
   889  	i.u2 = uint64(toBits)
   890  	if signed {
   891  		i.u3 = 1
   892  	}
   893  }
   894  
   895  func (i *instruction) asMove32(rd, rn regalloc.VReg) {
   896  	i.kind = mov32
   897  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   898  }
   899  
   900  func (i *instruction) asMove64(rd, rn regalloc.VReg) *instruction {
   901  	i.kind = mov64
   902  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   903  	return i
   904  }
   905  
   906  func (i *instruction) asFpuMov64(rd, rn regalloc.VReg) {
   907  	i.kind = fpuMov64
   908  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   909  }
   910  
   911  func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) *instruction {
   912  	i.kind = fpuMov128
   913  	i.rn, i.rd = operandNR(rn), operandNR(rd)
   914  	return i
   915  }
   916  
   917  func (i *instruction) asMovToVec(rd, rn operand, arr vecArrangement, index vecIndex) {
   918  	i.kind = movToVec
   919  	i.rd = rd
   920  	i.rn = rn
   921  	i.u1, i.u2 = uint64(arr), uint64(index)
   922  }
   923  
   924  func (i *instruction) asMovFromVec(rd, rn operand, arr vecArrangement, index vecIndex, signed bool) {
   925  	if signed {
   926  		i.kind = movFromVecSigned
   927  	} else {
   928  		i.kind = movFromVec
   929  	}
   930  	i.rd = rd
   931  	i.rn = rn
   932  	i.u1, i.u2 = uint64(arr), uint64(index)
   933  }
   934  
   935  func (i *instruction) asVecDup(rd, rn operand, arr vecArrangement) {
   936  	i.kind = vecDup
   937  	i.u1 = uint64(arr)
   938  	i.rn, i.rd = rn, rd
   939  }
   940  
   941  func (i *instruction) asVecDupElement(rd, rn operand, arr vecArrangement, index vecIndex) {
   942  	i.kind = vecDupElement
   943  	i.u1 = uint64(arr)
   944  	i.rn, i.rd = rn, rd
   945  	i.u2 = uint64(index)
   946  }
   947  
   948  func (i *instruction) asVecExtract(rd, rn, rm operand, arr vecArrangement, index uint32) {
   949  	i.kind = vecExtract
   950  	i.u1 = uint64(arr)
   951  	i.rn, i.rm, i.rd = rn, rm, rd
   952  	i.u2 = uint64(index)
   953  }
   954  
   955  func (i *instruction) asVecMovElement(rd, rn operand, arr vecArrangement, rdIndex, rnIndex vecIndex) {
   956  	i.kind = vecMovElement
   957  	i.u1 = uint64(arr)
   958  	i.u2, i.u3 = uint64(rdIndex), uint64(rnIndex)
   959  	i.rn, i.rd = rn, rd
   960  }
   961  
   962  func (i *instruction) asVecMisc(op vecOp, rd, rn operand, arr vecArrangement) {
   963  	i.kind = vecMisc
   964  	i.u1 = uint64(op)
   965  	i.rn, i.rd = rn, rd
   966  	i.u2 = uint64(arr)
   967  }
   968  
   969  func (i *instruction) asVecLanes(op vecOp, rd, rn operand, arr vecArrangement) {
   970  	i.kind = vecLanes
   971  	i.u1 = uint64(op)
   972  	i.rn, i.rd = rn, rd
   973  	i.u2 = uint64(arr)
   974  }
   975  
   976  func (i *instruction) asVecShiftImm(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
   977  	i.kind = vecShiftImm
   978  	i.u1 = uint64(op)
   979  	i.rn, i.rm, i.rd = rn, rm, rd
   980  	i.u2 = uint64(arr)
   981  	return i
   982  }
   983  
   984  func (i *instruction) asVecTbl(nregs byte, rd, rn, rm operand, arr vecArrangement) {
   985  	switch nregs {
   986  	case 0, 1:
   987  		i.kind = vecTbl
   988  	case 2:
   989  		i.kind = vecTbl2
   990  		if !rn.reg().IsRealReg() {
   991  			panic("rn is not a RealReg")
   992  		}
   993  		if rn.realReg() == v31 {
   994  			panic("rn cannot be v31")
   995  		}
   996  	default:
   997  		panic(fmt.Sprintf("unsupported number of registers %d", nregs))
   998  	}
   999  	i.rn, i.rm, i.rd = rn, rm, rd
  1000  	i.u2 = uint64(arr)
  1001  }
  1002  
  1003  func (i *instruction) asVecPermute(op vecOp, rd, rn, rm operand, arr vecArrangement) {
  1004  	i.kind = vecPermute
  1005  	i.u1 = uint64(op)
  1006  	i.rn, i.rm, i.rd = rn, rm, rd
  1007  	i.u2 = uint64(arr)
  1008  }
  1009  
  1010  func (i *instruction) asVecRRR(op vecOp, rd, rn, rm operand, arr vecArrangement) *instruction {
  1011  	i.kind = vecRRR
  1012  	i.u1 = uint64(op)
  1013  	i.rn, i.rd, i.rm = rn, rd, rm
  1014  	i.u2 = uint64(arr)
  1015  	return i
  1016  }
  1017  
  1018  // asVecRRRRewrite encodes a vector instruction that rewrites the destination register.
  1019  // IMPORTANT: the destination register must be already defined before this instruction.
  1020  func (i *instruction) asVecRRRRewrite(op vecOp, rd, rn, rm operand, arr vecArrangement) {
  1021  	i.kind = vecRRRRewrite
  1022  	i.u1 = uint64(op)
  1023  	i.rn, i.rd, i.rm = rn, rd, rm
  1024  	i.u2 = uint64(arr)
  1025  }
  1026  
  1027  func (i *instruction) IsCopy() bool {
  1028  	op := i.kind
  1029  	// We do not include mov32 as it is not a copy instruction in the sense that it does not preserve the upper 32 bits,
  1030  	// and it is only used in the translation of IReduce, not the actual copy indeed.
  1031  	return op == mov64 || op == fpuMov64 || op == fpuMov128
  1032  }
  1033  
  1034  // String implements fmt.Stringer.
  1035  func (i *instruction) String() (str string) {
  1036  	is64SizeBitToSize := func(u3 uint64) byte {
  1037  		if u3 == 0 {
  1038  			return 32
  1039  		}
  1040  		return 64
  1041  	}
  1042  
  1043  	switch i.kind {
  1044  	case nop0:
  1045  		if i.u1 != 0 {
  1046  			l := label(i.u1)
  1047  			str = fmt.Sprintf("%s:", l)
  1048  		} else {
  1049  			str = "nop0"
  1050  		}
  1051  	case aluRRR:
  1052  		size := is64SizeBitToSize(i.u3)
  1053  		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
  1054  			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size),
  1055  			i.rm.format(size))
  1056  	case aluRRRR:
  1057  		size := is64SizeBitToSize(i.u3)
  1058  		str = fmt.Sprintf("%s %s, %s, %s, %s", aluOp(i.u1).String(),
  1059  			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.ra.nr(), size))
  1060  	case aluRRImm12:
  1061  		size := is64SizeBitToSize(i.u3)
  1062  		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
  1063  			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), i.rm.format(size))
  1064  	case aluRRBitmaskImm:
  1065  		size := is64SizeBitToSize(i.u3)
  1066  		rd, rn := formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size)
  1067  		if size == 32 {
  1068  			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, uint32(i.u2))
  1069  		} else {
  1070  			str = fmt.Sprintf("%s %s, %s, #%#x", aluOp(i.u1).String(), rd, rn, i.u2)
  1071  		}
  1072  	case aluRRImmShift:
  1073  		size := is64SizeBitToSize(i.u3)
  1074  		str = fmt.Sprintf("%s %s, %s, %#x",
  1075  			aluOp(i.u1).String(),
  1076  			formatVRegSized(i.rd.nr(), size),
  1077  			formatVRegSized(i.rn.nr(), size),
  1078  			i.rm.shiftImm(),
  1079  		)
  1080  	case aluRRRShift:
  1081  		size := is64SizeBitToSize(i.u3)
  1082  		str = fmt.Sprintf("%s %s, %s, %s",
  1083  			aluOp(i.u1).String(),
  1084  			formatVRegSized(i.rd.nr(), size),
  1085  			formatVRegSized(i.rn.nr(), size),
  1086  			i.rm.format(size),
  1087  		)
  1088  	case aluRRRExtend:
  1089  		size := is64SizeBitToSize(i.u3)
  1090  		str = fmt.Sprintf("%s %s, %s, %s", aluOp(i.u1).String(),
  1091  			formatVRegSized(i.rd.nr(), size),
  1092  			formatVRegSized(i.rn.nr(), size),
  1093  			// Regardless of the source size, the register is formatted in 32-bit.
  1094  			i.rm.format(32),
  1095  		)
  1096  	case bitRR:
  1097  		size := is64SizeBitToSize(i.u2)
  1098  		str = fmt.Sprintf("%s %s, %s",
  1099  			bitOp(i.u1),
  1100  			formatVRegSized(i.rd.nr(), size),
  1101  			formatVRegSized(i.rn.nr(), size),
  1102  		)
  1103  	case uLoad8:
  1104  		str = fmt.Sprintf("ldrb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1105  	case sLoad8:
  1106  		str = fmt.Sprintf("ldrsb %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1107  	case uLoad16:
  1108  		str = fmt.Sprintf("ldrh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1109  	case sLoad16:
  1110  		str = fmt.Sprintf("ldrsh %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1111  	case uLoad32:
  1112  		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1113  	case sLoad32:
  1114  		str = fmt.Sprintf("ldrs %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1115  	case uLoad64:
  1116  		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
  1117  	case store8:
  1118  		str = fmt.Sprintf("strb %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(8))
  1119  	case store16:
  1120  		str = fmt.Sprintf("strh %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(16))
  1121  	case store32:
  1122  		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(32))
  1123  	case store64:
  1124  		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
  1125  	case storeP64:
  1126  		str = fmt.Sprintf("stp %s, %s, %s",
  1127  			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
  1128  	case loadP64:
  1129  		str = fmt.Sprintf("ldp %s, %s, %s",
  1130  			formatVRegSized(i.rn.nr(), 64), formatVRegSized(i.rm.nr(), 64), i.amode.format(64))
  1131  	case mov64:
  1132  		str = fmt.Sprintf("mov %s, %s",
  1133  			formatVRegSized(i.rd.nr(), 64),
  1134  			formatVRegSized(i.rn.nr(), 64))
  1135  	case mov32:
  1136  		str = fmt.Sprintf("mov %s, %s", formatVRegSized(i.rd.nr(), 32), formatVRegSized(i.rn.nr(), 32))
  1137  	case movZ:
  1138  		size := is64SizeBitToSize(i.u3)
  1139  		str = fmt.Sprintf("movz %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
  1140  	case movN:
  1141  		size := is64SizeBitToSize(i.u3)
  1142  		str = fmt.Sprintf("movn %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
  1143  	case movK:
  1144  		size := is64SizeBitToSize(i.u3)
  1145  		str = fmt.Sprintf("movk %s, #%#x, lsl %d", formatVRegSized(i.rd.nr(), size), uint16(i.u1), i.u2*16)
  1146  	case extend:
  1147  		fromBits, toBits := byte(i.u1), byte(i.u2)
  1148  
  1149  		var signedStr string
  1150  		if i.u3 == 1 {
  1151  			signedStr = "s"
  1152  		} else {
  1153  			signedStr = "u"
  1154  		}
  1155  		var fromStr string
  1156  		switch fromBits {
  1157  		case 8:
  1158  			fromStr = "b"
  1159  		case 16:
  1160  			fromStr = "h"
  1161  		case 32:
  1162  			fromStr = "w"
  1163  		}
  1164  		str = fmt.Sprintf("%sxt%s %s, %s", signedStr, fromStr, formatVRegSized(i.rd.nr(), toBits), formatVRegSized(i.rn.nr(), 32))
  1165  	case cSel:
  1166  		size := is64SizeBitToSize(i.u3)
  1167  		str = fmt.Sprintf("csel %s, %s, %s, %s",
  1168  			formatVRegSized(i.rd.nr(), size),
  1169  			formatVRegSized(i.rn.nr(), size),
  1170  			formatVRegSized(i.rm.nr(), size),
  1171  			condFlag(i.u1),
  1172  		)
  1173  	case cSet:
  1174  		if i.u2 != 0 {
  1175  			str = fmt.Sprintf("csetm %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
  1176  		} else {
  1177  			str = fmt.Sprintf("cset %s, %s", formatVRegSized(i.rd.nr(), 64), condFlag(i.u1))
  1178  		}
  1179  	case cCmpImm:
  1180  		size := is64SizeBitToSize(i.u3)
  1181  		str = fmt.Sprintf("ccmp %s, #%#x, #%#x, %s",
  1182  			formatVRegSized(i.rn.nr(), size), i.rm.data,
  1183  			i.u2&0b1111,
  1184  			condFlag(i.u1))
  1185  	case fpuMov64:
  1186  		str = fmt.Sprintf("mov %s, %s",
  1187  			formatVRegVec(i.rd.nr(), vecArrangement8B, vecIndexNone),
  1188  			formatVRegVec(i.rn.nr(), vecArrangement8B, vecIndexNone))
  1189  	case fpuMov128:
  1190  		str = fmt.Sprintf("mov %s, %s",
  1191  			formatVRegVec(i.rd.nr(), vecArrangement16B, vecIndexNone),
  1192  			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone))
  1193  	case fpuMovFromVec:
  1194  		panic("TODO")
  1195  	case fpuRR:
  1196  		dstSz := is64SizeBitToSize(i.u3)
  1197  		srcSz := dstSz
  1198  		op := fpuUniOp(i.u1)
  1199  		switch op {
  1200  		case fpuUniOpCvt32To64:
  1201  			srcSz = 32
  1202  		case fpuUniOpCvt64To32:
  1203  			srcSz = 64
  1204  		}
  1205  		str = fmt.Sprintf("%s %s, %s", op.String(),
  1206  			formatVRegSized(i.rd.nr(), dstSz), formatVRegSized(i.rn.nr(), srcSz))
  1207  	case fpuRRR:
  1208  		size := is64SizeBitToSize(i.u3)
  1209  		str = fmt.Sprintf("%s %s, %s, %s", fpuBinOp(i.u1).String(),
  1210  			formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
  1211  	case fpuRRI:
  1212  		panic("TODO")
  1213  	case fpuRRRR:
  1214  		panic("TODO")
  1215  	case fpuCmp:
  1216  		size := is64SizeBitToSize(i.u3)
  1217  		str = fmt.Sprintf("fcmp %s, %s",
  1218  			formatVRegSized(i.rn.nr(), size), formatVRegSized(i.rm.nr(), size))
  1219  	case fpuLoad32:
  1220  		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 32), i.amode.format(32))
  1221  	case fpuStore32:
  1222  		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 32), i.amode.format(64))
  1223  	case fpuLoad64:
  1224  		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 64), i.amode.format(64))
  1225  	case fpuStore64:
  1226  		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 64), i.amode.format(64))
  1227  	case fpuLoad128:
  1228  		str = fmt.Sprintf("ldr %s, %s", formatVRegSized(i.rd.nr(), 128), i.amode.format(64))
  1229  	case fpuStore128:
  1230  		str = fmt.Sprintf("str %s, %s", formatVRegSized(i.rn.nr(), 128), i.amode.format(64))
  1231  	case loadFpuConst32:
  1232  		str = fmt.Sprintf("ldr %s, #8; b 8; data.f32 %f", formatVRegSized(i.rd.nr(), 32), math.Float32frombits(uint32(i.u1)))
  1233  	case loadFpuConst64:
  1234  		str = fmt.Sprintf("ldr %s, #8; b 16; data.f64 %f", formatVRegSized(i.rd.nr(), 64), math.Float64frombits(i.u1))
  1235  	case loadFpuConst128:
  1236  		str = fmt.Sprintf("ldr %s, #8; b 32; data.v128  %016x %016x",
  1237  			formatVRegSized(i.rd.nr(), 128), i.u1, i.u2)
  1238  	case fpuToInt:
  1239  		var op, src, dst string
  1240  		if signed := i.u1 == 1; signed {
  1241  			op = "fcvtzs"
  1242  		} else {
  1243  			op = "fcvtzu"
  1244  		}
  1245  		if src64 := i.u2 == 1; src64 {
  1246  			src = formatVRegWidthVec(i.rn.nr(), vecArrangementD)
  1247  		} else {
  1248  			src = formatVRegWidthVec(i.rn.nr(), vecArrangementS)
  1249  		}
  1250  		if dst64 := i.u3 == 1; dst64 {
  1251  			dst = formatVRegSized(i.rd.nr(), 64)
  1252  		} else {
  1253  			dst = formatVRegSized(i.rd.nr(), 32)
  1254  		}
  1255  		str = fmt.Sprintf("%s %s, %s", op, dst, src)
  1256  
  1257  	case intToFpu:
  1258  		var op, src, dst string
  1259  		if signed := i.u1 == 1; signed {
  1260  			op = "scvtf"
  1261  		} else {
  1262  			op = "ucvtf"
  1263  		}
  1264  		if src64 := i.u2 == 1; src64 {
  1265  			src = formatVRegSized(i.rn.nr(), 64)
  1266  		} else {
  1267  			src = formatVRegSized(i.rn.nr(), 32)
  1268  		}
  1269  		if dst64 := i.u3 == 1; dst64 {
  1270  			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementD)
  1271  		} else {
  1272  			dst = formatVRegWidthVec(i.rd.nr(), vecArrangementS)
  1273  		}
  1274  		str = fmt.Sprintf("%s %s, %s", op, dst, src)
  1275  	case fpuCSel:
  1276  		size := is64SizeBitToSize(i.u3)
  1277  		str = fmt.Sprintf("fcsel %s, %s, %s, %s",
  1278  			formatVRegSized(i.rd.nr(), size),
  1279  			formatVRegSized(i.rn.nr(), size),
  1280  			formatVRegSized(i.rm.nr(), size),
  1281  			condFlag(i.u1),
  1282  		)
  1283  	case movToVec:
  1284  		var size byte
  1285  		arr := vecArrangement(i.u1)
  1286  		switch arr {
  1287  		case vecArrangementB, vecArrangementH, vecArrangementS:
  1288  			size = 32
  1289  		case vecArrangementD:
  1290  			size = 64
  1291  		default:
  1292  			panic("unsupported arrangement " + arr.String())
  1293  		}
  1294  		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
  1295  	case movFromVec, movFromVecSigned:
  1296  		var size byte
  1297  		var opcode string
  1298  		arr := vecArrangement(i.u1)
  1299  		signed := i.kind == movFromVecSigned
  1300  		switch arr {
  1301  		case vecArrangementB, vecArrangementH, vecArrangementS:
  1302  			size = 32
  1303  			if signed {
  1304  				opcode = "smov"
  1305  			} else {
  1306  				opcode = "umov"
  1307  			}
  1308  		case vecArrangementD:
  1309  			size = 64
  1310  			if signed {
  1311  				opcode = "smov"
  1312  			} else {
  1313  				opcode = "mov"
  1314  			}
  1315  		default:
  1316  			panic("unsupported arrangement " + arr.String())
  1317  		}
  1318  		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
  1319  	case vecDup:
  1320  		str = fmt.Sprintf("dup %s, %s",
  1321  			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
  1322  			formatVRegSized(i.rn.nr(), 64),
  1323  		)
  1324  	case vecDupElement:
  1325  		arr := vecArrangement(i.u1)
  1326  		str = fmt.Sprintf("dup %s, %s",
  1327  			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
  1328  			formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)),
  1329  		)
  1330  	case vecDupFromFpu:
  1331  		panic("TODO")
  1332  	case vecExtract:
  1333  		str = fmt.Sprintf("ext %s, %s, %s, #%d",
  1334  			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone),
  1335  			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndexNone),
  1336  			formatVRegVec(i.rm.nr(), vecArrangement(i.u1), vecIndexNone),
  1337  			uint32(i.u2),
  1338  		)
  1339  	case vecExtend:
  1340  		panic("TODO")
  1341  	case vecMovElement:
  1342  		str = fmt.Sprintf("mov %s, %s",
  1343  			formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndex(i.u2)),
  1344  			formatVRegVec(i.rn.nr(), vecArrangement(i.u1), vecIndex(i.u3)),
  1345  		)
  1346  	case vecMiscNarrow:
  1347  		panic("TODO")
  1348  	case vecRRR, vecRRRRewrite:
  1349  		str = fmt.Sprintf("%s %s, %s, %s",
  1350  			vecOp(i.u1),
  1351  			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
  1352  			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone),
  1353  			formatVRegVec(i.rm.nr(), vecArrangement(i.u2), vecIndexNone),
  1354  		)
  1355  	case vecMisc:
  1356  		vop := vecOp(i.u1)
  1357  		if vop == vecOpCmeq0 {
  1358  			str = fmt.Sprintf("cmeq %s, %s, #0",
  1359  				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
  1360  				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
  1361  		} else {
  1362  			str = fmt.Sprintf("%s %s, %s",
  1363  				vop,
  1364  				formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
  1365  				formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
  1366  		}
  1367  	case vecLanes:
  1368  		arr := vecArrangement(i.u2)
  1369  		var destArr vecArrangement
  1370  		switch arr {
  1371  		case vecArrangement8B, vecArrangement16B:
  1372  			destArr = vecArrangementH
  1373  		case vecArrangement4H, vecArrangement8H:
  1374  			destArr = vecArrangementS
  1375  		case vecArrangement4S:
  1376  			destArr = vecArrangementD
  1377  		default:
  1378  			panic("invalid arrangement " + arr.String())
  1379  		}
  1380  		str = fmt.Sprintf("%s %s, %s",
  1381  			vecOp(i.u1),
  1382  			formatVRegWidthVec(i.rd.nr(), destArr),
  1383  			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
  1384  	case vecShiftImm:
  1385  		arr := vecArrangement(i.u2)
  1386  		str = fmt.Sprintf("%s %s, %s, #%d",
  1387  			vecOp(i.u1),
  1388  			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
  1389  			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
  1390  			i.rm.shiftImm())
  1391  	case vecTbl:
  1392  		arr := vecArrangement(i.u2)
  1393  		str = fmt.Sprintf("tbl %s, { %s }, %s",
  1394  			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
  1395  			formatVRegVec(i.rn.nr(), vecArrangement16B, vecIndexNone),
  1396  			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
  1397  	case vecTbl2:
  1398  		arr := vecArrangement(i.u2)
  1399  		rd, rn, rm := i.rd.nr(), i.rn.nr(), i.rm.nr()
  1400  		rn1 := regalloc.FromRealReg(rn.RealReg()+1, rn.RegType())
  1401  		str = fmt.Sprintf("tbl %s, { %s, %s }, %s",
  1402  			formatVRegVec(rd, arr, vecIndexNone),
  1403  			formatVRegVec(rn, vecArrangement16B, vecIndexNone),
  1404  			formatVRegVec(rn1, vecArrangement16B, vecIndexNone),
  1405  			formatVRegVec(rm, arr, vecIndexNone))
  1406  	case vecPermute:
  1407  		arr := vecArrangement(i.u2)
  1408  		str = fmt.Sprintf("%s %s, %s, %s",
  1409  			vecOp(i.u1),
  1410  			formatVRegVec(i.rd.nr(), arr, vecIndexNone),
  1411  			formatVRegVec(i.rn.nr(), arr, vecIndexNone),
  1412  			formatVRegVec(i.rm.nr(), arr, vecIndexNone))
  1413  	case movToFPSR:
  1414  		str = fmt.Sprintf("msr fpsr, %s", formatVRegSized(i.rn.nr(), 64))
  1415  	case movFromFPSR:
  1416  		str = fmt.Sprintf("mrs %s fpsr", formatVRegSized(i.rd.nr(), 64))
  1417  	case call:
  1418  		str = fmt.Sprintf("bl %s", ssa.FuncRef(i.u1))
  1419  	case callInd:
  1420  		str = fmt.Sprintf("bl %s", formatVRegSized(i.rn.nr(), 64))
  1421  	case ret:
  1422  		str = "ret"
  1423  	case br:
  1424  		target := label(i.u1)
  1425  		if i.u3 != 0 {
  1426  			str = fmt.Sprintf("b #%#x (%s)", i.brOffset(), target.String())
  1427  		} else {
  1428  			str = fmt.Sprintf("b %s", target.String())
  1429  		}
  1430  	case condBr:
  1431  		size := is64SizeBitToSize(i.u3)
  1432  		c := cond(i.u1)
  1433  		target := label(i.u2)
  1434  		switch c.kind() {
  1435  		case condKindRegisterZero:
  1436  			if !i.condBrOffsetResolved() {
  1437  				str = fmt.Sprintf("cbz %s, (%s)", formatVRegSized(c.register(), size), target.String())
  1438  			} else {
  1439  				str = fmt.Sprintf("cbz %s, #%#x %s", formatVRegSized(c.register(), size), i.condBrOffset(), target.String())
  1440  			}
  1441  		case condKindRegisterNotZero:
  1442  			if offset := i.condBrOffset(); offset != 0 {
  1443  				str = fmt.Sprintf("cbnz %s, #%#x (%s)", formatVRegSized(c.register(), size), offset, target.String())
  1444  			} else {
  1445  				str = fmt.Sprintf("cbnz %s, %s", formatVRegSized(c.register(), size), target.String())
  1446  			}
  1447  		case condKindCondFlagSet:
  1448  			if offset := i.condBrOffset(); offset != 0 {
  1449  				if target == labelInvalid {
  1450  					str = fmt.Sprintf("b.%s #%#x", c.flag(), offset)
  1451  				} else {
  1452  					str = fmt.Sprintf("b.%s #%#x, (%s)", c.flag(), offset, target.String())
  1453  				}
  1454  			} else {
  1455  				str = fmt.Sprintf("b.%s %s", c.flag(), target.String())
  1456  			}
  1457  		}
  1458  	case adr:
  1459  		str = fmt.Sprintf("adr %s, #%#x", formatVRegSized(i.rd.nr(), 64), int64(i.u1))
  1460  	case brTableSequence:
  1461  		targetIndex := i.u1
  1462  		str = fmt.Sprintf("br_table_sequence %s, table_index=%d", formatVRegSized(i.rn.nr(), 64), targetIndex)
  1463  	case exitSequence:
  1464  		str = fmt.Sprintf("exit_sequence %s", formatVRegSized(i.rn.nr(), 64))
  1465  	case atomicRmw:
  1466  		m := atomicRmwOp(i.u1).String()
  1467  		size := byte(32)
  1468  		switch i.u2 {
  1469  		case 8:
  1470  			size = 64
  1471  		case 2:
  1472  			m = m + "h"
  1473  		case 1:
  1474  			m = m + "b"
  1475  		}
  1476  		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
  1477  	case atomicCas:
  1478  		m := "casal"
  1479  		size := byte(32)
  1480  		switch i.u2 {
  1481  		case 8:
  1482  			size = 64
  1483  		case 2:
  1484  			m = m + "h"
  1485  		case 1:
  1486  			m = m + "b"
  1487  		}
  1488  		str = fmt.Sprintf("%s %s, %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
  1489  	case atomicLoad:
  1490  		m := "ldar"
  1491  		size := byte(32)
  1492  		switch i.u2 {
  1493  		case 8:
  1494  			size = 64
  1495  		case 2:
  1496  			m = m + "h"
  1497  		case 1:
  1498  			m = m + "b"
  1499  		}
  1500  		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rd.nr(), size), formatVRegSized(i.rn.nr(), 64))
  1501  	case atomicStore:
  1502  		m := "stlr"
  1503  		size := byte(32)
  1504  		switch i.u2 {
  1505  		case 8:
  1506  			size = 64
  1507  		case 2:
  1508  			m = m + "h"
  1509  		case 1:
  1510  			m = m + "b"
  1511  		}
  1512  		str = fmt.Sprintf("%s %s, %s", m, formatVRegSized(i.rm.nr(), size), formatVRegSized(i.rn.nr(), 64))
  1513  	case dmb:
  1514  		str = "dmb"
  1515  	case udf:
  1516  		str = "udf"
  1517  	case emitSourceOffsetInfo:
  1518  		str = fmt.Sprintf("source_offset_info %d", ssa.SourceOffset(i.u1))
  1519  	case vecLoad1R:
  1520  		str = fmt.Sprintf("ld1r {%s}, [%s]", formatVRegVec(i.rd.nr(), vecArrangement(i.u1), vecIndexNone), formatVRegSized(i.rn.nr(), 64))
  1521  	case loadConstBlockArg:
  1522  		str = fmt.Sprintf("load_const_block_arg %s, %#x", formatVRegSized(i.rd.nr(), 64), i.u1)
  1523  	default:
  1524  		panic(i.kind)
  1525  	}
  1526  	return
  1527  }
  1528  
  1529  func (i *instruction) asAdr(rd regalloc.VReg, offset int64) {
  1530  	i.kind = adr
  1531  	i.rd = operandNR(rd)
  1532  	i.u1 = uint64(offset)
  1533  }
  1534  
  1535  func (i *instruction) asAtomicRmw(op atomicRmwOp, rn, rs, rt operand, size uint64) {
  1536  	i.kind = atomicRmw
  1537  	i.rd, i.rn, i.rm = rt, rn, rs
  1538  	i.u1 = uint64(op)
  1539  	i.u2 = size
  1540  }
  1541  
  1542  func (i *instruction) asAtomicCas(rn, rs, rt operand, size uint64) {
  1543  	i.kind = atomicCas
  1544  	i.rm, i.rn, i.rd = rt, rn, rs
  1545  	i.u2 = size
  1546  }
  1547  
  1548  func (i *instruction) asAtomicLoad(rn, rt operand, size uint64) {
  1549  	i.kind = atomicLoad
  1550  	i.rn, i.rd = rn, rt
  1551  	i.u2 = size
  1552  }
  1553  
  1554  func (i *instruction) asAtomicStore(rn, rt operand, size uint64) {
  1555  	i.kind = atomicStore
  1556  	i.rn, i.rm = rn, rt
  1557  	i.u2 = size
  1558  }
  1559  
  1560  func (i *instruction) asDMB() {
  1561  	i.kind = dmb
  1562  }
  1563  
  1564  // TODO: delete unnecessary things.
  1565  const (
  1566  	// nop0 represents a no-op of zero size.
  1567  	nop0 instructionKind = iota + 1
  1568  	// aluRRR represents an ALU operation with two register sources and a register destination.
  1569  	aluRRR
  1570  	// aluRRRR represents an ALU operation with three register sources and a register destination.
  1571  	aluRRRR
  1572  	// aluRRImm12 represents an ALU operation with a register source and an immediate-12 source, with a register destination.
  1573  	aluRRImm12
  1574  	// aluRRBitmaskImm represents an ALU operation with a register source and a bitmask immediate, with a register destination.
  1575  	aluRRBitmaskImm
  1576  	// aluRRImmShift represents an ALU operation with a register source and an immediate-shifted source, with a register destination.
  1577  	aluRRImmShift
  1578  	// aluRRRShift represents an ALU operation with two register sources, one of which can be shifted, with a register destination.
  1579  	aluRRRShift
  1580  	// aluRRRExtend represents an ALU operation with two register sources, one of which can be extended, with a register destination.
  1581  	aluRRRExtend
  1582  	// bitRR represents a bit op instruction with a single register source.
  1583  	bitRR
  1584  	// uLoad8 represents an unsigned 8-bit load.
  1585  	uLoad8
  1586  	// sLoad8 represents a signed 8-bit load into 64-bit register.
  1587  	sLoad8
  1588  	// uLoad16 represents an unsigned 16-bit load into 64-bit register.
  1589  	uLoad16
  1590  	// sLoad16 represents a signed 16-bit load into 64-bit register.
  1591  	sLoad16
  1592  	// uLoad32 represents an unsigned 32-bit load into 64-bit register.
  1593  	uLoad32
  1594  	// sLoad32 represents a signed 32-bit load into 64-bit register.
  1595  	sLoad32
  1596  	// uLoad64 represents a 64-bit load.
  1597  	uLoad64
  1598  	// store8 represents an 8-bit store.
  1599  	store8
  1600  	// store16 represents a 16-bit store.
  1601  	store16
  1602  	// store32 represents a 32-bit store.
  1603  	store32
  1604  	// store64 represents a 64-bit store.
  1605  	store64
  1606  	// storeP64 represents a store of a pair of registers.
  1607  	storeP64
  1608  	// loadP64 represents a load of a pair of registers.
  1609  	loadP64
  1610  	// mov64 represents a MOV instruction. These are encoded as ORR's but we keep them separate for better handling.
  1611  	mov64
  1612  	// mov32 represents a 32-bit MOV. This zeroes the top 32 bits of the destination.
  1613  	mov32
  1614  	// movZ represents a MOVZ with a 16-bit immediate.
  1615  	movZ
  1616  	// movN represents a MOVN with a 16-bit immediate.
  1617  	movN
  1618  	// movK represents a MOVK with a 16-bit immediate.
  1619  	movK
  1620  	// extend represents a sign- or zero-extend operation.
  1621  	extend
  1622  	// cSel represents a conditional-select operation.
  1623  	cSel
  1624  	// cSet represents a conditional-set operation.
  1625  	cSet
  1626  	// cCmpImm represents a conditional comparison with an immediate.
  1627  	cCmpImm
  1628  	// fpuMov64 represents a FPU move. Distinct from a vector-register move; moving just 64 bits appears to be significantly faster.
  1629  	fpuMov64
  1630  	// fpuMov128 represents a vector register move.
  1631  	fpuMov128
  1632  	// fpuMovFromVec represents a move to scalar from a vector element.
  1633  	fpuMovFromVec
  1634  	// fpuRR represents a 1-op FPU instruction.
  1635  	fpuRR
  1636  	// fpuRRR represents a 2-op FPU instruction.
  1637  	fpuRRR
  1638  	// fpuRRI represents a 2-op FPU instruction with immediate value.
  1639  	fpuRRI
  1640  	// fpuRRRR represents a 3-op FPU instruction.
  1641  	fpuRRRR
  1642  	// fpuCmp represents a FPU comparison, either 32 or 64 bit.
  1643  	fpuCmp
  1644  	// fpuLoad32 represents a floating-point load, single-precision (32 bit).
  1645  	fpuLoad32
  1646  	// fpuStore32 represents a floating-point store, single-precision (32 bit).
  1647  	fpuStore32
  1648  	// fpuLoad64 represents a floating-point load, double-precision (64 bit).
  1649  	fpuLoad64
  1650  	// fpuStore64 represents a floating-point store, double-precision (64 bit).
  1651  	fpuStore64
  1652  	// fpuLoad128 represents a floating-point/vector load, 128 bit.
  1653  	fpuLoad128
  1654  	// fpuStore128 represents a floating-point/vector store, 128 bit.
  1655  	fpuStore128
  1656  	// loadFpuConst32 represents a load of a 32-bit floating-point constant.
  1657  	loadFpuConst32
  1658  	// loadFpuConst64 represents a load of a 64-bit floating-point constant.
  1659  	loadFpuConst64
  1660  	// loadFpuConst128 represents a load of a 128-bit floating-point constant.
  1661  	loadFpuConst128
  1662  	// vecLoad1R represents a load of a one single-element structure that replicates to all lanes of a vector.
  1663  	vecLoad1R
  1664  	// fpuToInt represents a conversion from FP to integer.
  1665  	fpuToInt
  1666  	// intToFpu represents a conversion from integer to FP.
  1667  	intToFpu
  1668  	// fpuCSel represents a 32/64-bit FP conditional select.
  1669  	fpuCSel
  1670  	// movToVec represents a move to a vector element from a GPR.
  1671  	movToVec
  1672  	// movFromVec represents an unsigned move from a vector element to a GPR.
  1673  	movFromVec
  1674  	// movFromVecSigned represents a signed move from a vector element to a GPR.
  1675  	movFromVecSigned
  1676  	// vecDup represents a duplication of general-purpose register to vector.
  1677  	vecDup
  1678  	// vecDupElement represents a duplication of a vector element to vector or scalar.
  1679  	vecDupElement
  1680  	// vecDupFromFpu represents a duplication of scalar to vector.
  1681  	vecDupFromFpu
  1682  	// vecExtract represents a vector extraction operation.
  1683  	vecExtract
  1684  	// vecExtend represents a vector extension operation.
  1685  	vecExtend
  1686  	// vecMovElement represents a move vector element to another vector element operation.
  1687  	vecMovElement
  1688  	// vecMiscNarrow represents a vector narrowing operation.
  1689  	vecMiscNarrow
  1690  	// vecRRR represents a vector ALU operation.
  1691  	vecRRR
  1692  	// vecRRRRewrite is exactly the same as vecRRR except that this rewrites the destination register.
  1693  	// For example, BSL instruction rewrites the destination register, and the existing value influences the result.
  1694  	// Therefore, the "destination" register in vecRRRRewrite will be treated as "use" which makes the register outlive
  1695  	// the instruction while this instruction doesn't have "def" in the context of register allocation.
  1696  	vecRRRRewrite
  1697  	// vecMisc represents a vector two register miscellaneous instruction.
  1698  	vecMisc
  1699  	// vecLanes represents a vector instruction across lanes.
  1700  	vecLanes
  1701  	// vecShiftImm represents a SIMD scalar shift by immediate instruction.
  1702  	vecShiftImm
  1703  	// vecTbl represents a table vector lookup - single register table.
  1704  	vecTbl
  1705  	// vecTbl2 represents a table vector lookup - two register table.
  1706  	vecTbl2
  1707  	// vecPermute represents a vector permute instruction.
  1708  	vecPermute
  1709  	// movToNZCV represents a move to the FPSR.
  1710  	movToFPSR
  1711  	// movFromNZCV represents a move from the FPSR.
  1712  	movFromFPSR
  1713  	// call represents a machine call instruction.
  1714  	call
  1715  	// callInd represents a machine indirect-call instruction.
  1716  	callInd
  1717  	// ret represents a machine return instruction.
  1718  	ret
  1719  	// br represents an unconditional branch.
  1720  	br
  1721  	// condBr represents a conditional branch.
  1722  	condBr
  1723  	// adr represents a compute the address (using a PC-relative offset) of a memory location.
  1724  	adr
  1725  	// brTableSequence represents a jump-table sequence.
  1726  	brTableSequence
  1727  	// exitSequence consists of multiple instructions, and exits the execution immediately.
  1728  	// See encodeExitSequence.
  1729  	exitSequence
  1730  	// atomicRmw represents an atomic read-modify-write operation with two register sources and a register destination.
  1731  	atomicRmw
  1732  	// atomicCas represents an atomic compare-and-swap operation with three register sources. The value is loaded to
  1733  	// the source register containing the comparison value.
  1734  	atomicCas
  1735  	// atomicLoad represents an atomic load with one source register and a register destination.
  1736  	atomicLoad
  1737  	// atomicStore represents an atomic store with two source registers and no destination.
  1738  	atomicStore
  1739  	// dmb represents the data memory barrier instruction in inner-shareable (ish) mode.
  1740  	dmb
  1741  	// UDF is the undefined instruction. For debugging only.
  1742  	udf
  1743  	// loadConstBlockArg represents a load of a constant block argument.
  1744  	loadConstBlockArg
  1745  
  1746  	// emitSourceOffsetInfo is a dummy instruction to emit source offset info.
  1747  	// The existence of this instruction does not affect the execution.
  1748  	emitSourceOffsetInfo
  1749  
  1750  	// ------------------- do not define below this line -------------------
  1751  	numInstructionKinds
  1752  )
  1753  
  1754  func (i *instruction) asLoadConstBlockArg(v uint64, typ ssa.Type, dst regalloc.VReg) *instruction {
  1755  	i.kind = loadConstBlockArg
  1756  	i.u1 = v
  1757  	i.u2 = uint64(typ)
  1758  	i.rd = operandNR(dst)
  1759  	return i
  1760  }
  1761  
  1762  func (i *instruction) loadConstBlockArgData() (v uint64, typ ssa.Type, dst regalloc.VReg) {
  1763  	return i.u1, ssa.Type(i.u2), i.rd.nr()
  1764  }
  1765  
  1766  func (i *instruction) asEmitSourceOffsetInfo(l ssa.SourceOffset) *instruction {
  1767  	i.kind = emitSourceOffsetInfo
  1768  	i.u1 = uint64(l)
  1769  	return i
  1770  }
  1771  
  1772  func (i *instruction) sourceOffsetInfo() ssa.SourceOffset {
  1773  	return ssa.SourceOffset(i.u1)
  1774  }
  1775  
  1776  func (i *instruction) asUDF() *instruction {
  1777  	i.kind = udf
  1778  	return i
  1779  }
  1780  
  1781  func (i *instruction) asFpuToInt(rd, rn operand, rdSigned, src64bit, dst64bit bool) {
  1782  	i.kind = fpuToInt
  1783  	i.rn = rn
  1784  	i.rd = rd
  1785  	if rdSigned {
  1786  		i.u1 = 1
  1787  	}
  1788  	if src64bit {
  1789  		i.u2 = 1
  1790  	}
  1791  	if dst64bit {
  1792  		i.u3 = 1
  1793  	}
  1794  }
  1795  
  1796  func (i *instruction) asIntToFpu(rd, rn operand, rnSigned, src64bit, dst64bit bool) {
  1797  	i.kind = intToFpu
  1798  	i.rn = rn
  1799  	i.rd = rd
  1800  	if rnSigned {
  1801  		i.u1 = 1
  1802  	}
  1803  	if src64bit {
  1804  		i.u2 = 1
  1805  	}
  1806  	if dst64bit {
  1807  		i.u3 = 1
  1808  	}
  1809  }
  1810  
  1811  func (i *instruction) asExitSequence(ctx regalloc.VReg) *instruction {
  1812  	i.kind = exitSequence
  1813  	i.rn = operandNR(ctx)
  1814  	return i
  1815  }
  1816  
  1817  // aluOp determines the type of ALU operation. Instructions whose kind is one of
  1818  // aluRRR, aluRRRR, aluRRImm12, aluRRBitmaskImm, aluRRImmShift, aluRRRShift and aluRRRExtend
  1819  // would use this type.
  1820  type aluOp int
  1821  
  1822  func (a aluOp) String() string {
  1823  	switch a {
  1824  	case aluOpAdd:
  1825  		return "add"
  1826  	case aluOpSub:
  1827  		return "sub"
  1828  	case aluOpOrr:
  1829  		return "orr"
  1830  	case aluOpOrn:
  1831  		return "orn"
  1832  	case aluOpAnd:
  1833  		return "and"
  1834  	case aluOpAnds:
  1835  		return "ands"
  1836  	case aluOpBic:
  1837  		return "bic"
  1838  	case aluOpEor:
  1839  		return "eor"
  1840  	case aluOpAddS:
  1841  		return "adds"
  1842  	case aluOpSubS:
  1843  		return "subs"
  1844  	case aluOpSMulH:
  1845  		return "sMulH"
  1846  	case aluOpUMulH:
  1847  		return "uMulH"
  1848  	case aluOpSDiv:
  1849  		return "sdiv"
  1850  	case aluOpUDiv:
  1851  		return "udiv"
  1852  	case aluOpRotR:
  1853  		return "ror"
  1854  	case aluOpLsr:
  1855  		return "lsr"
  1856  	case aluOpAsr:
  1857  		return "asr"
  1858  	case aluOpLsl:
  1859  		return "lsl"
  1860  	case aluOpMAdd:
  1861  		return "madd"
  1862  	case aluOpMSub:
  1863  		return "msub"
  1864  	}
  1865  	panic(int(a))
  1866  }
  1867  
  1868  const (
  1869  	// 32/64-bit Add.
  1870  	aluOpAdd aluOp = iota
  1871  	// 32/64-bit Subtract.
  1872  	aluOpSub
  1873  	// 32/64-bit Bitwise OR.
  1874  	aluOpOrr
  1875  	// 32/64-bit Bitwise OR NOT.
  1876  	aluOpOrn
  1877  	// 32/64-bit Bitwise AND.
  1878  	aluOpAnd
  1879  	// 32/64-bit Bitwise ANDS.
  1880  	aluOpAnds
  1881  	// 32/64-bit Bitwise AND NOT.
  1882  	aluOpBic
  1883  	// 32/64-bit Bitwise XOR (Exclusive OR).
  1884  	aluOpEor
  1885  	// 32/64-bit Add setting flags.
  1886  	aluOpAddS
  1887  	// 32/64-bit Subtract setting flags.
  1888  	aluOpSubS
  1889  	// Signed multiply, high-word result.
  1890  	aluOpSMulH
  1891  	// Unsigned multiply, high-word result.
  1892  	aluOpUMulH
  1893  	// 64-bit Signed divide.
  1894  	aluOpSDiv
  1895  	// 64-bit Unsigned divide.
  1896  	aluOpUDiv
  1897  	// 32/64-bit Rotate right.
  1898  	aluOpRotR
  1899  	// 32/64-bit Logical shift right.
  1900  	aluOpLsr
  1901  	// 32/64-bit Arithmetic shift right.
  1902  	aluOpAsr
  1903  	// 32/64-bit Logical shift left.
  1904  	aluOpLsl /// Multiply-add
  1905  
  1906  	// MAdd and MSub are only applicable for aluRRRR.
  1907  	aluOpMAdd
  1908  	aluOpMSub
  1909  )
  1910  
  1911  // vecOp determines the type of vector operation. Instructions whose kind is one of
  1912  // vecOpCnt would use this type.
  1913  type vecOp int
  1914  
  1915  // String implements fmt.Stringer.
  1916  func (b vecOp) String() string {
  1917  	switch b {
  1918  	case vecOpCnt:
  1919  		return "cnt"
  1920  	case vecOpCmeq:
  1921  		return "cmeq"
  1922  	case vecOpCmgt:
  1923  		return "cmgt"
  1924  	case vecOpCmhi:
  1925  		return "cmhi"
  1926  	case vecOpCmge:
  1927  		return "cmge"
  1928  	case vecOpCmhs:
  1929  		return "cmhs"
  1930  	case vecOpFcmeq:
  1931  		return "fcmeq"
  1932  	case vecOpFcmgt:
  1933  		return "fcmgt"
  1934  	case vecOpFcmge:
  1935  		return "fcmge"
  1936  	case vecOpCmeq0:
  1937  		return "cmeq0"
  1938  	case vecOpUaddlv:
  1939  		return "uaddlv"
  1940  	case vecOpBit:
  1941  		return "bit"
  1942  	case vecOpBic:
  1943  		return "bic"
  1944  	case vecOpBsl:
  1945  		return "bsl"
  1946  	case vecOpNot:
  1947  		return "not"
  1948  	case vecOpAnd:
  1949  		return "and"
  1950  	case vecOpOrr:
  1951  		return "orr"
  1952  	case vecOpEOR:
  1953  		return "eor"
  1954  	case vecOpFadd:
  1955  		return "fadd"
  1956  	case vecOpAdd:
  1957  		return "add"
  1958  	case vecOpAddp:
  1959  		return "addp"
  1960  	case vecOpAddv:
  1961  		return "addv"
  1962  	case vecOpSub:
  1963  		return "sub"
  1964  	case vecOpFsub:
  1965  		return "fsub"
  1966  	case vecOpSmin:
  1967  		return "smin"
  1968  	case vecOpUmin:
  1969  		return "umin"
  1970  	case vecOpUminv:
  1971  		return "uminv"
  1972  	case vecOpSmax:
  1973  		return "smax"
  1974  	case vecOpUmax:
  1975  		return "umax"
  1976  	case vecOpUmaxp:
  1977  		return "umaxp"
  1978  	case vecOpUrhadd:
  1979  		return "urhadd"
  1980  	case vecOpFmul:
  1981  		return "fmul"
  1982  	case vecOpSqrdmulh:
  1983  		return "sqrdmulh"
  1984  	case vecOpMul:
  1985  		return "mul"
  1986  	case vecOpUmlal:
  1987  		return "umlal"
  1988  	case vecOpFdiv:
  1989  		return "fdiv"
  1990  	case vecOpFsqrt:
  1991  		return "fsqrt"
  1992  	case vecOpAbs:
  1993  		return "abs"
  1994  	case vecOpFabs:
  1995  		return "fabs"
  1996  	case vecOpNeg:
  1997  		return "neg"
  1998  	case vecOpFneg:
  1999  		return "fneg"
  2000  	case vecOpFrintp:
  2001  		return "frintp"
  2002  	case vecOpFrintm:
  2003  		return "frintm"
  2004  	case vecOpFrintn:
  2005  		return "frintn"
  2006  	case vecOpFrintz:
  2007  		return "frintz"
  2008  	case vecOpFcvtl:
  2009  		return "fcvtl"
  2010  	case vecOpFcvtn:
  2011  		return "fcvtn"
  2012  	case vecOpFcvtzu:
  2013  		return "fcvtzu"
  2014  	case vecOpFcvtzs:
  2015  		return "fcvtzs"
  2016  	case vecOpScvtf:
  2017  		return "scvtf"
  2018  	case vecOpUcvtf:
  2019  		return "ucvtf"
  2020  	case vecOpSqxtn:
  2021  		return "sqxtn"
  2022  	case vecOpUqxtn:
  2023  		return "uqxtn"
  2024  	case vecOpSqxtun:
  2025  		return "sqxtun"
  2026  	case vecOpRev64:
  2027  		return "rev64"
  2028  	case vecOpXtn:
  2029  		return "xtn"
  2030  	case vecOpShll:
  2031  		return "shll"
  2032  	case vecOpSshl:
  2033  		return "sshl"
  2034  	case vecOpSshll:
  2035  		return "sshll"
  2036  	case vecOpUshl:
  2037  		return "ushl"
  2038  	case vecOpUshll:
  2039  		return "ushll"
  2040  	case vecOpSshr:
  2041  		return "sshr"
  2042  	case vecOpZip1:
  2043  		return "zip1"
  2044  	case vecOpFmin:
  2045  		return "fmin"
  2046  	case vecOpFmax:
  2047  		return "fmax"
  2048  	case vecOpSmull:
  2049  		return "smull"
  2050  	case vecOpSmull2:
  2051  		return "smull2"
  2052  	}
  2053  	panic(int(b))
  2054  }
  2055  
  2056  const (
  2057  	vecOpCnt vecOp = iota
  2058  	vecOpCmeq0
  2059  	vecOpCmeq
  2060  	vecOpCmgt
  2061  	vecOpCmhi
  2062  	vecOpCmge
  2063  	vecOpCmhs
  2064  	vecOpFcmeq
  2065  	vecOpFcmgt
  2066  	vecOpFcmge
  2067  	vecOpUaddlv
  2068  	vecOpBit
  2069  	vecOpBic
  2070  	vecOpBsl
  2071  	vecOpNot
  2072  	vecOpAnd
  2073  	vecOpOrr
  2074  	vecOpEOR
  2075  	vecOpAdd
  2076  	vecOpFadd
  2077  	vecOpAddv
  2078  	vecOpSqadd
  2079  	vecOpUqadd
  2080  	vecOpAddp
  2081  	vecOpSub
  2082  	vecOpFsub
  2083  	vecOpSqsub
  2084  	vecOpUqsub
  2085  	vecOpSmin
  2086  	vecOpUmin
  2087  	vecOpUminv
  2088  	vecOpFmin
  2089  	vecOpSmax
  2090  	vecOpUmax
  2091  	vecOpUmaxp
  2092  	vecOpFmax
  2093  	vecOpUrhadd
  2094  	vecOpMul
  2095  	vecOpFmul
  2096  	vecOpSqrdmulh
  2097  	vecOpUmlal
  2098  	vecOpFdiv
  2099  	vecOpFsqrt
  2100  	vecOpAbs
  2101  	vecOpFabs
  2102  	vecOpNeg
  2103  	vecOpFneg
  2104  	vecOpFrintm
  2105  	vecOpFrintn
  2106  	vecOpFrintp
  2107  	vecOpFrintz
  2108  	vecOpFcvtl
  2109  	vecOpFcvtn
  2110  	vecOpFcvtzs
  2111  	vecOpFcvtzu
  2112  	vecOpScvtf
  2113  	vecOpUcvtf
  2114  	vecOpSqxtn
  2115  	vecOpSqxtun
  2116  	vecOpUqxtn
  2117  	vecOpRev64
  2118  	vecOpXtn
  2119  	vecOpShll
  2120  	vecOpSshl
  2121  	vecOpSshll
  2122  	vecOpUshl
  2123  	vecOpUshll
  2124  	vecOpSshr
  2125  	vecOpZip1
  2126  	vecOpSmull
  2127  	vecOpSmull2
  2128  )
  2129  
  2130  // bitOp determines the type of bitwise operation. Instructions whose kind is one of
  2131  // bitOpRbit and bitOpClz would use this type.
  2132  type bitOp int
  2133  
  2134  // String implements fmt.Stringer.
  2135  func (b bitOp) String() string {
  2136  	switch b {
  2137  	case bitOpRbit:
  2138  		return "rbit"
  2139  	case bitOpClz:
  2140  		return "clz"
  2141  	}
  2142  	panic(int(b))
  2143  }
  2144  
  2145  const (
  2146  	// 32/64-bit Rbit.
  2147  	bitOpRbit bitOp = iota
  2148  	// 32/64-bit Clz.
  2149  	bitOpClz
  2150  )
  2151  
  2152  // fpuUniOp represents a unary floating-point unit (FPU) operation.
  2153  type fpuUniOp byte
  2154  
  2155  const (
  2156  	fpuUniOpNeg fpuUniOp = iota
  2157  	fpuUniOpCvt32To64
  2158  	fpuUniOpCvt64To32
  2159  	fpuUniOpSqrt
  2160  	fpuUniOpRoundPlus
  2161  	fpuUniOpRoundMinus
  2162  	fpuUniOpRoundZero
  2163  	fpuUniOpRoundNearest
  2164  	fpuUniOpAbs
  2165  )
  2166  
  2167  // String implements the fmt.Stringer.
  2168  func (f fpuUniOp) String() string {
  2169  	switch f {
  2170  	case fpuUniOpNeg:
  2171  		return "fneg"
  2172  	case fpuUniOpCvt32To64:
  2173  		return "fcvt"
  2174  	case fpuUniOpCvt64To32:
  2175  		return "fcvt"
  2176  	case fpuUniOpSqrt:
  2177  		return "fsqrt"
  2178  	case fpuUniOpRoundPlus:
  2179  		return "frintp"
  2180  	case fpuUniOpRoundMinus:
  2181  		return "frintm"
  2182  	case fpuUniOpRoundZero:
  2183  		return "frintz"
  2184  	case fpuUniOpRoundNearest:
  2185  		return "frintn"
  2186  	case fpuUniOpAbs:
  2187  		return "fabs"
  2188  	}
  2189  	panic(int(f))
  2190  }
  2191  
  2192  // fpuBinOp represents a binary floating-point unit (FPU) operation.
  2193  type fpuBinOp byte
  2194  
  2195  const (
  2196  	fpuBinOpAdd = iota
  2197  	fpuBinOpSub
  2198  	fpuBinOpMul
  2199  	fpuBinOpDiv
  2200  	fpuBinOpMax
  2201  	fpuBinOpMin
  2202  )
  2203  
  2204  // String implements the fmt.Stringer.
  2205  func (f fpuBinOp) String() string {
  2206  	switch f {
  2207  	case fpuBinOpAdd:
  2208  		return "fadd"
  2209  	case fpuBinOpSub:
  2210  		return "fsub"
  2211  	case fpuBinOpMul:
  2212  		return "fmul"
  2213  	case fpuBinOpDiv:
  2214  		return "fdiv"
  2215  	case fpuBinOpMax:
  2216  		return "fmax"
  2217  	case fpuBinOpMin:
  2218  		return "fmin"
  2219  	}
  2220  	panic(int(f))
  2221  }
  2222  
  2223  // extMode represents the mode of a register operand extension.
  2224  // For example, aluRRRExtend instructions need this info to determine the extensions.
  2225  type extMode byte
  2226  
  2227  const (
  2228  	extModeNone extMode = iota
  2229  	// extModeZeroExtend64 suggests a zero-extension to 32 bits if the original bit size is less than 32.
  2230  	extModeZeroExtend32
  2231  	// extModeSignExtend64 stands for a sign-extension to 32 bits if the original bit size is less than 32.
  2232  	extModeSignExtend32
  2233  	// extModeZeroExtend64 suggests a zero-extension to 64 bits if the original bit size is less than 64.
  2234  	extModeZeroExtend64
  2235  	// extModeSignExtend64 stands for a sign-extension to 64 bits if the original bit size is less than 64.
  2236  	extModeSignExtend64
  2237  )
  2238  
  2239  func (e extMode) bits() byte {
  2240  	switch e {
  2241  	case extModeZeroExtend32, extModeSignExtend32:
  2242  		return 32
  2243  	case extModeZeroExtend64, extModeSignExtend64:
  2244  		return 64
  2245  	default:
  2246  		return 0
  2247  	}
  2248  }
  2249  
  2250  func (e extMode) signed() bool {
  2251  	switch e {
  2252  	case extModeSignExtend32, extModeSignExtend64:
  2253  		return true
  2254  	default:
  2255  		return false
  2256  	}
  2257  }
  2258  
  2259  func extModeOf(t ssa.Type, signed bool) extMode {
  2260  	switch t.Bits() {
  2261  	case 32:
  2262  		if signed {
  2263  			return extModeSignExtend32
  2264  		}
  2265  		return extModeZeroExtend32
  2266  	case 64:
  2267  		if signed {
  2268  			return extModeSignExtend64
  2269  		}
  2270  		return extModeZeroExtend64
  2271  	default:
  2272  		panic("TODO? do we need narrower than 32 bits?")
  2273  	}
  2274  }
  2275  
  2276  type extendOp byte
  2277  
  2278  const (
  2279  	extendOpUXTB extendOp = 0b000
  2280  	extendOpUXTH extendOp = 0b001
  2281  	extendOpUXTW extendOp = 0b010
  2282  	// extendOpUXTX does nothing, but convenient symbol that officially exists. See:
  2283  	// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
  2284  	extendOpUXTX extendOp = 0b011
  2285  	extendOpSXTB extendOp = 0b100
  2286  	extendOpSXTH extendOp = 0b101
  2287  	extendOpSXTW extendOp = 0b110
  2288  	// extendOpSXTX does nothing, but convenient symbol that officially exists. See:
  2289  	// https://stackoverflow.com/questions/72041372/what-do-the-uxtx-and-sxtx-extensions-mean-for-32-bit-aarch64-adds-instruct
  2290  	extendOpSXTX extendOp = 0b111
  2291  	extendOpNone extendOp = 0xff
  2292  )
  2293  
  2294  func (e extendOp) srcBits() byte {
  2295  	switch e {
  2296  	case extendOpUXTB, extendOpSXTB:
  2297  		return 8
  2298  	case extendOpUXTH, extendOpSXTH:
  2299  		return 16
  2300  	case extendOpUXTW, extendOpSXTW:
  2301  		return 32
  2302  	case extendOpUXTX, extendOpSXTX:
  2303  		return 64
  2304  	}
  2305  	panic(int(e))
  2306  }
  2307  
  2308  func (e extendOp) String() string {
  2309  	switch e {
  2310  	case extendOpUXTB:
  2311  		return "UXTB"
  2312  	case extendOpUXTH:
  2313  		return "UXTH"
  2314  	case extendOpUXTW:
  2315  		return "UXTW"
  2316  	case extendOpUXTX:
  2317  		return "UXTX"
  2318  	case extendOpSXTB:
  2319  		return "SXTB"
  2320  	case extendOpSXTH:
  2321  		return "SXTH"
  2322  	case extendOpSXTW:
  2323  		return "SXTW"
  2324  	case extendOpSXTX:
  2325  		return "SXTX"
  2326  	}
  2327  	panic(int(e))
  2328  }
  2329  
  2330  func extendOpFrom(signed bool, from byte) extendOp {
  2331  	switch from {
  2332  	case 8:
  2333  		if signed {
  2334  			return extendOpSXTB
  2335  		}
  2336  		return extendOpUXTB
  2337  	case 16:
  2338  		if signed {
  2339  			return extendOpSXTH
  2340  		}
  2341  		return extendOpUXTH
  2342  	case 32:
  2343  		if signed {
  2344  			return extendOpSXTW
  2345  		}
  2346  		return extendOpUXTW
  2347  	case 64:
  2348  		if signed {
  2349  			return extendOpSXTX
  2350  		}
  2351  		return extendOpUXTX
  2352  	}
  2353  	panic("invalid extendOpFrom")
  2354  }
  2355  
  2356  type shiftOp byte
  2357  
  2358  const (
  2359  	shiftOpLSL shiftOp = 0b00
  2360  	shiftOpLSR shiftOp = 0b01
  2361  	shiftOpASR shiftOp = 0b10
  2362  	shiftOpROR shiftOp = 0b11
  2363  )
  2364  
  2365  func (s shiftOp) String() string {
  2366  	switch s {
  2367  	case shiftOpLSL:
  2368  		return "lsl"
  2369  	case shiftOpLSR:
  2370  		return "lsr"
  2371  	case shiftOpASR:
  2372  		return "asr"
  2373  	case shiftOpROR:
  2374  		return "ror"
  2375  	}
  2376  	panic(int(s))
  2377  }
  2378  
  2379  const exitSequenceSize = 6 * 4 // 6 instructions as in encodeExitSequence.
  2380  
  2381  // size returns the size of the instruction in encoded bytes.
  2382  func (i *instruction) size() int64 {
  2383  	switch i.kind {
  2384  	case exitSequence:
  2385  		return exitSequenceSize // 5 instructions as in encodeExitSequence.
  2386  	case nop0, loadConstBlockArg:
  2387  		return 0
  2388  	case emitSourceOffsetInfo:
  2389  		return 0
  2390  	case loadFpuConst32:
  2391  		if i.u1 == 0 {
  2392  			return 4 // zero loading can be encoded as a single instruction.
  2393  		}
  2394  		return 4 + 4 + 4
  2395  	case loadFpuConst64:
  2396  		if i.u1 == 0 {
  2397  			return 4 // zero loading can be encoded as a single instruction.
  2398  		}
  2399  		return 4 + 4 + 8
  2400  	case loadFpuConst128:
  2401  		if i.u1 == 0 && i.u2 == 0 {
  2402  			return 4 // zero loading can be encoded as a single instruction.
  2403  		}
  2404  		return 4 + 4 + 16
  2405  	case brTableSequence:
  2406  		return 4*4 + int64(i.u2)*4
  2407  	default:
  2408  		return 4
  2409  	}
  2410  }
  2411  
  2412  // vecArrangement is the arrangement of data within a vector register.
  2413  type vecArrangement byte
  2414  
  2415  const (
  2416  	// vecArrangementNone is an arrangement indicating no data is stored.
  2417  	vecArrangementNone vecArrangement = iota
  2418  	// vecArrangement8B is an arrangement of 8 bytes (64-bit vector)
  2419  	vecArrangement8B
  2420  	// vecArrangement16B is an arrangement of 16 bytes (128-bit vector)
  2421  	vecArrangement16B
  2422  	// vecArrangement4H is an arrangement of 4 half precisions (64-bit vector)
  2423  	vecArrangement4H
  2424  	// vecArrangement8H is an arrangement of 8 half precisions (128-bit vector)
  2425  	vecArrangement8H
  2426  	// vecArrangement2S is an arrangement of 2 single precisions (64-bit vector)
  2427  	vecArrangement2S
  2428  	// vecArrangement4S is an arrangement of 4 single precisions (128-bit vector)
  2429  	vecArrangement4S
  2430  	// vecArrangement1D is an arrangement of 1 double precision (64-bit vector)
  2431  	vecArrangement1D
  2432  	// vecArrangement2D is an arrangement of 2 double precisions (128-bit vector)
  2433  	vecArrangement2D
  2434  
  2435  	// Assign each vector size specifier to a vector arrangement ID.
  2436  	// Instructions can only have an arrangement or a size specifier, but not both, so it
  2437  	// simplifies the internal representation of vector instructions by being able to
  2438  	// store either into the same field.
  2439  
  2440  	// vecArrangementB is a size specifier of byte
  2441  	vecArrangementB
  2442  	// vecArrangementH is a size specifier of word (16-bit)
  2443  	vecArrangementH
  2444  	// vecArrangementS is a size specifier of double word (32-bit)
  2445  	vecArrangementS
  2446  	// vecArrangementD is a size specifier of quad word (64-bit)
  2447  	vecArrangementD
  2448  	// vecArrangementQ is a size specifier of the entire vector (128-bit)
  2449  	vecArrangementQ
  2450  )
  2451  
  2452  // String implements fmt.Stringer
  2453  func (v vecArrangement) String() (ret string) {
  2454  	switch v {
  2455  	case vecArrangement8B:
  2456  		ret = "8B"
  2457  	case vecArrangement16B:
  2458  		ret = "16B"
  2459  	case vecArrangement4H:
  2460  		ret = "4H"
  2461  	case vecArrangement8H:
  2462  		ret = "8H"
  2463  	case vecArrangement2S:
  2464  		ret = "2S"
  2465  	case vecArrangement4S:
  2466  		ret = "4S"
  2467  	case vecArrangement1D:
  2468  		ret = "1D"
  2469  	case vecArrangement2D:
  2470  		ret = "2D"
  2471  	case vecArrangementB:
  2472  		ret = "B"
  2473  	case vecArrangementH:
  2474  		ret = "H"
  2475  	case vecArrangementS:
  2476  		ret = "S"
  2477  	case vecArrangementD:
  2478  		ret = "D"
  2479  	case vecArrangementQ:
  2480  		ret = "Q"
  2481  	case vecArrangementNone:
  2482  		ret = "none"
  2483  	default:
  2484  		panic(v)
  2485  	}
  2486  	return
  2487  }
  2488  
  2489  // vecIndex is the index of an element of a vector register
  2490  type vecIndex byte
  2491  
  2492  // vecIndexNone indicates no vector index specified.
  2493  const vecIndexNone = ^vecIndex(0)
  2494  
  2495  func ssaLaneToArrangement(lane ssa.VecLane) vecArrangement {
  2496  	switch lane {
  2497  	case ssa.VecLaneI8x16:
  2498  		return vecArrangement16B
  2499  	case ssa.VecLaneI16x8:
  2500  		return vecArrangement8H
  2501  	case ssa.VecLaneI32x4:
  2502  		return vecArrangement4S
  2503  	case ssa.VecLaneI64x2:
  2504  		return vecArrangement2D
  2505  	case ssa.VecLaneF32x4:
  2506  		return vecArrangement4S
  2507  	case ssa.VecLaneF64x2:
  2508  		return vecArrangement2D
  2509  	default:
  2510  		panic(lane)
  2511  	}
  2512  }
  2513  
  2514  // atomicRmwOp is the type of atomic read-modify-write operation.
  2515  type atomicRmwOp byte
  2516  
  2517  const (
  2518  	// atomicRmwOpAdd is an atomic add operation.
  2519  	atomicRmwOpAdd atomicRmwOp = iota
  2520  	// atomicRmwOpClr is an atomic clear operation, i.e. AND NOT.
  2521  	atomicRmwOpClr
  2522  	// atomicRmwOpSet is an atomic set operation, i.e. OR.
  2523  	atomicRmwOpSet
  2524  	// atomicRmwOpEor is an atomic exclusive OR operation.
  2525  	atomicRmwOpEor
  2526  	// atomicRmwOpSwp is an atomic swap operation.
  2527  	atomicRmwOpSwp
  2528  )
  2529  
  2530  // String implements fmt.Stringer
  2531  func (a atomicRmwOp) String() string {
  2532  	switch a {
  2533  	case atomicRmwOpAdd:
  2534  		return "ldaddal"
  2535  	case atomicRmwOpClr:
  2536  		return "ldclral"
  2537  	case atomicRmwOpSet:
  2538  		return "ldsetal"
  2539  	case atomicRmwOpEor:
  2540  		return "ldeoral"
  2541  	case atomicRmwOpSwp:
  2542  		return "swpal"
  2543  	}
  2544  	panic(fmt.Sprintf("unknown atomicRmwOp: %d", a))
  2545  }