github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/asm/arm64/impl.go (about) 1 package arm64 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 8 "github.com/bananabytelabs/wazero/internal/asm" 9 ) 10 11 type nodeImpl struct { 12 // jumpTarget holds the target node in the linked for the jump-kind instruction. 13 jumpTarget *nodeImpl 14 // next holds the next node from this node in the assembled linked list. 15 next *nodeImpl 16 staticConst *asm.StaticConst 17 18 instruction asm.Instruction 19 types operandTypes 20 srcReg, srcReg2, dstReg, dstReg2 asm.Register 21 srcConst, dstConst asm.ConstantValue 22 23 offsetInBinary asm.NodeOffsetInBinary 24 25 // readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of 26 // read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress. 27 readInstructionAddressBeforeTargetInstruction asm.Instruction 28 29 vectorArrangement VectorArrangement 30 srcVectorIndex, dstVectorIndex VectorIndex 31 } 32 33 // AssignJumpTarget implements the same method as documented on asm.Node. 34 func (n *nodeImpl) AssignJumpTarget(target asm.Node) { 35 n.jumpTarget = target.(*nodeImpl) 36 } 37 38 // AssignDestinationConstant implements the same method as documented on asm.Node. 39 func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) { 40 n.dstConst = value 41 } 42 43 // AssignSourceConstant implements the same method as documented on asm.Node. 44 func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) { 45 n.srcConst = value 46 } 47 48 // OffsetInBinary implements the same method as documented on asm.Node. 49 func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary { 50 return n.offsetInBinary 51 } 52 53 // String implements fmt.Stringer. 54 // 55 // This is for debugging purpose, and the format is similar to the AT&T assembly syntax, 56 // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand 57 // might be embraced by '[]' to represent the memory location, and multiple operands 58 // are embraced by `()`. 59 func (n *nodeImpl) String() (ret string) { 60 instName := InstructionName(n.instruction) 61 switch n.types { 62 case operandTypesNoneToNone: 63 ret = instName 64 case operandTypesNoneToRegister: 65 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg)) 66 case operandTypesNoneToBranch: 67 ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget) 68 case operandTypesRegisterToRegister: 69 ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg)) 70 case operandTypesLeftShiftedRegisterToRegister: 71 ret = fmt.Sprintf("%s (%s, %s << %d), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), n.srcConst, RegisterName(n.dstReg)) 72 case operandTypesTwoRegistersToRegister: 73 ret = fmt.Sprintf("%s (%s, %s), %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg)) 74 case operandTypesThreeRegistersToRegister: 75 ret = fmt.Sprintf("%s (%s, %s, %s), %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), RegisterName(n.dstReg2)) 76 case operandTypesTwoRegistersToNone: 77 ret = fmt.Sprintf("%s (%s, %s)", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2)) 78 case operandTypesRegisterAndConstToNone: 79 ret = fmt.Sprintf("%s (%s, 0x%x)", instName, RegisterName(n.srcReg), n.srcConst) 80 case operandTypesRegisterAndConstToRegister: 81 ret = fmt.Sprintf("%s (%s, 0x%x), %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg)) 82 case operandTypesRegisterToMemory: 83 if n.dstReg2 != asm.NilRegister { 84 ret = fmt.Sprintf("%s %s, [%s + %s]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), RegisterName(n.dstReg2)) 85 } else { 86 ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst) 87 } 88 case operandTypesMemoryToRegister: 89 if n.srcReg2 != asm.NilRegister { 90 ret = fmt.Sprintf("%s [%s + %s], %s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg)) 91 } else { 92 ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg)) 93 } 94 case operandTypesConstToRegister: 95 ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg)) 96 case operandTypesRegisterToVectorRegister: 97 ret = fmt.Sprintf("%s %s, %s.%s[%d]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement, n.dstVectorIndex) 98 case operandTypesVectorRegisterToRegister: 99 ret = fmt.Sprintf("%s %s.%s[%d], %s", instName, RegisterName(n.srcReg), n.vectorArrangement, n.srcVectorIndex, RegisterName(n.dstReg)) 100 case operandTypesVectorRegisterToMemory: 101 if n.dstReg2 != asm.NilRegister { 102 ret = fmt.Sprintf("%s %s.%s, [%s + %s]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), RegisterName(n.dstReg2)) 103 } else { 104 ret = fmt.Sprintf("%s %s.%s, [%s + 0x%x]", instName, RegisterName(n.srcReg), n.vectorArrangement, RegisterName(n.dstReg), n.dstConst) 105 } 106 case operandTypesMemoryToVectorRegister: 107 ret = fmt.Sprintf("%s [%s], %s.%s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement) 108 case operandTypesVectorRegisterToVectorRegister: 109 ret = fmt.Sprintf("%s %[2]s.%[4]s, %[3]s.%[4]s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.vectorArrangement) 110 case operandTypesStaticConstToVectorRegister: 111 ret = fmt.Sprintf("%s $%#x %s.%s", instName, n.staticConst.Raw, RegisterName(n.dstReg), n.vectorArrangement) 112 case operandTypesTwoVectorRegistersToVectorRegister: 113 ret = fmt.Sprintf("%s (%s.%[5]s, %[3]s.%[5]s), %[4]s.%[5]s", instName, RegisterName(n.srcReg), RegisterName(n.srcReg2), RegisterName(n.dstReg), n.vectorArrangement) 114 } 115 return 116 } 117 118 // operandTypes represents types of operands of a node. 119 type operandTypes byte 120 121 const ( 122 operandTypesNoneToNone operandTypes = iota 123 operandTypesNoneToRegister 124 operandTypesNoneToBranch 125 operandTypesRegisterToRegister 126 operandTypesLeftShiftedRegisterToRegister 127 operandTypesTwoRegistersToRegister 128 operandTypesThreeRegistersToRegister 129 operandTypesTwoRegistersToNone 130 operandTypesRegisterAndConstToNone 131 operandTypesRegisterAndConstToRegister 132 operandTypesRegisterToMemory 133 operandTypesMemoryToRegister 134 operandTypesConstToRegister 135 operandTypesRegisterToVectorRegister 136 operandTypesVectorRegisterToRegister 137 operandTypesMemoryToVectorRegister 138 operandTypesVectorRegisterToMemory 139 operandTypesVectorRegisterToVectorRegister 140 operandTypesTwoVectorRegistersToVectorRegister 141 operandTypesStaticConstToVectorRegister 142 ) 143 144 // String implements fmt.Stringer 145 func (o operandTypes) String() (ret string) { 146 switch o { 147 case operandTypesNoneToNone: 148 ret = "NoneToNone" 149 case operandTypesNoneToRegister: 150 ret = "NoneToRegister" 151 case operandTypesNoneToBranch: 152 ret = "NoneToBranch" 153 case operandTypesRegisterToRegister: 154 ret = "RegisterToRegister" 155 case operandTypesLeftShiftedRegisterToRegister: 156 ret = "LeftShiftedRegisterToRegister" 157 case operandTypesTwoRegistersToRegister: 158 ret = "TwoRegistersToRegister" 159 case operandTypesThreeRegistersToRegister: 160 ret = "ThreeRegistersToRegister" 161 case operandTypesTwoRegistersToNone: 162 ret = "TwoRegistersToNone" 163 case operandTypesRegisterAndConstToNone: 164 ret = "RegisterAndConstToNone" 165 case operandTypesRegisterAndConstToRegister: 166 ret = "RegisterAndConstToRegister" 167 case operandTypesRegisterToMemory: 168 ret = "RegisterToMemory" 169 case operandTypesMemoryToRegister: 170 ret = "MemoryToRegister" 171 case operandTypesConstToRegister: 172 ret = "ConstToRegister" 173 case operandTypesRegisterToVectorRegister: 174 ret = "RegisterToVectorRegister" 175 case operandTypesVectorRegisterToRegister: 176 ret = "VectorRegisterToRegister" 177 case operandTypesMemoryToVectorRegister: 178 ret = "MemoryToVectorRegister" 179 case operandTypesVectorRegisterToMemory: 180 ret = "VectorRegisterToMemory" 181 case operandTypesVectorRegisterToVectorRegister: 182 ret = "VectorRegisterToVectorRegister" 183 case operandTypesTwoVectorRegistersToVectorRegister: 184 ret = "TwoVectorRegistersToVectorRegister" 185 case operandTypesStaticConstToVectorRegister: 186 ret = "StaticConstToVectorRegister" 187 } 188 return 189 } 190 191 const ( 192 maxSignedInt26 int64 = 1<<25 - 1 193 minSignedInt26 int64 = -(1 << 25) 194 195 maxSignedInt19 int64 = 1<<19 - 1 196 minSignedInt19 int64 = -(1 << 19) 197 ) 198 199 // AssemblerImpl implements Assembler. 200 type AssemblerImpl struct { 201 root *nodeImpl 202 current *nodeImpl 203 asm.BaseAssemblerImpl 204 relativeJumpNodes []*nodeImpl 205 adrInstructionNodes []*nodeImpl 206 nodePool nodePool 207 pool asm.StaticConstPool 208 nodeCount int 209 210 // MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstPool 211 // but have it as a field here for testability. 212 MaxDisplacementForConstantPool int 213 214 temporaryRegister asm.Register 215 } 216 217 const nodePageSize = 128 218 219 type nodePage = [nodePageSize]nodeImpl 220 221 // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl. 222 // This reduces the allocations over compilation by reusing AssemblerImpl. 223 type nodePool struct { 224 pages []*nodePage 225 index int 226 } 227 228 // allocNode allocates a new nodeImpl for use from the pool. 229 // This expands the pool if there is no space left for it. 230 func (n *nodePool) allocNode() *nodeImpl { 231 if n.index == nodePageSize { 232 if len(n.pages) == cap(n.pages) { 233 n.pages = append(n.pages, new(nodePage)) 234 } else { 235 i := len(n.pages) 236 n.pages = n.pages[:i+1] 237 if n.pages[i] == nil { 238 n.pages[i] = new(nodePage) 239 } 240 } 241 n.index = 0 242 } 243 ret := &n.pages[len(n.pages)-1][n.index] 244 n.index++ 245 return ret 246 } 247 248 func (n *nodePool) reset() { 249 for _, ns := range n.pages { 250 pages := ns[:] 251 for i := range pages { 252 pages[i] = nodeImpl{} 253 } 254 } 255 n.pages = n.pages[:0] 256 n.index = nodePageSize 257 } 258 259 func NewAssembler(temporaryRegister asm.Register) *AssemblerImpl { 260 return &AssemblerImpl{ 261 nodePool: nodePool{index: nodePageSize}, 262 temporaryRegister: temporaryRegister, 263 pool: asm.NewStaticConstPool(), 264 MaxDisplacementForConstantPool: defaultMaxDisplacementForConstPool, 265 } 266 } 267 268 // AllocateNOP implements asm.AssemblerBase. 269 func (a *AssemblerImpl) AllocateNOP() asm.Node { 270 n := a.nodePool.allocNode() 271 n.instruction = NOP 272 n.types = operandTypesNoneToNone 273 return n 274 } 275 276 // Add implements asm.AssemblerBase. 277 func (a *AssemblerImpl) Add(n asm.Node) { 278 a.addNode(n.(*nodeImpl)) 279 } 280 281 // Reset implements asm.AssemblerBase. 282 func (a *AssemblerImpl) Reset() { 283 pool := a.pool 284 pool.Reset() 285 *a = AssemblerImpl{ 286 nodePool: a.nodePool, 287 pool: pool, 288 temporaryRegister: a.temporaryRegister, 289 adrInstructionNodes: a.adrInstructionNodes[:0], 290 relativeJumpNodes: a.relativeJumpNodes[:0], 291 BaseAssemblerImpl: asm.BaseAssemblerImpl{ 292 SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0], 293 JumpTableEntries: a.JumpTableEntries[:0], 294 }, 295 } 296 a.nodePool.reset() 297 } 298 299 // newNode creates a new Node and appends it into the linked list. 300 func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl { 301 n := a.nodePool.allocNode() 302 n.instruction = instruction 303 n.types = types 304 305 a.addNode(n) 306 return n 307 } 308 309 // addNode appends the new node into the linked list. 310 func (a *AssemblerImpl) addNode(node *nodeImpl) { 311 a.nodeCount++ 312 313 if a.root == nil { 314 a.root = node 315 a.current = node 316 } else { 317 parent := a.current 318 parent.next = node 319 a.current = node 320 } 321 322 for _, o := range a.SetBranchTargetOnNextNodes { 323 origin := o.(*nodeImpl) 324 origin.jumpTarget = node 325 } 326 // Reuse the underlying slice to avoid re-allocations. 327 a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0] 328 } 329 330 // Assemble implements asm.AssemblerBase 331 func (a *AssemblerImpl) Assemble(buf asm.Buffer) error { 332 // arm64 has 32-bit fixed length instructions, 333 // but note that some nodes are encoded as multiple instructions, 334 // so the resulting binary might not be the size of count*8. 335 buf.Grow(a.nodeCount * 8) 336 337 for n := a.root; n != nil; n = n.next { 338 n.offsetInBinary = uint64(buf.Len()) 339 if err := a.encodeNode(buf, n); err != nil { 340 return err 341 } 342 a.maybeFlushConstPool(buf, n.next == nil) 343 } 344 345 code := buf.Bytes() 346 347 if err := a.FinalizeJumpTableEntry(code); err != nil { 348 return err 349 } 350 351 for _, rel := range a.relativeJumpNodes { 352 if err := a.relativeBranchFinalize(code, rel); err != nil { 353 return err 354 } 355 } 356 357 for _, adr := range a.adrInstructionNodes { 358 if err := a.finalizeADRInstructionNode(code, adr); err != nil { 359 return err 360 } 361 } 362 return nil 363 } 364 365 const defaultMaxDisplacementForConstPool = (1 << 20) - 1 - 4 // -4 for unconditional branch to skip the constants. 366 367 // maybeFlushConstPool flushes the constant pool if endOfBinary or a boundary condition was met. 368 func (a *AssemblerImpl) maybeFlushConstPool(buf asm.Buffer, endOfBinary bool) { 369 if a.pool.Empty() { 370 return 371 } 372 373 // If endOfBinary = true, we no longer need to emit the instructions, therefore 374 // flush all the constants. 375 if endOfBinary || 376 // Also, if the offset between the first usage of the constant pool and 377 // the first constant would exceed 2^20 -1(= 2MiB-1), which is the maximum offset 378 // for LDR(literal)/ADR instruction, flush all the constants in the pool. 379 (buf.Len()+a.pool.PoolSizeInBytes-int(a.pool.FirstUseOffsetInBinary)) >= a.MaxDisplacementForConstantPool { 380 381 // Before emitting consts, we have to add br instruction to skip the const pool. 382 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1123-L1129 383 skipOffset := a.pool.PoolSizeInBytes/4 + 1 384 if a.pool.PoolSizeInBytes%4 != 0 { 385 skipOffset++ 386 } 387 if endOfBinary { 388 // If this is the end of binary, we never reach this block, 389 // so offset can be zero (which is the behavior of Go's assembler). 390 skipOffset = 0 391 } 392 393 buf.Append4Bytes( 394 byte(skipOffset), 395 byte(skipOffset>>8), 396 byte(skipOffset>>16), 397 0x14, 398 ) 399 400 // Then adding the consts into the binary. 401 for _, c := range a.pool.Consts { 402 c.SetOffsetInBinary(uint64(buf.Len())) 403 buf.AppendBytes(c.Raw) 404 } 405 406 // arm64 instructions are 4-byte (32-bit) aligned, so we must pad the zero consts here. 407 if pad := buf.Len() % 4; pad != 0 { 408 buf.AppendBytes(make([]byte, 4-pad)) 409 } 410 411 // After the flush, reset the constant pool. 412 a.pool.Reset() 413 } 414 } 415 416 // encodeNode encodes the given node into writer. 417 func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) { 418 switch n.types { 419 case operandTypesNoneToNone: 420 err = a.encodeNoneToNone(buf, n) 421 case operandTypesNoneToRegister: 422 err = a.encodeJumpToRegister(buf, n) 423 case operandTypesNoneToBranch: 424 err = a.encodeRelativeBranch(buf, n) 425 case operandTypesRegisterToRegister: 426 err = a.encodeRegisterToRegister(buf, n) 427 case operandTypesLeftShiftedRegisterToRegister: 428 err = a.encodeLeftShiftedRegisterToRegister(buf, n) 429 case operandTypesTwoRegistersToRegister: 430 err = a.encodeTwoRegistersToRegister(buf, n) 431 case operandTypesThreeRegistersToRegister: 432 err = a.encodeThreeRegistersToRegister(buf, n) 433 case operandTypesTwoRegistersToNone: 434 err = a.encodeTwoRegistersToNone(buf, n) 435 case operandTypesRegisterAndConstToNone: 436 err = a.encodeRegisterAndConstToNone(buf, n) 437 case operandTypesRegisterToMemory: 438 err = a.encodeRegisterToMemory(buf, n) 439 case operandTypesMemoryToRegister: 440 err = a.encodeMemoryToRegister(buf, n) 441 case operandTypesRegisterAndConstToRegister, operandTypesConstToRegister: 442 err = a.encodeConstToRegister(buf, n) 443 case operandTypesRegisterToVectorRegister: 444 err = a.encodeRegisterToVectorRegister(buf, n) 445 case operandTypesVectorRegisterToRegister: 446 err = a.encodeVectorRegisterToRegister(buf, n) 447 case operandTypesMemoryToVectorRegister: 448 err = a.encodeMemoryToVectorRegister(buf, n) 449 case operandTypesVectorRegisterToMemory: 450 err = a.encodeVectorRegisterToMemory(buf, n) 451 case operandTypesVectorRegisterToVectorRegister: 452 err = a.encodeVectorRegisterToVectorRegister(buf, n) 453 case operandTypesStaticConstToVectorRegister: 454 err = a.encodeStaticConstToVectorRegister(buf, n) 455 case operandTypesTwoVectorRegistersToVectorRegister: 456 err = a.encodeTwoVectorRegistersToVectorRegister(buf, n) 457 default: 458 err = fmt.Errorf("encoder undefined for [%s] operand type", n.types) 459 } 460 if err != nil { 461 err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node. 462 } 463 return 464 } 465 466 // CompileStandAlone implements the same method as documented on asm.AssemblerBase. 467 func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node { 468 return a.newNode(instruction, operandTypesNoneToNone) 469 } 470 471 // CompileConstToRegister implements the same method as documented on asm.AssemblerBase. 472 func (a *AssemblerImpl) CompileConstToRegister( 473 instruction asm.Instruction, 474 value asm.ConstantValue, 475 destinationReg asm.Register, 476 ) (inst asm.Node) { 477 n := a.newNode(instruction, operandTypesConstToRegister) 478 n.srcConst = value 479 n.dstReg = destinationReg 480 return n 481 } 482 483 // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase. 484 func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) { 485 n := a.newNode(instruction, operandTypesRegisterToRegister) 486 n.srcReg = from 487 n.dstReg = to 488 } 489 490 // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase. 491 func (a *AssemblerImpl) CompileMemoryToRegister( 492 instruction asm.Instruction, 493 sourceBaseReg asm.Register, 494 sourceOffsetConst asm.ConstantValue, 495 destinationReg asm.Register, 496 ) { 497 n := a.newNode(instruction, operandTypesMemoryToRegister) 498 n.srcReg = sourceBaseReg 499 n.srcConst = sourceOffsetConst 500 n.dstReg = destinationReg 501 } 502 503 // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase. 504 func (a *AssemblerImpl) CompileRegisterToMemory( 505 instruction asm.Instruction, 506 sourceRegister, destinationBaseRegister asm.Register, 507 destinationOffsetConst asm.ConstantValue, 508 ) { 509 n := a.newNode(instruction, operandTypesRegisterToMemory) 510 n.srcReg = sourceRegister 511 n.dstReg = destinationBaseRegister 512 n.dstConst = destinationOffsetConst 513 } 514 515 // CompileJump implements the same method as documented on asm.AssemblerBase. 516 func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node { 517 return a.newNode(jmpInstruction, operandTypesNoneToBranch) 518 } 519 520 // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase. 521 func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) { 522 n := a.newNode(jmpInstruction, operandTypesNoneToRegister) 523 n.dstReg = reg 524 } 525 526 // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase. 527 func (a *AssemblerImpl) CompileReadInstructionAddress( 528 destinationRegister asm.Register, 529 beforeAcquisitionTargetInstruction asm.Instruction, 530 ) { 531 n := a.newNode(ADR, operandTypesMemoryToRegister) 532 n.dstReg = destinationRegister 533 n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction 534 } 535 536 // CompileMemoryWithRegisterOffsetToRegister implements Assembler.CompileMemoryWithRegisterOffsetToRegister 537 func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToRegister( 538 instruction asm.Instruction, 539 srcBaseReg, srcOffsetReg, dstReg asm.Register, 540 ) { 541 n := a.newNode(instruction, operandTypesMemoryToRegister) 542 n.dstReg = dstReg 543 n.srcReg = srcBaseReg 544 n.srcReg2 = srcOffsetReg 545 } 546 547 // CompileRegisterToMemoryWithRegisterOffset implements Assembler.CompileRegisterToMemoryWithRegisterOffset 548 func (a *AssemblerImpl) CompileRegisterToMemoryWithRegisterOffset( 549 instruction asm.Instruction, 550 srcReg, dstBaseReg, dstOffsetReg asm.Register, 551 ) { 552 n := a.newNode(instruction, operandTypesRegisterToMemory) 553 n.srcReg = srcReg 554 n.dstReg = dstBaseReg 555 n.dstReg2 = dstOffsetReg 556 } 557 558 // CompileTwoRegistersToRegister implements Assembler.CompileTwoRegistersToRegister 559 func (a *AssemblerImpl) CompileTwoRegistersToRegister(instruction asm.Instruction, src1, src2, dst asm.Register) { 560 n := a.newNode(instruction, operandTypesTwoRegistersToRegister) 561 n.srcReg = src1 562 n.srcReg2 = src2 563 n.dstReg = dst 564 } 565 566 // CompileThreeRegistersToRegister implements Assembler.CompileThreeRegistersToRegister 567 func (a *AssemblerImpl) CompileThreeRegistersToRegister( 568 instruction asm.Instruction, 569 src1, src2, src3, dst asm.Register, 570 ) { 571 n := a.newNode(instruction, operandTypesThreeRegistersToRegister) 572 n.srcReg = src1 573 n.srcReg2 = src2 574 n.dstReg = src3 // To minimize the size of nodeImpl struct, we reuse dstReg for the third source operand. 575 n.dstReg2 = dst 576 } 577 578 // CompileTwoRegistersToNone implements Assembler.CompileTwoRegistersToNone 579 func (a *AssemblerImpl) CompileTwoRegistersToNone(instruction asm.Instruction, src1, src2 asm.Register) { 580 n := a.newNode(instruction, operandTypesTwoRegistersToNone) 581 n.srcReg = src1 582 n.srcReg2 = src2 583 } 584 585 // CompileRegisterAndConstToNone implements Assembler.CompileRegisterAndConstToNone 586 func (a *AssemblerImpl) CompileRegisterAndConstToNone( 587 instruction asm.Instruction, 588 src asm.Register, 589 srcConst asm.ConstantValue, 590 ) { 591 n := a.newNode(instruction, operandTypesRegisterAndConstToNone) 592 n.srcReg = src 593 n.srcConst = srcConst 594 } 595 596 // CompileRegisterAndConstToRegister implements Assembler.CompileRegisterAndConstToRegister 597 func (a *AssemblerImpl) CompileRegisterAndConstToRegister( 598 instruction asm.Instruction, 599 src asm.Register, 600 srcConst asm.ConstantValue, 601 dst asm.Register, 602 ) { 603 n := a.newNode(instruction, operandTypesRegisterAndConstToRegister) 604 n.srcReg = src 605 n.srcConst = srcConst 606 n.dstReg = dst 607 } 608 609 // CompileLeftShiftedRegisterToRegister implements Assembler.CompileLeftShiftedRegisterToRegister 610 func (a *AssemblerImpl) CompileLeftShiftedRegisterToRegister( 611 instruction asm.Instruction, 612 shiftedSourceReg asm.Register, 613 shiftNum asm.ConstantValue, 614 srcReg, dstReg asm.Register, 615 ) { 616 n := a.newNode(instruction, operandTypesLeftShiftedRegisterToRegister) 617 n.srcReg = srcReg 618 n.srcReg2 = shiftedSourceReg 619 n.srcConst = shiftNum 620 n.dstReg = dstReg 621 } 622 623 // CompileConditionalRegisterSet implements Assembler.CompileConditionalRegisterSet 624 func (a *AssemblerImpl) CompileConditionalRegisterSet(cond asm.ConditionalRegisterState, dstReg asm.Register) { 625 n := a.newNode(CSET, operandTypesRegisterToRegister) 626 n.srcReg = conditionalRegisterStateToRegister(cond) 627 n.dstReg = dstReg 628 } 629 630 // CompileMemoryToVectorRegister implements Assembler.CompileMemoryToVectorRegister 631 func (a *AssemblerImpl) CompileMemoryToVectorRegister( 632 instruction asm.Instruction, srcBaseReg asm.Register, dstOffset asm.ConstantValue, dstReg asm.Register, arrangement VectorArrangement, 633 ) { 634 n := a.newNode(instruction, operandTypesMemoryToVectorRegister) 635 n.srcReg = srcBaseReg 636 n.srcConst = dstOffset 637 n.dstReg = dstReg 638 n.vectorArrangement = arrangement 639 } 640 641 // CompileMemoryWithRegisterOffsetToVectorRegister implements Assembler.CompileMemoryWithRegisterOffsetToVectorRegister 642 func (a *AssemblerImpl) CompileMemoryWithRegisterOffsetToVectorRegister(instruction asm.Instruction, 643 srcBaseReg, srcOffsetRegister asm.Register, dstReg asm.Register, arrangement VectorArrangement, 644 ) { 645 n := a.newNode(instruction, operandTypesMemoryToVectorRegister) 646 n.srcReg = srcBaseReg 647 n.srcReg2 = srcOffsetRegister 648 n.dstReg = dstReg 649 n.vectorArrangement = arrangement 650 } 651 652 // CompileVectorRegisterToMemory implements Assembler.CompileVectorRegisterToMemory 653 func (a *AssemblerImpl) CompileVectorRegisterToMemory( 654 instruction asm.Instruction, srcReg, dstBaseReg asm.Register, dstOffset asm.ConstantValue, arrangement VectorArrangement, 655 ) { 656 n := a.newNode(instruction, operandTypesVectorRegisterToMemory) 657 n.srcReg = srcReg 658 n.dstReg = dstBaseReg 659 n.dstConst = dstOffset 660 n.vectorArrangement = arrangement 661 } 662 663 // CompileVectorRegisterToMemoryWithRegisterOffset implements Assembler.CompileVectorRegisterToMemoryWithRegisterOffset 664 func (a *AssemblerImpl) CompileVectorRegisterToMemoryWithRegisterOffset(instruction asm.Instruction, 665 srcReg, dstBaseReg, dstOffsetRegister asm.Register, arrangement VectorArrangement, 666 ) { 667 n := a.newNode(instruction, operandTypesVectorRegisterToMemory) 668 n.srcReg = srcReg 669 n.dstReg = dstBaseReg 670 n.dstReg2 = dstOffsetRegister 671 n.vectorArrangement = arrangement 672 } 673 674 // CompileRegisterToVectorRegister implements Assembler.CompileRegisterToVectorRegister 675 func (a *AssemblerImpl) CompileRegisterToVectorRegister( 676 instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, index VectorIndex, 677 ) { 678 n := a.newNode(instruction, operandTypesRegisterToVectorRegister) 679 n.srcReg = srcReg 680 n.dstReg = dstReg 681 n.vectorArrangement = arrangement 682 n.dstVectorIndex = index 683 } 684 685 // CompileVectorRegisterToRegister implements Assembler.CompileVectorRegisterToRegister 686 func (a *AssemblerImpl) CompileVectorRegisterToRegister(instruction asm.Instruction, srcReg, dstReg asm.Register, 687 arrangement VectorArrangement, index VectorIndex, 688 ) { 689 n := a.newNode(instruction, operandTypesVectorRegisterToRegister) 690 n.srcReg = srcReg 691 n.dstReg = dstReg 692 n.vectorArrangement = arrangement 693 n.srcVectorIndex = index 694 } 695 696 // CompileVectorRegisterToVectorRegister implements Assembler.CompileVectorRegisterToVectorRegister 697 func (a *AssemblerImpl) CompileVectorRegisterToVectorRegister( 698 instruction asm.Instruction, srcReg, dstReg asm.Register, arrangement VectorArrangement, srcIndex, dstIndex VectorIndex, 699 ) { 700 n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister) 701 n.srcReg = srcReg 702 n.dstReg = dstReg 703 n.vectorArrangement = arrangement 704 n.srcVectorIndex = srcIndex 705 n.dstVectorIndex = dstIndex 706 } 707 708 // CompileVectorRegisterToVectorRegisterWithConst implements Assembler.CompileVectorRegisterToVectorRegisterWithConst 709 func (a *AssemblerImpl) CompileVectorRegisterToVectorRegisterWithConst(instruction asm.Instruction, 710 srcReg, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue, 711 ) { 712 n := a.newNode(instruction, operandTypesVectorRegisterToVectorRegister) 713 n.srcReg = srcReg 714 n.srcConst = c 715 n.dstReg = dstReg 716 n.vectorArrangement = arrangement 717 } 718 719 // CompileStaticConstToRegister implements Assembler.CompileStaticConstToVectorRegister 720 func (a *AssemblerImpl) CompileStaticConstToRegister(instruction asm.Instruction, c *asm.StaticConst, dstReg asm.Register) { 721 n := a.newNode(instruction, operandTypesMemoryToRegister) 722 n.staticConst = c 723 n.dstReg = dstReg 724 } 725 726 // CompileStaticConstToVectorRegister implements Assembler.CompileStaticConstToVectorRegister 727 func (a *AssemblerImpl) CompileStaticConstToVectorRegister(instruction asm.Instruction, 728 c *asm.StaticConst, dstReg asm.Register, arrangement VectorArrangement, 729 ) { 730 n := a.newNode(instruction, operandTypesStaticConstToVectorRegister) 731 n.staticConst = c 732 n.dstReg = dstReg 733 n.vectorArrangement = arrangement 734 } 735 736 // CompileTwoVectorRegistersToVectorRegister implements Assembler.CompileTwoVectorRegistersToVectorRegister. 737 func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegister(instruction asm.Instruction, srcReg, srcReg2, dstReg asm.Register, 738 arrangement VectorArrangement, 739 ) { 740 n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister) 741 n.srcReg = srcReg 742 n.srcReg2 = srcReg2 743 n.dstReg = dstReg 744 n.vectorArrangement = arrangement 745 } 746 747 // CompileTwoVectorRegistersToVectorRegisterWithConst implements Assembler.CompileTwoVectorRegistersToVectorRegisterWithConst. 748 func (a *AssemblerImpl) CompileTwoVectorRegistersToVectorRegisterWithConst(instruction asm.Instruction, 749 srcReg, srcReg2, dstReg asm.Register, arrangement VectorArrangement, c asm.ConstantValue, 750 ) { 751 n := a.newNode(instruction, operandTypesTwoVectorRegistersToVectorRegister) 752 n.srcReg = srcReg 753 n.srcReg2 = srcReg2 754 n.srcConst = c 755 n.dstReg = dstReg 756 n.vectorArrangement = arrangement 757 } 758 759 func errorEncodingUnsupported(n *nodeImpl) error { 760 return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types) 761 } 762 763 func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) error { 764 switch n.instruction { 765 case UDF: 766 buf.Append4Bytes(0, 0, 0, 0) 767 return nil 768 case NOP: 769 return nil 770 default: 771 return errorEncodingUnsupported(n) 772 } 773 } 774 775 func (a *AssemblerImpl) encodeJumpToRegister(buf asm.Buffer, n *nodeImpl) error { 776 // "Unconditional branch (register)" in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Branches--Exception-Generating-and-System-instructions 777 var opc byte 778 switch n.instruction { 779 case RET: 780 opc = 0b0010 781 case B: 782 opc = 0b0000 783 default: 784 return errorEncodingUnsupported(n) 785 } 786 787 regBits, err := intRegisterBits(n.dstReg) 788 if err != nil { 789 return fmt.Errorf("invalid destination register: %w", err) 790 } 791 792 buf.Append4Bytes( 793 0x00|(regBits<<5), 794 0x00|(regBits>>3), 795 0b000_11111|(opc<<5), 796 0b1101011_0|(opc>>3), 797 ) 798 return err 799 } 800 801 func (a *AssemblerImpl) relativeBranchFinalize(code []byte, n *nodeImpl) error { 802 var condBits byte 803 const condBitsUnconditional = 0xff // Indicates this is not conditional jump. 804 805 // https://developer.arm.com/documentation/den0024/a/CHDEEABE 806 switch n.instruction { 807 case B: 808 condBits = condBitsUnconditional 809 case BCONDEQ: 810 condBits = 0b0000 811 case BCONDGE: 812 condBits = 0b1010 813 case BCONDGT: 814 condBits = 0b1100 815 case BCONDHI: 816 condBits = 0b1000 817 case BCONDHS: 818 condBits = 0b0010 819 case BCONDLE: 820 condBits = 0b1101 821 case BCONDLO: 822 condBits = 0b0011 823 case BCONDLS: 824 condBits = 0b1001 825 case BCONDLT: 826 condBits = 0b1011 827 case BCONDMI: 828 condBits = 0b0100 829 case BCONDPL: 830 condBits = 0b0101 831 case BCONDNE: 832 condBits = 0b0001 833 case BCONDVS: 834 condBits = 0b0110 835 case BCONDVC: 836 condBits = 0b0111 837 } 838 839 branchInstOffset := int64(n.OffsetInBinary()) 840 offset := int64(n.jumpTarget.OffsetInBinary()) - branchInstOffset 841 if offset%4 != 0 { 842 return errors.New("BUG: relative jump offset must be 4 bytes aligned") 843 } 844 845 branchInst := code[branchInstOffset : branchInstOffset+4] 846 if condBits == condBitsUnconditional { 847 imm26 := offset >> 2 // divide by 4. 848 if imm26 < minSignedInt26 || imm26 > maxSignedInt26 { 849 // In theory this could happen if a Wasm binary has a huge single label (more than 128MB for a single block), 850 // and in that case, we use load the offset into a register and do the register jump, but to avoid the complexity, 851 // we impose this limit for now as that would be *unlikely* happen in practice. 852 return fmt.Errorf("relative jump offset %d/4 must be within %d and %d", offset, minSignedInt26, maxSignedInt26) 853 } 854 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B--Branch-?lang=en 855 branchInst[0] = byte(imm26) 856 branchInst[1] = byte(imm26 >> 8) 857 branchInst[2] = byte(imm26 >> 16) 858 branchInst[3] = (byte(imm26 >> 24 & 0b000000_11)) | 0b000101_00 859 } else { 860 imm19 := offset >> 2 // divide by 4. 861 if imm19 < minSignedInt19 || imm19 > maxSignedInt19 { 862 // This should be a bug in our compiler as the conditional jumps are only used in the small offsets (~a few bytes), 863 // and if ever happens, compiler can be fixed. 864 return fmt.Errorf("BUG: relative jump offset %d/4(=%d) must be within %d and %d", offset, imm19, minSignedInt19, maxSignedInt19) 865 } 866 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/B-cond--Branch-conditionally-?lang=en 867 branchInst[0] = (byte(imm19<<5) & 0b111_0_0000) | condBits 868 branchInst[1] = byte(imm19 >> 3) 869 branchInst[2] = byte(imm19 >> 11) 870 branchInst[3] = 0b01010100 871 } 872 return nil 873 } 874 875 func (a *AssemblerImpl) encodeRelativeBranch(buf asm.Buffer, n *nodeImpl) error { 876 switch n.instruction { 877 case B, BCONDEQ, BCONDGE, BCONDGT, BCONDHI, BCONDHS, BCONDLE, BCONDLO, BCONDLS, BCONDLT, BCONDMI, BCONDNE, BCONDVS, BCONDVC, BCONDPL: 878 default: 879 return errorEncodingUnsupported(n) 880 } 881 882 if n.jumpTarget == nil { 883 return fmt.Errorf("branch target must be set for %s", InstructionName(n.instruction)) 884 } 885 886 // At this point, we don't yet know that target's branch, so emit the placeholder (4 bytes). 887 buf.Append4Bytes(0, 0, 0, 0) 888 a.relativeJumpNodes = append(a.relativeJumpNodes, n) 889 return nil 890 } 891 892 func checkRegisterToRegisterType(src, dst asm.Register, requireSrcInt, requireDstInt bool) (err error) { 893 isSrcInt, isDstInt := isIntRegister(src), isIntRegister(dst) 894 if isSrcInt && !requireSrcInt { 895 err = fmt.Errorf("src requires float register but got %s", RegisterName(src)) 896 } else if !isSrcInt && requireSrcInt { 897 err = fmt.Errorf("src requires int register but got %s", RegisterName(src)) 898 } else if isDstInt && !requireDstInt { 899 err = fmt.Errorf("dst requires float register but got %s", RegisterName(dst)) 900 } else if !isDstInt && requireDstInt { 901 err = fmt.Errorf("dst requires int register but got %s", RegisterName(dst)) 902 } 903 return 904 } 905 906 func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 907 switch inst := n.instruction; inst { 908 case ADD, ADDW, SUB: 909 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 910 return 911 } 912 913 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift 914 var sfops byte 915 switch inst { 916 case ADD: 917 sfops = 0b100 918 case ADDW: 919 case SUB: 920 sfops = 0b110 921 } 922 923 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 924 buf.Append4Bytes( 925 (dstRegBits<<5)|dstRegBits, 926 dstRegBits>>3, 927 srcRegBits, 928 (sfops<<5)|0b01011, 929 ) 930 case CLZ, CLZW, RBIT, RBITW: 931 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 932 return 933 } 934 935 var sf, opcode byte 936 switch inst { 937 case CLZ: 938 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en 939 sf, opcode = 0b1, 0b000_100 940 case CLZW: 941 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CLZ--Count-Leading-Zeros-?lang=en 942 sf, opcode = 0b0, 0b000_100 943 case RBIT: 944 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en 945 sf, opcode = 0b1, 0b000_000 946 case RBITW: 947 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/RBIT--Reverse-Bits-?lang=en 948 sf, opcode = 0b0, 0b000_000 949 } 950 if inst == CLZ { 951 sf = 1 952 } 953 954 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 955 buf.Append4Bytes( 956 (srcRegBits<<5)|dstRegBits, 957 opcode<<2|(srcRegBits>>3), 958 0b110_00000, 959 (sf<<7)|0b0_1011010, 960 ) 961 case CSET: 962 if !isConditionalRegister(n.srcReg) { 963 return fmt.Errorf("CSET requires conditional register but got %s", RegisterName(n.srcReg)) 964 } 965 966 dstRegBits, err := intRegisterBits(n.dstReg) 967 if err != nil { 968 return err 969 } 970 971 // CSET encodes the conditional bits with its least significant bit inverted. 972 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en 973 // 974 // https://developer.arm.com/documentation/den0024/a/CHDEEABE 975 var conditionalBits byte 976 switch n.srcReg { 977 case RegCondEQ: 978 conditionalBits = 0b0001 979 case RegCondNE: 980 conditionalBits = 0b0000 981 case RegCondHS: 982 conditionalBits = 0b0011 983 case RegCondLO: 984 conditionalBits = 0b0010 985 case RegCondMI: 986 conditionalBits = 0b0101 987 case RegCondPL: 988 conditionalBits = 0b0100 989 case RegCondVS: 990 conditionalBits = 0b0111 991 case RegCondVC: 992 conditionalBits = 0b0110 993 case RegCondHI: 994 conditionalBits = 0b1001 995 case RegCondLS: 996 conditionalBits = 0b1000 997 case RegCondGE: 998 conditionalBits = 0b1011 999 case RegCondLT: 1000 conditionalBits = 0b1010 1001 case RegCondGT: 1002 conditionalBits = 0b1101 1003 case RegCondLE: 1004 conditionalBits = 0b1100 1005 case RegCondAL: 1006 conditionalBits = 0b1111 1007 case RegCondNV: 1008 conditionalBits = 0b1110 1009 } 1010 1011 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CSET--Conditional-Set--an-alias-of-CSINC-?lang=en 1012 buf.Append4Bytes( 1013 0b111_00000|dstRegBits, 1014 (conditionalBits<<4)|0b0000_0111, 1015 0b100_11111, 1016 0b10011010, 1017 ) 1018 1019 case FABSD, FABSS, FNEGD, FNEGS, FSQRTD, FSQRTS, FCVTSD, FCVTDS, FRINTMD, FRINTMS, 1020 FRINTND, FRINTNS, FRINTPD, FRINTPS, FRINTZD, FRINTZS: 1021 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil { 1022 return 1023 } 1024 1025 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1026 1027 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1 1028 var tp, opcode byte 1029 switch inst { 1030 case FABSD: 1031 opcode, tp = 0b000001, 0b01 1032 case FABSS: 1033 opcode, tp = 0b000001, 0b00 1034 case FNEGD: 1035 opcode, tp = 0b000010, 0b01 1036 case FNEGS: 1037 opcode, tp = 0b000010, 0b00 1038 case FSQRTD: 1039 opcode, tp = 0b000011, 0b01 1040 case FSQRTS: 1041 opcode, tp = 0b000011, 0b00 1042 case FCVTSD: 1043 opcode, tp = 0b000101, 0b00 1044 case FCVTDS: 1045 opcode, tp = 0b000100, 0b01 1046 case FRINTMD: 1047 opcode, tp = 0b001010, 0b01 1048 case FRINTMS: 1049 opcode, tp = 0b001010, 0b00 1050 case FRINTND: 1051 opcode, tp = 0b001000, 0b01 1052 case FRINTNS: 1053 opcode, tp = 0b001000, 0b00 1054 case FRINTPD: 1055 opcode, tp = 0b001001, 0b01 1056 case FRINTPS: 1057 opcode, tp = 0b001001, 0b00 1058 case FRINTZD: 1059 opcode, tp = 0b001011, 0b01 1060 case FRINTZS: 1061 opcode, tp = 0b001011, 0b00 1062 } 1063 buf.Append4Bytes( 1064 (srcRegBits<<5)|dstRegBits, 1065 (opcode<<7)|0b0_10000_00|(srcRegBits>>3), 1066 tp<<6|0b00_1_00000|opcode>>1, 1067 0b0_00_11110, 1068 ) 1069 1070 case FADDD, FADDS, FDIVS, FDIVD, FMAXD, FMAXS, FMIND, FMINS, FMULS, FMULD: 1071 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, false); err != nil { 1072 return 1073 } 1074 1075 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1076 1077 // "Floating-point data-processing (2 source)" in 1078 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1 1079 var tp, opcode byte 1080 switch inst { 1081 case FADDD: 1082 opcode, tp = 0b0010, 0b01 1083 case FADDS: 1084 opcode, tp = 0b0010, 0b00 1085 case FDIVD: 1086 opcode, tp = 0b0001, 0b01 1087 case FDIVS: 1088 opcode, tp = 0b0001, 0b00 1089 case FMAXD: 1090 opcode, tp = 0b0100, 0b01 1091 case FMAXS: 1092 opcode, tp = 0b0100, 0b00 1093 case FMIND: 1094 opcode, tp = 0b0101, 0b01 1095 case FMINS: 1096 opcode, tp = 0b0101, 0b00 1097 case FMULS: 1098 opcode, tp = 0b0000, 0b00 1099 case FMULD: 1100 opcode, tp = 0b0000, 0b01 1101 } 1102 1103 buf.Append4Bytes( 1104 (dstRegBits<<5)|dstRegBits, 1105 opcode<<4|0b0000_10_00|(dstRegBits>>3), 1106 tp<<6|0b00_1_00000|srcRegBits, 1107 0b0001_1110, 1108 ) 1109 1110 case FCVTZSD, FCVTZSDW, FCVTZSS, FCVTZSSW, FCVTZUD, FCVTZUDW, FCVTZUS, FCVTZUSW: 1111 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, false, true); err != nil { 1112 return 1113 } 1114 1115 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1116 1117 // "Conversion between floating-point and integer" in 1118 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1 1119 var sf, tp, opcode byte 1120 switch inst { 1121 case FCVTZSD: // Double to signed 64-bit 1122 sf, tp, opcode = 0b1, 0b01, 0b000 1123 case FCVTZSDW: // Double to signed 32-bit. 1124 sf, tp, opcode = 0b0, 0b01, 0b000 1125 case FCVTZSS: // Single to signed 64-bit. 1126 sf, tp, opcode = 0b1, 0b00, 0b000 1127 case FCVTZSSW: // Single to signed 32-bit. 1128 sf, tp, opcode = 0b0, 0b00, 0b000 1129 case FCVTZUD: // Double to unsigned 64-bit. 1130 sf, tp, opcode = 0b1, 0b01, 0b001 1131 case FCVTZUDW: // Double to unsigned 32-bit. 1132 sf, tp, opcode = 0b0, 0b01, 0b001 1133 case FCVTZUS: // Single to unsigned 64-bit. 1134 sf, tp, opcode = 0b1, 0b00, 0b001 1135 case FCVTZUSW: // Single to unsigned 32-bit. 1136 sf, tp, opcode = 0b0, 0b00, 0b001 1137 } 1138 1139 buf.Append4Bytes( 1140 (srcRegBits<<5)|dstRegBits, 1141 0|(srcRegBits>>3), 1142 tp<<6|0b00_1_11_000|opcode, 1143 sf<<7|0b0_0_0_11110, 1144 ) 1145 1146 case FMOVD, FMOVS: 1147 isSrcInt, isDstInt := isIntRegister(n.srcReg), isIntRegister(n.dstReg) 1148 if isSrcInt && isDstInt { 1149 return errors.New("FMOV needs at least one of operands to be integer") 1150 } 1151 1152 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1153 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMOV--register---Floating-point-Move-register-without-conversion-?lang=en 1154 if !isSrcInt && !isDstInt { // Float to float. 1155 var tp byte 1156 if inst == FMOVD { 1157 tp = 0b01 1158 } 1159 buf.Append4Bytes( 1160 (srcRegBits<<5)|dstRegBits, 1161 0b0_10000_00|(srcRegBits>>3), 1162 tp<<6|0b00_1_00000, 1163 0b000_11110, 1164 ) 1165 } else if isSrcInt && !isDstInt { // Int to float. 1166 var tp, sf byte 1167 if inst == FMOVD { 1168 tp, sf = 0b01, 0b1 1169 } 1170 buf.Append4Bytes( 1171 (srcRegBits<<5)|dstRegBits, 1172 srcRegBits>>3, 1173 tp<<6|0b00_1_00_111, 1174 sf<<7|0b0_00_11110, 1175 ) 1176 } else { // Float to int. 1177 var tp, sf byte 1178 if inst == FMOVD { 1179 tp, sf = 0b01, 0b1 1180 } 1181 buf.Append4Bytes( 1182 (srcRegBits<<5)|dstRegBits, 1183 srcRegBits>>3, 1184 tp<<6|0b00_1_00_110, 1185 sf<<7|0b0_00_11110, 1186 ) 1187 } 1188 1189 case MOVD, MOVW: 1190 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 1191 return 1192 } 1193 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1194 1195 if n.srcReg == RegSP || n.dstReg == RegSP { 1196 // Moving between stack pointers. 1197 // https://developer.arm.com/documentation/ddi0602/2021-12/Base-Instructions/MOV--to-from-SP---Move-between-register-and-stack-pointer--an-alias-of-ADD--immediate-- 1198 buf.Append4Bytes( 1199 (srcRegBits<<5)|dstRegBits, 1200 srcRegBits>>3, 1201 0x0, 1202 0b1001_0001, 1203 ) 1204 return 1205 } 1206 1207 if n.srcReg == RegRZR && inst == MOVD { 1208 // If this is 64-bit mov from zero register, then we encode this as MOVK. 1209 // See "Move wide (immediate)" in 1210 // https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Immediate 1211 buf.Append4Bytes( 1212 dstRegBits, 1213 0x0, 1214 0b1000_0000, 1215 0b1_10_10010, 1216 ) 1217 } else { 1218 // MOV can be encoded as ORR (shifted register): "ORR Wd, WZR, Wm". 1219 // https://developer.arm.com/documentation/100069/0609/A64-General-Instructions/MOV--register- 1220 var sf byte 1221 if inst == MOVD { 1222 sf = 0b1 1223 } 1224 buf.Append4Bytes( 1225 (zeroRegisterBits<<5)|dstRegBits, 1226 zeroRegisterBits>>3, 1227 0b000_00000|srcRegBits, 1228 sf<<7|0b0_01_01010, 1229 ) 1230 } 1231 1232 case MRS: 1233 if n.srcReg != RegFPSR { 1234 return fmt.Errorf("MRS has only support for FPSR register as a src but got %s", RegisterName(n.srcReg)) 1235 } 1236 1237 // For how to specify FPSR register, see "Accessing FPSR" in: 1238 // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en 1239 dstRegBits := registerBits(n.dstReg) 1240 buf.Append4Bytes( 1241 0b001<<5|dstRegBits, 1242 0b0100<<4|0b0100, 1243 0b0011_0000|0b11<<3|0b011, 1244 0b1101_0101, 1245 ) 1246 1247 case MSR: 1248 if n.dstReg != RegFPSR { 1249 return fmt.Errorf("MSR has only support for FPSR register as a dst but got %s", RegisterName(n.srcReg)) 1250 } 1251 1252 // For how to specify FPSR register, see "Accessing FPSR" in: 1253 // https://developer.arm.com/documentation/ddi0595/2021-12/AArch64-Registers/FPSR--Floating-point-Status-Register?lang=en 1254 srcRegBits := registerBits(n.srcReg) 1255 buf.Append4Bytes( 1256 0b001<<5|srcRegBits, 1257 0b0100<<4|0b0100, 1258 0b0001_0000|0b11<<3|0b011, 1259 0b1101_0101, 1260 ) 1261 1262 case MUL, MULW: 1263 // Multiplications are encoded as MADD (zero register, src, dst), dst = zero + (src * dst) = src * dst. 1264 // See "Data-processing (3 source)" in 1265 // https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en 1266 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 1267 return 1268 } 1269 1270 var sf byte 1271 if inst == MUL { 1272 sf = 0b1 1273 } 1274 1275 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1276 1277 buf.Append4Bytes( 1278 dstRegBits<<5|dstRegBits, 1279 zeroRegisterBits<<2|dstRegBits>>3, 1280 srcRegBits, 1281 sf<<7|0b11011, 1282 ) 1283 1284 case NEG, NEGW: 1285 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1286 1287 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 1288 return 1289 } 1290 1291 // NEG is encoded as "SUB dst, XZR, src" = "dst = 0 - src" 1292 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift 1293 var sf byte 1294 if inst == NEG { 1295 sf = 0b1 1296 } 1297 1298 buf.Append4Bytes( 1299 (zeroRegisterBits<<5)|dstRegBits, 1300 zeroRegisterBits>>3, 1301 srcRegBits, 1302 sf<<7|0b0_10_00000|0b0_00_01011, 1303 ) 1304 1305 case SDIV, SDIVW, UDIV, UDIVW: 1306 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1307 1308 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 1309 return 1310 } 1311 1312 // See "Data-processing (2 source)" in 1313 // https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en 1314 var sf, opcode byte 1315 switch inst { 1316 case SDIV: 1317 sf, opcode = 0b1, 0b000011 1318 case SDIVW: 1319 sf, opcode = 0b0, 0b000011 1320 case UDIV: 1321 sf, opcode = 0b1, 0b000010 1322 case UDIVW: 1323 sf, opcode = 0b0, 0b000010 1324 } 1325 1326 buf.Append4Bytes( 1327 (dstRegBits<<5)|dstRegBits, 1328 opcode<<2|(dstRegBits>>3), 1329 0b110_00000|srcRegBits, 1330 sf<<7|0b0_00_11010, 1331 ) 1332 1333 case SCVTFD, SCVTFWD, SCVTFS, SCVTFWS, UCVTFD, UCVTFS, UCVTFWD, UCVTFWS: 1334 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1335 1336 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, false); err != nil { 1337 return 1338 } 1339 1340 // "Conversion between floating-point and integer" in 1341 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#floatdp1 1342 var sf, tp, opcode byte 1343 switch inst { 1344 case SCVTFD: // 64-bit integer to double 1345 sf, tp, opcode = 0b1, 0b01, 0b010 1346 case SCVTFWD: // 32-bit integer to double 1347 sf, tp, opcode = 0b0, 0b01, 0b010 1348 case SCVTFS: // 64-bit integer to single 1349 sf, tp, opcode = 0b1, 0b00, 0b010 1350 case SCVTFWS: // 32-bit integer to single 1351 sf, tp, opcode = 0b0, 0b00, 0b010 1352 case UCVTFD: // 64-bit to double 1353 sf, tp, opcode = 0b1, 0b01, 0b011 1354 case UCVTFWD: // 32-bit to double 1355 sf, tp, opcode = 0b0, 0b01, 0b011 1356 case UCVTFS: // 64-bit to single 1357 sf, tp, opcode = 0b1, 0b00, 0b011 1358 case UCVTFWS: // 32-bit to single 1359 sf, tp, opcode = 0b0, 0b00, 0b011 1360 } 1361 1362 buf.Append4Bytes( 1363 (srcRegBits<<5)|dstRegBits, 1364 srcRegBits>>3, 1365 tp<<6|0b00_1_00_000|opcode, 1366 sf<<7|0b0_0_0_11110, 1367 ) 1368 1369 case SXTB, SXTBW, SXTH, SXTHW, SXTW: 1370 if err = checkRegisterToRegisterType(n.srcReg, n.dstReg, true, true); err != nil { 1371 return 1372 } 1373 1374 srcRegBits, dstRegBits := registerBits(n.srcReg), registerBits(n.dstReg) 1375 if n.srcReg == RegRZR { 1376 // If the source is zero register, we encode as MOV dst, zero. 1377 var sf byte 1378 if inst == MOVD { 1379 sf = 0b1 1380 } 1381 buf.Append4Bytes( 1382 (zeroRegisterBits<<5)|dstRegBits, 1383 zeroRegisterBits>>3, 1384 0b000_00000|srcRegBits, 1385 sf<<7|0b0_01_01010, 1386 ) 1387 return 1388 } 1389 1390 // SXTB is encoded as "SBFM Wd, Wn, #0, #7" 1391 // https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTB 1392 // SXTH is encoded as "SBFM Wd, Wn, #0, #15" 1393 // https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/SXTH 1394 // SXTW is encoded as "SBFM Xd, Xn, #0, #31" 1395 // https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/SXTW 1396 1397 var n, sf, imms, opc byte 1398 switch inst { 1399 case SXTB: 1400 n, sf, imms = 0b1, 0b1, 0x7 1401 case SXTBW: 1402 n, sf, imms = 0b0, 0b0, 0x7 1403 case SXTH: 1404 n, sf, imms = 0b1, 0b1, 0xf 1405 case SXTHW: 1406 n, sf, imms = 0b0, 0b0, 0xf 1407 case SXTW: 1408 n, sf, imms = 0b1, 0b1, 0x1f 1409 } 1410 1411 buf.Append4Bytes( 1412 (srcRegBits<<5)|dstRegBits, 1413 imms<<2|(srcRegBits>>3), 1414 n<<6, 1415 sf<<7|opc<<5|0b10011, 1416 ) 1417 default: 1418 return errorEncodingUnsupported(n) 1419 } 1420 return 1421 } 1422 1423 func (a *AssemblerImpl) encodeLeftShiftedRegisterToRegister(buf asm.Buffer, n *nodeImpl) error { 1424 baseRegBits, err := intRegisterBits(n.srcReg) 1425 if err != nil { 1426 return err 1427 } 1428 shiftTargetRegBits, err := intRegisterBits(n.srcReg2) 1429 if err != nil { 1430 return err 1431 } 1432 dstRegBits, err := intRegisterBits(n.dstReg) 1433 if err != nil { 1434 return err 1435 } 1436 1437 switch n.instruction { 1438 case ADD: 1439 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift 1440 const logicalLeftShiftBits = 0b00 1441 if n.srcConst < 0 || n.srcConst > 64 { 1442 return fmt.Errorf("shift amount must fit in unsigned 6-bit integer (0-64) but got %d", n.srcConst) 1443 } 1444 shiftByte := byte(n.srcConst) 1445 buf.Append4Bytes( 1446 (baseRegBits<<5)|dstRegBits, 1447 (shiftByte<<2)|(baseRegBits>>3), 1448 (logicalLeftShiftBits<<6)|shiftTargetRegBits, 1449 0b1000_1011, 1450 ) 1451 return err 1452 default: 1453 return errorEncodingUnsupported(n) 1454 } 1455 } 1456 1457 func (a *AssemblerImpl) encodeTwoRegistersToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 1458 switch inst := n.instruction; inst { 1459 case AND, ANDW, ORR, ORRW, EOR, EORW: 1460 // See "Logical (shifted register)" in 1461 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en 1462 srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg) 1463 var sf, opc byte 1464 switch inst { 1465 case AND: 1466 sf, opc = 0b1, 0b00 1467 case ANDW: 1468 sf, opc = 0b0, 0b00 1469 case ORR: 1470 sf, opc = 0b1, 0b01 1471 case ORRW: 1472 sf, opc = 0b0, 0b01 1473 case EOR: 1474 sf, opc = 0b1, 0b10 1475 case EORW: 1476 sf, opc = 0b0, 0b10 1477 } 1478 buf.Append4Bytes( 1479 (srcReg2Bits<<5)|dstRegBits, 1480 srcReg2Bits>>3, 1481 srcRegBits, 1482 sf<<7|opc<<5|0b01010, 1483 ) 1484 case ASR, ASRW, LSL, LSLW, LSR, LSRW, ROR, RORW: 1485 // See "Data-processing (2 source)" in 1486 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en 1487 srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg) 1488 1489 var sf, opcode byte 1490 switch inst { 1491 case ASR: 1492 sf, opcode = 0b1, 0b001010 1493 case ASRW: 1494 sf, opcode = 0b0, 0b001010 1495 case LSL: 1496 sf, opcode = 0b1, 0b001000 1497 case LSLW: 1498 sf, opcode = 0b0, 0b001000 1499 case LSR: 1500 sf, opcode = 0b1, 0b001001 1501 case LSRW: 1502 sf, opcode = 0b0, 0b001001 1503 case ROR: 1504 sf, opcode = 0b1, 0b001011 1505 case RORW: 1506 sf, opcode = 0b0, 0b001011 1507 } 1508 buf.Append4Bytes( 1509 (srcReg2Bits<<5)|dstRegBits, 1510 opcode<<2|(srcReg2Bits>>3), 1511 0b110_00000|srcRegBits, 1512 sf<<7|0b0_00_11010, 1513 ) 1514 case SDIV, SDIVW, UDIV, UDIVW: 1515 srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg) 1516 1517 // See "Data-processing (2 source)" in 1518 // https://developer.arm.com/documentation/ddi0602/2021-06/Index-by-Encoding/Data-Processing----Register?lang=en 1519 var sf, opcode byte 1520 switch inst { 1521 case SDIV: 1522 sf, opcode = 0b1, 0b000011 1523 case SDIVW: 1524 sf, opcode = 0b0, 0b000011 1525 case UDIV: 1526 sf, opcode = 0b1, 0b000010 1527 case UDIVW: 1528 sf, opcode = 0b0, 0b000010 1529 } 1530 1531 buf.Append4Bytes( 1532 (srcReg2Bits<<5)|dstRegBits, 1533 opcode<<2|(srcReg2Bits>>3), 1534 0b110_00000|srcRegBits, 1535 sf<<7|0b0_00_11010, 1536 ) 1537 case SUB, SUBW: 1538 srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg) 1539 1540 // See "Add/subtract (shifted register)" in 1541 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en 1542 var sf byte 1543 if inst == SUB { 1544 sf = 0b1 1545 } 1546 1547 buf.Append4Bytes( 1548 (srcReg2Bits<<5)|dstRegBits, 1549 srcReg2Bits>>3, 1550 srcRegBits, 1551 sf<<7|0b0_10_01011, 1552 ) 1553 case FSUBD, FSUBS: 1554 srcRegBits, srcReg2Bits, dstRegBits := registerBits(n.srcReg), registerBits(n.srcReg2), registerBits(n.dstReg) 1555 1556 // See "Floating-point data-processing (2 source)" in 1557 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 1558 var tp byte 1559 if inst == FSUBD { 1560 tp = 0b01 1561 } 1562 buf.Append4Bytes( 1563 (srcReg2Bits<<5)|dstRegBits, 1564 0b0011_10_00|(srcReg2Bits>>3), 1565 tp<<6|0b00_1_00000|srcRegBits, 1566 0b0_00_11110, 1567 ) 1568 default: 1569 return errorEncodingUnsupported(n) 1570 } 1571 return 1572 } 1573 1574 func (a *AssemblerImpl) encodeThreeRegistersToRegister(buf asm.Buffer, n *nodeImpl) error { 1575 switch n.instruction { 1576 case MSUB, MSUBW: 1577 // Dst = Src2 - (Src1 * Src3) 1578 // "Data-processing (3 source)" in: 1579 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en 1580 src1RegBits, err := intRegisterBits(n.srcReg) 1581 if err != nil { 1582 return err 1583 } 1584 src2RegBits, err := intRegisterBits(n.srcReg2) 1585 if err != nil { 1586 return err 1587 } 1588 src3RegBits, err := intRegisterBits(n.dstReg) 1589 if err != nil { 1590 return err 1591 } 1592 dstRegBits, err := intRegisterBits(n.dstReg2) 1593 if err != nil { 1594 return err 1595 } 1596 1597 var sf byte // is zero for MSUBW (32-bit MSUB). 1598 if n.instruction == MSUB { 1599 sf = 0b1 1600 } 1601 1602 buf.Append4Bytes( 1603 (src3RegBits<<5)|dstRegBits, 1604 0b1_0000000|(src2RegBits<<2)|(src3RegBits>>3), 1605 src1RegBits, 1606 sf<<7|0b00_11011, 1607 ) 1608 return nil 1609 default: 1610 return errorEncodingUnsupported(n) 1611 } 1612 } 1613 1614 func (a *AssemblerImpl) encodeTwoRegistersToNone(buf asm.Buffer, n *nodeImpl) error { 1615 switch n.instruction { 1616 case CMPW, CMP: 1617 // Compare on two registers is an alias for "SUBS (src1, src2) ZERO" 1618 // which can be encoded as SUBS (shifted registers) with zero shifting. 1619 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Register?lang=en#addsub_shift 1620 src1RegBits, err := intRegisterBits(n.srcReg) 1621 if err != nil { 1622 return err 1623 } 1624 src2RegBits, err := intRegisterBits(n.srcReg2) 1625 if err != nil { 1626 return err 1627 } 1628 1629 var op byte 1630 if n.instruction == CMP { 1631 op = 0b111 1632 } else { 1633 op = 0b011 1634 } 1635 1636 buf.Append4Bytes( 1637 (src2RegBits<<5)|zeroRegisterBits, 1638 src2RegBits>>3, 1639 src1RegBits, 1640 0b01011|(op<<5), 1641 ) 1642 return nil 1643 case FCMPS, FCMPD: 1644 // "Floating-point compare" section in: 1645 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 1646 src1RegBits, err := vectorRegisterBits(n.srcReg) 1647 if err != nil { 1648 return err 1649 } 1650 src2RegBits, err := vectorRegisterBits(n.srcReg2) 1651 if err != nil { 1652 return err 1653 } 1654 1655 var ftype byte // is zero for FCMPS (single precision float compare). 1656 if n.instruction == FCMPD { 1657 ftype = 0b01 1658 } 1659 buf.Append4Bytes( 1660 src2RegBits<<5, 1661 0b001000_00|(src2RegBits>>3), 1662 ftype<<6|0b1_00000|src1RegBits, 1663 0b000_11110, 1664 ) 1665 return nil 1666 default: 1667 return errorEncodingUnsupported(n) 1668 } 1669 } 1670 1671 func (a *AssemblerImpl) encodeRegisterAndConstToNone(buf asm.Buffer, n *nodeImpl) error { 1672 if n.instruction != CMP { 1673 return errorEncodingUnsupported(n) 1674 } 1675 1676 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/CMP--immediate---Compare--immediate---an-alias-of-SUBS--immediate--?lang=en 1677 if n.srcConst < 0 || n.srcConst > 4095 { 1678 return fmt.Errorf("immediate for CMP must fit in 0 to 4095 but got %d", n.srcConst) 1679 } else if n.srcReg == RegRZR { 1680 return errors.New("zero register is not supported for CMP (immediate)") 1681 } 1682 1683 srcRegBits, err := intRegisterBits(n.srcReg) 1684 if err != nil { 1685 return err 1686 } 1687 1688 buf.Append4Bytes( 1689 (srcRegBits<<5)|zeroRegisterBits, 1690 (byte(n.srcConst)<<2)|(srcRegBits>>3), 1691 byte(n.srcConst>>6), 1692 0b111_10001, 1693 ) 1694 return nil 1695 } 1696 1697 func fitInSigned9Bits(v int64) bool { 1698 return v >= -256 && v <= 255 1699 } 1700 1701 func (a *AssemblerImpl) encodeLoadOrStoreWithRegisterOffset( 1702 buf asm.Buffer, baseRegBits, offsetRegBits, targetRegBits byte, opcode, size, v byte, 1703 ) { 1704 // See "Load/store register (register offset)". 1705 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff 1706 buf.Append4Bytes( 1707 (baseRegBits<<5)|targetRegBits, 1708 0b011_010_00|(baseRegBits>>3), 1709 opcode<<6|0b00_1_00000|offsetRegBits, 1710 size<<6|v<<2|0b00_111_0_00, 1711 ) 1712 } 1713 1714 // validateMemoryOffset validates the memory offset if the given offset can be encoded in the assembler. 1715 // In theory, offset can be any, but for simplicity of our homemade assembler, we limit the offset range 1716 // that can be encoded enough for supporting compiler. 1717 func validateMemoryOffset(offset int64) error { 1718 if offset > 255 && offset%4 != 0 { 1719 // This is because we only have large offsets for load/store with Wasm value stack or reading type IDs, and its offset 1720 // is always multiplied by 4 or 8 (== the size of uint32 or uint64 == the type of wasm.FunctionTypeID or value stack in Go) 1721 return fmt.Errorf("large memory offset (>255) must be a multiple of 4 but got %d", offset) 1722 } else if offset < -256 { // 9-bit signed integer's minimum = 2^8. 1723 return fmt.Errorf("negative memory offset must be larget than or equal -256 but got %d", offset) 1724 } else if offset > 1<<31-1 { 1725 return fmt.Errorf("large memory offset must be less than %d but got %d", 1<<31-1, offset) 1726 } else { 1727 return nil 1728 } 1729 } 1730 1731 // encodeLoadOrStoreWithConstOffset encodes load/store instructions with the constant offset. 1732 // 1733 // Note: Encoding strategy intentionally matches the Go assembler: https://go.dev/doc/asm 1734 func (a *AssemblerImpl) encodeLoadOrStoreWithConstOffset( 1735 buf asm.Buffer, 1736 baseRegBits, targetRegBits byte, 1737 offset int64, 1738 opcode, size, v byte, 1739 datasize, datasizeLog2 int64, 1740 ) (err error) { 1741 if err = validateMemoryOffset(offset); err != nil { 1742 return 1743 } 1744 1745 if fitInSigned9Bits(offset) { 1746 // See "LDAPR/STLR (unscaled immediate)" 1747 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldapstl_unscaled 1748 if offset < 0 || offset%datasize != 0 { 1749 // This case is encoded as one "unscaled signed store". 1750 buf.Append4Bytes( 1751 (baseRegBits<<5)|targetRegBits, 1752 byte(offset<<4)|(baseRegBits>>3), 1753 opcode<<6|(0b00_00_11111&byte(offset>>4)), 1754 size<<6|v<<2|0b00_1_11_0_00, 1755 ) 1756 return 1757 } 1758 } 1759 1760 // At this point we have the assumption that offset is positive. 1761 // Plus if it is a multiple of datasize, then it can be encoded as a single "unsigned immediate". 1762 if offset%datasize == 0 && 1763 offset < (1<<12)<<datasizeLog2 { 1764 m := offset / datasize 1765 buf.Append4Bytes( 1766 (baseRegBits<<5)|targetRegBits, 1767 (byte(m<<2))|(baseRegBits>>3), 1768 opcode<<6|0b00_111111&byte(m>>6), 1769 size<<6|v<<2|0b00_1_11_0_01, 1770 ) 1771 return 1772 } 1773 1774 // Otherwise, we need multiple instructions. 1775 tmpRegBits := registerBits(a.temporaryRegister) 1776 offset32 := int32(offset) 1777 1778 // Go's assembler adds a const into the const pool at this point, 1779 // regardless of its usage; e.g. if we enter the then block of the following if statement, 1780 // the const is not used but it is added into the const pool. 1781 c := asm.NewStaticConst(make([]byte, 4)) 1782 binary.LittleEndian.PutUint32(c.Raw, uint32(offset)) 1783 a.pool.AddConst(c, uint64(buf.Len())) 1784 1785 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3529-L3532 1786 // If the offset is within 24-bits, we can load it with two ADD instructions. 1787 hi := offset32 - (offset32 & (0xfff << uint(datasizeLog2))) 1788 if hi&^0xfff000 == 0 { 1789 var sfops byte = 0b100 1790 m := ((offset32 - hi) >> datasizeLog2) & 0xfff 1791 hi >>= 12 1792 1793 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3534-L3535 1794 buf.Append4Bytes( 1795 (baseRegBits<<5)|tmpRegBits, 1796 (byte(hi)<<2)|(baseRegBits>>3), 1797 0b01<<6 /* shift by 12 */ |byte(hi>>6), 1798 sfops<<5|0b10001, 1799 ) 1800 1801 buf.Append4Bytes( 1802 (tmpRegBits<<5)|targetRegBits, 1803 (byte(m<<2))|(tmpRegBits>>3), 1804 opcode<<6|0b00_111111&byte(m>>6), 1805 size<<6|v<<2|0b00_1_11_0_01, 1806 ) 1807 } else { 1808 // This case we load the const via ldr(literal) into tem register, 1809 // and the target const is placed after this instruction below. 1810 loadLiteralOffsetInBinary := uint64(buf.Len()) 1811 1812 // First we emit the ldr(literal) with offset zero as we don't yet know the const's placement in the binary. 1813 // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--literal---Load-Register--literal-- 1814 buf.Append4Bytes(tmpRegBits, 0x0, 0x0, 0b00_011_0_00) 1815 1816 // Set the callback for the constant, and we set properly the offset in the callback. 1817 1818 c.AddOffsetFinalizedCallback(func(offsetOfConst uint64) { 1819 // ldr(literal) encodes offset divided by 4. 1820 offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4 1821 bin := buf.Bytes() 1822 bin[loadLiteralOffsetInBinary] |= byte(offset << 5) 1823 bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3) 1824 bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11) 1825 }) 1826 1827 // Then, load the constant with the register offset. 1828 // https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/LDR--register---Load-Register--register-- 1829 buf.Append4Bytes( 1830 (baseRegBits<<5)|targetRegBits, 1831 0b011_010_00|(baseRegBits>>3), 1832 opcode<<6|0b00_1_00000|tmpRegBits, 1833 size<<6|v<<2|0b00_111_0_00, 1834 ) 1835 } 1836 return 1837 } 1838 1839 func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 1840 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff 1841 var ( 1842 size, v byte 1843 datasize, datasizeLog2 int64 1844 isTargetFloat bool 1845 ) 1846 switch n.instruction { 1847 case STRD: 1848 size, v, datasize, datasizeLog2 = 0b11, 0x0, 8, 3 1849 case STRW: 1850 size, v, datasize, datasizeLog2 = 0b10, 0x0, 4, 2 1851 case STRH: 1852 size, v, datasize, datasizeLog2 = 0b01, 0x0, 2, 1 1853 case STRB: 1854 size, v, datasize, datasizeLog2 = 0b00, 0x0, 1, 0 1855 case FSTRD: 1856 size, v, datasize, datasizeLog2, isTargetFloat = 0b11, 0x1, 8, 3, true 1857 case FSTRS: 1858 size, v, datasize, datasizeLog2, isTargetFloat = 0b10, 0x1, 4, 2, true 1859 default: 1860 return errorEncodingUnsupported(n) 1861 } 1862 1863 var srcRegBits byte 1864 if isTargetFloat { 1865 srcRegBits, err = vectorRegisterBits(n.srcReg) 1866 } else { 1867 srcRegBits, err = intRegisterBits(n.srcReg) 1868 } 1869 if err != nil { 1870 return 1871 } 1872 1873 baseRegBits, err := intRegisterBits(n.dstReg) 1874 if err != nil { 1875 return err 1876 } 1877 1878 const opcode = 0x00 // opcode for store instructions. 1879 if n.dstReg2 != asm.NilRegister { 1880 offsetRegBits, err := intRegisterBits(n.dstReg2) 1881 if err != nil { 1882 return err 1883 } 1884 a.encodeLoadOrStoreWithRegisterOffset(buf, baseRegBits, offsetRegBits, srcRegBits, opcode, size, v) 1885 } else { 1886 err = a.encodeLoadOrStoreWithConstOffset(buf, baseRegBits, srcRegBits, n.dstConst, opcode, size, v, datasize, datasizeLog2) 1887 } 1888 return 1889 } 1890 1891 func (a *AssemblerImpl) encodeADR(buf asm.Buffer, n *nodeImpl) (err error) { 1892 dstRegBits, err := intRegisterBits(n.dstReg) 1893 if err != nil { 1894 return err 1895 } 1896 1897 adrInstructionOffsetInBinary := uint64(buf.Len()) 1898 1899 // At this point, we don't yet know the target offset to read from, 1900 // so we emit the ADR instruction with 0 offset, and replace later in the callback. 1901 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en 1902 buf.Append4Bytes(dstRegBits, 0x0, 0x0, 0b10000) 1903 1904 // This case, the ADR's target offset is for the staticConst's initial address. 1905 if sc := n.staticConst; sc != nil { 1906 a.pool.AddConst(sc, adrInstructionOffsetInBinary) 1907 sc.AddOffsetFinalizedCallback(func(offsetOfConst uint64) { 1908 adrInstructionBytes := buf.Bytes()[adrInstructionOffsetInBinary : adrInstructionOffsetInBinary+4] 1909 offset := int(offsetOfConst) - int(adrInstructionOffsetInBinary) 1910 1911 // See https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en 1912 adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5) 1913 offset >>= 2 1914 adrInstructionBytes[0] |= byte(offset << 5) 1915 offset >>= 3 1916 adrInstructionBytes[1] |= byte(offset) 1917 offset >>= 8 1918 adrInstructionBytes[2] |= byte(offset) 1919 }) 1920 return 1921 } else { 1922 a.adrInstructionNodes = append(a.adrInstructionNodes, n) 1923 } 1924 return 1925 } 1926 1927 func (a *AssemblerImpl) finalizeADRInstructionNode(code []byte, n *nodeImpl) (err error) { 1928 // Find the target instruction node. 1929 targetNode := n 1930 for ; targetNode != nil; targetNode = targetNode.next { 1931 if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction { 1932 targetNode = targetNode.next 1933 break 1934 } 1935 } 1936 1937 if targetNode == nil { 1938 return fmt.Errorf("BUG: target instruction %s not found for ADR", InstructionName(n.readInstructionAddressBeforeTargetInstruction)) 1939 } 1940 1941 offset := targetNode.OffsetInBinary() - n.OffsetInBinary() 1942 if i64 := int64(offset); i64 >= 1<<20 || i64 < -1<<20 { 1943 // We could support offset over 20-bit range by special casing them here, 1944 // but 20-bit range should be enough for our impl. If the necessity comes up, 1945 // we could add the special casing here to support arbitrary large offset. 1946 return fmt.Errorf("BUG: too large offset for ADR: %#x", offset) 1947 } 1948 1949 adrInstructionBytes := code[n.OffsetInBinary() : n.OffsetInBinary()+4] 1950 // According to the binary format of ADR instruction: 1951 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADR--Form-PC-relative-address-?lang=en 1952 adrInstructionBytes[3] |= byte(offset & 0b00000011 << 5) 1953 offset >>= 2 1954 adrInstructionBytes[0] |= byte(offset << 5) 1955 offset >>= 3 1956 adrInstructionBytes[1] |= byte(offset) 1957 offset >>= 8 1958 adrInstructionBytes[2] |= byte(offset) 1959 return nil 1960 } 1961 1962 func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 1963 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_regoff 1964 var ( 1965 size, v, opcode byte 1966 datasize, datasizeLog2 int64 1967 isTargetFloat bool 1968 ) 1969 switch n.instruction { 1970 case ADR: 1971 return a.encodeADR(buf, n) 1972 case FLDRD: 1973 size, v, datasize, datasizeLog2, opcode, isTargetFloat = 0b11, 0x1, 8, 3, 0b01, true 1974 case FLDRS: 1975 size, v, datasize, datasizeLog2, opcode, isTargetFloat = 0b10, 0x1, 4, 2, 0b01, true 1976 case LDRD: 1977 size, v, datasize, datasizeLog2, opcode = 0b11, 0x0, 8, 3, 0b01 1978 case LDRW: 1979 size, v, datasize, datasizeLog2, opcode = 0b10, 0x0, 4, 2, 0b01 1980 case LDRSHD: 1981 size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b10 1982 case LDRSHW: 1983 size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b11 1984 case LDRH: 1985 size, v, datasize, datasizeLog2, opcode = 0b01, 0x0, 2, 1, 0b01 1986 case LDRSBD: 1987 size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b10 1988 case LDRSBW: 1989 size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b11 1990 case LDRB: 1991 size, v, datasize, datasizeLog2, opcode = 0b00, 0x0, 1, 0, 0b01 1992 case LDRSW: 1993 size, v, datasize, datasizeLog2, opcode = 0b10, 0x0, 4, 2, 0b10 1994 default: 1995 return errorEncodingUnsupported(n) 1996 } 1997 1998 var dstRegBits byte 1999 if isTargetFloat { 2000 dstRegBits, err = vectorRegisterBits(n.dstReg) 2001 } else { 2002 dstRegBits, err = intRegisterBits(n.dstReg) 2003 } 2004 if err != nil { 2005 return 2006 } 2007 baseRegBits, err := intRegisterBits(n.srcReg) 2008 if err != nil { 2009 return err 2010 } 2011 2012 if n.srcReg2 != asm.NilRegister { 2013 offsetRegBits, err := intRegisterBits(n.srcReg2) 2014 if err != nil { 2015 return err 2016 } 2017 a.encodeLoadOrStoreWithRegisterOffset(buf, baseRegBits, offsetRegBits, dstRegBits, opcode, 2018 size, v) 2019 } else { 2020 err = a.encodeLoadOrStoreWithConstOffset(buf, baseRegBits, dstRegBits, n.srcConst, opcode, 2021 size, v, datasize, datasizeLog2) 2022 } 2023 return 2024 } 2025 2026 // const16bitAligned check if the value is on the 16-bit alignment. 2027 // If so, returns the shift num divided by 16, and otherwise -1. 2028 func const16bitAligned(v int64) (ret int) { 2029 ret = -1 2030 for s := 0; s < 64; s += 16 { 2031 if (uint64(v) &^ (uint64(0xffff) << uint(s))) == 0 { 2032 ret = s / 16 2033 break 2034 } 2035 } 2036 return 2037 } 2038 2039 // isBitMaskImmediate determines if the value can be encoded as "bitmask immediate". 2040 // 2041 // Such an immediate is a 32-bit or 64-bit pattern viewed as a vector of identical elements of size e = 2, 4, 8, 16, 32, or 64 bits. 2042 // Each element contains the same sub-pattern: a single run of 1 to e-1 non-zero bits, rotated by 0 to e-1 bits. 2043 // 2044 // See https://developer.arm.com/documentation/dui0802/b/A64-General-Instructions/MOV--bitmask-immediate- 2045 func isBitMaskImmediate(x uint64) bool { 2046 // All zeros and ones are not "bitmask immediate" by defainition. 2047 if x == 0 || x == 0xffff_ffff_ffff_ffff { 2048 return false 2049 } 2050 2051 switch { 2052 case x != x>>32|x<<32: 2053 // e = 64 2054 case x != x>>16|x<<48: 2055 // e = 32 (x == x>>32|x<<32). 2056 // e.g. 0x00ff_ff00_00ff_ff00 2057 x = uint64(int32(x)) 2058 case x != x>>8|x<<56: 2059 // e = 16 (x == x>>16|x<<48). 2060 // e.g. 0x00ff_00ff_00ff_00ff 2061 x = uint64(int16(x)) 2062 case x != x>>4|x<<60: 2063 // e = 8 (x == x>>8|x<<56). 2064 // e.g. 0x0f0f_0f0f_0f0f_0f0f 2065 x = uint64(int8(x)) 2066 default: 2067 // e = 4 or 2. 2068 return true 2069 } 2070 return sequenceOfSetbits(x) || sequenceOfSetbits(^x) 2071 } 2072 2073 // sequenceOfSetbits returns true if the number's binary representation is the sequence set bit (1). 2074 // For example: 0b1110 -> true, 0b1010 -> false 2075 func sequenceOfSetbits(x uint64) bool { 2076 y := getLowestBit(x) 2077 // If x is a sequence of set bit, this should results in the number 2078 // with only one set bit (i.e. power of two). 2079 y += x 2080 return (y-1)&y == 0 2081 } 2082 2083 func getLowestBit(x uint64) uint64 { 2084 // See https://stackoverflow.com/questions/12247186/find-the-lowest-set-bit 2085 return x & (^x + 1) 2086 } 2087 2088 func (a *AssemblerImpl) addOrSub64BitRegisters(buf asm.Buffer, sfops byte, sp bool, dstRegBits, src1RegBits, src2RegBits byte) { 2089 // src1Reg = src1Reg +/- src2Reg 2090 if sp { 2091 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--extended-register---Add--extended-register--?lang=en 2092 buf.Append4Bytes( 2093 (src1RegBits<<5)|dstRegBits, 2094 0b011<<5|src1RegBits>>3, 2095 1<<5|src2RegBits, 2096 sfops<<5|0b01011, 2097 ) 2098 } else { 2099 buf.Append4Bytes( 2100 (src1RegBits<<5)|dstRegBits, 2101 src1RegBits>>3, 2102 src2RegBits, 2103 sfops<<5|0b01011, 2104 ) 2105 } 2106 } 2107 2108 func bitmaskImmediate(c uint64, is64bit bool) (immr, imms, N byte) { 2109 var size uint32 2110 switch { 2111 case c != c>>32|c<<32: 2112 size = 64 2113 case c != c>>16|c<<48: 2114 size = 32 2115 c = uint64(int32(c)) 2116 case c != c>>8|c<<56: 2117 size = 16 2118 c = uint64(int16(c)) 2119 case c != c>>4|c<<60: 2120 size = 8 2121 c = uint64(int8(c)) 2122 case c != c>>2|c<<62: 2123 size = 4 2124 c = uint64(int64(c<<60) >> 60) 2125 default: 2126 size = 2 2127 c = uint64(int64(c<<62) >> 62) 2128 } 2129 2130 neg := false 2131 if int64(c) < 0 { 2132 c = ^c 2133 neg = true 2134 } 2135 2136 onesSize, nonZeroPos := getOnesSequenceSize(c) 2137 if neg { 2138 nonZeroPos = onesSize + nonZeroPos 2139 onesSize = size - onesSize 2140 } 2141 2142 var mode byte = 32 2143 if is64bit { 2144 N, mode = 0b1, 64 2145 } 2146 2147 immr = byte((size - nonZeroPos) & (size - 1) & uint32(mode-1)) 2148 imms = byte((onesSize - 1) | 63&^(size<<1-1)) 2149 return 2150 } 2151 2152 func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2153 // Alias for readability. 2154 c := n.srcConst 2155 2156 dstRegBits, err := intRegisterBits(n.dstReg) 2157 if err != nil { 2158 return err 2159 } 2160 2161 // See "Logical (immediate)" in 2162 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Immediate 2163 switch n.instruction { 2164 case ANDIMM32: 2165 var sf, opc byte = 0b0, 0b00 2166 if !isBitMaskImmediate(uint64(c)) { 2167 err = fmt.Errorf("const %d must be valid bitmask immediate for %s", c, InstructionName(ANDIMM64)) 2168 return 2169 } 2170 immr, imms, N := bitmaskImmediate(uint64(c), false) 2171 buf.Append4Bytes( 2172 (dstRegBits<<5)|dstRegBits, 2173 imms<<2|dstRegBits>>3, 2174 N<<6|immr, 2175 sf<<7|opc<<5|0b10010, 2176 ) 2177 return 2178 case ANDIMM64: 2179 var sf, opc byte = 0b1, 0b00 2180 if !isBitMaskImmediate(uint64(c)) { 2181 err = fmt.Errorf("const %d must be valid bitmask immediate for %s", c, InstructionName(ANDIMM64)) 2182 return 2183 } 2184 immr, imms, N := bitmaskImmediate(uint64(c), true) 2185 buf.Append4Bytes( 2186 (dstRegBits<<5)|dstRegBits, 2187 imms<<2|dstRegBits>>3, 2188 N<<6|immr, 2189 sf<<7|opc<<5|0b10010, 2190 ) 2191 return 2192 } 2193 2194 switch inst := n.instruction; inst { 2195 case ADD, ADDS, SUB, SUBS: 2196 srcRegBits := dstRegBits 2197 if n.srcReg != asm.NilRegister { 2198 srcRegBits, err = intRegisterBits(n.srcReg) 2199 if err != nil { 2200 return err 2201 } 2202 } 2203 2204 var sfops byte 2205 if inst == ADD { 2206 sfops = 0b100 2207 } else if inst == ADDS { 2208 sfops = 0b101 2209 } else if inst == SUB { 2210 sfops = 0b110 2211 } else if inst == SUBS { 2212 sfops = 0b111 2213 } 2214 2215 isSP := n.srcReg == RegSP || n.dstReg == RegSP 2216 if c == 0 { 2217 // If the constant equals zero, we encode it as ADD (register) with zero register. 2218 a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, zeroRegisterBits) 2219 return 2220 } 2221 2222 if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) { 2223 // If the const can be represented as "imm12" or "imm12 << 12": one instruction 2224 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L2992 2225 2226 if c <= 0xfff { 2227 buf.Append4Bytes( 2228 (srcRegBits<<5)|dstRegBits, 2229 (byte(c)<<2)|(srcRegBits>>3), 2230 byte(c>>6), 2231 sfops<<5|0b10001, 2232 ) 2233 } else { 2234 c >>= 12 2235 buf.Append4Bytes( 2236 (srcRegBits<<5)|dstRegBits, 2237 (byte(c)<<2)|(srcRegBits>>3), 2238 0b01<<6 /* shift by 12 */ |byte(c>>6), 2239 sfops<<5|0b10001, 2240 ) 2241 } 2242 return 2243 } 2244 2245 if t := const16bitAligned(c); t >= 0 { 2246 // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 2247 // We could load it into temporary with movk. 2248 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029 2249 tmpRegBits := registerBits(a.temporaryRegister) 2250 2251 // MOVZ $c, tmpReg with shifting. 2252 a.load16bitAlignedConst(buf, c>>(16*t), byte(t), tmpRegBits, false, true) 2253 2254 // ADD/SUB tmpReg, dstReg 2255 a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits) 2256 return 2257 } else if t := const16bitAligned(^c); t >= 0 { 2258 // Also if the reverse of the const can fit within 16-bit range, do the same ^^. 2259 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L4029 2260 tmpRegBits := registerBits(a.temporaryRegister) 2261 2262 // MOVN $c, tmpReg with shifting. 2263 a.load16bitAlignedConst(buf, ^c>>(16*t), byte(t), tmpRegBits, true, true) 2264 2265 // ADD/SUB tmpReg, dstReg 2266 a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits) 2267 return 2268 } 2269 2270 if uc := uint64(c); isBitMaskImmediate(uc) { 2271 // If the const can be represented as "bitmask immediate", we load it via ORR into temp register. 2272 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6570-L6583 2273 tmpRegBits := registerBits(a.temporaryRegister) 2274 // OOR $c, tmpReg 2275 a.loadConstViaBitMaskImmediate(buf, uc, tmpRegBits, true) 2276 2277 // ADD/SUB tmpReg, dstReg 2278 a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits) 2279 return 2280 } 2281 2282 // If the value fits within 24-bit, then we emit two add instructions 2283 if 0 <= c && c <= 0xffffff && inst != SUBS && inst != ADDS { 2284 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3849-L3862 2285 buf.Append4Bytes( 2286 (dstRegBits<<5)|dstRegBits, 2287 (byte(c)<<2)|(dstRegBits>>3), 2288 byte(c&0xfff>>6), 2289 sfops<<5|0b10001, 2290 ) 2291 c = c >> 12 2292 buf.Append4Bytes( 2293 (dstRegBits<<5)|dstRegBits, 2294 (byte(c)<<2)|(dstRegBits>>3), 2295 0b01_000000 /* shift by 12 */ |byte(c>>6), 2296 sfops<<5|0b10001, 2297 ) 2298 return 2299 } 2300 2301 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L3163-L3203 2302 // Otherwise we use MOVZ and MOVNs for loading const into tmpRegister. 2303 tmpRegBits := registerBits(a.temporaryRegister) 2304 a.load64bitConst(buf, c, tmpRegBits) 2305 a.addOrSub64BitRegisters(buf, sfops, isSP, dstRegBits, srcRegBits, tmpRegBits) 2306 case MOVW: 2307 if c == 0 { 2308 buf.Append4Bytes( 2309 (zeroRegisterBits<<5)|dstRegBits, 2310 zeroRegisterBits>>3, 2311 0b000_00000|zeroRegisterBits, 2312 0b0_01_01010, 2313 ) 2314 return 2315 } 2316 2317 // Following the logic here: 2318 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1637 2319 c32 := uint32(c) 2320 ic := int64(c32) 2321 if ic >= 0 && (ic <= 0xfff || (ic&0xfff) == 0 && (uint64(ic>>12) <= 0xfff)) { 2322 if isBitMaskImmediate(uint64(c)) { 2323 a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, false) 2324 return 2325 } 2326 } 2327 2328 if t := const16bitAligned(int64(c32)); t >= 0 { 2329 // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 2330 // We could load it into temporary with movk. 2331 a.load16bitAlignedConst(buf, int64(c32)>>(16*t), byte(t), dstRegBits, false, false) 2332 } else if t := const16bitAligned(int64(^c32)); t >= 0 { 2333 // Also, if the reverse of the const can fit within 16-bit range, do the same ^^. 2334 a.load16bitAlignedConst(buf, int64(^c32)>>(16*t), byte(t), dstRegBits, true, false) 2335 } else if isBitMaskImmediate(uint64(c)) { 2336 a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, false) 2337 } else { 2338 // Otherwise, we use MOVZ and MOVK to load it. 2339 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6623-L6630 2340 c16 := uint16(c32) 2341 // MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2342 buf.Append4Bytes( 2343 (byte(c16)<<5)|dstRegBits, 2344 byte(c16>>3), 2345 1<<7|byte(c16>>11), 2346 0b0_10_10010, 2347 ) 2348 // MOVK: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK 2349 c16 = uint16(c32 >> 16) 2350 if c16 != 0 { 2351 buf.Append4Bytes( 2352 (byte(c16)<<5)|dstRegBits, 2353 byte(c16>>3), 2354 1<<7|0b0_01_00000 /* shift by 16 */ |byte(c16>>11), 2355 0b0_11_10010, 2356 ) 2357 } 2358 } 2359 case MOVD: 2360 // Following the logic here: 2361 // https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L1798-L1852 2362 if c >= 0 && (c <= 0xfff || (c&0xfff) == 0 && (uint64(c>>12) <= 0xfff)) { 2363 if isBitMaskImmediate(uint64(c)) { 2364 a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, true) 2365 return 2366 } 2367 } 2368 2369 if t := const16bitAligned(c); t >= 0 { 2370 // If the const can fit within 16-bit alignment, for example, 0xffff, 0xffff_0000 or 0xffff_0000_0000_0000 2371 // We could load it into temporary with movk. 2372 a.load16bitAlignedConst(buf, c>>(16*t), byte(t), dstRegBits, false, true) 2373 } else if t := const16bitAligned(^c); t >= 0 { 2374 // Also, if the reverse of the const can fit within 16-bit range, do the same ^^. 2375 a.load16bitAlignedConst(buf, (^c)>>(16*t), byte(t), dstRegBits, true, true) 2376 } else if isBitMaskImmediate(uint64(c)) { 2377 a.loadConstViaBitMaskImmediate(buf, uint64(c), dstRegBits, true) 2378 } else { 2379 a.load64bitConst(buf, c, dstRegBits) 2380 } 2381 case LSR: 2382 if c == 0 { 2383 err = errors.New("LSR with zero constant should be optimized out") 2384 return 2385 } else if c < 0 || c > 63 { 2386 err = fmt.Errorf("LSR requires immediate to be within 0 to 63, but got %d", c) 2387 return 2388 } 2389 2390 // LSR(immediate) is an alias of UBFM 2391 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSR--immediate---Logical-Shift-Right--immediate---an-alias-of-UBFM-?lang=en 2392 buf.Append4Bytes( 2393 (dstRegBits<<5)|dstRegBits, 2394 0b111111_00|dstRegBits>>3, 2395 0b01_000000|byte(c), 2396 0b110_10011, 2397 ) 2398 case LSL: 2399 if c == 0 { 2400 err = errors.New("LSL with zero constant should be optimized out") 2401 return 2402 } else if c < 0 || c > 63 { 2403 err = fmt.Errorf("LSL requires immediate to be within 0 to 63, but got %d", c) 2404 return 2405 } 2406 2407 // LSL(immediate) is an alias of UBFM 2408 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LSL--immediate---Logical-Shift-Left--immediate---an-alias-of-UBFM- 2409 cb := byte(c) 2410 buf.Append4Bytes( 2411 (dstRegBits<<5)|dstRegBits, 2412 (0b111111-cb)<<2|dstRegBits>>3, 2413 0b01_000000|(64-cb), 2414 0b110_10011, 2415 ) 2416 2417 default: 2418 return errorEncodingUnsupported(n) 2419 } 2420 return 2421 } 2422 2423 func (a *AssemblerImpl) movk(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) { 2424 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVK 2425 buf.Append4Bytes( 2426 (byte(v)<<5)|dstRegBits, 2427 byte(v>>3), 2428 1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)), 2429 0b1_11_10010, 2430 ) 2431 } 2432 2433 func (a *AssemblerImpl) movz(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) { 2434 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2435 buf.Append4Bytes( 2436 (byte(v)<<5)|dstRegBits, 2437 byte(v>>3), 2438 1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)), 2439 0b1_10_10010, 2440 ) 2441 } 2442 2443 func (a *AssemblerImpl) movn(buf asm.Buffer, v uint64, shfitNum int, dstRegBits byte) { 2444 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2445 buf.Append4Bytes( 2446 (byte(v)<<5)|dstRegBits, 2447 byte(v>>3), 2448 1<<7|byte(shfitNum)<<5|(0b000_11111&byte(v>>11)), 2449 0b1_00_10010, 2450 ) 2451 } 2452 2453 // load64bitConst loads a 64-bit constant into the register, following the same logic to decide how to load large 64-bit 2454 // consts as in the Go assembler. 2455 // 2456 // See https://github.com/golang/go/blob/release-branch.go1.15/src/cmd/internal/obj/arm64/asm7.go#L6632-L6759 2457 func (a *AssemblerImpl) load64bitConst(buf asm.Buffer, c int64, dstRegBits byte) { 2458 var bits [4]uint64 2459 var zeros, negs int 2460 for i := 0; i < 4; i++ { 2461 bits[i] = uint64((c >> uint(i*16)) & 0xffff) 2462 if v := bits[i]; v == 0 { 2463 zeros++ 2464 } else if v == 0xffff { 2465 negs++ 2466 } 2467 } 2468 2469 if zeros == 3 { 2470 // one MOVZ instruction. 2471 for i, v := range bits { 2472 if v != 0 { 2473 a.movz(buf, v, i, dstRegBits) 2474 } 2475 } 2476 } else if negs == 3 { 2477 // one MOVN instruction. 2478 for i, v := range bits { 2479 if v != 0xffff { 2480 v = ^v 2481 a.movn(buf, v, i, dstRegBits) 2482 } 2483 } 2484 } else if zeros == 2 { 2485 // one MOVZ then one OVK. 2486 var movz bool 2487 for i, v := range bits { 2488 if !movz && v != 0 { // MOVZ. 2489 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2490 a.movz(buf, v, i, dstRegBits) 2491 movz = true 2492 } else if v != 0 { 2493 a.movk(buf, v, i, dstRegBits) 2494 } 2495 } 2496 2497 } else if negs == 2 { 2498 // one MOVN then one or two MOVK. 2499 var movn bool 2500 for i, v := range bits { // Emit MOVN. 2501 if !movn && v != 0xffff { 2502 v = ^v 2503 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN 2504 a.movn(buf, v, i, dstRegBits) 2505 movn = true 2506 } else if v != 0xffff { 2507 a.movk(buf, v, i, dstRegBits) 2508 } 2509 } 2510 2511 } else if zeros == 1 { 2512 // one MOVZ then two MOVK. 2513 var movz bool 2514 for i, v := range bits { 2515 if !movz && v != 0 { // MOVZ. 2516 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2517 a.movz(buf, v, i, dstRegBits) 2518 movz = true 2519 } else if v != 0 { 2520 a.movk(buf, v, i, dstRegBits) 2521 } 2522 } 2523 2524 } else if negs == 1 { 2525 // one MOVN then two MOVK. 2526 var movn bool 2527 for i, v := range bits { // Emit MOVN. 2528 if !movn && v != 0xffff { 2529 v = ^v 2530 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN 2531 a.movn(buf, v, i, dstRegBits) 2532 movn = true 2533 } else if v != 0xffff { 2534 a.movk(buf, v, i, dstRegBits) 2535 } 2536 } 2537 2538 } else { 2539 // one MOVZ then tree MOVK. 2540 var movz bool 2541 for i, v := range bits { 2542 if !movz && v != 0 { // MOVZ. 2543 // https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2544 a.movz(buf, v, i, dstRegBits) 2545 movz = true 2546 } else if v != 0 { 2547 a.movk(buf, v, i, dstRegBits) 2548 } 2549 } 2550 2551 } 2552 } 2553 2554 func (a *AssemblerImpl) load16bitAlignedConst(buf asm.Buffer, c int64, shiftNum byte, regBits byte, reverse bool, dst64bit bool) { 2555 var lastByte byte 2556 if reverse { 2557 // MOVN: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVZ 2558 lastByte = 0b0_00_10010 2559 } else { 2560 // MOVZ: https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/MOVN 2561 lastByte = 0b0_10_10010 2562 } 2563 if dst64bit { 2564 lastByte |= 0b1 << 7 2565 } 2566 buf.Append4Bytes( 2567 (byte(c)<<5)|regBits, 2568 byte(c>>3), 2569 1<<7|(shiftNum<<5)|byte(c>>11), 2570 lastByte, 2571 ) 2572 } 2573 2574 // loadConstViaBitMaskImmediate loads the constant with ORR (bitmask immediate). 2575 // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ORR--immediate---Bitwise-OR--immediate--?lang=en 2576 func (a *AssemblerImpl) loadConstViaBitMaskImmediate(buf asm.Buffer, c uint64, regBits byte, dst64bit bool) { 2577 var size uint32 2578 switch { 2579 case c != c>>32|c<<32: 2580 size = 64 2581 case c != c>>16|c<<48: 2582 size = 32 2583 c = uint64(int32(c)) 2584 case c != c>>8|c<<56: 2585 size = 16 2586 c = uint64(int16(c)) 2587 case c != c>>4|c<<60: 2588 size = 8 2589 c = uint64(int8(c)) 2590 case c != c>>2|c<<62: 2591 size = 4 2592 c = uint64(int64(c<<60) >> 60) 2593 default: 2594 size = 2 2595 c = uint64(int64(c<<62) >> 62) 2596 } 2597 2598 neg := false 2599 if int64(c) < 0 { 2600 c = ^c 2601 neg = true 2602 } 2603 2604 onesSize, nonZeroPos := getOnesSequenceSize(c) 2605 if neg { 2606 nonZeroPos = onesSize + nonZeroPos 2607 onesSize = size - onesSize 2608 } 2609 2610 // See the following article for understanding the encoding. 2611 // https://dinfuehr.github.io/blog/encoding-of-immediate-values-on-aarch64/ 2612 var n byte 2613 mode := 32 2614 if dst64bit && size == 64 { 2615 n = 0b1 2616 mode = 64 2617 } 2618 2619 r := byte((size - nonZeroPos) & (size - 1) & uint32(mode-1)) 2620 s := byte((onesSize - 1) | 63&^(size<<1-1)) 2621 2622 var sf byte 2623 if dst64bit { 2624 sf = 0b1 2625 } 2626 buf.Append4Bytes( 2627 (zeroRegisterBits<<5)|regBits, 2628 s<<2|(zeroRegisterBits>>3), 2629 n<<6|r, 2630 sf<<7|0b0_01_10010, 2631 ) 2632 } 2633 2634 func getOnesSequenceSize(x uint64) (size, nonZeroPos uint32) { 2635 // Take 0b00111000 for example: 2636 y := getLowestBit(x) // = 0b0000100 2637 nonZeroPos = setBitPos(y) // = 2 2638 size = setBitPos(x+y) - nonZeroPos // = setBitPos(0b0100000) - 2 = 5 - 2 = 3 2639 return 2640 } 2641 2642 func setBitPos(x uint64) (ret uint32) { 2643 for ; ; ret++ { 2644 if x == 0b1 { 2645 break 2646 } 2647 x = x >> 1 2648 } 2649 return 2650 } 2651 2652 func checkArrangementIndexPair(arr VectorArrangement, index VectorIndex) (err error) { 2653 if arr == VectorArrangementNone { 2654 return nil 2655 } 2656 var valid bool 2657 switch arr { 2658 case VectorArrangement8B: 2659 valid = index < 8 2660 case VectorArrangement16B: 2661 valid = index < 16 2662 case VectorArrangement4H: 2663 valid = index < 4 2664 case VectorArrangement8H: 2665 valid = index < 8 2666 case VectorArrangement2S: 2667 valid = index < 2 2668 case VectorArrangement4S: 2669 valid = index < 4 2670 case VectorArrangement1D: 2671 valid = index < 1 2672 case VectorArrangement2D: 2673 valid = index < 2 2674 case VectorArrangementB: 2675 valid = index < 16 2676 case VectorArrangementH: 2677 valid = index < 8 2678 case VectorArrangementS: 2679 valid = index < 4 2680 case VectorArrangementD: 2681 valid = index < 2 2682 } 2683 if !valid { 2684 err = fmt.Errorf("invalid arrangement and index pair: %s[%d]", arr, index) 2685 } 2686 return 2687 } 2688 2689 func (a *AssemblerImpl) encodeMemoryToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2690 srcBaseRegBits, err := intRegisterBits(n.srcReg) 2691 if err != nil { 2692 return err 2693 } 2694 2695 dstVectorRegBits, err := vectorRegisterBits(n.dstReg) 2696 if err != nil { 2697 return err 2698 } 2699 2700 switch n.instruction { 2701 case VMOV: // translated as LDR(immediate,SIMD&FP) 2702 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LDR--immediate--SIMD-FP---Load-SIMD-FP-Register--immediate-offset--?lang=en 2703 var size, opcode byte 2704 var dataSize, dataSizeLog2 int64 2705 switch n.vectorArrangement { 2706 case VectorArrangementB: 2707 size, opcode, dataSize, dataSizeLog2 = 0b00, 0b01, 1, 0 2708 case VectorArrangementH: 2709 size, opcode, dataSize, dataSizeLog2 = 0b01, 0b01, 2, 1 2710 case VectorArrangementS: 2711 size, opcode, dataSize, dataSizeLog2 = 0b10, 0b01, 4, 2 2712 case VectorArrangementD: 2713 size, opcode, dataSize, dataSizeLog2 = 0b11, 0b01, 8, 3 2714 case VectorArrangementQ: 2715 size, opcode, dataSize, dataSizeLog2 = 0b00, 0b11, 16, 4 2716 } 2717 const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos 2718 if n.srcReg2 != asm.NilRegister { 2719 offsetRegBits, err := intRegisterBits(n.srcReg2) 2720 if err != nil { 2721 return err 2722 } 2723 a.encodeLoadOrStoreWithRegisterOffset(buf, srcBaseRegBits, offsetRegBits, dstVectorRegBits, opcode, size, v) 2724 } else { 2725 err = a.encodeLoadOrStoreWithConstOffset(buf, srcBaseRegBits, dstVectorRegBits, 2726 n.srcConst, opcode, size, v, dataSize, dataSizeLog2) 2727 } 2728 case LD1R: 2729 if n.srcReg2 != asm.NilRegister || n.srcConst != 0 { 2730 return fmt.Errorf("offset for %s is not implemented", InstructionName(LD1R)) 2731 } 2732 2733 var size, q byte 2734 switch n.vectorArrangement { 2735 case VectorArrangement8B: 2736 size, q = 0b00, 0b0 2737 case VectorArrangement16B: 2738 size, q = 0b00, 0b1 2739 case VectorArrangement4H: 2740 size, q = 0b01, 0b0 2741 case VectorArrangement8H: 2742 size, q = 0b01, 0b1 2743 case VectorArrangement2S: 2744 size, q = 0b10, 0b0 2745 case VectorArrangement4S: 2746 size, q = 0b10, 0b1 2747 case VectorArrangement1D: 2748 size, q = 0b11, 0b0 2749 case VectorArrangement2D: 2750 size, q = 0b11, 0b1 2751 } 2752 2753 // No offset encoding. 2754 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/LD1R--Load-one-single-element-structure-and-Replicate-to-all-lanes--of-one-register--?lang=en#iclass_as_post_index 2755 buf.Append4Bytes( 2756 (srcBaseRegBits<<5)|dstVectorRegBits, 2757 0b11_000000|size<<2|srcBaseRegBits>>3, 2758 0b01_000000, 2759 q<<6|0b1101, 2760 ) 2761 default: 2762 return errorEncodingUnsupported(n) 2763 } 2764 return 2765 } 2766 2767 func arrangementSizeQ(arr VectorArrangement) (size, q byte) { 2768 switch arr { 2769 case VectorArrangement8B: 2770 size, q = 0b00, 0 2771 case VectorArrangement16B: 2772 size, q = 0b00, 1 2773 case VectorArrangement4H: 2774 size, q = 0b01, 0 2775 case VectorArrangement8H: 2776 size, q = 0b01, 1 2777 case VectorArrangement2S: 2778 size, q = 0b10, 0 2779 case VectorArrangement4S: 2780 size, q = 0b10, 1 2781 case VectorArrangement1D: 2782 size, q = 0b11, 0 2783 case VectorArrangement2D: 2784 size, q = 0b11, 1 2785 } 2786 return 2787 } 2788 2789 func (a *AssemblerImpl) encodeVectorRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 2790 srcVectorRegBits, err := vectorRegisterBits(n.srcReg) 2791 if err != nil { 2792 return err 2793 } 2794 2795 dstBaseRegBits, err := intRegisterBits(n.dstReg) 2796 if err != nil { 2797 return err 2798 } 2799 2800 switch n.instruction { 2801 case VMOV: // translated as STR(immediate,SIMD&FP) 2802 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/STR--immediate--SIMD-FP---Store-SIMD-FP-register--immediate-offset-- 2803 var size, opcode byte 2804 var dataSize, dataSizeLog2 int64 2805 switch n.vectorArrangement { 2806 case VectorArrangementB: 2807 size, opcode, dataSize, dataSizeLog2 = 0b00, 0b00, 1, 0 2808 case VectorArrangementH: 2809 size, opcode, dataSize, dataSizeLog2 = 0b01, 0b00, 2, 1 2810 case VectorArrangementS: 2811 size, opcode, dataSize, dataSizeLog2 = 0b10, 0b00, 4, 2 2812 case VectorArrangementD: 2813 size, opcode, dataSize, dataSizeLog2 = 0b11, 0b00, 8, 3 2814 case VectorArrangementQ: 2815 size, opcode, dataSize, dataSizeLog2 = 0b00, 0b10, 16, 4 2816 } 2817 const v = 1 // v as in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Loads-and-Stores?lang=en#ldst_pos 2818 2819 if n.dstReg2 != asm.NilRegister { 2820 offsetRegBits, err := intRegisterBits(n.dstReg2) 2821 if err != nil { 2822 return err 2823 } 2824 a.encodeLoadOrStoreWithRegisterOffset(buf, dstBaseRegBits, offsetRegBits, srcVectorRegBits, opcode, size, v) 2825 } else { 2826 err = a.encodeLoadOrStoreWithConstOffset(buf, dstBaseRegBits, srcVectorRegBits, 2827 n.dstConst, opcode, size, v, dataSize, dataSizeLog2) 2828 } 2829 default: 2830 return errorEncodingUnsupported(n) 2831 } 2832 return 2833 } 2834 2835 func (a *AssemblerImpl) encodeStaticConstToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2836 if n.instruction != VMOV { 2837 return errorEncodingUnsupported(n) 2838 } 2839 2840 dstRegBits, err := vectorRegisterBits(n.dstReg) 2841 if err != nil { 2842 return err 2843 } 2844 2845 // LDR (literal, SIMD&FP) 2846 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/LDR--literal--SIMD-FP---Load-SIMD-FP-Register--PC-relative-literal-- 2847 var opc byte 2848 var constLength int 2849 switch n.vectorArrangement { 2850 case VectorArrangementS: 2851 opc, constLength = 0b00, 4 2852 case VectorArrangementD: 2853 opc, constLength = 0b01, 8 2854 case VectorArrangementQ: 2855 opc, constLength = 0b10, 16 2856 } 2857 2858 loadLiteralOffsetInBinary := uint64(buf.Len()) 2859 a.pool.AddConst(n.staticConst, loadLiteralOffsetInBinary) 2860 2861 if len(n.staticConst.Raw) != constLength { 2862 return fmt.Errorf("invalid const length for %s: want %d but was %d", 2863 n.vectorArrangement, constLength, len(n.staticConst.Raw)) 2864 } 2865 2866 buf.Append4Bytes(dstRegBits, 0x0, 0x0, opc<<6|0b11100) 2867 n.staticConst.AddOffsetFinalizedCallback(func(offsetOfConst uint64) { 2868 // LDR (literal, SIMD&FP) encodes offset divided by 4. 2869 offset := (int(offsetOfConst) - int(loadLiteralOffsetInBinary)) / 4 2870 bin := buf.Bytes() 2871 bin[loadLiteralOffsetInBinary] |= byte(offset << 5) 2872 bin[loadLiteralOffsetInBinary+1] |= byte(offset >> 3) 2873 bin[loadLiteralOffsetInBinary+2] |= byte(offset >> 11) 2874 }) 2875 return 2876 } 2877 2878 // advancedSIMDTwoRegisterMisc holds information to encode instructions as "Advanced SIMD two-register miscellaneous" in 2879 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 2880 var advancedSIMDTwoRegisterMisc = map[asm.Instruction]struct { 2881 qAndSize map[VectorArrangement]qAndSize 2882 u, opcode byte 2883 }{ 2884 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NOT--Bitwise-NOT--vector--?lang=en 2885 NOT: { 2886 u: 0b1, opcode: 0b00101, 2887 qAndSize: map[VectorArrangement]qAndSize{ 2888 VectorArrangement16B: {size: 0b00, q: 0b1}, 2889 VectorArrangement8B: {size: 0b00, q: 0b0}, 2890 }, 2891 }, 2892 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FNEG--vector---Floating-point-Negate--vector--?lang=en 2893 VFNEG: { 2894 u: 0b1, opcode: 0b01111, 2895 qAndSize: map[VectorArrangement]qAndSize{ 2896 VectorArrangement4S: {size: 0b10, q: 0b1}, 2897 VectorArrangement2S: {size: 0b10, q: 0b0}, 2898 VectorArrangement2D: {size: 0b11, q: 0b1}, 2899 }, 2900 }, 2901 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FABS--vector---Floating-point-Absolute-value--vector--?lang=en 2902 VFABS: {u: 0, opcode: 0b01111, qAndSize: map[VectorArrangement]qAndSize{ 2903 VectorArrangement2D: {size: 0b11, q: 0b1}, 2904 VectorArrangement4S: {size: 0b10, q: 0b1}, 2905 VectorArrangement2S: {size: 0b10, q: 0b0}, 2906 }}, 2907 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSQRT--vector---Floating-point-Square-Root--vector--?lang=en 2908 VFSQRT: {u: 1, opcode: 0b11111, qAndSize: map[VectorArrangement]qAndSize{ 2909 VectorArrangement2D: {size: 0b11, q: 0b1}, 2910 VectorArrangement4S: {size: 0b10, q: 0b1}, 2911 VectorArrangement2S: {size: 0b10, q: 0b0}, 2912 }}, 2913 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTM--vector---Floating-point-Round-to-Integral--toward-Minus-infinity--vector--?lang=en 2914 VFRINTM: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ 2915 VectorArrangement2D: {size: 0b01, q: 0b1}, 2916 VectorArrangement4S: {size: 0b00, q: 0b1}, 2917 VectorArrangement2S: {size: 0b00, q: 0b0}, 2918 }}, 2919 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTN--vector---Floating-point-Round-to-Integral--to-nearest-with-ties-to-even--vector--?lang=en 2920 VFRINTN: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ 2921 VectorArrangement2D: {size: 0b01, q: 0b1}, 2922 VectorArrangement4S: {size: 0b00, q: 0b1}, 2923 VectorArrangement2S: {size: 0b00, q: 0b0}, 2924 }}, 2925 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTP--vector---Floating-point-Round-to-Integral--toward-Plus-infinity--vector--?lang=en 2926 VFRINTP: {u: 0, opcode: 0b11000, qAndSize: map[VectorArrangement]qAndSize{ 2927 VectorArrangement2D: {size: 0b11, q: 0b1}, 2928 VectorArrangement4S: {size: 0b10, q: 0b1}, 2929 VectorArrangement2S: {size: 0b10, q: 0b0}, 2930 }}, 2931 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FRINTZ--vector---Floating-point-Round-to-Integral--toward-Zero--vector--?lang=en 2932 VFRINTZ: {u: 0, opcode: 0b11001, qAndSize: map[VectorArrangement]qAndSize{ 2933 VectorArrangement2D: {size: 0b11, q: 0b1}, 2934 VectorArrangement4S: {size: 0b10, q: 0b1}, 2935 VectorArrangement2S: {size: 0b10, q: 0b0}, 2936 }}, 2937 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CNT--Population-Count-per-byte-?lang=en 2938 VCNT: {u: 0b0, opcode: 0b00101, qAndSize: map[VectorArrangement]qAndSize{ 2939 VectorArrangement8B: {size: 0b00, q: 0b0}, 2940 VectorArrangement16B: {size: 0b00, q: 0b1}, 2941 }}, 2942 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/NEG--vector---Negate--vector--?lang=en 2943 VNEG: {u: 0b1, opcode: 0b01011, qAndSize: defaultQAndSize}, 2944 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ABS--Absolute-value--vector--?lang=en 2945 VABS: {u: 0b0, opcode: 0b01011, qAndSize: defaultQAndSize}, 2946 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/REV64--Reverse-elements-in-64-bit-doublewords--vector--?lang=en 2947 REV64: {u: 0b0, opcode: 0b00000, qAndSize: defaultQAndSize}, 2948 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/XTN--XTN2--Extract-Narrow-?lang=en 2949 XTN: {u: 0b0, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ 2950 VectorArrangement2D: {q: 0, size: 0b10}, 2951 VectorArrangement4S: {q: 0, size: 0b01}, 2952 VectorArrangement8H: {q: 0, size: 0b00}, 2953 }}, 2954 SHLL: {u: 0b1, opcode: 0b10011, qAndSize: map[VectorArrangement]qAndSize{ 2955 VectorArrangement8B: {q: 0b00, size: 0b00}, 2956 VectorArrangement4H: {q: 0b00, size: 0b01}, 2957 VectorArrangement2S: {q: 0b00, size: 0b10}, 2958 }}, 2959 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--zero---Compare-bitwise-Equal-to-zero--vector--?lang=en 2960 CMEQZERO: {u: 0b0, opcode: 0b01001, qAndSize: defaultQAndSize}, 2961 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SADDLP--Signed-Add-Long-Pairwise-?lang=en 2962 SADDLP: {u: 0b0, opcode: 0b00010, qAndSize: defaultQAndSize}, 2963 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UADDLP--Unsigned-Add-Long-Pairwise-?lang=en 2964 UADDLP: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, 2965 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-Convert-to-Signed-integer--rounding-toward-Zero--vector--?lang=en 2966 VFCVTZS: {u: 0b0, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ 2967 VectorArrangement4S: {size: 0b10, q: 0b1}, 2968 VectorArrangement2S: {size: 0b10, q: 0b0}, 2969 VectorArrangement2D: {size: 0b11, q: 0b1}, 2970 }}, 2971 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTZU--vector--integer---Floating-point-Convert-to-Unsigned-integer--rounding-toward-Zero--vector--?lang=en 2972 VFCVTZU: {u: 0b1, opcode: 0b11011, qAndSize: map[VectorArrangement]qAndSize{ 2973 VectorArrangement4S: {size: 0b10, q: 0b1}, 2974 VectorArrangement2S: {size: 0b10, q: 0b0}, 2975 VectorArrangement2D: {size: 0b11, q: 0b1}, 2976 }}, 2977 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en 2978 SQXTN: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{ 2979 VectorArrangement8B: {q: 0b0, size: 0b00}, 2980 VectorArrangement4H: {q: 0b0, size: 0b01}, 2981 VectorArrangement2S: {q: 0b0, size: 0b10}, 2982 }}, 2983 2984 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTN--SQXTN2--Signed-saturating-extract-Narrow-?lang=en 2985 SQXTN2: {u: 0b0, opcode: 0b10100, qAndSize: map[VectorArrangement]qAndSize{ 2986 VectorArrangement16B: {q: 0b1, size: 0b00}, 2987 VectorArrangement8H: {q: 0b1, size: 0b01}, 2988 VectorArrangement4S: {q: 0b1, size: 0b10}, 2989 }}, 2990 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQXTN--UQXTN2--Unsigned-saturating-extract-Narrow-?lang=en 2991 UQXTN: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize}, 2992 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en 2993 SQXTUN: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ 2994 VectorArrangement8B: {q: 0b0, size: 0b00}, 2995 VectorArrangement4H: {q: 0b0, size: 0b01}, 2996 VectorArrangement2S: {q: 0b0, size: 0b10}, 2997 }}, 2998 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQXTUN--SQXTUN2--Signed-saturating-extract-Unsigned-Narrow-?lang=en 2999 SQXTUN2: {u: 0b1, opcode: 0b10010, qAndSize: map[VectorArrangement]qAndSize{ 3000 VectorArrangement16B: {q: 0b1, size: 0b00}, 3001 VectorArrangement8H: {q: 0b1, size: 0b01}, 3002 VectorArrangement4S: {q: 0b1, size: 0b10}, 3003 }}, 3004 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-Convert-to-Floating-point--vector--?lang=en 3005 VSCVTF: {u: 0b0, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{ 3006 VectorArrangement2D: {q: 0b1, size: 0b01}, 3007 VectorArrangement4S: {q: 0b1, size: 0b00}, 3008 VectorArrangement2S: {q: 0b0, size: 0b00}, 3009 }}, 3010 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-Convert-to-Floating-point--vector--?lang=en 3011 VUCVTF: {u: 0b1, opcode: 0b11101, qAndSize: map[VectorArrangement]qAndSize{ 3012 VectorArrangement2D: {q: 0b1, size: 0b01}, 3013 VectorArrangement4S: {q: 0b1, size: 0b00}, 3014 VectorArrangement2S: {q: 0b0, size: 0b00}, 3015 }}, 3016 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTL--FCVTL2--Floating-point-Convert-to-higher-precision-Long--vector--?lang=en 3017 FCVTL: {u: 0b0, opcode: 0b10111, qAndSize: map[VectorArrangement]qAndSize{ 3018 VectorArrangement2S: {size: 0b01, q: 0b0}, 3019 VectorArrangement4H: {size: 0b00, q: 0b0}, 3020 }}, 3021 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCVTN--FCVTN2--Floating-point-Convert-to-lower-precision-Narrow--vector--?lang=en 3022 FCVTN: {u: 0b0, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{ 3023 VectorArrangement2S: {size: 0b01, q: 0b0}, 3024 VectorArrangement4H: {size: 0b00, q: 0b0}, 3025 }}, 3026 } 3027 3028 // advancedSIMDThreeDifferent holds information to encode instructions as "Advanced SIMD three different" in 3029 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3030 var advancedSIMDThreeDifferent = map[asm.Instruction]struct { 3031 qAndSize map[VectorArrangement]qAndSize 3032 u, opcode byte 3033 }{ 3034 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMLAL--UMLAL2--vector---Unsigned-Multiply-Add-Long--vector--?lang=en 3035 VUMLAL: {u: 0b1, opcode: 0b1000, qAndSize: map[VectorArrangement]qAndSize{ 3036 VectorArrangement2S: {q: 0b0, size: 0b10}, 3037 VectorArrangement4H: {q: 0b0, size: 0b01}, 3038 VectorArrangement8B: {q: 0b0, size: 0b00}, 3039 }}, 3040 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en 3041 SMULL: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ 3042 VectorArrangement8B: {q: 0b0, size: 0b00}, 3043 VectorArrangement4H: {q: 0b0, size: 0b01}, 3044 VectorArrangement2S: {q: 0b0, size: 0b10}, 3045 }}, 3046 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMULL--SMULL2--vector---Signed-Multiply-Long--vector--?lang=en 3047 SMULL2: {u: 0b0, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ 3048 VectorArrangement16B: {q: 0b1, size: 0b00}, 3049 VectorArrangement8H: {q: 0b1, size: 0b01}, 3050 VectorArrangement4S: {q: 0b1, size: 0b10}, 3051 }}, 3052 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3053 UMULL: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ 3054 VectorArrangement8B: {q: 0b0, size: 0b00}, 3055 VectorArrangement4H: {q: 0b0, size: 0b01}, 3056 VectorArrangement2S: {q: 0b0, size: 0b10}, 3057 }}, 3058 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3059 UMULL2: {u: 0b1, opcode: 0b1100, qAndSize: map[VectorArrangement]qAndSize{ 3060 VectorArrangement16B: {q: 0b1, size: 0b00}, 3061 VectorArrangement8H: {q: 0b1, size: 0b01}, 3062 VectorArrangement4S: {q: 0b1, size: 0b10}, 3063 }}, 3064 } 3065 3066 // advancedSIMDThreeSame holds information to encode instructions as "Advanced SIMD three same" in 3067 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3068 var advancedSIMDThreeSame = map[asm.Instruction]struct { 3069 qAndSize map[VectorArrangement]qAndSize 3070 u, opcode byte 3071 }{ 3072 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/AND--vector---Bitwise-AND--vector--?lang=en 3073 VAND: { 3074 u: 0b0, opcode: 0b00011, 3075 qAndSize: map[VectorArrangement]qAndSize{ 3076 VectorArrangement16B: {size: 0b00, q: 0b1}, 3077 VectorArrangement8B: {size: 0b00, q: 0b0}, 3078 }, 3079 }, 3080 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BSL--Bitwise-Select-?lang=en 3081 BSL: { 3082 u: 0b1, opcode: 0b00011, 3083 qAndSize: map[VectorArrangement]qAndSize{ 3084 VectorArrangement16B: {size: 0b01, q: 0b1}, 3085 VectorArrangement8B: {size: 0b01, q: 0b0}, 3086 }, 3087 }, 3088 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EOR--vector---Bitwise-Exclusive-OR--vector--?lang=en 3089 EOR: { 3090 u: 0b1, opcode: 0b00011, 3091 qAndSize: map[VectorArrangement]qAndSize{ 3092 VectorArrangement16B: {size: 0b00, q: 0b1}, 3093 VectorArrangement8B: {size: 0b00, q: 0b0}, 3094 }, 3095 }, 3096 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ORR--vector--register---Bitwise-inclusive-OR--vector--register--?lang=en 3097 VORR: { 3098 u: 0b0, opcode: 0b00011, 3099 qAndSize: map[VectorArrangement]qAndSize{ 3100 VectorArrangement16B: {size: 0b10, q: 0b1}, 3101 VectorArrangement8B: {size: 0b10, q: 0b0}, 3102 }, 3103 }, 3104 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIC--vector--register---Bitwise-bit-Clear--vector--register--?lang=en 3105 BIC: { 3106 u: 0b0, opcode: 0b00011, 3107 qAndSize: map[VectorArrangement]qAndSize{ 3108 VectorArrangement16B: {size: 0b01, q: 0b1}, 3109 VectorArrangement8B: {size: 0b01, q: 0b0}, 3110 }, 3111 }, 3112 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en 3113 VFADDS: { 3114 u: 0b0, opcode: 0b11010, 3115 qAndSize: map[VectorArrangement]qAndSize{ 3116 VectorArrangement4S: {size: 0b00, q: 0b1}, 3117 VectorArrangement2S: {size: 0b00, q: 0b0}, 3118 }, 3119 }, 3120 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FADD--vector---Floating-point-Add--vector--?lang=en 3121 VFADDD: { 3122 u: 0b0, opcode: 0b11010, 3123 qAndSize: map[VectorArrangement]qAndSize{ 3124 VectorArrangement2D: {size: 0b01, q: 0b1}, 3125 }, 3126 }, 3127 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en 3128 VFSUBS: { 3129 u: 0b0, opcode: 0b11010, 3130 qAndSize: map[VectorArrangement]qAndSize{ 3131 VectorArrangement4S: {size: 0b10, q: 0b1}, 3132 VectorArrangement2S: {size: 0b10, q: 0b0}, 3133 }, 3134 }, 3135 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FSUB--vector---Floating-point-Subtract--vector--?lang=en 3136 VFSUBD: { 3137 u: 0b0, opcode: 0b11010, 3138 qAndSize: map[VectorArrangement]qAndSize{ 3139 VectorArrangement2D: {size: 0b11, q: 0b1}, 3140 }, 3141 }, 3142 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAXP--Unsigned-Maximum-Pairwise-?lang=en 3143 UMAXP: {u: 0b1, opcode: 0b10100, qAndSize: defaultQAndSize}, 3144 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMEQ--register---Compare-bitwise-Equal--vector--?lang=en 3145 CMEQ: {u: 0b1, opcode: 0b10001, qAndSize: defaultQAndSize}, 3146 // https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/ADDP--vector- 3147 VADDP: {u: 0b0, opcode: 0b10111, qAndSize: defaultQAndSize}, 3148 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADD--vector---Add--vector--?lang=en 3149 VADD: {u: 0, opcode: 0b10000, qAndSize: defaultQAndSize}, 3150 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SUB--vector---Subtract--vector--?lang=en 3151 VSUB: {u: 1, opcode: 0b10000, qAndSize: defaultQAndSize}, 3152 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en 3153 SSHL: {u: 0, opcode: 0b01000, qAndSize: defaultQAndSize}, 3154 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHL--Signed-Shift-Left--register--?lang=en 3155 USHL: {u: 0b1, opcode: 0b01000, qAndSize: defaultQAndSize}, 3156 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGT--register---Compare-signed-Greater-than--vector--?lang=en 3157 CMGT: {u: 0b0, opcode: 0b00110, qAndSize: defaultQAndSize}, 3158 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHI--register---Compare-unsigned-Higher--vector--?lang=en 3159 CMHI: {u: 0b1, opcode: 0b00110, qAndSize: defaultQAndSize}, 3160 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMGE--register---Compare-signed-Greater-than-or-Equal--vector--?lang=en 3161 CMGE: {u: 0b0, opcode: 0b00111, qAndSize: defaultQAndSize}, 3162 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/CMHS--register---Compare-unsigned-Higher-or-Same--vector--?lang=en 3163 CMHS: {u: 0b1, opcode: 0b00111, qAndSize: defaultQAndSize}, 3164 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMEQ--register---Floating-point-Compare-Equal--vector--?lang=en 3165 FCMEQ: { 3166 u: 0b0, opcode: 0b11100, 3167 qAndSize: map[VectorArrangement]qAndSize{ 3168 VectorArrangement4S: {size: 0b00, q: 0b1}, 3169 VectorArrangement2S: {size: 0b00, q: 0b0}, 3170 VectorArrangement2D: {size: 0b01, q: 0b1}, 3171 }, 3172 }, 3173 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGT--register---Floating-point-Compare-Greater-than--vector--?lang=en 3174 FCMGT: { 3175 u: 0b1, opcode: 0b11100, 3176 qAndSize: map[VectorArrangement]qAndSize{ 3177 VectorArrangement4S: {size: 0b10, q: 0b1}, 3178 VectorArrangement2S: {size: 0b10, q: 0b0}, 3179 VectorArrangement2D: {size: 0b11, q: 0b1}, 3180 }, 3181 }, 3182 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FCMGE--register---Floating-point-Compare-Greater-than-or-Equal--vector--?lang=en 3183 FCMGE: { 3184 u: 0b1, opcode: 0b11100, 3185 qAndSize: map[VectorArrangement]qAndSize{ 3186 VectorArrangement4S: {size: 0b00, q: 0b1}, 3187 VectorArrangement2S: {size: 0b00, q: 0b0}, 3188 VectorArrangement2D: {size: 0b01, q: 0b1}, 3189 }, 3190 }, 3191 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMIN--vector---Floating-point-minimum--vector--?lang=en 3192 VFMIN: { 3193 u: 0b0, opcode: 0b11110, 3194 qAndSize: map[VectorArrangement]qAndSize{ 3195 VectorArrangement4S: {size: 0b10, q: 0b1}, 3196 VectorArrangement2S: {size: 0b10, q: 0b0}, 3197 VectorArrangement2D: {size: 0b11, q: 0b1}, 3198 }, 3199 }, 3200 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMAX--vector---Floating-point-Maximum--vector--?lang=en 3201 VFMAX: { 3202 u: 0b0, opcode: 0b11110, 3203 qAndSize: map[VectorArrangement]qAndSize{ 3204 VectorArrangement4S: {size: 0b00, q: 0b1}, 3205 VectorArrangement2S: {size: 0b00, q: 0b0}, 3206 VectorArrangement2D: {size: 0b01, q: 0b1}, 3207 }, 3208 }, 3209 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FMUL--vector---Floating-point-Multiply--vector--?lang=en 3210 VFMUL: { 3211 u: 0b1, opcode: 0b11011, 3212 qAndSize: map[VectorArrangement]qAndSize{ 3213 VectorArrangement4S: {size: 0b00, q: 0b1}, 3214 VectorArrangement2S: {size: 0b00, q: 0b0}, 3215 VectorArrangement2D: {size: 0b01, q: 0b1}, 3216 }, 3217 }, 3218 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/FDIV--vector---Floating-point-Divide--vector--?lang=en 3219 VFDIV: { 3220 u: 0b1, opcode: 0b11111, 3221 qAndSize: map[VectorArrangement]qAndSize{ 3222 VectorArrangement4S: {size: 0b00, q: 0b1}, 3223 VectorArrangement2S: {size: 0b00, q: 0b0}, 3224 VectorArrangement2D: {size: 0b01, q: 0b1}, 3225 }, 3226 }, 3227 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/MUL--vector---Multiply--vector--?lang=en 3228 VMUL: {u: 0b0, opcode: 0b10011, qAndSize: defaultQAndSize}, 3229 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQADD--Signed-saturating-Add-?lang=en 3230 VSQADD: {u: 0b0, opcode: 0b00001, qAndSize: defaultQAndSize}, 3231 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQADD--Unsigned-saturating-Add-?lang=en 3232 VUQADD: {u: 0b1, opcode: 0b00001, qAndSize: defaultQAndSize}, 3233 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMIN--Signed-Minimum--vector--?lang=en 3234 SMIN: {u: 0b0, opcode: 0b01101, qAndSize: defaultQAndSize}, 3235 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMAX--Signed-Maximum--vector--?lang=en 3236 SMAX: {u: 0b0, opcode: 0b01100, qAndSize: defaultQAndSize}, 3237 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMIN--Unsigned-Minimum--vector--?lang=en 3238 UMIN: {u: 0b1, opcode: 0b01101, qAndSize: defaultQAndSize}, 3239 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMAX--Unsigned-Maximum--vector--?lang=en 3240 UMAX: {u: 0b1, opcode: 0b01100, qAndSize: defaultQAndSize}, 3241 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/URHADD--Unsigned-Rounding-Halving-Add-?lang=en 3242 URHADD: {u: 0b1, opcode: 0b00010, qAndSize: defaultQAndSize}, 3243 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SQSUB--Signed-saturating-Subtract-?lang=en 3244 VSQSUB: {u: 0b0, opcode: 0b00101, qAndSize: defaultQAndSize}, 3245 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UQSUB--Unsigned-saturating-Subtract-?lang=en 3246 VUQSUB: {u: 0b1, opcode: 0b00101, qAndSize: defaultQAndSize}, 3247 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/BIT--Bitwise-Insert-if-True-?lang=en 3248 VBIT: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{ 3249 VectorArrangement8B: {q: 0b0, size: 0b10}, 3250 VectorArrangement16B: {q: 0b1, size: 0b10}, 3251 }}, 3252 SQRDMULH: {u: 0b1, opcode: 0b10110, qAndSize: map[VectorArrangement]qAndSize{ 3253 VectorArrangement4H: {q: 0b0, size: 0b01}, 3254 VectorArrangement8H: {q: 0b1, size: 0b01}, 3255 VectorArrangement2S: {q: 0b0, size: 0b10}, 3256 VectorArrangement4S: {q: 0b1, size: 0b10}, 3257 }}, 3258 } 3259 3260 // aAndSize is a pair of "Q" and "size" that appear in https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3261 type qAndSize struct{ q, size byte } 3262 3263 // defaultQAndSize maps a vector arrangement to the default qAndSize which is encoded by many instructions. 3264 var defaultQAndSize = map[VectorArrangement]qAndSize{ 3265 VectorArrangement8B: {size: 0b00, q: 0b0}, 3266 VectorArrangement16B: {size: 0b00, q: 0b1}, 3267 VectorArrangement4H: {size: 0b01, q: 0b0}, 3268 VectorArrangement8H: {size: 0b01, q: 0b1}, 3269 VectorArrangement2S: {size: 0b10, q: 0b0}, 3270 VectorArrangement4S: {size: 0b10, q: 0b1}, 3271 VectorArrangement1D: {size: 0b11, q: 0b0}, 3272 VectorArrangement2D: {size: 0b11, q: 0b1}, 3273 } 3274 3275 // advancedSIMDAcrossLanes holds information to encode instructions as "Advanced SIMD across lanes" in 3276 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3277 var advancedSIMDAcrossLanes = map[asm.Instruction]struct { 3278 qAndSize map[VectorArrangement]qAndSize 3279 u, opcode byte 3280 }{ 3281 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDV--Add-across-Vector-?lang=en 3282 ADDV: { 3283 u: 0b0, opcode: 0b11011, 3284 qAndSize: map[VectorArrangement]qAndSize{ 3285 VectorArrangement16B: {size: 0b00, q: 0b1}, 3286 VectorArrangement8B: {size: 0b00, q: 0b0}, 3287 VectorArrangement8H: {size: 0b01, q: 0b1}, 3288 VectorArrangement4H: {size: 0b01, q: 0b0}, 3289 VectorArrangement4S: {size: 0b10, q: 0b1}, 3290 }, 3291 }, 3292 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMINV--Unsigned-Minimum-across-Vector-?lang=en 3293 UMINV: { 3294 u: 0b1, opcode: 0b11010, 3295 qAndSize: map[VectorArrangement]qAndSize{ 3296 VectorArrangement16B: {size: 0b00, q: 0b1}, 3297 VectorArrangement8B: {size: 0b00, q: 0b0}, 3298 VectorArrangement8H: {size: 0b01, q: 0b1}, 3299 VectorArrangement4H: {size: 0b01, q: 0b0}, 3300 VectorArrangement4S: {size: 0b10, q: 0b1}, 3301 }, 3302 }, 3303 UADDLV: {u: 0b1, opcode: 0b00011, qAndSize: map[VectorArrangement]qAndSize{ 3304 VectorArrangement16B: {size: 0b00, q: 0b1}, 3305 VectorArrangement8B: {size: 0b00, q: 0b0}, 3306 VectorArrangement8H: {size: 0b01, q: 0b1}, 3307 VectorArrangement4H: {size: 0b01, q: 0b0}, 3308 VectorArrangement4S: {size: 0b10, q: 0b1}, 3309 }}, 3310 } 3311 3312 // advancedSIMDScalarPairwise holds information to encode instructions as "Advanced SIMD scalar pairwise" in 3313 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3314 var advancedSIMDScalarPairwise = map[asm.Instruction]struct { 3315 size map[VectorArrangement]byte 3316 u, opcode byte 3317 }{ 3318 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/ADDP--scalar---Add-Pair-of-elements--scalar--?lang=en 3319 ADDP: {u: 0b0, opcode: 0b11011, size: map[VectorArrangement]byte{VectorArrangement2D: 0b11}}, 3320 } 3321 3322 // advancedSIMDCopy holds information to encode instructions as "Advanced SIMD copy" in 3323 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3324 var advancedSIMDCopy = map[asm.Instruction]struct { 3325 // TODO: extract common implementation of resolver. 3326 resolver func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) 3327 op byte 3328 }{ 3329 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--element---Duplicate-vector-element-to-vector-or-scalar-?lang=en 3330 DUPELEM: {op: 0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { 3331 imm4 = 0b0000 3332 q = 0b1 3333 3334 switch arr { 3335 case VectorArrangementB: 3336 imm5 |= 0b1 3337 imm5 |= byte(srcIndex) << 1 3338 case VectorArrangementH: 3339 imm5 |= 0b10 3340 imm5 |= byte(srcIndex) << 2 3341 case VectorArrangementS: 3342 imm5 |= 0b100 3343 imm5 |= byte(srcIndex) << 3 3344 case VectorArrangementD: 3345 imm5 |= 0b1000 3346 imm5 |= byte(srcIndex) << 4 3347 default: 3348 err = fmt.Errorf("unsupported arrangement for DUPELEM: %d", arr) 3349 } 3350 3351 return 3352 }}, 3353 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/DUP--general---Duplicate-general-purpose-register-to-vector-?lang=en 3354 DUPGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { 3355 imm4 = 0b0001 3356 switch arr { 3357 case VectorArrangement8B: 3358 imm5 = 0b1 3359 case VectorArrangement16B: 3360 imm5 = 0b1 3361 q = 0b1 3362 case VectorArrangement4H: 3363 imm5 = 0b10 3364 case VectorArrangement8H: 3365 imm5 = 0b10 3366 q = 0b1 3367 case VectorArrangement2S: 3368 imm5 = 0b100 3369 case VectorArrangement4S: 3370 imm5 = 0b100 3371 q = 0b1 3372 case VectorArrangement2D: 3373 imm5 = 0b1000 3374 q = 0b1 3375 default: 3376 err = fmt.Errorf("unsupported arrangement for DUPGEN: %s", arr) 3377 } 3378 return 3379 }}, 3380 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/INS--general---Insert-vector-element-from-general-purpose-register-?lang=en 3381 INSGEN: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { 3382 imm4, q = 0b0011, 0b1 3383 switch arr { 3384 case VectorArrangementB: 3385 imm5 |= 0b1 3386 imm5 |= byte(dstIndex) << 1 3387 case VectorArrangementH: 3388 imm5 |= 0b10 3389 imm5 |= byte(dstIndex) << 2 3390 case VectorArrangementS: 3391 imm5 |= 0b100 3392 imm5 |= byte(dstIndex) << 3 3393 case VectorArrangementD: 3394 imm5 |= 0b1000 3395 imm5 |= byte(dstIndex) << 4 3396 default: 3397 err = fmt.Errorf("unsupported arrangement for INSGEN: %s", arr) 3398 } 3399 return 3400 }}, 3401 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en 3402 UMOV: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { 3403 imm4 = 0b0111 3404 switch arr { 3405 case VectorArrangementB: 3406 imm5 |= 0b1 3407 imm5 |= byte(srcIndex) << 1 3408 case VectorArrangementH: 3409 imm5 |= 0b10 3410 imm5 |= byte(srcIndex) << 2 3411 case VectorArrangementS: 3412 imm5 |= 0b100 3413 imm5 |= byte(srcIndex) << 3 3414 case VectorArrangementD: 3415 imm5 |= 0b1000 3416 imm5 |= byte(srcIndex) << 4 3417 q = 0b1 3418 default: 3419 err = fmt.Errorf("unsupported arrangement for UMOV: %s", arr) 3420 } 3421 return 3422 }}, 3423 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SMOV--Signed-Move-vector-element-to-general-purpose-register-?lang=en 3424 SMOV32: {op: 0b0, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { 3425 imm4 = 0b0101 3426 switch arr { 3427 case VectorArrangementB: 3428 imm5 |= 0b1 3429 imm5 |= byte(srcIndex) << 1 3430 case VectorArrangementH: 3431 imm5 |= 0b10 3432 imm5 |= byte(srcIndex) << 2 3433 default: 3434 err = fmt.Errorf("unsupported arrangement for SMOV32: %s", arr) 3435 } 3436 return 3437 }}, 3438 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en 3439 INSELEM: {op: 0b1, resolver: func(srcIndex, dstIndex VectorIndex, arr VectorArrangement) (imm5, imm4, q byte, err error) { 3440 q = 0b1 3441 switch arr { 3442 case VectorArrangementB: 3443 imm5 |= 0b1 3444 imm5 |= byte(dstIndex) << 1 3445 imm4 = byte(srcIndex) 3446 case VectorArrangementH: 3447 imm5 |= 0b10 3448 imm5 |= byte(dstIndex) << 2 3449 imm4 = byte(srcIndex) << 1 3450 case VectorArrangementS: 3451 imm5 |= 0b100 3452 imm5 |= byte(dstIndex) << 3 3453 imm4 = byte(srcIndex) << 2 3454 case VectorArrangementD: 3455 imm5 |= 0b1000 3456 imm5 |= byte(dstIndex) << 4 3457 imm4 = byte(srcIndex) << 3 3458 default: 3459 err = fmt.Errorf("unsupported arrangement for INSELEM: %d", arr) 3460 } 3461 return 3462 }}, 3463 } 3464 3465 // advancedSIMDTableLookup holds information to encode instructions as "Advanced SIMD table lookup" in 3466 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3467 var advancedSIMDTableLookup = map[asm.Instruction]struct { 3468 q map[VectorArrangement]byte 3469 op, op2, Len byte 3470 }{ 3471 TBL1: {op: 0, op2: 0, Len: 0b00, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}}, 3472 TBL2: {op: 0, op2: 0, Len: 0b01, q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8B: 0b0}}, 3473 } 3474 3475 // advancedSIMDShiftByImmediate holds information to encode instructions as "Advanced SIMD shift by immediate" in 3476 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3477 var advancedSIMDShiftByImmediate = map[asm.Instruction]struct { 3478 q map[VectorArrangement]byte 3479 immResolver func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) 3480 U, opcode byte 3481 }{ 3482 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- 3483 SSHLL: { 3484 U: 0b0, opcode: 0b10100, 3485 q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0}, 3486 immResolver: immResolverForSIMDSiftLeftByImmediate, 3487 }, 3488 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/SSHLL--SSHLL2--Signed-Shift-Left-Long--immediate-- 3489 SSHLL2: { 3490 U: 0b0, opcode: 0b10100, 3491 q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1}, 3492 immResolver: immResolverForSIMDSiftLeftByImmediate, 3493 }, 3494 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- 3495 USHLL: { 3496 U: 0b1, opcode: 0b10100, 3497 q: map[VectorArrangement]byte{VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0}, 3498 immResolver: immResolverForSIMDSiftLeftByImmediate, 3499 }, 3500 // https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/USHLL--USHLL2--Unsigned-Shift-Left-Long--immediate-- 3501 USHLL2: { 3502 U: 0b1, opcode: 0b10100, 3503 q: map[VectorArrangement]byte{VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1}, 3504 immResolver: immResolverForSIMDSiftLeftByImmediate, 3505 }, 3506 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/SSHR--Signed-Shift-Right--immediate--?lang=en 3507 SSHR: { 3508 U: 0b0, opcode: 0b00000, 3509 q: map[VectorArrangement]byte{ 3510 VectorArrangement16B: 0b1, VectorArrangement8H: 0b1, VectorArrangement4S: 0b1, VectorArrangement2D: 0b1, 3511 VectorArrangement8B: 0b0, VectorArrangement4H: 0b0, VectorArrangement2S: 0b0, 3512 }, 3513 immResolver: func(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) { 3514 switch arr { 3515 case VectorArrangement16B, VectorArrangement8B: 3516 immh = 0b0001 3517 immb = 8 - byte(shiftAmount&0b111) 3518 case VectorArrangement8H, VectorArrangement4H: 3519 v := 16 - byte(shiftAmount&0b1111) 3520 immb = v & 0b111 3521 immh = 0b0010 | (v >> 3) 3522 case VectorArrangement4S, VectorArrangement2S: 3523 v := 32 - byte(shiftAmount&0b11111) 3524 immb = v & 0b111 3525 immh = 0b0100 | (v >> 3) 3526 case VectorArrangement2D: 3527 v := 64 - byte(shiftAmount&0b111111) 3528 immb = v & 0b111 3529 immh = 0b1000 | (v >> 3) 3530 default: 3531 err = fmt.Errorf("unsupported arrangement %s", arr) 3532 } 3533 return 3534 }, 3535 }, 3536 } 3537 3538 // advancedSIMDPermute holds information to encode instructions as "Advanced SIMD permute" in 3539 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3540 var advancedSIMDPermute = map[asm.Instruction]struct { 3541 opcode byte 3542 }{ 3543 ZIP1: {opcode: 0b011}, 3544 } 3545 3546 func immResolverForSIMDSiftLeftByImmediate(shiftAmount int64, arr VectorArrangement) (immh, immb byte, err error) { 3547 switch arr { 3548 case VectorArrangement16B, VectorArrangement8B: 3549 immb = byte(shiftAmount) 3550 immh = 0b0001 3551 case VectorArrangement8H, VectorArrangement4H: 3552 immb = byte(shiftAmount) & 0b111 3553 immh = 0b0010 | byte(shiftAmount>>3) 3554 case VectorArrangement4S, VectorArrangement2S: 3555 immb = byte(shiftAmount) & 0b111 3556 immh = 0b0100 | byte(shiftAmount>>3) 3557 default: 3558 err = fmt.Errorf("unsupported arrangement %s", arr) 3559 } 3560 return 3561 } 3562 3563 // encodeAdvancedSIMDCopy encodes instruction as "Advanced SIMD copy" in 3564 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3565 func (a *AssemblerImpl) encodeAdvancedSIMDCopy(buf asm.Buffer, srcRegBits, dstRegBits, op, imm5, imm4, q byte) { 3566 buf.Append4Bytes( 3567 (srcRegBits<<5)|dstRegBits, 3568 imm4<<3|0b1<<2|srcRegBits>>3, 3569 imm5, 3570 q<<6|op<<5|0b1110, 3571 ) 3572 } 3573 3574 // encodeAdvancedSIMDThreeSame encodes instruction as "Advanced SIMD three same" in 3575 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3576 func (a *AssemblerImpl) encodeAdvancedSIMDThreeSame(buf asm.Buffer, src1, src2, dst, opcode, size, q, u byte) { 3577 buf.Append4Bytes( 3578 (src2<<5)|dst, 3579 opcode<<3|1<<2|src2>>3, 3580 size<<6|0b1<<5|src1, 3581 q<<6|u<<5|0b1110, 3582 ) 3583 } 3584 3585 // encodeAdvancedSIMDThreeDifferent encodes instruction as "Advanced SIMD three different" in 3586 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3587 func (a *AssemblerImpl) encodeAdvancedSIMDThreeDifferent(buf asm.Buffer, src1, src2, dst, opcode, size, q, u byte) { 3588 buf.Append4Bytes( 3589 (src2<<5)|dst, 3590 opcode<<4|src2>>3, 3591 size<<6|0b1<<5|src1, 3592 q<<6|u<<5|0b1110, 3593 ) 3594 } 3595 3596 // encodeAdvancedSIMDPermute encodes instruction as "Advanced SIMD permute" in 3597 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3598 func (a *AssemblerImpl) encodeAdvancedSIMDPermute(buf asm.Buffer, src1, src2, dst, opcode, size, q byte) { 3599 buf.Append4Bytes( 3600 (src2<<5)|dst, 3601 opcode<<4|0b1<<3|src2>>3, 3602 size<<6|src1, 3603 q<<6|0b1110, 3604 ) 3605 } 3606 3607 func (a *AssemblerImpl) encodeVectorRegisterToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) { 3608 var srcVectorRegBits byte 3609 if n.srcReg != RegRZR { 3610 srcVectorRegBits, err = vectorRegisterBits(n.srcReg) 3611 } else if n.instruction == CMEQZERO { 3612 // CMEQZERO has RegRZR as the src, and we apply the instruction to the same register as the destination. 3613 srcVectorRegBits, err = vectorRegisterBits(n.dstReg) 3614 } 3615 3616 if err != nil { 3617 return err 3618 } 3619 3620 dstVectorRegBits, err := vectorRegisterBits(n.dstReg) 3621 if err != nil { 3622 return err 3623 } 3624 3625 if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok { 3626 imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement) 3627 if err != nil { 3628 return err 3629 } 3630 a.encodeAdvancedSIMDCopy(buf, srcVectorRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q) 3631 return nil 3632 } 3633 3634 if scalarPairwise, ok := advancedSIMDScalarPairwise[n.instruction]; ok { 3635 // See "Advanced SIMD scalar pairwise" in 3636 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3637 size, ok := scalarPairwise.size[n.vectorArrangement] 3638 if !ok { 3639 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3640 } 3641 buf.Append4Bytes( 3642 (srcVectorRegBits<<5)|dstVectorRegBits, 3643 scalarPairwise.opcode<<4|1<<3|srcVectorRegBits>>3, 3644 size<<6|0b11<<4|scalarPairwise.opcode>>4, 3645 0b1<<6|scalarPairwise.u<<5|0b11110, 3646 ) 3647 return 3648 } 3649 3650 if twoRegMisc, ok := advancedSIMDTwoRegisterMisc[n.instruction]; ok { 3651 // See "Advanced SIMD two-register miscellaneous" in 3652 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3653 qs, ok := twoRegMisc.qAndSize[n.vectorArrangement] 3654 if !ok { 3655 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3656 } 3657 buf.Append4Bytes( 3658 (srcVectorRegBits<<5)|dstVectorRegBits, 3659 twoRegMisc.opcode<<4|0b1<<3|srcVectorRegBits>>3, 3660 qs.size<<6|0b1<<5|twoRegMisc.opcode>>4, 3661 qs.q<<6|twoRegMisc.u<<5|0b01110, 3662 ) 3663 return nil 3664 } 3665 3666 if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok { 3667 qs, ok := threeSame.qAndSize[n.vectorArrangement] 3668 if !ok { 3669 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3670 } 3671 a.encodeAdvancedSIMDThreeSame(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u) 3672 return nil 3673 } 3674 3675 if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok { 3676 qs, ok := threeDifferent.qAndSize[n.vectorArrangement] 3677 if !ok { 3678 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3679 } 3680 a.encodeAdvancedSIMDThreeDifferent(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u) 3681 return nil 3682 } 3683 3684 if acrossLanes, ok := advancedSIMDAcrossLanes[n.instruction]; ok { 3685 // See "Advanced SIMD across lanes" in 3686 // https://developer.arm.com/documentation/ddi0596/2021-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en 3687 qs, ok := acrossLanes.qAndSize[n.vectorArrangement] 3688 if !ok { 3689 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3690 } 3691 buf.Append4Bytes( 3692 (srcVectorRegBits<<5)|dstVectorRegBits, 3693 acrossLanes.opcode<<4|0b1<<3|srcVectorRegBits>>3, 3694 qs.size<<6|0b11000<<1|acrossLanes.opcode>>4, 3695 qs.q<<6|acrossLanes.u<<5|0b01110, 3696 ) 3697 return nil 3698 } 3699 3700 if lookup, ok := advancedSIMDTableLookup[n.instruction]; ok { 3701 q, ok := lookup.q[n.vectorArrangement] 3702 if !ok { 3703 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3704 } 3705 buf.Append4Bytes( 3706 (srcVectorRegBits<<5)|dstVectorRegBits, 3707 lookup.Len<<5|lookup.op<<4|srcVectorRegBits>>3, 3708 lookup.op2<<6|dstVectorRegBits, 3709 q<<6|0b1110, 3710 ) 3711 return 3712 } 3713 3714 if shiftByImmediate, ok := advancedSIMDShiftByImmediate[n.instruction]; ok { 3715 immh, immb, err := shiftByImmediate.immResolver(n.srcConst, n.vectorArrangement) 3716 if err != nil { 3717 return err 3718 } 3719 3720 q, ok := shiftByImmediate.q[n.vectorArrangement] 3721 if !ok { 3722 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3723 } 3724 3725 buf.Append4Bytes( 3726 (srcVectorRegBits<<5)|dstVectorRegBits, 3727 shiftByImmediate.opcode<<3|0b1<<2|srcVectorRegBits>>3, 3728 immh<<3|immb, 3729 q<<6|shiftByImmediate.U<<5|0b1111, 3730 ) 3731 return nil 3732 } 3733 3734 if permute, ok := advancedSIMDPermute[n.instruction]; ok { 3735 size, q := arrangementSizeQ(n.vectorArrangement) 3736 a.encodeAdvancedSIMDPermute(buf, srcVectorRegBits, dstVectorRegBits, dstVectorRegBits, permute.opcode, size, q) 3737 return 3738 } 3739 return errorEncodingUnsupported(n) 3740 } 3741 3742 func (a *AssemblerImpl) encodeTwoVectorRegistersToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) { 3743 var srcRegBits, srcRegBits2, dstRegBits byte 3744 srcRegBits, err = vectorRegisterBits(n.srcReg) 3745 if err != nil { 3746 return err 3747 } 3748 3749 srcRegBits2, err = vectorRegisterBits(n.srcReg2) 3750 if err != nil { 3751 return err 3752 } 3753 3754 dstRegBits, err = vectorRegisterBits(n.dstReg) 3755 if err != nil { 3756 return err 3757 } 3758 3759 if threeSame, ok := advancedSIMDThreeSame[n.instruction]; ok { 3760 qs, ok := threeSame.qAndSize[n.vectorArrangement] 3761 if !ok { 3762 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3763 } 3764 a.encodeAdvancedSIMDThreeSame(buf, srcRegBits, srcRegBits2, dstRegBits, threeSame.opcode, qs.size, qs.q, threeSame.u) 3765 return nil 3766 } 3767 3768 if threeDifferent, ok := advancedSIMDThreeDifferent[n.instruction]; ok { 3769 qs, ok := threeDifferent.qAndSize[n.vectorArrangement] 3770 if !ok { 3771 return fmt.Errorf("unsupported vector arrangement %s for %s", n.vectorArrangement, InstructionName(n.instruction)) 3772 } 3773 a.encodeAdvancedSIMDThreeDifferent(buf, srcRegBits, srcRegBits2, dstRegBits, threeDifferent.opcode, qs.size, qs.q, threeDifferent.u) 3774 return nil 3775 } 3776 3777 if permute, ok := advancedSIMDPermute[n.instruction]; ok { 3778 size, q := arrangementSizeQ(n.vectorArrangement) 3779 a.encodeAdvancedSIMDPermute(buf, srcRegBits, srcRegBits2, dstRegBits, permute.opcode, size, q) 3780 return 3781 } 3782 3783 if n.instruction == EXT { 3784 // EXT is the only instruction in "Advanced SIMD extract", so inline the encoding here. 3785 // https://developer.arm.com/documentation/ddi0596/2021-12/SIMD-FP-Instructions/EXT--Extract-vector-from-pair-of-vectors-?lang=en 3786 var q, imm4 byte 3787 switch n.vectorArrangement { 3788 case VectorArrangement16B: 3789 imm4 = 0b1111 & byte(n.srcConst) 3790 q = 0b1 3791 case VectorArrangement8B: 3792 imm4 = 0b111 & byte(n.srcConst) 3793 default: 3794 return fmt.Errorf("invalid arrangement %s for EXT", n.vectorArrangement) 3795 } 3796 buf.Append4Bytes( 3797 (srcRegBits2<<5)|dstRegBits, 3798 imm4<<3|srcRegBits2>>3, 3799 srcRegBits, 3800 q<<6|0b101110, 3801 ) 3802 return 3803 } 3804 return 3805 } 3806 3807 func (a *AssemblerImpl) encodeVectorRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 3808 if err = checkArrangementIndexPair(n.vectorArrangement, n.srcVectorIndex); err != nil { 3809 return 3810 } 3811 3812 srcVecRegBits, err := vectorRegisterBits(n.srcReg) 3813 if err != nil { 3814 return err 3815 } 3816 3817 dstRegBits, err := intRegisterBits(n.dstReg) 3818 if err != nil { 3819 return err 3820 } 3821 3822 if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok { 3823 imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement) 3824 if err != nil { 3825 return err 3826 } 3827 a.encodeAdvancedSIMDCopy(buf, srcVecRegBits, dstRegBits, simdCopy.op, imm5, imm4, q) 3828 return nil 3829 } 3830 return errorEncodingUnsupported(n) 3831 } 3832 3833 func (a *AssemblerImpl) encodeRegisterToVectorRegister(buf asm.Buffer, n *nodeImpl) (err error) { 3834 srcRegBits, err := intRegisterBits(n.srcReg) 3835 if err != nil { 3836 return err 3837 } 3838 3839 dstVectorRegBits, err := vectorRegisterBits(n.dstReg) 3840 if err != nil { 3841 return err 3842 } 3843 3844 if simdCopy, ok := advancedSIMDCopy[n.instruction]; ok { 3845 imm5, imm4, q, err := simdCopy.resolver(n.srcVectorIndex, n.dstVectorIndex, n.vectorArrangement) 3846 if err != nil { 3847 return err 3848 } 3849 a.encodeAdvancedSIMDCopy(buf, srcRegBits, dstVectorRegBits, simdCopy.op, imm5, imm4, q) 3850 return nil 3851 } 3852 return errorEncodingUnsupported(n) 3853 } 3854 3855 var zeroRegisterBits byte = 0b11111 3856 3857 func isIntRegister(r asm.Register) bool { 3858 return RegR0 <= r && r <= RegSP 3859 } 3860 3861 func isVectorRegister(r asm.Register) bool { 3862 return RegV0 <= r && r <= RegV31 3863 } 3864 3865 func isConditionalRegister(r asm.Register) bool { 3866 return RegCondEQ <= r && r <= RegCondNV 3867 } 3868 3869 func intRegisterBits(r asm.Register) (ret byte, err error) { 3870 if !isIntRegister(r) { 3871 err = fmt.Errorf("%s is not integer", RegisterName(r)) 3872 } else if r == RegSP { 3873 // SP has the same bit representations as RegRZR. 3874 r = RegRZR 3875 } 3876 ret = byte(r - RegR0) 3877 return 3878 } 3879 3880 func vectorRegisterBits(r asm.Register) (ret byte, err error) { 3881 if !isVectorRegister(r) { 3882 err = fmt.Errorf("%s is not vector", RegisterName(r)) 3883 } else { 3884 ret = byte(r - RegV0) 3885 } 3886 return 3887 } 3888 3889 func registerBits(r asm.Register) (ret byte) { 3890 if isIntRegister(r) { 3891 if r == RegSP { 3892 // SP has the same bit representations as RegRZR. 3893 r = RegRZR 3894 } 3895 ret = byte(r - RegR0) 3896 } else { 3897 ret = byte(r - RegV0) 3898 } 3899 return 3900 }