github.com/wasilibs/wazerox@v0.0.0-20240124024944-4923be63ab5f/internal/asm/amd64/impl.go (about) 1 package amd64 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 "math" 8 9 "github.com/wasilibs/wazerox/internal/asm" 10 ) 11 12 // nodeImpl implements asm.Node for amd64. 13 type nodeImpl struct { 14 // jumpTarget holds the target node in the linked for the jump-kind instruction. 15 jumpTarget *nodeImpl 16 17 // prev and next hold the prev/next node from this node in the assembled linked list. 18 prev, next *nodeImpl 19 20 // forwardJumpOrigins hold all the nodes trying to jump into this node as a 21 // singly linked list. In other words, all the nodes with .jumpTarget == this. 22 forwardJumpOrigins *nodeImpl 23 24 staticConst *asm.StaticConst 25 26 dstConst asm.ConstantValue 27 offsetInBinary asm.NodeOffsetInBinary 28 srcConst asm.ConstantValue 29 instruction asm.Instruction 30 31 // readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of 32 // read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress. 33 readInstructionAddressBeforeTargetInstruction asm.Instruction 34 flag nodeFlag 35 types operandTypes 36 srcReg, dstReg asm.Register 37 srcMemIndex, dstMemIndex asm.Register 38 srcMemScale, dstMemScale byte 39 arg byte 40 41 // staticConstReferrersAdded true if this node is already added into AssemblerImpl.staticConstReferrers. 42 // Only used when staticConst is not nil. Through re-assembly, we might end up adding multiple times which causes unnecessary 43 // allocations, so we use this flag to do it once. 44 staticConstReferrersAdded bool 45 } 46 47 type nodeFlag byte 48 49 const ( 50 // nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge 51 // whether a jump is backward or forward before encoding. 52 nodeFlagInitializedForEncoding nodeFlag = 1 << iota 53 nodeFlagBackwardJump 54 // nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and 55 // the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget 56 // as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible. 57 nodeFlagShortForwardJump 58 // nodeFlagLock indicates the encoded instruction should include the LOCK prefix 59 nodeFlagLock 60 ) 61 62 func (n *nodeImpl) isInitializedForEncoding() bool { 63 return n.flag&nodeFlagInitializedForEncoding != 0 64 } 65 66 func (n *nodeImpl) isJumpNode() bool { 67 return n.jumpTarget != nil 68 } 69 70 func (n *nodeImpl) isBackwardJump() bool { 71 return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0) 72 } 73 74 func (n *nodeImpl) isForwardJump() bool { 75 return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0) 76 } 77 78 func (n *nodeImpl) isForwardShortJump() bool { 79 return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0 80 } 81 82 func (n *nodeImpl) isLock() bool { 83 return n.flag&nodeFlagLock != 0 84 } 85 86 // AssignJumpTarget implements asm.Node.AssignJumpTarget. 87 func (n *nodeImpl) AssignJumpTarget(target asm.Node) { 88 n.jumpTarget = target.(*nodeImpl) 89 } 90 91 // AssignDestinationConstant implements asm.Node.AssignDestinationConstant. 92 func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) { 93 n.dstConst = value 94 } 95 96 // AssignSourceConstant implements asm.Node.AssignSourceConstant. 97 func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) { 98 n.srcConst = value 99 } 100 101 // OffsetInBinary implements asm.Node.OffsetInBinary. 102 func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary { 103 return n.offsetInBinary 104 } 105 106 // String implements fmt.Stringer. 107 // 108 // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax, 109 // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand 110 // might be embraced by '[]' to represent the memory location. 111 func (n *nodeImpl) String() (ret string) { 112 instName := InstructionName(n.instruction) 113 switch n.types { 114 case operandTypesNoneToNone: 115 ret = instName 116 case operandTypesNoneToRegister: 117 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg)) 118 case operandTypesNoneToMemory: 119 if n.dstMemIndex != asm.NilRegister { 120 ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName, 121 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 122 } else { 123 ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst) 124 } 125 case operandTypesNoneToBranch: 126 ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget) 127 case operandTypesRegisterToNone: 128 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg)) 129 case operandTypesRegisterToRegister: 130 ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg)) 131 case operandTypesRegisterToMemory: 132 if n.dstMemIndex != asm.NilRegister { 133 ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg), 134 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 135 } else { 136 ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst) 137 } 138 case operandTypesRegisterToConst: 139 ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst) 140 case operandTypesMemoryToRegister: 141 if n.srcMemIndex != asm.NilRegister { 142 ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName, 143 RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg)) 144 } else { 145 ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg)) 146 } 147 case operandTypesMemoryToConst: 148 if n.srcMemIndex != asm.NilRegister { 149 ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName, 150 RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst) 151 } else { 152 ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst) 153 } 154 case operandTypesConstToMemory: 155 if n.dstMemIndex != asm.NilRegister { 156 ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst, 157 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 158 } else { 159 ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst) 160 } 161 case operandTypesConstToRegister: 162 ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg)) 163 case operandTypesStaticConstToRegister: 164 ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg)) 165 case operandTypesRegisterToStaticConst: 166 ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw) 167 } 168 return 169 } 170 171 type operandTypes byte 172 173 const ( 174 operandTypesNoneToNone operandTypes = iota 175 operandTypesNoneToRegister 176 operandTypesNoneToMemory 177 operandTypesNoneToBranch 178 operandTypesRegisterToNone 179 operandTypesRegisterToRegister 180 operandTypesRegisterToMemory 181 operandTypesRegisterToConst 182 operandTypesMemoryToRegister 183 operandTypesMemoryToConst 184 operandTypesConstToRegister 185 operandTypesConstToMemory 186 operandTypesStaticConstToRegister 187 operandTypesRegisterToStaticConst 188 ) 189 190 // String implements fmt.Stringer 191 func (o operandTypes) String() (ret string) { 192 switch o { 193 case operandTypesNoneToNone: 194 ret = "NoneToNone" 195 case operandTypesNoneToRegister: 196 ret = "NoneToRegister" 197 case operandTypesNoneToMemory: 198 ret = "NoneToMemory" 199 case operandTypesNoneToBranch: 200 ret = "NoneToBranch" 201 case operandTypesRegisterToNone: 202 ret = "RegisterToNone" 203 case operandTypesRegisterToRegister: 204 ret = "RegisterToRegister" 205 case operandTypesRegisterToMemory: 206 ret = "RegisterToMemory" 207 case operandTypesRegisterToConst: 208 ret = "RegisterToConst" 209 case operandTypesMemoryToRegister: 210 ret = "MemoryToRegister" 211 case operandTypesMemoryToConst: 212 ret = "MemoryToConst" 213 case operandTypesConstToRegister: 214 ret = "ConstToRegister" 215 case operandTypesConstToMemory: 216 ret = "ConstToMemory" 217 case operandTypesStaticConstToRegister: 218 ret = "StaticConstToRegister" 219 case operandTypesRegisterToStaticConst: 220 ret = "RegisterToStaticConst" 221 } 222 return 223 } 224 225 type ( 226 // AssemblerImpl implements Assembler. 227 AssemblerImpl struct { 228 root *nodeImpl 229 current *nodeImpl 230 asm.BaseAssemblerImpl 231 readInstructionAddressNodes []*nodeImpl 232 233 // staticConstReferrers maintains the list of static const referrers which requires the 234 // offset resolution after finalizing the binary layout. 235 staticConstReferrers []staticConstReferrer 236 237 nodePool nodePool 238 pool asm.StaticConstPool 239 240 // MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool 241 // but have it as an exported field here for testability. 242 MaxDisplacementForConstantPool int 243 244 forceReAssemble bool 245 } 246 247 // staticConstReferrer represents a referrer of a asm.StaticConst. 248 staticConstReferrer struct { 249 n *nodeImpl 250 // instLen is the encoded length of the instruction for `n`. 251 instLen int 252 } 253 ) 254 255 func NewAssembler() *AssemblerImpl { 256 return &AssemblerImpl{ 257 nodePool: nodePool{index: nodePageSize}, 258 pool: asm.NewStaticConstPool(), 259 MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool, 260 } 261 } 262 263 const nodePageSize = 128 264 265 type nodePage = [nodePageSize]nodeImpl 266 267 // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl. 268 // This reduces the allocations over compilation by reusing AssemblerImpl. 269 type nodePool struct { 270 pages []*nodePage 271 index int 272 } 273 274 // allocNode allocates a new nodeImpl for use from the pool. 275 // This expands the pool if there is no space left for it. 276 func (n *nodePool) allocNode() *nodeImpl { 277 if n.index == nodePageSize { 278 if len(n.pages) == cap(n.pages) { 279 n.pages = append(n.pages, new(nodePage)) 280 } else { 281 i := len(n.pages) 282 n.pages = n.pages[:i+1] 283 if n.pages[i] == nil { 284 n.pages[i] = new(nodePage) 285 } 286 } 287 n.index = 0 288 } 289 ret := &n.pages[len(n.pages)-1][n.index] 290 n.index++ 291 return ret 292 } 293 294 func (n *nodePool) reset() { 295 for _, ns := range n.pages { 296 pages := ns[:] 297 for i := range pages { 298 pages[i] = nodeImpl{} 299 } 300 } 301 n.pages = n.pages[:0] 302 n.index = nodePageSize 303 } 304 305 // AllocateNOP implements asm.AssemblerBase. 306 func (a *AssemblerImpl) AllocateNOP() asm.Node { 307 n := a.nodePool.allocNode() 308 n.instruction = NOP 309 n.types = operandTypesNoneToNone 310 return n 311 } 312 313 // Add implements asm.AssemblerBase. 314 func (a *AssemblerImpl) Add(n asm.Node) { 315 a.addNode(n.(*nodeImpl)) 316 } 317 318 // Reset implements asm.AssemblerBase. 319 func (a *AssemblerImpl) Reset() { 320 pool := a.pool 321 pool.Reset() 322 *a = AssemblerImpl{ 323 nodePool: a.nodePool, 324 pool: pool, 325 readInstructionAddressNodes: a.readInstructionAddressNodes[:0], 326 staticConstReferrers: a.staticConstReferrers[:0], 327 BaseAssemblerImpl: asm.BaseAssemblerImpl{ 328 SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0], 329 JumpTableEntries: a.JumpTableEntries[:0], 330 }, 331 } 332 a.nodePool.reset() 333 } 334 335 // newNode creates a new Node and appends it into the linked list. 336 func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl { 337 n := a.nodePool.allocNode() 338 n.instruction = instruction 339 n.types = types 340 a.addNode(n) 341 return n 342 } 343 344 // addNode appends the new node into the linked list. 345 func (a *AssemblerImpl) addNode(node *nodeImpl) { 346 if a.root == nil { 347 a.root = node 348 a.current = node 349 } else { 350 parent := a.current 351 parent.next = node 352 node.prev = parent 353 a.current = node 354 } 355 356 for _, o := range a.SetBranchTargetOnNextNodes { 357 origin := o.(*nodeImpl) 358 origin.jumpTarget = node 359 } 360 // Reuse the underlying slice to avoid re-allocations. 361 a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0] 362 } 363 364 // encodeNode encodes the given node into writer. 365 func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) { 366 switch n.types { 367 case operandTypesNoneToNone: 368 err = a.encodeNoneToNone(buf, n) 369 case operandTypesNoneToRegister: 370 err = a.encodeNoneToRegister(buf, n) 371 case operandTypesNoneToMemory: 372 err = a.encodeNoneToMemory(buf, n) 373 case operandTypesNoneToBranch: 374 // Branching operand can be encoded as relative jumps. 375 err = a.encodeRelativeJump(buf, n) 376 case operandTypesRegisterToNone: 377 err = a.encodeRegisterToNone(buf, n) 378 case operandTypesRegisterToRegister: 379 err = a.encodeRegisterToRegister(buf, n) 380 case operandTypesRegisterToMemory: 381 err = a.encodeRegisterToMemory(buf, n) 382 case operandTypesRegisterToConst: 383 err = a.encodeRegisterToConst(buf, n) 384 case operandTypesMemoryToRegister: 385 err = a.encodeMemoryToRegister(buf, n) 386 case operandTypesMemoryToConst: 387 err = a.encodeMemoryToConst(buf, n) 388 case operandTypesConstToRegister: 389 err = a.encodeConstToRegister(buf, n) 390 case operandTypesConstToMemory: 391 err = a.encodeConstToMemory(buf, n) 392 case operandTypesStaticConstToRegister: 393 err = a.encodeStaticConstToRegister(buf, n) 394 case operandTypesRegisterToStaticConst: 395 err = a.encodeRegisterToStaticConst(buf, n) 396 default: 397 err = fmt.Errorf("encoder undefined for [%s] operand type", n.types) 398 } 399 if err != nil { 400 err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node. 401 } 402 return 403 } 404 405 // Assemble implements asm.AssemblerBase 406 func (a *AssemblerImpl) Assemble(buf asm.Buffer) error { 407 a.initializeNodesForEncoding() 408 409 // Continue encoding until we are not forced to re-assemble which happens when 410 // a short relative jump ends up the offset larger than 8-bit length. 411 for { 412 err := a.encode(buf) 413 if err != nil { 414 return err 415 } 416 417 if !a.forceReAssemble { 418 break 419 } else { 420 // We reset the length of buffer but don't delete the underlying slice since 421 // the binary size will roughly the same after reassemble. 422 buf.Reset() 423 // Reset the re-assemble flag in order to avoid the infinite loop! 424 a.forceReAssemble = false 425 } 426 } 427 428 code := buf.Bytes() 429 for _, n := range a.readInstructionAddressNodes { 430 if err := a.finalizeReadInstructionAddressNode(code, n); err != nil { 431 return err 432 } 433 } 434 435 // Now that we've finished the layout, fill out static consts offsets. 436 for i := range a.staticConstReferrers { 437 ref := &a.staticConstReferrers[i] 438 n, instLen := ref.n, ref.instLen 439 // Calculate the displacement between the RIP (the offset _after_ n) and the static constant. 440 displacement := int(n.staticConst.OffsetInBinary) - int(n.OffsetInBinary()) - instLen 441 // The offset must be stored at the 4 bytes from the tail of this n. See AssemblerImpl.encodeStaticConstImpl for detail. 442 displacementOffsetInInstruction := n.OffsetInBinary() + uint64(instLen-4) 443 binary.LittleEndian.PutUint32(code[displacementOffsetInInstruction:], uint32(int32(displacement))) 444 } 445 446 return a.FinalizeJumpTableEntry(code) 447 } 448 449 // initializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps 450 // are forward or backward jump. 451 func (a *AssemblerImpl) initializeNodesForEncoding() { 452 for n := a.root; n != nil; n = n.next { 453 n.flag |= nodeFlagInitializedForEncoding 454 if target := n.jumpTarget; target != nil { 455 if target.isInitializedForEncoding() { 456 // This means the target exists behind. 457 n.flag |= nodeFlagBackwardJump 458 } else { 459 // Otherwise, this is forward jump. 460 // We start with assuming that the jump can be short (8-bit displacement). 461 // If it doens't fit, we change this flag in resolveRelativeForwardJump. 462 n.flag |= nodeFlagShortForwardJump 463 464 // If the target node is also the branching instruction, we replace the target with the NOP 465 // node so that we can avoid the collision of the target.forwardJumpOrigins both as destination and origins. 466 if target.types == operandTypesNoneToBranch { 467 // Allocate the NOP node from the pool. 468 nop := a.nodePool.allocNode() 469 nop.instruction = NOP 470 nop.types = operandTypesNoneToNone 471 // Insert it between target.prev and target: [target.prev, target] -> [target.prev, nop, target] 472 prev := target.prev 473 nop.prev = prev 474 prev.next = nop 475 nop.next = target 476 target.prev = nop 477 n.jumpTarget = nop 478 target = nop 479 } 480 481 // We add this node `n` into the end of the linked list (.forwardJumpOrigins) beginning from the `target.forwardJumpOrigins`. 482 // Insert the current `n` as the head of the list. 483 n.forwardJumpOrigins = target.forwardJumpOrigins 484 target.forwardJumpOrigins = n 485 } 486 } 487 } 488 } 489 490 func (a *AssemblerImpl) encode(buf asm.Buffer) error { 491 for n := a.root; n != nil; n = n.next { 492 // If an instruction needs NOP padding, we do so before encoding it. 493 // 494 // This is necessary to avoid Intel's jump erratum; see in Section 2.1 495 // in for when we have to pad NOP: 496 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 497 // 498 // This logic used to be implemented in a function called maybeNOPPadding, 499 // but the complexity of the logic made it impossible for the compiler to 500 // inline. Since this function is on a hot code path, we inlined the 501 // initial checks to skip the function call when instructions do not need 502 // NOP padding. 503 switch info := nopPaddingInfo[n.instruction]; { 504 case info.jmp: 505 if err := a.encodeJmpNOPPadding(buf, n); err != nil { 506 return err 507 } 508 case info.onNextJmp: 509 if err := a.encodeOnNextJmpNOPPAdding(buf, n); err != nil { 510 return err 511 } 512 } 513 514 // After the padding, we can finalize the offset of this instruction in the binary. 515 n.offsetInBinary = uint64(buf.Len()) 516 517 if err := a.encodeNode(buf, n); err != nil { 518 return err 519 } 520 521 if n.forwardJumpOrigins != nil { 522 if err := a.resolveForwardRelativeJumps(buf, n); err != nil { 523 return fmt.Errorf("invalid relative forward jumps: %w", err) 524 } 525 } 526 527 a.maybeFlushConstants(buf, n.next == nil) 528 } 529 return nil 530 } 531 532 var nopPaddingInfo = [instructionEnd]struct { 533 jmp, onNextJmp bool 534 }{ 535 RET: {jmp: true}, 536 JMP: {jmp: true}, 537 JCC: {jmp: true}, 538 JCS: {jmp: true}, 539 JEQ: {jmp: true}, 540 JGE: {jmp: true}, 541 JGT: {jmp: true}, 542 JHI: {jmp: true}, 543 JLE: {jmp: true}, 544 JLS: {jmp: true}, 545 JLT: {jmp: true}, 546 JMI: {jmp: true}, 547 JNE: {jmp: true}, 548 JPC: {jmp: true}, 549 JPS: {jmp: true}, 550 // The possible fused jump instructions if the next node is a conditional jump instruction. 551 CMPL: {onNextJmp: true}, 552 CMPQ: {onNextJmp: true}, 553 TESTL: {onNextJmp: true}, 554 TESTQ: {onNextJmp: true}, 555 ADDL: {onNextJmp: true}, 556 ADDQ: {onNextJmp: true}, 557 SUBL: {onNextJmp: true}, 558 SUBQ: {onNextJmp: true}, 559 ANDL: {onNextJmp: true}, 560 ANDQ: {onNextJmp: true}, 561 INCQ: {onNextJmp: true}, 562 DECQ: {onNextJmp: true}, 563 } 564 565 func (a *AssemblerImpl) encodeJmpNOPPadding(buf asm.Buffer, n *nodeImpl) error { 566 // In order to know the instruction length before writing into the binary, 567 // we try encoding it. 568 prevLen := buf.Len() 569 570 // Assign the temporary offset which may or may not be correct depending on the padding decision. 571 n.offsetInBinary = uint64(prevLen) 572 573 // Encode the node and get the instruction length. 574 if err := a.encodeNode(buf, n); err != nil { 575 return err 576 } 577 instructionLen := int32(buf.Len() - prevLen) 578 579 // Revert the written bytes. 580 buf.Truncate(prevLen) 581 return a.encodeNOPPadding(buf, instructionLen) 582 } 583 584 func (a *AssemblerImpl) encodeOnNextJmpNOPPAdding(buf asm.Buffer, n *nodeImpl) error { 585 instructionLen, err := a.fusedInstructionLength(buf, n) 586 if err != nil { 587 return err 588 } 589 return a.encodeNOPPadding(buf, instructionLen) 590 } 591 592 // encodeNOPPadding maybe appends NOP instructions before the node `n`. 593 // This is necessary to avoid Intel's jump erratum: 594 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 595 func (a *AssemblerImpl) encodeNOPPadding(buf asm.Buffer, instructionLen int32) error { 596 const boundaryInBytes int32 = 32 597 const mask = boundaryInBytes - 1 598 var padNum int 599 currentPos := int32(buf.Len()) 600 if used := currentPos & mask; used+instructionLen >= boundaryInBytes { 601 padNum = int(boundaryInBytes - used) 602 } 603 a.padNOP(buf, padNum) 604 return nil 605 } 606 607 // fusedInstructionLength returns the length of "macro fused instruction" if the 608 // instruction sequence starting from `n` can be fused by processor. Otherwise, 609 // returns zero. 610 func (a *AssemblerImpl) fusedInstructionLength(buf asm.Buffer, n *nodeImpl) (ret int32, err error) { 611 // Find the next non-NOP instruction. 612 next := n.next 613 for ; next != nil && next.instruction == NOP; next = next.next { 614 } 615 616 if next == nil { 617 return 618 } 619 620 inst, jmpInst := n.instruction, next.instruction 621 622 if !nopPaddingInfo[jmpInst].jmp { 623 // If the next instruction is not jump kind, the instruction will not be fused. 624 return 625 } 626 627 // How to determine whether the instruction can be fused is described in 628 // Section 3.4.2.2 of "Intel Optimization Manual": 629 // https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf 630 isTest := inst == TESTL || inst == TESTQ 631 isCmp := inst == CMPQ || inst == CMPL 632 isTestCmp := isTest || isCmp 633 if isTestCmp && (n.types == operandTypesMemoryToConst || n.types == operandTypesConstToMemory) { 634 // The manual says: "CMP and TEST can not be fused when comparing MEM-IMM". 635 return 636 } 637 638 // Implement the decision according to the table 3-1 in the manual. 639 isAnd := inst == ANDL || inst == ANDQ 640 if !isTest && !isAnd { 641 if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC { 642 // These jumps are only fused for TEST or AND. 643 return 644 } 645 isAdd := inst == ADDL || inst == ADDQ 646 isSub := inst == SUBL || inst == SUBQ 647 if !isCmp && !isAdd && !isSub { 648 if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS { 649 // Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB. 650 return 651 } 652 } 653 } 654 655 // Now the instruction is ensured to be fused by the processor. 656 // In order to know the fused instruction length before writing into the binary, 657 // we try encoding it. 658 savedLen := uint64(buf.Len()) 659 660 // Encode the nodes into the buffer. 661 if err = a.encodeNode(buf, n); err != nil { 662 return 663 } 664 if err = a.encodeNode(buf, next); err != nil { 665 return 666 } 667 668 ret = int32(uint64(buf.Len()) - savedLen) 669 670 // Revert the written bytes. 671 buf.Truncate(int(savedLen)) 672 return 673 } 674 675 // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP" 676 // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf 677 var nopOpcodes = [][11]byte{ 678 {0x90}, 679 {0x66, 0x90}, 680 {0x0f, 0x1f, 0x00}, 681 {0x0f, 0x1f, 0x40, 0x00}, 682 {0x0f, 0x1f, 0x44, 0x00, 0x00}, 683 {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, 684 {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, 685 {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 686 {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 687 {0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 688 {0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 689 } 690 691 func (a *AssemblerImpl) padNOP(buf asm.Buffer, num int) { 692 for num > 0 { 693 singleNopNum := num 694 if singleNopNum > len(nopOpcodes) { 695 singleNopNum = len(nopOpcodes) 696 } 697 buf.AppendBytes(nopOpcodes[singleNopNum-1][:singleNopNum]) 698 num -= singleNopNum 699 } 700 } 701 702 // CompileStandAlone implements the same method as documented on asm.AssemblerBase. 703 func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node { 704 return a.newNode(instruction, operandTypesNoneToNone) 705 } 706 707 // CompileConstToRegister implements the same method as documented on asm.AssemblerBase. 708 func (a *AssemblerImpl) CompileConstToRegister( 709 instruction asm.Instruction, 710 value asm.ConstantValue, 711 destinationReg asm.Register, 712 ) (inst asm.Node) { 713 n := a.newNode(instruction, operandTypesConstToRegister) 714 n.srcConst = value 715 n.dstReg = destinationReg 716 return n 717 } 718 719 // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase. 720 func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) { 721 n := a.newNode(instruction, operandTypesRegisterToRegister) 722 n.srcReg = from 723 n.dstReg = to 724 } 725 726 // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase. 727 func (a *AssemblerImpl) CompileMemoryToRegister( 728 instruction asm.Instruction, 729 sourceBaseReg asm.Register, 730 sourceOffsetConst asm.ConstantValue, 731 destinationReg asm.Register, 732 ) { 733 n := a.newNode(instruction, operandTypesMemoryToRegister) 734 n.srcReg = sourceBaseReg 735 n.srcConst = sourceOffsetConst 736 n.dstReg = destinationReg 737 } 738 739 // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase. 740 func (a *AssemblerImpl) CompileRegisterToMemory( 741 instruction asm.Instruction, 742 sourceRegister, destinationBaseRegister asm.Register, 743 destinationOffsetConst asm.ConstantValue, 744 ) { 745 n := a.newNode(instruction, operandTypesRegisterToMemory) 746 n.srcReg = sourceRegister 747 n.dstReg = destinationBaseRegister 748 n.dstConst = destinationOffsetConst 749 } 750 751 // CompileRegisterToMemoryWithIndexAndLock implements the same method as documented on asm.AssemblerBase. 752 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndLock( 753 instruction asm.Instruction, 754 srcReg asm.Register, 755 dstBaseReg asm.Register, 756 dstOffsetConst int64, 757 dstIndex asm.Register, 758 dstScale int16, 759 ) { 760 n := a.newNode(instruction, operandTypesRegisterToMemory) 761 n.srcReg = srcReg 762 n.dstReg = dstBaseReg 763 n.dstConst = dstOffsetConst 764 n.dstMemIndex = dstIndex 765 n.dstMemScale = byte(dstScale) 766 n.flag |= nodeFlagLock 767 } 768 769 // CompileJump implements the same method as documented on asm.AssemblerBase. 770 func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node { 771 return a.newNode(jmpInstruction, operandTypesNoneToBranch) 772 } 773 774 // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase. 775 func (a *AssemblerImpl) CompileJumpToMemory( 776 jmpInstruction asm.Instruction, 777 baseReg asm.Register, 778 offset asm.ConstantValue, 779 ) { 780 n := a.newNode(jmpInstruction, operandTypesNoneToMemory) 781 n.dstReg = baseReg 782 n.dstConst = offset 783 } 784 785 // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase. 786 func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) { 787 n := a.newNode(jmpInstruction, operandTypesNoneToRegister) 788 n.dstReg = reg 789 } 790 791 // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase. 792 func (a *AssemblerImpl) CompileReadInstructionAddress( 793 destinationRegister asm.Register, 794 beforeAcquisitionTargetInstruction asm.Instruction, 795 ) { 796 n := a.newNode(LEAQ, operandTypesMemoryToRegister) 797 n.dstReg = destinationRegister 798 n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction 799 } 800 801 // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler. 802 func (a *AssemblerImpl) CompileRegisterToRegisterWithArg( 803 instruction asm.Instruction, 804 from, to asm.Register, 805 arg byte, 806 ) { 807 n := a.newNode(instruction, operandTypesRegisterToRegister) 808 n.srcReg = from 809 n.dstReg = to 810 n.arg = arg 811 } 812 813 // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler. 814 func (a *AssemblerImpl) CompileMemoryWithIndexToRegister( 815 instruction asm.Instruction, 816 srcBaseReg asm.Register, 817 srcOffsetConst asm.ConstantValue, 818 srcIndex asm.Register, 819 srcScale int16, 820 dstReg asm.Register, 821 ) { 822 n := a.newNode(instruction, operandTypesMemoryToRegister) 823 n.srcReg = srcBaseReg 824 n.srcConst = srcOffsetConst 825 n.srcMemIndex = srcIndex 826 n.srcMemScale = byte(srcScale) 827 n.dstReg = dstReg 828 } 829 830 // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler. 831 func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister( 832 instruction asm.Instruction, 833 srcBaseReg asm.Register, 834 srcOffsetConst asm.ConstantValue, 835 srcIndex asm.Register, 836 srcScale int16, 837 dstReg asm.Register, 838 arg byte, 839 ) { 840 n := a.newNode(instruction, operandTypesMemoryToRegister) 841 n.srcReg = srcBaseReg 842 n.srcConst = srcOffsetConst 843 n.srcMemIndex = srcIndex 844 n.srcMemScale = byte(srcScale) 845 n.dstReg = dstReg 846 n.arg = arg 847 } 848 849 // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler. 850 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex( 851 instruction asm.Instruction, 852 srcReg, dstBaseReg asm.Register, 853 dstOffsetConst asm.ConstantValue, 854 dstIndex asm.Register, 855 dstScale int16, 856 ) { 857 n := a.newNode(instruction, operandTypesRegisterToMemory) 858 n.srcReg = srcReg 859 n.dstReg = dstBaseReg 860 n.dstConst = dstOffsetConst 861 n.dstMemIndex = dstIndex 862 n.dstMemScale = byte(dstScale) 863 } 864 865 // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler. 866 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg( 867 instruction asm.Instruction, 868 srcReg, dstBaseReg asm.Register, 869 dstOffsetConst asm.ConstantValue, 870 dstIndex asm.Register, 871 dstScale int16, 872 arg byte, 873 ) { 874 n := a.newNode(instruction, operandTypesRegisterToMemory) 875 n.srcReg = srcReg 876 n.dstReg = dstBaseReg 877 n.dstConst = dstOffsetConst 878 n.dstMemIndex = dstIndex 879 n.dstMemScale = byte(dstScale) 880 n.arg = arg 881 } 882 883 // CompileRegisterToConst implements the same method as documented on amd64.Assembler. 884 func (a *AssemblerImpl) CompileRegisterToConst( 885 instruction asm.Instruction, 886 srcRegister asm.Register, 887 value asm.ConstantValue, 888 ) asm.Node { 889 n := a.newNode(instruction, operandTypesRegisterToConst) 890 n.srcReg = srcRegister 891 n.dstConst = value 892 return n 893 } 894 895 // CompileRegisterToNone implements the same method as documented on amd64.Assembler. 896 func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) { 897 n := a.newNode(instruction, operandTypesRegisterToNone) 898 n.srcReg = register 899 } 900 901 // CompileNoneToRegister implements the same method as documented on amd64.Assembler. 902 func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) { 903 n := a.newNode(instruction, operandTypesNoneToRegister) 904 n.dstReg = register 905 } 906 907 // CompileNoneToMemory implements the same method as documented on amd64.Assembler. 908 func (a *AssemblerImpl) CompileNoneToMemory( 909 instruction asm.Instruction, 910 baseReg asm.Register, 911 offset asm.ConstantValue, 912 ) { 913 n := a.newNode(instruction, operandTypesNoneToMemory) 914 n.dstReg = baseReg 915 n.dstConst = offset 916 } 917 918 // CompileConstToMemory implements the same method as documented on amd64.Assembler. 919 func (a *AssemblerImpl) CompileConstToMemory( 920 instruction asm.Instruction, 921 value asm.ConstantValue, 922 dstbaseReg asm.Register, 923 dstOffset asm.ConstantValue, 924 ) asm.Node { 925 n := a.newNode(instruction, operandTypesConstToMemory) 926 n.srcConst = value 927 n.dstReg = dstbaseReg 928 n.dstConst = dstOffset 929 return n 930 } 931 932 // CompileMemoryToConst implements the same method as documented on amd64.Assembler. 933 func (a *AssemblerImpl) CompileMemoryToConst( 934 instruction asm.Instruction, 935 srcBaseReg asm.Register, 936 srcOffset, value asm.ConstantValue, 937 ) asm.Node { 938 n := a.newNode(instruction, operandTypesMemoryToConst) 939 n.srcReg = srcBaseReg 940 n.srcConst = srcOffset 941 n.dstConst = value 942 return n 943 } 944 945 func errorEncodingUnsupported(n *nodeImpl) error { 946 return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types) 947 } 948 949 func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) (err error) { 950 // Throughout the encoding methods, we use this pair of base offset and 951 // code buffer to write instructions. 952 // 953 // The code buffer is allocated at the end of the current buffer to a size 954 // large enough to hold all the bytes that may be written by the method. 955 // 956 // We use Go's append builtin to write to the buffer because it allows the 957 // compiler to generate much better code than if we made calls to write 958 // methods to mutate an encapsulated byte slice. 959 // 960 // At the end of the method, we truncate the buffer size back to the base 961 // plus the length of the code buffer so the end of the buffer points right 962 // after the last byte that was written. 963 base := buf.Len() 964 code := buf.Append(4)[:0] 965 966 switch n.instruction { 967 case CDQ: 968 // https://www.felixcloutier.com/x86/cwd:cdq:cqo 969 code = append(code, 0x99) 970 case CQO: 971 // https://www.felixcloutier.com/x86/cwd:cdq:cqo 972 code = append(code, rexPrefixW, 0x99) 973 case NOP: 974 // Simply optimize out the NOP instructions. 975 case RET: 976 // https://www.felixcloutier.com/x86/ret 977 code = append(code, 0xc3) 978 case UD2: 979 // https://mudongliang.github.io/x86/html/file_module_x86_id_318.html 980 code = append(code, 0x0f, 0x0b) 981 case REPMOVSQ: 982 code = append(code, 0xf3, rexPrefixW, 0xa5) 983 case REPSTOSQ: 984 code = append(code, 0xf3, rexPrefixW, 0xab) 985 case STD: 986 code = append(code, 0xfd) 987 case CLD: 988 code = append(code, 0xfc) 989 case MFENCE: 990 code = append(code, 0x0F, 0xAE, 0xF0) 991 default: 992 err = errorEncodingUnsupported(n) 993 } 994 995 buf.Truncate(base + len(code)) 996 return 997 } 998 999 func (a *AssemblerImpl) encodeNoneToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 1000 regBits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 1001 1002 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1003 modRM := 0b11_000_000 | // Specifying that opeand is register. 1004 regBits 1005 var mandatoryPrefix byte 1006 switch n.instruction { 1007 case JMP: 1008 // JMP's opcode is defined as "FF /4" meaning that we have to have "4" 1009 // in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp 1010 modRM |= 0b00_100_000 1011 case NEGQ: 1012 prefix |= rexPrefixW 1013 modRM |= 0b00_011_000 1014 case NEGL: 1015 modRM |= 0b00_011_000 1016 case NEGW: 1017 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1018 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1019 mandatoryPrefix = 0x66 1020 modRM |= 0b00_011_000 1021 case NEGB: 1022 modRM |= 0b00_011_000 1023 // 1 byte register operands need default prefix for the following registers. 1024 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1025 prefix |= rexPrefixDefault 1026 } 1027 case INCQ: 1028 prefix |= rexPrefixW 1029 case DECQ: 1030 prefix |= rexPrefixW 1031 modRM |= 0b00_001_000 1032 default: 1033 if RegSP <= n.dstReg && n.dstReg <= RegDI { 1034 // If the destination is one byte length register, we need to have the default prefix. 1035 // https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers 1036 prefix |= rexPrefixDefault 1037 } 1038 } 1039 1040 base := buf.Len() 1041 code := buf.Append(8)[:0] 1042 1043 if mandatoryPrefix != 0 { 1044 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 1045 code = append(code, mandatoryPrefix) 1046 } 1047 1048 if prefix != rexPrefixNone { 1049 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding 1050 code = append(code, prefix) 1051 } 1052 1053 switch n.instruction { 1054 case JMP: 1055 // https://www.felixcloutier.com/x86/jmp 1056 code = append(code, 0xff, modRM) 1057 case SETCC: 1058 // https://www.felixcloutier.com/x86/setcc 1059 code = append(code, 0x0f, 0x93, modRM) 1060 case SETCS: 1061 // https://www.felixcloutier.com/x86/setcc 1062 code = append(code, 0x0f, 0x92, modRM) 1063 case SETEQ: 1064 // https://www.felixcloutier.com/x86/setcc 1065 code = append(code, 0x0f, 0x94, modRM) 1066 case SETGE: 1067 // https://www.felixcloutier.com/x86/setcc 1068 code = append(code, 0x0f, 0x9d, modRM) 1069 case SETGT: 1070 // https://www.felixcloutier.com/x86/setcc 1071 code = append(code, 0x0f, 0x9f, modRM) 1072 case SETHI: 1073 // https://www.felixcloutier.com/x86/setcc 1074 code = append(code, 0x0f, 0x97, modRM) 1075 case SETLE: 1076 // https://www.felixcloutier.com/x86/setcc 1077 code = append(code, 0x0f, 0x9e, modRM) 1078 case SETLS: 1079 // https://www.felixcloutier.com/x86/setcc 1080 code = append(code, 0x0f, 0x96, modRM) 1081 case SETLT: 1082 // https://www.felixcloutier.com/x86/setcc 1083 code = append(code, 0x0f, 0x9c, modRM) 1084 case SETNE: 1085 // https://www.felixcloutier.com/x86/setcc 1086 code = append(code, 0x0f, 0x95, modRM) 1087 case SETPC: 1088 // https://www.felixcloutier.com/x86/setcc 1089 code = append(code, 0x0f, 0x9b, modRM) 1090 case SETPS: 1091 // https://www.felixcloutier.com/x86/setcc 1092 code = append(code, 0x0f, 0x9a, modRM) 1093 case NEGQ, NEGL, NEGW: 1094 // https://www.felixcloutier.com/x86/neg 1095 code = append(code, 0xf7, modRM) 1096 case NEGB: 1097 // https://www.felixcloutier.com/x86/neg 1098 code = append(code, 0xf6, modRM) 1099 case INCQ: 1100 // https://www.felixcloutier.com/x86/inc 1101 code = append(code, 0xff, modRM) 1102 case DECQ: 1103 // https://www.felixcloutier.com/x86/dec 1104 code = append(code, 0xff, modRM) 1105 default: 1106 err = errorEncodingUnsupported(n) 1107 } 1108 1109 buf.Truncate(base + len(code)) 1110 return 1111 } 1112 1113 func (a *AssemblerImpl) encodeNoneToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 1114 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) 1115 if err != nil { 1116 return err 1117 } 1118 1119 var opcode byte 1120 switch n.instruction { 1121 case INCQ: 1122 // https://www.felixcloutier.com/x86/inc 1123 rexPrefix |= rexPrefixW 1124 opcode = 0xff 1125 case DECQ: 1126 // https://www.felixcloutier.com/x86/dec 1127 rexPrefix |= rexPrefixW 1128 modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM. 1129 opcode = 0xff 1130 case JMP: 1131 // https://www.felixcloutier.com/x86/jmp 1132 modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM. 1133 opcode = 0xff 1134 default: 1135 return errorEncodingUnsupported(n) 1136 } 1137 1138 base := buf.Len() 1139 code := buf.Append(12)[:0] 1140 1141 if rexPrefix != rexPrefixNone { 1142 code = append(code, rexPrefix) 1143 } 1144 1145 code = append(code, opcode, modRM) 1146 1147 if sbiExist { 1148 code = append(code, sbi) 1149 } 1150 1151 if displacementWidth != 0 { 1152 code = appendConst(code, n.dstConst, displacementWidth) 1153 } 1154 1155 buf.Truncate(base + len(code)) 1156 return 1157 } 1158 1159 type relativeJumpOpcode struct{ short, long []byte } 1160 1161 func (o relativeJumpOpcode) instructionLen(short bool) int64 { 1162 if short { 1163 return int64(len(o.short)) + 1 // 1 byte = 8 bit offset 1164 } else { 1165 return int64(len(o.long)) + 4 // 4 byte = 32 bit offset 1166 } 1167 } 1168 1169 var relativeJumpOpcodes = [...]relativeJumpOpcode{ 1170 // https://www.felixcloutier.com/x86/jcc 1171 JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}}, 1172 JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}}, 1173 JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}}, 1174 JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}}, 1175 JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}}, 1176 JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}}, 1177 JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}}, 1178 JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}}, 1179 JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}}, 1180 JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}}, 1181 JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}}, 1182 JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}}, 1183 JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}}, 1184 JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}}, 1185 // https://www.felixcloutier.com/x86/jmp 1186 JMP: {short: []byte{0xeb}, long: []byte{0xe9}}, 1187 } 1188 1189 func (a *AssemblerImpl) resolveForwardRelativeJumps(buf asm.Buffer, target *nodeImpl) (err error) { 1190 offsetInBinary := int64(target.OffsetInBinary()) 1191 origin := target.forwardJumpOrigins 1192 for ; origin != nil; origin = origin.forwardJumpOrigins { 1193 shortJump := origin.isForwardShortJump() 1194 op := relativeJumpOpcodes[origin.instruction] 1195 instructionLen := op.instructionLen(shortJump) 1196 1197 // Calculate the offset from the EIP (at the time of executing this jump instruction) 1198 // to the target instruction. This value is always >= 0 as here we only handle forward jumps. 1199 offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen) 1200 if shortJump { 1201 if offset > math.MaxInt8 { 1202 // This forces reassemble in the outer loop inside AssemblerImpl.Assemble(). 1203 a.forceReAssemble = true 1204 // From the next reAssemble phases, this forward jump will be encoded long jump and 1205 // allocate 32-bit offset bytes by default. This means that this `origin` node 1206 // will always enter the "long jump offset encoding" block below 1207 origin.flag ^= nodeFlagShortForwardJump 1208 } else { 1209 buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset) 1210 } 1211 } else { // long jump offset encoding. 1212 if offset > math.MaxInt32 { 1213 return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction)) 1214 } 1215 binary.LittleEndian.PutUint32(buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset)) 1216 } 1217 } 1218 return nil 1219 } 1220 1221 func (a *AssemblerImpl) encodeRelativeJump(buf asm.Buffer, n *nodeImpl) (err error) { 1222 if n.jumpTarget == nil { 1223 err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction)) 1224 return 1225 } 1226 1227 op := relativeJumpOpcodes[n.instruction] 1228 var isShortJump bool 1229 // offsetOfEIP means the offset of EIP register at the time of executing this jump instruction. 1230 // Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP. 1231 var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps 1232 if n.isBackwardJump() { 1233 // If this is the backward jump, we can calculate the exact offset now. 1234 offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary()) 1235 isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8 1236 offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump) 1237 } else { 1238 // For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps. 1239 isShortJump = n.isForwardShortJump() 1240 } 1241 1242 if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here. 1243 return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction)) 1244 } 1245 1246 base := buf.Len() 1247 code := buf.Append(6)[:0] 1248 1249 if isShortJump { 1250 code = append(code, op.short...) 1251 code = append(code, byte(offsetOfEIP)) 1252 } else { 1253 code = append(code, op.long...) 1254 code = appendUint32(code, uint32(offsetOfEIP)) 1255 } 1256 1257 buf.Truncate(base + len(code)) 1258 return 1259 } 1260 1261 func (a *AssemblerImpl) encodeRegisterToNone(buf asm.Buffer, n *nodeImpl) (err error) { 1262 regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) 1263 1264 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1265 modRM := 0b11_000_000 | // Specifying that opeand is register. 1266 regBits 1267 1268 var opcode byte 1269 switch n.instruction { 1270 case DIVL: 1271 // https://www.felixcloutier.com/x86/div 1272 modRM |= 0b00_110_000 1273 opcode = 0xf7 1274 case DIVQ: 1275 // https://www.felixcloutier.com/x86/div 1276 prefix |= rexPrefixW 1277 modRM |= 0b00_110_000 1278 opcode = 0xf7 1279 case IDIVL: 1280 // https://www.felixcloutier.com/x86/idiv 1281 modRM |= 0b00_111_000 1282 opcode = 0xf7 1283 case IDIVQ: 1284 // https://www.felixcloutier.com/x86/idiv 1285 prefix |= rexPrefixW 1286 modRM |= 0b00_111_000 1287 opcode = 0xf7 1288 case MULL: 1289 // https://www.felixcloutier.com/x86/mul 1290 modRM |= 0b00_100_000 1291 opcode = 0xf7 1292 case MULQ: 1293 // https://www.felixcloutier.com/x86/mul 1294 prefix |= rexPrefixW 1295 modRM |= 0b00_100_000 1296 opcode = 0xf7 1297 default: 1298 err = errorEncodingUnsupported(n) 1299 } 1300 1301 base := buf.Len() 1302 code := buf.Append(3)[:0] 1303 1304 if prefix != rexPrefixNone { 1305 code = append(code, prefix) 1306 } 1307 1308 code = append(code, opcode, modRM) 1309 1310 buf.Truncate(base + len(code)) 1311 return 1312 } 1313 1314 var registerToRegisterOpcode = [instructionEnd]*struct { 1315 opcode []byte 1316 rPrefix rexPrefix 1317 mandatoryPrefix byte 1318 srcOnModRMReg bool 1319 isSrc8bit bool 1320 needArg bool 1321 }{ 1322 // https://www.felixcloutier.com/x86/add 1323 ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true}, 1324 ADDQ: {opcode: []byte{0x1}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1325 // https://www.felixcloutier.com/x86/and 1326 ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true}, 1327 ANDQ: {opcode: []byte{0x21}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1328 // https://www.felixcloutier.com/x86/cmp 1329 CMPL: {opcode: []byte{0x39}}, 1330 CMPQ: {opcode: []byte{0x39}, rPrefix: rexPrefixW}, 1331 // https://www.felixcloutier.com/x86/cmovcc 1332 CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: rexPrefixW}, 1333 // https://www.felixcloutier.com/x86/addsd 1334 ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}}, 1335 // https://www.felixcloutier.com/x86/addss 1336 ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}}, 1337 // https://www.felixcloutier.com/x86/addpd 1338 ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}}, 1339 // https://www.felixcloutier.com/x86/addps 1340 ANDPS: {opcode: []byte{0x0f, 0x54}}, 1341 // https://www.felixcloutier.com/x86/bsr 1342 BSRL: {opcode: []byte{0xf, 0xbd}}, 1343 BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: rexPrefixW}, 1344 // https://www.felixcloutier.com/x86/comisd 1345 COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}}, 1346 // https://www.felixcloutier.com/x86/comiss 1347 COMISS: {opcode: []byte{0x0f, 0x2f}}, 1348 // https://www.felixcloutier.com/x86/cvtsd2ss 1349 CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}}, 1350 // https://www.felixcloutier.com/x86/cvtsi2sd 1351 CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}}, 1352 // https://www.felixcloutier.com/x86/cvtsi2sd 1353 CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW}, 1354 // https://www.felixcloutier.com/x86/cvtsi2ss 1355 CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}}, 1356 // https://www.felixcloutier.com/x86/cvtsi2ss 1357 CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW}, 1358 // https://www.felixcloutier.com/x86/cvtss2sd 1359 CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}}, 1360 // https://www.felixcloutier.com/x86/cvttsd2si 1361 CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}}, 1362 CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW}, 1363 // https://www.felixcloutier.com/x86/cvttss2si 1364 CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}}, 1365 CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW}, 1366 // https://www.felixcloutier.com/x86/divsd 1367 DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}}, 1368 // https://www.felixcloutier.com/x86/divss 1369 DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}}, 1370 // https://www.felixcloutier.com/x86/lzcnt 1371 LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}}, 1372 LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: rexPrefixW}, 1373 // https://www.felixcloutier.com/x86/maxsd 1374 MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}}, 1375 // https://www.felixcloutier.com/x86/maxss 1376 MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}}, 1377 // https://www.felixcloutier.com/x86/minsd 1378 MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}}, 1379 // https://www.felixcloutier.com/x86/minss 1380 MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}}, 1381 // https://www.felixcloutier.com/x86/movsx:movsxd 1382 MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true}, 1383 // https://www.felixcloutier.com/x86/movzx 1384 MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true}, 1385 // https://www.felixcloutier.com/x86/movzx 1386 MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true}, 1387 // https://www.felixcloutier.com/x86/movsx:movsxd 1388 MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: rexPrefixW, isSrc8bit: true}, 1389 // https://www.felixcloutier.com/x86/movsx:movsxd 1390 MOVLQSX: {opcode: []byte{0x63}, rPrefix: rexPrefixW}, 1391 // https://www.felixcloutier.com/x86/movsx:movsxd 1392 MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: rexPrefixW}, 1393 // https://www.felixcloutier.com/x86/movsx:movsxd 1394 MOVWLSX: {opcode: []byte{0x0f, 0xbf}}, 1395 // https://www.felixcloutier.com/x86/imul 1396 IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: rexPrefixW}, 1397 // https://www.felixcloutier.com/x86/mulss 1398 MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}}, 1399 // https://www.felixcloutier.com/x86/mulsd 1400 MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}}, 1401 // https://www.felixcloutier.com/x86/or 1402 ORL: {opcode: []byte{0x09}, srcOnModRMReg: true}, 1403 ORQ: {opcode: []byte{0x09}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1404 // https://www.felixcloutier.com/x86/orpd 1405 ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}}, 1406 // https://www.felixcloutier.com/x86/orps 1407 ORPS: {opcode: []byte{0x0f, 0x56}}, 1408 // https://www.felixcloutier.com/x86/popcnt 1409 POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}}, 1410 POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: rexPrefixW}, 1411 // https://www.felixcloutier.com/x86/roundss 1412 ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true}, 1413 // https://www.felixcloutier.com/x86/roundsd 1414 ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true}, 1415 // https://www.felixcloutier.com/x86/sqrtss 1416 SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}}, 1417 // https://www.felixcloutier.com/x86/sqrtsd 1418 SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}}, 1419 // https://www.felixcloutier.com/x86/sub 1420 SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true}, 1421 SUBQ: {opcode: []byte{0x29}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1422 // https://www.felixcloutier.com/x86/subss 1423 SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}}, 1424 // https://www.felixcloutier.com/x86/subsd 1425 SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}}, 1426 // https://www.felixcloutier.com/x86/test 1427 TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true}, 1428 TESTQ: {opcode: []byte{0x85}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1429 // https://www.felixcloutier.com/x86/tzcnt 1430 TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}}, 1431 TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: rexPrefixW}, 1432 // https://www.felixcloutier.com/x86/ucomisd 1433 UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}}, 1434 // https://www.felixcloutier.com/x86/ucomiss 1435 UCOMISS: {opcode: []byte{0x0f, 0x2e}}, 1436 // https://www.felixcloutier.com/x86/xchg 1437 XCHGQ: {opcode: []byte{0x87}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1438 // https://www.felixcloutier.com/x86/xor 1439 XORL: {opcode: []byte{0x31}, srcOnModRMReg: true}, 1440 XORQ: {opcode: []byte{0x31}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1441 // https://www.felixcloutier.com/x86/xorpd 1442 XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}}, 1443 XORPS: {opcode: []byte{0x0f, 0x57}}, 1444 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1445 PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, needArg: true}, 1446 // https://www.felixcloutier.com/x86/pinsrw 1447 PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, needArg: true}, 1448 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1449 PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true}, 1450 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1451 PINSRQ: {mandatoryPrefix: 0x66, rPrefix: rexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true}, 1452 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1453 MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}}, 1454 // https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 1455 MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}}, 1456 // https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq 1457 PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}}, 1458 PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}}, 1459 PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}}, 1460 PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}}, 1461 // https://www.felixcloutier.com/x86/psubb:psubw:psubd 1462 PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}}, 1463 PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}}, 1464 PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}}, 1465 // https://www.felixcloutier.com/x86/psubq 1466 PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}}, 1467 // https://www.felixcloutier.com/x86/addps 1468 ADDPS: {opcode: []byte{0x0f, 0x58}}, 1469 // https://www.felixcloutier.com/x86/addpd 1470 ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}}, 1471 // https://www.felixcloutier.com/x86/subps 1472 SUBPS: {opcode: []byte{0x0f, 0x5c}}, 1473 // https://www.felixcloutier.com/x86/subpd 1474 SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}}, 1475 // https://www.felixcloutier.com/x86/pxor 1476 PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}}, 1477 // https://www.felixcloutier.com/x86/pand 1478 PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}}, 1479 // https://www.felixcloutier.com/x86/por 1480 POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}}, 1481 // https://www.felixcloutier.com/x86/pandn 1482 PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}}, 1483 // https://www.felixcloutier.com/x86/pshufb 1484 PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}}, 1485 // https://www.felixcloutier.com/x86/pshufd 1486 PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, needArg: true}, 1487 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1488 PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, needArg: true, srcOnModRMReg: true}, 1489 // https://www.felixcloutier.com/x86/pextrw 1490 PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, needArg: true}, 1491 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1492 PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true}, 1493 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1494 PEXTRQ: {rPrefix: rexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true}, 1495 // https://www.felixcloutier.com/x86/insertps 1496 INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, needArg: true}, 1497 // https://www.felixcloutier.com/x86/movlhps 1498 MOVLHPS: {opcode: []byte{0x0f, 0x16}}, 1499 // https://www.felixcloutier.com/x86/ptest 1500 PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}}, 1501 // https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd 1502 PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}}, 1503 PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}}, 1504 PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}}, 1505 // https://www.felixcloutier.com/x86/pcmpeqq 1506 PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}}, 1507 // https://www.felixcloutier.com/x86/paddusb:paddusw 1508 PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}}, 1509 // https://www.felixcloutier.com/x86/movsd 1510 MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}}, 1511 // https://www.felixcloutier.com/x86/packsswb:packssdw 1512 PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}}, 1513 // https://www.felixcloutier.com/x86/pmovmskb 1514 PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}}, 1515 // https://www.felixcloutier.com/x86/movmskps 1516 MOVMSKPS: {opcode: []byte{0x0f, 0x50}}, 1517 // https://www.felixcloutier.com/x86/movmskpd 1518 MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}}, 1519 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 1520 PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}}, 1521 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 1522 PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}}, 1523 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1524 PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}}, 1525 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1526 PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}}, 1527 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1528 PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}}, 1529 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1530 PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}}, 1531 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1532 PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}}, 1533 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1534 PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}}, 1535 // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq 1536 PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}}, 1537 // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq 1538 PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}}, 1539 // https://www.felixcloutier.com/x86/cmpps 1540 CMPPS: {opcode: []byte{0x0f, 0xc2}, needArg: true}, 1541 // https://www.felixcloutier.com/x86/cmppd 1542 CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true}, 1543 // https://www.felixcloutier.com/x86/pcmpgtq 1544 PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}}, 1545 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1546 PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}}, 1547 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1548 PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}}, 1549 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1550 PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}}, 1551 // https://www.felixcloutier.com/x86/pminsd:pminsq 1552 PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}}, 1553 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1554 PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}}, 1555 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1556 PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}}, 1557 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1558 PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}}, 1559 // https://www.felixcloutier.com/x86/pminsb:pminsw 1560 PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}}, 1561 // https://www.felixcloutier.com/x86/pminsb:pminsw 1562 PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}}, 1563 // https://www.felixcloutier.com/x86/pminud:pminuq 1564 PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}}, 1565 // https://www.felixcloutier.com/x86/pminub:pminuw 1566 PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}}, 1567 // https://www.felixcloutier.com/x86/pminub:pminuw 1568 PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}}, 1569 // https://www.felixcloutier.com/x86/pmaxud:pmaxuq 1570 PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}}, 1571 // https://www.felixcloutier.com/x86/pmaxub:pmaxuw 1572 PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}}, 1573 // https://www.felixcloutier.com/x86/pmaxub:pmaxuw 1574 PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}}, 1575 // https://www.felixcloutier.com/x86/pmullw 1576 PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}}, 1577 // https://www.felixcloutier.com/x86/pmulld:pmullq 1578 PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}}, 1579 // https://www.felixcloutier.com/x86/pmuludq 1580 PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}}, 1581 // https://www.felixcloutier.com/x86/psubsb:psubsw 1582 PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}}, 1583 // https://www.felixcloutier.com/x86/psubsb:psubsw 1584 PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}}, 1585 // https://www.felixcloutier.com/x86/psubusb:psubusw 1586 PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}}, 1587 // https://www.felixcloutier.com/x86/psubusb:psubusw 1588 PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}}, 1589 // https://www.felixcloutier.com/x86/paddsb:paddsw 1590 PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}}, 1591 // https://www.felixcloutier.com/x86/paddsb:paddsw 1592 PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}}, 1593 // https://www.felixcloutier.com/x86/paddusb:paddusw 1594 PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}}, 1595 // https://www.felixcloutier.com/x86/pavgb:pavgw 1596 PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}}, 1597 // https://www.felixcloutier.com/x86/pavgb:pavgw 1598 PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}}, 1599 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1600 PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}}, 1601 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1602 PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}}, 1603 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1604 PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}}, 1605 // https://www.felixcloutier.com/x86/blendvpd 1606 BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}}, 1607 // https://www.felixcloutier.com/x86/maxpd 1608 MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}}, 1609 // https://www.felixcloutier.com/x86/maxps 1610 MAXPS: {opcode: []byte{0x0f, 0x5f}}, 1611 // https://www.felixcloutier.com/x86/minpd 1612 MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}}, 1613 // https://www.felixcloutier.com/x86/minps 1614 MINPS: {opcode: []byte{0x0f, 0x5d}}, 1615 // https://www.felixcloutier.com/x86/andnpd 1616 ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}}, 1617 // https://www.felixcloutier.com/x86/andnps 1618 ANDNPS: {opcode: []byte{0x0f, 0x55}}, 1619 // https://www.felixcloutier.com/x86/mulps 1620 MULPS: {opcode: []byte{0x0f, 0x59}}, 1621 // https://www.felixcloutier.com/x86/mulpd 1622 MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}}, 1623 // https://www.felixcloutier.com/x86/divps 1624 DIVPS: {opcode: []byte{0x0f, 0x5e}}, 1625 // https://www.felixcloutier.com/x86/divpd 1626 DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}}, 1627 // https://www.felixcloutier.com/x86/sqrtps 1628 SQRTPS: {opcode: []byte{0x0f, 0x51}}, 1629 // https://www.felixcloutier.com/x86/sqrtpd 1630 SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}}, 1631 // https://www.felixcloutier.com/x86/roundps 1632 ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, needArg: true}, 1633 // https://www.felixcloutier.com/x86/roundpd 1634 ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, needArg: true}, 1635 // https://www.felixcloutier.com/x86/palignr 1636 PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, needArg: true}, 1637 // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq 1638 PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}}, 1639 // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq 1640 PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}}, 1641 // https://www.felixcloutier.com/x86/pmulhuw 1642 PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}}, 1643 // https://www.felixcloutier.com/x86/pmuldq 1644 PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}}, 1645 // https://www.felixcloutier.com/x86/pmulhrsw 1646 PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}}, 1647 // https://www.felixcloutier.com/x86/pmovsx 1648 PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}}, 1649 // https://www.felixcloutier.com/x86/pmovsx 1650 PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}}, 1651 // https://www.felixcloutier.com/x86/pmovsx 1652 PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}}, 1653 // https://www.felixcloutier.com/x86/pmovzx 1654 PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}}, 1655 // https://www.felixcloutier.com/x86/pmovzx 1656 PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}}, 1657 // https://www.felixcloutier.com/x86/pmovzx 1658 PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}}, 1659 // https://www.felixcloutier.com/x86/pmulhw 1660 PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}}, 1661 // https://www.felixcloutier.com/x86/cmpps 1662 CMPEQPS: {opcode: []byte{0x0f, 0xc2}, needArg: true}, 1663 // https://www.felixcloutier.com/x86/cmppd 1664 CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true}, 1665 // https://www.felixcloutier.com/x86/cvttps2dq 1666 CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}}, 1667 // https://www.felixcloutier.com/x86/cvtdq2ps 1668 CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}}, 1669 // https://www.felixcloutier.com/x86/cvtdq2pd 1670 CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}}, 1671 // https://www.felixcloutier.com/x86/cvtpd2ps 1672 CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}}, 1673 // https://www.felixcloutier.com/x86/cvtps2pd 1674 CVTPS2PD: {opcode: []byte{0x0f, 0x5a}}, 1675 // https://www.felixcloutier.com/x86/movupd 1676 MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}}, 1677 // https://www.felixcloutier.com/x86/shufps 1678 SHUFPS: {opcode: []byte{0x0f, 0xc6}, needArg: true}, 1679 // https://www.felixcloutier.com/x86/pmaddwd 1680 PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}}, 1681 // https://www.felixcloutier.com/x86/unpcklps 1682 UNPCKLPS: {opcode: []byte{0x0f, 0x14}}, 1683 // https://www.felixcloutier.com/x86/packuswb 1684 PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}}, 1685 // https://www.felixcloutier.com/x86/packsswb:packssdw 1686 PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}}, 1687 // https://www.felixcloutier.com/x86/packusdw 1688 PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}}, 1689 // https://www.felixcloutier.com/x86/pmaddubsw 1690 PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}}, 1691 // https://www.felixcloutier.com/x86/cvttpd2dq 1692 CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}}, 1693 } 1694 1695 var registerToRegisterShiftOpcode = [instructionEnd]*struct { 1696 opcode []byte 1697 rPrefix rexPrefix 1698 modRMExtension byte 1699 }{ 1700 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1701 ROLL: {opcode: []byte{0xd3}}, 1702 ROLQ: {opcode: []byte{0xd3}, rPrefix: rexPrefixW}, 1703 RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000}, 1704 RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: rexPrefixW}, 1705 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1706 SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000}, 1707 SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: rexPrefixW}, 1708 SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000}, 1709 SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: rexPrefixW}, 1710 SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000}, 1711 SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: rexPrefixW}, 1712 } 1713 1714 func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 1715 // Alias for readability 1716 inst := n.instruction 1717 base := buf.Len() 1718 code := buf.Append(8)[:0] 1719 1720 switch inst { 1721 case MOVL, MOVQ: 1722 var ( 1723 opcode []byte 1724 mandatoryPrefix byte 1725 srcOnModRMReg bool 1726 rPrefix rexPrefix 1727 ) 1728 srcIsFloat, dstIsFloat := isVectorRegister(n.srcReg), isVectorRegister(n.dstReg) 1729 f2f := srcIsFloat && dstIsFloat 1730 if f2f { 1731 // https://www.felixcloutier.com/x86/movq 1732 opcode, mandatoryPrefix = []byte{0x0f, 0x7e}, 0xf3 1733 } else if srcIsFloat && !dstIsFloat { 1734 // https://www.felixcloutier.com/x86/movd:movq 1735 opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x7e}, 0x66, true 1736 } else if !srcIsFloat && dstIsFloat { 1737 // https://www.felixcloutier.com/x86/movd:movq 1738 opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x6e}, 0x66, false 1739 } else { 1740 // https://www.felixcloutier.com/x86/mov 1741 opcode, srcOnModRMReg = []byte{0x89}, true 1742 } 1743 1744 rexPrefix, modRM, err := n.getRegisterToRegisterModRM(srcOnModRMReg) 1745 if err != nil { 1746 return err 1747 } 1748 rexPrefix |= rPrefix 1749 1750 if inst == MOVQ && !f2f { 1751 rexPrefix |= rexPrefixW 1752 } 1753 if mandatoryPrefix != 0 { 1754 code = append(code, mandatoryPrefix) 1755 } 1756 if rexPrefix != rexPrefixNone { 1757 code = append(code, rexPrefix) 1758 } 1759 code = append(code, opcode...) 1760 code = append(code, modRM) 1761 buf.Truncate(base + len(code)) 1762 return nil 1763 } 1764 1765 if op := registerToRegisterOpcode[inst]; op != nil { 1766 rexPrefix, modRM, err := n.getRegisterToRegisterModRM(op.srcOnModRMReg) 1767 if err != nil { 1768 return err 1769 } 1770 rexPrefix |= op.rPrefix 1771 1772 if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI { 1773 // If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix. 1774 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers 1775 rexPrefix |= rexPrefixDefault 1776 } 1777 1778 if op.mandatoryPrefix != 0 { 1779 code = append(code, op.mandatoryPrefix) 1780 } 1781 1782 if rexPrefix != rexPrefixNone { 1783 code = append(code, rexPrefix) 1784 } 1785 code = append(code, op.opcode...) 1786 code = append(code, modRM) 1787 1788 if op.needArg { 1789 code = append(code, n.arg) 1790 } 1791 } else if op := registerToRegisterShiftOpcode[inst]; op != nil { 1792 reg3bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 1793 rexPrefix |= op.rPrefix 1794 if rexPrefix != rexPrefixNone { 1795 code = append(code, rexPrefix) 1796 } 1797 1798 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1799 modRM := 0b11_000_000 | 1800 (op.modRMExtension) | 1801 reg3bits 1802 code = append(code, op.opcode...) 1803 code = append(code, modRM) 1804 } else { 1805 return errorEncodingUnsupported(n) 1806 } 1807 1808 buf.Truncate(base + len(code)) 1809 return nil 1810 } 1811 1812 func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 1813 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) 1814 if err != nil { 1815 return err 1816 } 1817 1818 var opcode []byte 1819 var mandatoryPrefix byte 1820 var isShiftInstruction bool 1821 var needArg bool 1822 switch n.instruction { 1823 case CMPL: 1824 // https://www.felixcloutier.com/x86/cmp 1825 opcode = []byte{0x3b} 1826 case CMPQ: 1827 // https://www.felixcloutier.com/x86/cmp 1828 rexPrefix |= rexPrefixW 1829 opcode = []byte{0x3b} 1830 case MOVB: 1831 // https://www.felixcloutier.com/x86/mov 1832 opcode = []byte{0x88} 1833 // 1 byte register operands need default prefix for the following registers. 1834 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1835 rexPrefix |= rexPrefixDefault 1836 } 1837 case MOVL: 1838 if isVectorRegister(n.srcReg) { 1839 // https://www.felixcloutier.com/x86/movd:movq 1840 opcode = []byte{0x0f, 0x7e} 1841 mandatoryPrefix = 0x66 1842 } else { 1843 // https://www.felixcloutier.com/x86/mov 1844 opcode = []byte{0x89} 1845 } 1846 case MOVQ: 1847 if isVectorRegister(n.srcReg) { 1848 // https://www.felixcloutier.com/x86/movq 1849 opcode = []byte{0x0f, 0xd6} 1850 mandatoryPrefix = 0x66 1851 } else { 1852 // https://www.felixcloutier.com/x86/mov 1853 rexPrefix |= rexPrefixW 1854 opcode = []byte{0x89} 1855 } 1856 case MOVW: 1857 // https://www.felixcloutier.com/x86/mov 1858 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1859 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1860 mandatoryPrefix = 0x66 1861 opcode = []byte{0x89} 1862 case SARL: 1863 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1864 modRM |= 0b00_111_000 1865 opcode = []byte{0xd3} 1866 isShiftInstruction = true 1867 case SARQ: 1868 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1869 rexPrefix |= rexPrefixW 1870 modRM |= 0b00_111_000 1871 opcode = []byte{0xd3} 1872 isShiftInstruction = true 1873 case SHLL: 1874 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1875 modRM |= 0b00_100_000 1876 opcode = []byte{0xd3} 1877 isShiftInstruction = true 1878 case SHLQ: 1879 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1880 rexPrefix |= rexPrefixW 1881 modRM |= 0b00_100_000 1882 opcode = []byte{0xd3} 1883 isShiftInstruction = true 1884 case SHRL: 1885 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1886 modRM |= 0b00_101_000 1887 opcode = []byte{0xd3} 1888 isShiftInstruction = true 1889 case SHRQ: 1890 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1891 rexPrefix |= rexPrefixW 1892 modRM |= 0b00_101_000 1893 opcode = []byte{0xd3} 1894 isShiftInstruction = true 1895 case ROLL: 1896 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1897 opcode = []byte{0xd3} 1898 isShiftInstruction = true 1899 case ROLQ: 1900 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1901 rexPrefix |= rexPrefixW 1902 opcode = []byte{0xd3} 1903 isShiftInstruction = true 1904 case RORL: 1905 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1906 modRM |= 0b00_001_000 1907 opcode = []byte{0xd3} 1908 isShiftInstruction = true 1909 case RORQ: 1910 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1911 rexPrefix |= rexPrefixW 1912 opcode = []byte{0xd3} 1913 modRM |= 0b00_001_000 1914 isShiftInstruction = true 1915 case MOVDQU: 1916 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1917 mandatoryPrefix = 0xf3 1918 opcode = []byte{0x0f, 0x7f} 1919 case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1920 mandatoryPrefix = 0x66 1921 opcode = []byte{0x0f, 0x3a, 0x14} 1922 needArg = true 1923 case PEXTRW: // https://www.felixcloutier.com/x86/pextrw 1924 mandatoryPrefix = 0x66 1925 opcode = []byte{0x0f, 0x3a, 0x15} 1926 needArg = true 1927 case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1928 mandatoryPrefix = 0x66 1929 opcode = []byte{0x0f, 0x3a, 0x16} 1930 needArg = true 1931 case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1932 mandatoryPrefix = 0x66 1933 rexPrefix |= rexPrefixW // REX.W 1934 opcode = []byte{0x0f, 0x3a, 0x16} 1935 needArg = true 1936 case XCHGB: 1937 // https://www.felixcloutier.com/x86/xchg 1938 opcode = []byte{0x86} 1939 // 1 byte register operands need default prefix for the following registers. 1940 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1941 rexPrefix |= rexPrefixDefault 1942 } 1943 case XCHGW: 1944 // https://www.felixcloutier.com/x86/mov 1945 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1946 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1947 mandatoryPrefix = 0x66 1948 opcode = []byte{0x87} 1949 case XCHGL: 1950 // https://www.felixcloutier.com/x86/xchg 1951 opcode = []byte{0x87} 1952 case XCHGQ: 1953 // https://www.felixcloutier.com/x86/mxchg 1954 rexPrefix |= rexPrefixW 1955 opcode = []byte{0x87} 1956 case XADDB: 1957 // https://www.felixcloutier.com/x86/xadd 1958 opcode = []byte{0x0F, 0xC0} 1959 // 1 byte register operands need default prefix for the following registers. 1960 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1961 rexPrefix |= rexPrefixDefault 1962 } 1963 case XADDW: 1964 // https://www.felixcloutier.com/x86/xadd 1965 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1966 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1967 mandatoryPrefix = 0x66 1968 opcode = []byte{0x0F, 0xC1} 1969 case XADDL: 1970 // https://www.felixcloutier.com/x86/xadd 1971 opcode = []byte{0x0F, 0xC1} 1972 case XADDQ: 1973 // https://www.felixcloutier.com/x86/xadd 1974 rexPrefix |= rexPrefixW 1975 opcode = []byte{0x0F, 0xC1} 1976 case CMPXCHGB: 1977 // https://www.felixcloutier.com/x86/cmpxchg 1978 opcode = []byte{0x0F, 0xB0} 1979 // 1 byte register operands need default prefix for the following registers. 1980 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1981 rexPrefix |= rexPrefixDefault 1982 } 1983 case CMPXCHGW: 1984 // https://www.felixcloutier.com/x86/cmpxchg 1985 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1986 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1987 mandatoryPrefix = 0x66 1988 opcode = []byte{0x0F, 0xB1} 1989 case CMPXCHGL: 1990 // https://www.felixcloutier.com/x86/cmpxchg 1991 opcode = []byte{0x0F, 0xB1} 1992 case CMPXCHGQ: 1993 // https://www.felixcloutier.com/x86/cmpxchg 1994 rexPrefix |= rexPrefixW 1995 opcode = []byte{0x0F, 0xB1} 1996 default: 1997 return errorEncodingUnsupported(n) 1998 } 1999 2000 if !isShiftInstruction { 2001 srcReg3Bits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg) 2002 2003 rexPrefix |= prefix 2004 modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg 2005 } else { 2006 if n.srcReg != RegCX { 2007 return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg)) 2008 } 2009 } 2010 2011 base := buf.Len() 2012 code := buf.Append(16)[:0] 2013 2014 if mandatoryPrefix != 0 { 2015 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 2016 code = append(code, mandatoryPrefix) 2017 } 2018 2019 if n.isLock() { 2020 code = append(code, lockPrefix) 2021 } 2022 2023 if rexPrefix != rexPrefixNone { 2024 code = append(code, rexPrefix) 2025 } 2026 2027 code = append(code, opcode...) 2028 code = append(code, modRM) 2029 2030 if sbiExist { 2031 code = append(code, sbi) 2032 } 2033 2034 if displacementWidth != 0 { 2035 code = appendConst(code, n.dstConst, displacementWidth) 2036 } 2037 2038 if needArg { 2039 code = append(code, n.arg) 2040 } 2041 2042 buf.Truncate(base + len(code)) 2043 return 2044 } 2045 2046 func (a *AssemblerImpl) encodeRegisterToConst(buf asm.Buffer, n *nodeImpl) (err error) { 2047 regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) 2048 2049 base := buf.Len() 2050 code := buf.Append(10)[:0] 2051 2052 switch n.instruction { 2053 case CMPL, CMPQ: 2054 if n.instruction == CMPQ { 2055 prefix |= rexPrefixW 2056 } 2057 if prefix != rexPrefixNone { 2058 code = append(code, prefix) 2059 } 2060 is8bitConst := fitInSigned8bit(n.dstConst) 2061 // https://www.felixcloutier.com/x86/cmp 2062 if n.srcReg == RegAX && !is8bitConst { 2063 code = append(code, 0x3d) 2064 } else { 2065 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2066 modRM := 0b11_000_000 | // Specifying that opeand is register. 2067 0b00_111_000 | // CMP with immediate needs "/7" extension. 2068 regBits 2069 if is8bitConst { 2070 code = append(code, 0x83, modRM) 2071 } else { 2072 code = append(code, 0x81, modRM) 2073 } 2074 } 2075 default: 2076 err = errorEncodingUnsupported(n) 2077 } 2078 2079 if fitInSigned8bit(n.dstConst) { 2080 code = append(code, byte(n.dstConst)) 2081 } else { 2082 code = appendUint32(code, uint32(n.dstConst)) 2083 } 2084 2085 buf.Truncate(base + len(code)) 2086 return 2087 } 2088 2089 func (a *AssemblerImpl) finalizeReadInstructionAddressNode(code []byte, n *nodeImpl) (err error) { 2090 // Find the target instruction node. 2091 targetNode := n 2092 for ; targetNode != nil; targetNode = targetNode.next { 2093 if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction { 2094 targetNode = targetNode.next 2095 break 2096 } 2097 } 2098 2099 if targetNode == nil { 2100 return errors.New("BUG: target instruction not found for read instruction address") 2101 } 2102 2103 offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */) 2104 if offset >= math.MaxInt32 { 2105 return errors.New("BUG: too large offset for LEAQ instruction") 2106 } 2107 2108 binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset))) 2109 return nil 2110 } 2111 2112 func (a *AssemblerImpl) encodeReadInstructionAddress(buf asm.Buffer, n *nodeImpl) error { 2113 dstReg3Bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) 2114 2115 a.readInstructionAddressNodes = append(a.readInstructionAddressNodes, n) 2116 2117 // https://www.felixcloutier.com/x86/lea 2118 opcode := byte(0x8d) 2119 rexPrefix |= rexPrefixW 2120 2121 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2122 modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding. 2123 (dstReg3Bits << 3) // Place the dstReg on ModRM:reg. 2124 2125 code := buf.Append(7) 2126 code[0] = rexPrefix 2127 code[1] = opcode 2128 code[2] = modRM 2129 binary.LittleEndian.PutUint32(code[3:], 0) // Preserve 2130 return nil 2131 } 2132 2133 func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2134 if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE { 2135 return a.encodeReadInstructionAddress(buf, n) 2136 } 2137 2138 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false) 2139 if err != nil { 2140 return err 2141 } 2142 2143 dstReg3Bits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) 2144 rexPrefix |= prefix 2145 modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg 2146 2147 var mandatoryPrefix byte 2148 var opcode []byte 2149 var needArg bool 2150 2151 switch n.instruction { 2152 case ADDL: 2153 // https://www.felixcloutier.com/x86/add 2154 opcode = []byte{0x03} 2155 case ADDQ: 2156 // https://www.felixcloutier.com/x86/add 2157 rexPrefix |= rexPrefixW 2158 opcode = []byte{0x03} 2159 case CMPL: 2160 // https://www.felixcloutier.com/x86/cmp 2161 opcode = []byte{0x39} 2162 case CMPQ: 2163 // https://www.felixcloutier.com/x86/cmp 2164 rexPrefix |= rexPrefixW 2165 opcode = []byte{0x39} 2166 case LEAQ: 2167 // https://www.felixcloutier.com/x86/lea 2168 rexPrefix |= rexPrefixW 2169 opcode = []byte{0x8d} 2170 case MOVBLSX: 2171 // https://www.felixcloutier.com/x86/movsx:movsxd 2172 opcode = []byte{0x0f, 0xbe} 2173 case MOVBLZX: 2174 // https://www.felixcloutier.com/x86/movzx 2175 opcode = []byte{0x0f, 0xb6} 2176 case MOVBQSX: 2177 // https://www.felixcloutier.com/x86/movsx:movsxd 2178 rexPrefix |= rexPrefixW 2179 opcode = []byte{0x0f, 0xbe} 2180 case MOVBQZX: 2181 // https://www.felixcloutier.com/x86/movzx 2182 rexPrefix |= rexPrefixW 2183 opcode = []byte{0x0f, 0xb6} 2184 case MOVLQSX: 2185 // https://www.felixcloutier.com/x86/movsx:movsxd 2186 rexPrefix |= rexPrefixW 2187 opcode = []byte{0x63} 2188 case MOVLQZX: 2189 // https://www.felixcloutier.com/x86/mov 2190 // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and 2191 // that is semantically equivalent to MOV 32bit to 32bit. 2192 opcode = []byte{0x8B} 2193 case MOVL: 2194 // https://www.felixcloutier.com/x86/mov 2195 // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and 2196 // that is semantically equivalent to MOV 32bit to 32bit. 2197 if isVectorRegister(n.dstReg) { 2198 // https://www.felixcloutier.com/x86/movd:movq 2199 opcode = []byte{0x0f, 0x6e} 2200 mandatoryPrefix = 0x66 2201 } else { 2202 // https://www.felixcloutier.com/x86/mov 2203 opcode = []byte{0x8B} 2204 } 2205 case MOVQ: 2206 if isVectorRegister(n.dstReg) { 2207 // https://www.felixcloutier.com/x86/movq 2208 opcode = []byte{0x0f, 0x7e} 2209 mandatoryPrefix = 0xf3 2210 } else { 2211 // https://www.felixcloutier.com/x86/mov 2212 rexPrefix |= rexPrefixW 2213 opcode = []byte{0x8B} 2214 } 2215 case MOVWLSX: 2216 // https://www.felixcloutier.com/x86/movsx:movsxd 2217 opcode = []byte{0x0f, 0xbf} 2218 case MOVWLZX: 2219 // https://www.felixcloutier.com/x86/movzx 2220 opcode = []byte{0x0f, 0xb7} 2221 case MOVWQSX: 2222 // https://www.felixcloutier.com/x86/movsx:movsxd 2223 rexPrefix |= rexPrefixW 2224 opcode = []byte{0x0f, 0xbf} 2225 case MOVWQZX: 2226 // https://www.felixcloutier.com/x86/movzx 2227 rexPrefix |= rexPrefixW 2228 opcode = []byte{0x0f, 0xb7} 2229 case SUBQ: 2230 // https://www.felixcloutier.com/x86/sub 2231 rexPrefix |= rexPrefixW 2232 opcode = []byte{0x2b} 2233 case SUBSD: 2234 // https://www.felixcloutier.com/x86/subsd 2235 opcode = []byte{0x0f, 0x5c} 2236 mandatoryPrefix = 0xf2 2237 case SUBSS: 2238 // https://www.felixcloutier.com/x86/subss 2239 opcode = []byte{0x0f, 0x5c} 2240 mandatoryPrefix = 0xf3 2241 case UCOMISD: 2242 // https://www.felixcloutier.com/x86/ucomisd 2243 opcode = []byte{0x0f, 0x2e} 2244 mandatoryPrefix = 0x66 2245 case UCOMISS: 2246 // https://www.felixcloutier.com/x86/ucomiss 2247 opcode = []byte{0x0f, 0x2e} 2248 case MOVDQU: 2249 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 2250 mandatoryPrefix = 0xf3 2251 opcode = []byte{0x0f, 0x6f} 2252 case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx 2253 mandatoryPrefix = 0x66 2254 opcode = []byte{0x0f, 0x38, 0x20} 2255 case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx 2256 mandatoryPrefix = 0x66 2257 opcode = []byte{0x0f, 0x38, 0x23} 2258 case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx 2259 mandatoryPrefix = 0x66 2260 opcode = []byte{0x0f, 0x38, 0x25} 2261 case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx 2262 mandatoryPrefix = 0x66 2263 opcode = []byte{0x0f, 0x38, 0x30} 2264 case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx 2265 mandatoryPrefix = 0x66 2266 opcode = []byte{0x0f, 0x38, 0x33} 2267 case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx 2268 mandatoryPrefix = 0x66 2269 opcode = []byte{0x0f, 0x38, 0x35} 2270 case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2271 mandatoryPrefix = 0x66 2272 opcode = []byte{0x0f, 0x3a, 0x20} 2273 needArg = true 2274 case PINSRW: // https://www.felixcloutier.com/x86/pinsrw 2275 mandatoryPrefix = 0x66 2276 opcode = []byte{0x0f, 0xc4} 2277 needArg = true 2278 case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2279 mandatoryPrefix = 0x66 2280 opcode = []byte{0x0f, 0x3a, 0x22} 2281 needArg = true 2282 case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2283 rexPrefix |= rexPrefixW 2284 mandatoryPrefix = 0x66 2285 opcode = []byte{0x0f, 0x3a, 0x22} 2286 needArg = true 2287 default: 2288 return errorEncodingUnsupported(n) 2289 } 2290 2291 base := buf.Len() 2292 code := buf.Append(16)[:0] 2293 2294 if mandatoryPrefix != 0 { 2295 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 2296 code = append(code, mandatoryPrefix) 2297 } 2298 2299 if rexPrefix != rexPrefixNone { 2300 code = append(code, rexPrefix) 2301 } 2302 2303 code = append(code, opcode...) 2304 code = append(code, modRM) 2305 2306 if sbiExist { 2307 code = append(code, sbi) 2308 } 2309 2310 if displacementWidth != 0 { 2311 code = appendConst(code, n.srcConst, displacementWidth) 2312 } 2313 2314 if needArg { 2315 code = append(code, n.arg) 2316 } 2317 2318 buf.Truncate(base + len(code)) 2319 return 2320 } 2321 2322 func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2323 regBits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 2324 2325 isFloatReg := isVectorRegister(n.dstReg) 2326 switch n.instruction { 2327 case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD: 2328 if !isFloatReg { 2329 return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) 2330 } 2331 default: 2332 if isFloatReg { 2333 return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) 2334 } 2335 } 2336 2337 if n.instruction != MOVQ && !fitIn32bit(n.srcConst) { 2338 return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2339 } else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) { 2340 return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2341 } else if (n.instruction == PSLLD || 2342 n.instruction == PSLLQ || 2343 n.instruction == PSRLD || 2344 n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) { 2345 return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2346 } 2347 2348 base := buf.Len() 2349 code := buf.Append(32)[:0] 2350 2351 isSigned8bitConst := fitInSigned8bit(n.srcConst) 2352 switch inst := n.instruction; inst { 2353 case ADDQ: 2354 // https://www.felixcloutier.com/x86/add 2355 rexPrefix |= rexPrefixW 2356 if n.dstReg == RegAX && !isSigned8bitConst { 2357 code = append(code, rexPrefix, 0x05) 2358 } else { 2359 modRM := 0b11_000_000 | // Specifying that opeand is register. 2360 regBits 2361 if isSigned8bitConst { 2362 code = append(code, rexPrefix, 0x83, modRM) 2363 } else { 2364 code = append(code, rexPrefix, 0x81, modRM) 2365 } 2366 } 2367 if isSigned8bitConst { 2368 code = append(code, byte(n.srcConst)) 2369 } else { 2370 code = appendUint32(code, uint32(n.srcConst)) 2371 } 2372 case ANDQ: 2373 // https://www.felixcloutier.com/x86/and 2374 rexPrefix |= rexPrefixW 2375 if n.dstReg == RegAX && !isSigned8bitConst { 2376 code = append(code, rexPrefix, 0x25) 2377 } else { 2378 modRM := 0b11_000_000 | // Specifying that opeand is register. 2379 0b00_100_000 | // AND with immediate needs "/4" extension. 2380 regBits 2381 if isSigned8bitConst { 2382 code = append(code, rexPrefix, 0x83, modRM) 2383 } else { 2384 code = append(code, rexPrefix, 0x81, modRM) 2385 } 2386 } 2387 if fitInSigned8bit(n.srcConst) { 2388 code = append(code, byte(n.srcConst)) 2389 } else { 2390 code = appendUint32(code, uint32(n.srcConst)) 2391 } 2392 case TESTQ: 2393 // https://www.felixcloutier.com/x86/test 2394 rexPrefix |= rexPrefixW 2395 if n.dstReg == RegAX && !isSigned8bitConst { 2396 code = append(code, rexPrefix, 0xa9) 2397 } else { 2398 modRM := 0b11_000_000 | // Specifying that operand is register 2399 regBits 2400 code = append(code, rexPrefix, 0xf7, modRM) 2401 } 2402 code = appendUint32(code, uint32(n.srcConst)) 2403 case MOVL: 2404 // https://www.felixcloutier.com/x86/mov 2405 if rexPrefix != rexPrefixNone { 2406 code = append(code, rexPrefix) 2407 } 2408 code = append(code, 0xb8|regBits) 2409 code = appendUint32(code, uint32(n.srcConst)) 2410 case MOVQ: 2411 // https://www.felixcloutier.com/x86/mov 2412 if fitIn32bit(n.srcConst) { 2413 if n.srcConst > math.MaxInt32 { 2414 if rexPrefix != rexPrefixNone { 2415 code = append(code, rexPrefix) 2416 } 2417 code = append(code, 0xb8|regBits) 2418 } else { 2419 rexPrefix |= rexPrefixW 2420 modRM := 0b11_000_000 | // Specifying that opeand is register. 2421 regBits 2422 code = append(code, rexPrefix, 0xc7, modRM) 2423 } 2424 code = appendUint32(code, uint32(n.srcConst)) 2425 } else { 2426 rexPrefix |= rexPrefixW 2427 code = append(code, rexPrefix, 0xb8|regBits) 2428 code = appendUint64(code, uint64(n.srcConst)) 2429 } 2430 case SHLQ: 2431 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 2432 rexPrefix |= rexPrefixW 2433 modRM := 0b11_000_000 | // Specifying that opeand is register. 2434 0b00_100_000 | // SHL with immediate needs "/4" extension. 2435 regBits 2436 if n.srcConst == 1 { 2437 code = append(code, rexPrefix, 0xd1, modRM) 2438 } else { 2439 code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst)) 2440 } 2441 case SHRQ: 2442 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 2443 rexPrefix |= rexPrefixW 2444 modRM := 0b11_000_000 | // Specifying that opeand is register. 2445 0b00_101_000 | // SHR with immediate needs "/5" extension. 2446 regBits 2447 if n.srcConst == 1 { 2448 code = append(code, rexPrefix, 0xd1, modRM) 2449 } else { 2450 code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst)) 2451 } 2452 case PSLLD: 2453 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2454 modRM := 0b11_000_000 | // Specifying that opeand is register. 2455 0b00_110_000 | // PSLL with immediate needs "/6" extension. 2456 regBits 2457 if rexPrefix != rexPrefixNone { 2458 code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst)) 2459 } else { 2460 code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst)) 2461 } 2462 case PSLLQ: 2463 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2464 modRM := 0b11_000_000 | // Specifying that opeand is register. 2465 0b00_110_000 | // PSLL with immediate needs "/6" extension. 2466 regBits 2467 if rexPrefix != rexPrefixNone { 2468 code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst)) 2469 } else { 2470 code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst)) 2471 } 2472 case PSRLD: 2473 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2474 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2475 modRM := 0b11_000_000 | // Specifying that operand is register. 2476 0b00_010_000 | // PSRL with immediate needs "/2" extension. 2477 regBits 2478 if rexPrefix != rexPrefixNone { 2479 code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst)) 2480 } else { 2481 code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst)) 2482 } 2483 case PSRLQ: 2484 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2485 modRM := 0b11_000_000 | // Specifying that operand is register. 2486 0b00_010_000 | // PSRL with immediate needs "/2" extension. 2487 regBits 2488 if rexPrefix != rexPrefixNone { 2489 code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst)) 2490 } else { 2491 code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst)) 2492 } 2493 case PSRAW, PSRAD: 2494 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 2495 modRM := 0b11_000_000 | // Specifying that operand is register. 2496 0b00_100_000 | // PSRAW with immediate needs "/4" extension. 2497 regBits 2498 code = append(code, 0x66) 2499 if rexPrefix != rexPrefixNone { 2500 code = append(code, rexPrefix) 2501 } 2502 2503 var op byte 2504 if inst == PSRAD { 2505 op = 0x72 2506 } else { // PSRAW 2507 op = 0x71 2508 } 2509 2510 code = append(code, 0x0f, op, modRM, byte(n.srcConst)) 2511 case PSRLW: 2512 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2513 modRM := 0b11_000_000 | // Specifying that operand is register. 2514 0b00_010_000 | // PSRLW with immediate needs "/2" extension. 2515 regBits 2516 code = append(code, 0x66) 2517 if rexPrefix != rexPrefixNone { 2518 code = append(code, rexPrefix) 2519 } 2520 code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst)) 2521 case PSLLW: 2522 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2523 modRM := 0b11_000_000 | // Specifying that operand is register. 2524 0b00_110_000 | // PSLLW with immediate needs "/6" extension. 2525 regBits 2526 code = append(code, 0x66) 2527 if rexPrefix != rexPrefixNone { 2528 code = append(code, rexPrefix) 2529 } 2530 code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst)) 2531 case XORL, XORQ: 2532 // https://www.felixcloutier.com/x86/xor 2533 if inst == XORQ { 2534 rexPrefix |= rexPrefixW 2535 } 2536 if rexPrefix != rexPrefixNone { 2537 code = append(code, rexPrefix) 2538 } 2539 if n.dstReg == RegAX && !isSigned8bitConst { 2540 code = append(code, 0x35) 2541 } else { 2542 modRM := 0b11_000_000 | // Specifying that opeand is register. 2543 0b00_110_000 | // XOR with immediate needs "/6" extension. 2544 regBits 2545 if isSigned8bitConst { 2546 code = append(code, 0x83, modRM) 2547 } else { 2548 code = append(code, 0x81, modRM) 2549 } 2550 } 2551 if fitInSigned8bit(n.srcConst) { 2552 code = append(code, byte(n.srcConst)) 2553 } else { 2554 code = appendUint32(code, uint32(n.srcConst)) 2555 } 2556 default: 2557 err = errorEncodingUnsupported(n) 2558 } 2559 2560 buf.Truncate(base + len(code)) 2561 return 2562 } 2563 2564 func (a *AssemblerImpl) encodeMemoryToConst(buf asm.Buffer, n *nodeImpl) (err error) { 2565 if !fitIn32bit(n.dstConst) { 2566 return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction)) 2567 } 2568 2569 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false) 2570 if err != nil { 2571 return err 2572 } 2573 2574 // Alias for readability. 2575 c := n.dstConst 2576 2577 var opcode, constWidth byte 2578 switch n.instruction { 2579 case CMPL: 2580 // https://www.felixcloutier.com/x86/cmp 2581 if fitInSigned8bit(c) { 2582 opcode = 0x83 2583 constWidth = 8 2584 } else { 2585 opcode = 0x81 2586 constWidth = 32 2587 } 2588 modRM |= 0b00_111_000 2589 default: 2590 return errorEncodingUnsupported(n) 2591 } 2592 2593 base := buf.Len() 2594 code := buf.Append(20)[:0] 2595 2596 if rexPrefix != rexPrefixNone { 2597 code = append(code, rexPrefix) 2598 } 2599 2600 code = append(code, opcode, modRM) 2601 2602 if sbiExist { 2603 code = append(code, sbi) 2604 } 2605 2606 if displacementWidth != 0 { 2607 code = appendConst(code, n.srcConst, displacementWidth) 2608 } 2609 2610 code = appendConst(code, c, constWidth) 2611 buf.Truncate(base + len(code)) 2612 return 2613 } 2614 2615 func (a *AssemblerImpl) encodeConstToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 2616 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) 2617 if err != nil { 2618 return err 2619 } 2620 2621 // Alias for readability. 2622 inst := n.instruction 2623 c := n.srcConst 2624 2625 if inst == MOVB && !fitInSigned8bit(c) { 2626 return fmt.Errorf("too large load target const %d for MOVB", c) 2627 } else if !fitIn32bit(c) { 2628 return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction)) 2629 } 2630 2631 var constWidth, opcode byte 2632 switch inst { 2633 case MOVB: 2634 opcode = 0xc6 2635 constWidth = 8 2636 case MOVL: 2637 opcode = 0xc7 2638 constWidth = 32 2639 case MOVQ: 2640 rexPrefix |= rexPrefixW 2641 opcode = 0xc7 2642 constWidth = 32 2643 default: 2644 return errorEncodingUnsupported(n) 2645 } 2646 2647 base := buf.Len() 2648 code := buf.Append(20)[:0] 2649 2650 if rexPrefix != rexPrefixNone { 2651 code = append(code, rexPrefix) 2652 } 2653 2654 code = append(code, opcode, modRM) 2655 2656 if sbiExist { 2657 code = append(code, sbi) 2658 } 2659 2660 if displacementWidth != 0 { 2661 code = appendConst(code, n.dstConst, displacementWidth) 2662 } 2663 2664 code = appendConst(code, c, constWidth) 2665 2666 buf.Truncate(base + len(code)) 2667 return 2668 } 2669 2670 func appendUint32(code []byte, v uint32) []byte { 2671 b := [4]byte{} 2672 binary.LittleEndian.PutUint32(b[:], uint32(v)) 2673 return append(code, b[:]...) 2674 } 2675 2676 func appendUint64(code []byte, v uint64) []byte { 2677 b := [8]byte{} 2678 binary.LittleEndian.PutUint64(b[:], uint64(v)) 2679 return append(code, b[:]...) 2680 } 2681 2682 func appendConst(code []byte, v int64, length byte) []byte { 2683 switch length { 2684 case 8: 2685 return append(code, byte(v)) 2686 case 32: 2687 return appendUint32(code, uint32(v)) 2688 default: 2689 return appendUint64(code, uint64(v)) 2690 } 2691 } 2692 2693 func (n *nodeImpl) getMemoryLocation(dstMem bool) (p rexPrefix, modRM byte, sbi byte, sbiExist bool, displacementWidth byte, err error) { 2694 var baseReg, indexReg asm.Register 2695 var offset asm.ConstantValue 2696 var scale byte 2697 if dstMem { 2698 baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale 2699 } else { 2700 baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale 2701 } 2702 2703 if !fitIn32bit(offset) { 2704 err = errors.New("offset does not fit in 32-bit integer") 2705 return 2706 } 2707 2708 if baseReg == asm.NilRegister && indexReg != asm.NilRegister { 2709 // [(index*scale) + displacement] addressing is possible, but we haven't used it for now. 2710 err = errors.New("addressing without base register but with index is not implemented") 2711 } else if baseReg == asm.NilRegister { 2712 modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. 2713 sbi, sbiExist = byte(0b00_100_101), true 2714 displacementWidth = 32 2715 } else if indexReg == asm.NilRegister { 2716 modRM, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) 2717 2718 // Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0 2719 // and otherwise [R/M]. 2720 withoutDisplacement := offset == 0 && 2721 // If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value 2722 // is zero since it's not [R/M] operand is not defined for these two registers. 2723 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2724 baseReg != RegR13 && baseReg != RegBP 2725 if withoutDisplacement { 2726 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2727 modRM |= 0b00_000_000 // Specifying that operand is memory without displacement 2728 displacementWidth = 0 2729 } else if fitInSigned8bit(offset) { 2730 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2731 modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement. 2732 displacementWidth = 8 2733 } else { 2734 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2735 modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement. 2736 displacementWidth = 32 2737 } 2738 2739 // For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP]. 2740 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2741 // 2742 // Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement]. 2743 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2 2744 if baseReg == RegSP || baseReg == RegR12 { 2745 sbi, sbiExist = byte(0b00_100_100), true 2746 } 2747 } else { 2748 if indexReg == RegSP { 2749 err = errors.New("SP cannot be used for SIB index") 2750 return 2751 } 2752 2753 modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. 2754 2755 withoutDisplacement := offset == 0 && 2756 // For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod). 2757 baseReg != RegR13 && baseReg != RegBP 2758 if withoutDisplacement { 2759 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2760 modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement 2761 displacementWidth = 0 2762 } else if fitInSigned8bit(offset) { 2763 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2764 modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement. 2765 displacementWidth = 8 2766 } else { 2767 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2768 modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement. 2769 displacementWidth = 32 2770 } 2771 2772 var baseRegBits byte 2773 baseRegBits, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) 2774 2775 var indexRegBits byte 2776 var indexRegPrefix rexPrefix 2777 indexRegBits, indexRegPrefix = register3bits(indexReg, registerSpecifierPositionSIBIndex) 2778 p |= indexRegPrefix 2779 2780 sbi, sbiExist = baseRegBits|(indexRegBits<<3), true 2781 switch scale { 2782 case 1: 2783 sbi |= 0b00_000_000 2784 case 2: 2785 sbi |= 0b01_000_000 2786 case 4: 2787 sbi |= 0b10_000_000 2788 case 8: 2789 sbi |= 0b11_000_000 2790 default: 2791 err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale) 2792 return 2793 } 2794 2795 } 2796 return 2797 } 2798 2799 // getRegisterToRegisterModRM does XXXX 2800 // 2801 // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation 2802 // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity. 2803 func (n *nodeImpl) getRegisterToRegisterModRM(srcOnModRMReg bool) (rexPrefix, modRM byte, err error) { 2804 var reg3bits, rm3bits byte 2805 if srcOnModRMReg { 2806 reg3bits, rexPrefix = register3bits(n.srcReg, 2807 // Indicate that srcReg will be specified by ModRM:reg. 2808 registerSpecifierPositionModRMFieldReg) 2809 2810 var dstRexPrefix byte 2811 rm3bits, dstRexPrefix = register3bits(n.dstReg, 2812 // Indicate that dstReg will be specified by ModRM:r/m. 2813 registerSpecifierPositionModRMFieldRM) 2814 rexPrefix |= dstRexPrefix 2815 } else { 2816 rm3bits, rexPrefix = register3bits(n.srcReg, 2817 // Indicate that srcReg will be specified by ModRM:r/m. 2818 registerSpecifierPositionModRMFieldRM) 2819 2820 var dstRexPrefix byte 2821 reg3bits, dstRexPrefix = register3bits(n.dstReg, 2822 // Indicate that dstReg will be specified by ModRM:reg. 2823 registerSpecifierPositionModRMFieldReg) 2824 rexPrefix |= dstRexPrefix 2825 } 2826 2827 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2828 modRM = 0b11_000_000 | // Specifying that dst operand is register. 2829 (reg3bits << 3) | 2830 rm3bits 2831 2832 return 2833 } 2834 2835 // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix 2836 type rexPrefix = byte 2837 2838 // REX prefixes are independent of each other and can be combined with OR. 2839 const ( 2840 rexPrefixNone rexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix. 2841 rexPrefixDefault rexPrefix = 0b0100_0000 2842 rexPrefixW = 0b0000_1000 | rexPrefixDefault // REX.W 2843 rexPrefixR = 0b0000_0100 | rexPrefixDefault // REX.R 2844 rexPrefixX = 0b0000_0010 | rexPrefixDefault // REX.X 2845 rexPrefixB = 0b0000_0001 | rexPrefixDefault // REX.B 2846 ) 2847 2848 // lockPrefix represents the LOCK prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#Legacy_Prefixes 2849 const lockPrefix = 0xF0 2850 2851 // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed. 2852 type registerSpecifierPosition byte 2853 2854 const ( 2855 registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota 2856 registerSpecifierPositionModRMFieldRM 2857 registerSpecifierPositionSIBIndex 2858 ) 2859 2860 var regInfo = [...]struct { 2861 bits byte 2862 needRex bool 2863 }{ 2864 RegAX: {bits: 0b000}, 2865 RegCX: {bits: 0b001}, 2866 RegDX: {bits: 0b010}, 2867 RegBX: {bits: 0b011}, 2868 RegSP: {bits: 0b100}, 2869 RegBP: {bits: 0b101}, 2870 RegSI: {bits: 0b110}, 2871 RegDI: {bits: 0b111}, 2872 RegR8: {bits: 0b000, needRex: true}, 2873 RegR9: {bits: 0b001, needRex: true}, 2874 RegR10: {bits: 0b010, needRex: true}, 2875 RegR11: {bits: 0b011, needRex: true}, 2876 RegR12: {bits: 0b100, needRex: true}, 2877 RegR13: {bits: 0b101, needRex: true}, 2878 RegR14: {bits: 0b110, needRex: true}, 2879 RegR15: {bits: 0b111, needRex: true}, 2880 RegX0: {bits: 0b000}, 2881 RegX1: {bits: 0b001}, 2882 RegX2: {bits: 0b010}, 2883 RegX3: {bits: 0b011}, 2884 RegX4: {bits: 0b100}, 2885 RegX5: {bits: 0b101}, 2886 RegX6: {bits: 0b110}, 2887 RegX7: {bits: 0b111}, 2888 RegX8: {bits: 0b000, needRex: true}, 2889 RegX9: {bits: 0b001, needRex: true}, 2890 RegX10: {bits: 0b010, needRex: true}, 2891 RegX11: {bits: 0b011, needRex: true}, 2892 RegX12: {bits: 0b100, needRex: true}, 2893 RegX13: {bits: 0b101, needRex: true}, 2894 RegX14: {bits: 0b110, needRex: true}, 2895 RegX15: {bits: 0b111, needRex: true}, 2896 } 2897 2898 func register3bits( 2899 reg asm.Register, 2900 registerSpecifierPosition registerSpecifierPosition, 2901 ) (bits byte, prefix rexPrefix) { 2902 info := regInfo[reg] 2903 bits = info.bits 2904 if info.needRex { 2905 // https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix 2906 switch registerSpecifierPosition { 2907 case registerSpecifierPositionModRMFieldReg: 2908 prefix = rexPrefixR 2909 case registerSpecifierPositionModRMFieldRM: 2910 prefix = rexPrefixB 2911 case registerSpecifierPositionSIBIndex: 2912 prefix = rexPrefixX 2913 } 2914 } 2915 return 2916 } 2917 2918 func fitIn32bit(v int64) bool { 2919 return math.MinInt32 <= v && v <= math.MaxUint32 2920 } 2921 2922 func fitInSigned8bit(v int64) bool { 2923 return math.MinInt8 <= v && v <= math.MaxInt8 2924 } 2925 2926 func isVectorRegister(r asm.Register) bool { 2927 return RegX0 <= r && r <= RegX15 2928 }