github.com/bananabytelabs/wazero@v0.0.0-20240105073314-54b22a776da8/internal/asm/amd64/impl.go (about) 1 package amd64 2 3 import ( 4 "encoding/binary" 5 "errors" 6 "fmt" 7 "math" 8 9 "github.com/bananabytelabs/wazero/internal/asm" 10 ) 11 12 // nodeImpl implements asm.Node for amd64. 13 type nodeImpl struct { 14 // jumpTarget holds the target node in the linked for the jump-kind instruction. 15 jumpTarget *nodeImpl 16 17 // prev and next hold the prev/next node from this node in the assembled linked list. 18 prev, next *nodeImpl 19 20 // forwardJumpOrigins hold all the nodes trying to jump into this node as a 21 // singly linked list. In other words, all the nodes with .jumpTarget == this. 22 forwardJumpOrigins *nodeImpl 23 24 staticConst *asm.StaticConst 25 26 dstConst asm.ConstantValue 27 offsetInBinary asm.NodeOffsetInBinary 28 srcConst asm.ConstantValue 29 instruction asm.Instruction 30 31 // readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of 32 // read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress. 33 readInstructionAddressBeforeTargetInstruction asm.Instruction 34 flag nodeFlag 35 types operandTypes 36 srcReg, dstReg asm.Register 37 srcMemIndex, dstMemIndex asm.Register 38 srcMemScale, dstMemScale byte 39 arg byte 40 41 // staticConstReferrersAdded true if this node is already added into AssemblerImpl.staticConstReferrers. 42 // Only used when staticConst is not nil. Through re-assembly, we might end up adding multiple times which causes unnecessary 43 // allocations, so we use this flag to do it once. 44 staticConstReferrersAdded bool 45 } 46 47 type nodeFlag byte 48 49 const ( 50 // nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge 51 // whether a jump is backward or forward before encoding. 52 nodeFlagInitializedForEncoding nodeFlag = 1 << iota 53 nodeFlagBackwardJump 54 // nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and 55 // the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget 56 // as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible. 57 nodeFlagShortForwardJump 58 ) 59 60 func (n *nodeImpl) isInitializedForEncoding() bool { 61 return n.flag&nodeFlagInitializedForEncoding != 0 62 } 63 64 func (n *nodeImpl) isJumpNode() bool { 65 return n.jumpTarget != nil 66 } 67 68 func (n *nodeImpl) isBackwardJump() bool { 69 return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0) 70 } 71 72 func (n *nodeImpl) isForwardJump() bool { 73 return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0) 74 } 75 76 func (n *nodeImpl) isForwardShortJump() bool { 77 return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0 78 } 79 80 // AssignJumpTarget implements asm.Node.AssignJumpTarget. 81 func (n *nodeImpl) AssignJumpTarget(target asm.Node) { 82 n.jumpTarget = target.(*nodeImpl) 83 } 84 85 // AssignDestinationConstant implements asm.Node.AssignDestinationConstant. 86 func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) { 87 n.dstConst = value 88 } 89 90 // AssignSourceConstant implements asm.Node.AssignSourceConstant. 91 func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) { 92 n.srcConst = value 93 } 94 95 // OffsetInBinary implements asm.Node.OffsetInBinary. 96 func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary { 97 return n.offsetInBinary 98 } 99 100 // String implements fmt.Stringer. 101 // 102 // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax, 103 // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand 104 // might be embraced by '[]' to represent the memory location. 105 func (n *nodeImpl) String() (ret string) { 106 instName := InstructionName(n.instruction) 107 switch n.types { 108 case operandTypesNoneToNone: 109 ret = instName 110 case operandTypesNoneToRegister: 111 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg)) 112 case operandTypesNoneToMemory: 113 if n.dstMemIndex != asm.NilRegister { 114 ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName, 115 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 116 } else { 117 ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst) 118 } 119 case operandTypesNoneToBranch: 120 ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget) 121 case operandTypesRegisterToNone: 122 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg)) 123 case operandTypesRegisterToRegister: 124 ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg)) 125 case operandTypesRegisterToMemory: 126 if n.dstMemIndex != asm.NilRegister { 127 ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg), 128 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 129 } else { 130 ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst) 131 } 132 case operandTypesRegisterToConst: 133 ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst) 134 case operandTypesMemoryToRegister: 135 if n.srcMemIndex != asm.NilRegister { 136 ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName, 137 RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg)) 138 } else { 139 ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg)) 140 } 141 case operandTypesMemoryToConst: 142 if n.srcMemIndex != asm.NilRegister { 143 ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName, 144 RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst) 145 } else { 146 ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst) 147 } 148 case operandTypesConstToMemory: 149 if n.dstMemIndex != asm.NilRegister { 150 ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst, 151 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 152 } else { 153 ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst) 154 } 155 case operandTypesConstToRegister: 156 ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg)) 157 case operandTypesStaticConstToRegister: 158 ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg)) 159 case operandTypesRegisterToStaticConst: 160 ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw) 161 } 162 return 163 } 164 165 type operandTypes byte 166 167 const ( 168 operandTypesNoneToNone operandTypes = iota 169 operandTypesNoneToRegister 170 operandTypesNoneToMemory 171 operandTypesNoneToBranch 172 operandTypesRegisterToNone 173 operandTypesRegisterToRegister 174 operandTypesRegisterToMemory 175 operandTypesRegisterToConst 176 operandTypesMemoryToRegister 177 operandTypesMemoryToConst 178 operandTypesConstToRegister 179 operandTypesConstToMemory 180 operandTypesStaticConstToRegister 181 operandTypesRegisterToStaticConst 182 ) 183 184 // String implements fmt.Stringer 185 func (o operandTypes) String() (ret string) { 186 switch o { 187 case operandTypesNoneToNone: 188 ret = "NoneToNone" 189 case operandTypesNoneToRegister: 190 ret = "NoneToRegister" 191 case operandTypesNoneToMemory: 192 ret = "NoneToMemory" 193 case operandTypesNoneToBranch: 194 ret = "NoneToBranch" 195 case operandTypesRegisterToNone: 196 ret = "RegisterToNone" 197 case operandTypesRegisterToRegister: 198 ret = "RegisterToRegister" 199 case operandTypesRegisterToMemory: 200 ret = "RegisterToMemory" 201 case operandTypesRegisterToConst: 202 ret = "RegisterToConst" 203 case operandTypesMemoryToRegister: 204 ret = "MemoryToRegister" 205 case operandTypesMemoryToConst: 206 ret = "MemoryToConst" 207 case operandTypesConstToRegister: 208 ret = "ConstToRegister" 209 case operandTypesConstToMemory: 210 ret = "ConstToMemory" 211 case operandTypesStaticConstToRegister: 212 ret = "StaticConstToRegister" 213 case operandTypesRegisterToStaticConst: 214 ret = "RegisterToStaticConst" 215 } 216 return 217 } 218 219 type ( 220 // AssemblerImpl implements Assembler. 221 AssemblerImpl struct { 222 root *nodeImpl 223 current *nodeImpl 224 asm.BaseAssemblerImpl 225 readInstructionAddressNodes []*nodeImpl 226 227 // staticConstReferrers maintains the list of static const referrers which requires the 228 // offset resolution after finalizing the binary layout. 229 staticConstReferrers []staticConstReferrer 230 231 nodePool nodePool 232 pool asm.StaticConstPool 233 234 // MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool 235 // but have it as an exported field here for testability. 236 MaxDisplacementForConstantPool int 237 238 forceReAssemble bool 239 } 240 241 // staticConstReferrer represents a referrer of a asm.StaticConst. 242 staticConstReferrer struct { 243 n *nodeImpl 244 // instLen is the encoded length of the instruction for `n`. 245 instLen int 246 } 247 ) 248 249 func NewAssembler() *AssemblerImpl { 250 return &AssemblerImpl{ 251 nodePool: nodePool{index: nodePageSize}, 252 pool: asm.NewStaticConstPool(), 253 MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool, 254 } 255 } 256 257 const nodePageSize = 128 258 259 type nodePage = [nodePageSize]nodeImpl 260 261 // nodePool is the central allocation pool for nodeImpl used by a single AssemblerImpl. 262 // This reduces the allocations over compilation by reusing AssemblerImpl. 263 type nodePool struct { 264 pages []*nodePage 265 index int 266 } 267 268 // allocNode allocates a new nodeImpl for use from the pool. 269 // This expands the pool if there is no space left for it. 270 func (n *nodePool) allocNode() *nodeImpl { 271 if n.index == nodePageSize { 272 if len(n.pages) == cap(n.pages) { 273 n.pages = append(n.pages, new(nodePage)) 274 } else { 275 i := len(n.pages) 276 n.pages = n.pages[:i+1] 277 if n.pages[i] == nil { 278 n.pages[i] = new(nodePage) 279 } 280 } 281 n.index = 0 282 } 283 ret := &n.pages[len(n.pages)-1][n.index] 284 n.index++ 285 return ret 286 } 287 288 func (n *nodePool) reset() { 289 for _, ns := range n.pages { 290 pages := ns[:] 291 for i := range pages { 292 pages[i] = nodeImpl{} 293 } 294 } 295 n.pages = n.pages[:0] 296 n.index = nodePageSize 297 } 298 299 // AllocateNOP implements asm.AssemblerBase. 300 func (a *AssemblerImpl) AllocateNOP() asm.Node { 301 n := a.nodePool.allocNode() 302 n.instruction = NOP 303 n.types = operandTypesNoneToNone 304 return n 305 } 306 307 // Add implements asm.AssemblerBase. 308 func (a *AssemblerImpl) Add(n asm.Node) { 309 a.addNode(n.(*nodeImpl)) 310 } 311 312 // Reset implements asm.AssemblerBase. 313 func (a *AssemblerImpl) Reset() { 314 pool := a.pool 315 pool.Reset() 316 *a = AssemblerImpl{ 317 nodePool: a.nodePool, 318 pool: pool, 319 readInstructionAddressNodes: a.readInstructionAddressNodes[:0], 320 staticConstReferrers: a.staticConstReferrers[:0], 321 BaseAssemblerImpl: asm.BaseAssemblerImpl{ 322 SetBranchTargetOnNextNodes: a.SetBranchTargetOnNextNodes[:0], 323 JumpTableEntries: a.JumpTableEntries[:0], 324 }, 325 } 326 a.nodePool.reset() 327 } 328 329 // newNode creates a new Node and appends it into the linked list. 330 func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl { 331 n := a.nodePool.allocNode() 332 n.instruction = instruction 333 n.types = types 334 a.addNode(n) 335 return n 336 } 337 338 // addNode appends the new node into the linked list. 339 func (a *AssemblerImpl) addNode(node *nodeImpl) { 340 if a.root == nil { 341 a.root = node 342 a.current = node 343 } else { 344 parent := a.current 345 parent.next = node 346 node.prev = parent 347 a.current = node 348 } 349 350 for _, o := range a.SetBranchTargetOnNextNodes { 351 origin := o.(*nodeImpl) 352 origin.jumpTarget = node 353 } 354 // Reuse the underlying slice to avoid re-allocations. 355 a.SetBranchTargetOnNextNodes = a.SetBranchTargetOnNextNodes[:0] 356 } 357 358 // encodeNode encodes the given node into writer. 359 func (a *AssemblerImpl) encodeNode(buf asm.Buffer, n *nodeImpl) (err error) { 360 switch n.types { 361 case operandTypesNoneToNone: 362 err = a.encodeNoneToNone(buf, n) 363 case operandTypesNoneToRegister: 364 err = a.encodeNoneToRegister(buf, n) 365 case operandTypesNoneToMemory: 366 err = a.encodeNoneToMemory(buf, n) 367 case operandTypesNoneToBranch: 368 // Branching operand can be encoded as relative jumps. 369 err = a.encodeRelativeJump(buf, n) 370 case operandTypesRegisterToNone: 371 err = a.encodeRegisterToNone(buf, n) 372 case operandTypesRegisterToRegister: 373 err = a.encodeRegisterToRegister(buf, n) 374 case operandTypesRegisterToMemory: 375 err = a.encodeRegisterToMemory(buf, n) 376 case operandTypesRegisterToConst: 377 err = a.encodeRegisterToConst(buf, n) 378 case operandTypesMemoryToRegister: 379 err = a.encodeMemoryToRegister(buf, n) 380 case operandTypesMemoryToConst: 381 err = a.encodeMemoryToConst(buf, n) 382 case operandTypesConstToRegister: 383 err = a.encodeConstToRegister(buf, n) 384 case operandTypesConstToMemory: 385 err = a.encodeConstToMemory(buf, n) 386 case operandTypesStaticConstToRegister: 387 err = a.encodeStaticConstToRegister(buf, n) 388 case operandTypesRegisterToStaticConst: 389 err = a.encodeRegisterToStaticConst(buf, n) 390 default: 391 err = fmt.Errorf("encoder undefined for [%s] operand type", n.types) 392 } 393 if err != nil { 394 err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node. 395 } 396 return 397 } 398 399 // Assemble implements asm.AssemblerBase 400 func (a *AssemblerImpl) Assemble(buf asm.Buffer) error { 401 a.initializeNodesForEncoding() 402 403 // Continue encoding until we are not forced to re-assemble which happens when 404 // a short relative jump ends up the offset larger than 8-bit length. 405 for { 406 err := a.encode(buf) 407 if err != nil { 408 return err 409 } 410 411 if !a.forceReAssemble { 412 break 413 } else { 414 // We reset the length of buffer but don't delete the underlying slice since 415 // the binary size will roughly the same after reassemble. 416 buf.Reset() 417 // Reset the re-assemble flag in order to avoid the infinite loop! 418 a.forceReAssemble = false 419 } 420 } 421 422 code := buf.Bytes() 423 for _, n := range a.readInstructionAddressNodes { 424 if err := a.finalizeReadInstructionAddressNode(code, n); err != nil { 425 return err 426 } 427 } 428 429 // Now that we've finished the layout, fill out static consts offsets. 430 for i := range a.staticConstReferrers { 431 ref := &a.staticConstReferrers[i] 432 n, instLen := ref.n, ref.instLen 433 // Calculate the displacement between the RIP (the offset _after_ n) and the static constant. 434 displacement := int(n.staticConst.OffsetInBinary) - int(n.OffsetInBinary()) - instLen 435 // The offset must be stored at the 4 bytes from the tail of this n. See AssemblerImpl.encodeStaticConstImpl for detail. 436 displacementOffsetInInstruction := n.OffsetInBinary() + uint64(instLen-4) 437 binary.LittleEndian.PutUint32(code[displacementOffsetInInstruction:], uint32(int32(displacement))) 438 } 439 440 return a.FinalizeJumpTableEntry(code) 441 } 442 443 // initializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps 444 // are forward or backward jump. 445 func (a *AssemblerImpl) initializeNodesForEncoding() { 446 for n := a.root; n != nil; n = n.next { 447 n.flag |= nodeFlagInitializedForEncoding 448 if target := n.jumpTarget; target != nil { 449 if target.isInitializedForEncoding() { 450 // This means the target exists behind. 451 n.flag |= nodeFlagBackwardJump 452 } else { 453 // Otherwise, this is forward jump. 454 // We start with assuming that the jump can be short (8-bit displacement). 455 // If it doens't fit, we change this flag in resolveRelativeForwardJump. 456 n.flag |= nodeFlagShortForwardJump 457 458 // If the target node is also the branching instruction, we replace the target with the NOP 459 // node so that we can avoid the collision of the target.forwardJumpOrigins both as destination and origins. 460 if target.types == operandTypesNoneToBranch { 461 // Allocate the NOP node from the pool. 462 nop := a.nodePool.allocNode() 463 nop.instruction = NOP 464 nop.types = operandTypesNoneToNone 465 // Insert it between target.prev and target: [target.prev, target] -> [target.prev, nop, target] 466 prev := target.prev 467 nop.prev = prev 468 prev.next = nop 469 nop.next = target 470 target.prev = nop 471 n.jumpTarget = nop 472 target = nop 473 } 474 475 // We add this node `n` into the end of the linked list (.forwardJumpOrigins) beginning from the `target.forwardJumpOrigins`. 476 // Insert the current `n` as the head of the list. 477 n.forwardJumpOrigins = target.forwardJumpOrigins 478 target.forwardJumpOrigins = n 479 } 480 } 481 } 482 } 483 484 func (a *AssemblerImpl) encode(buf asm.Buffer) error { 485 for n := a.root; n != nil; n = n.next { 486 // If an instruction needs NOP padding, we do so before encoding it. 487 // 488 // This is necessary to avoid Intel's jump erratum; see in Section 2.1 489 // in for when we have to pad NOP: 490 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 491 // 492 // This logic used to be implemented in a function called maybeNOPPadding, 493 // but the complexity of the logic made it impossible for the compiler to 494 // inline. Since this function is on a hot code path, we inlined the 495 // initial checks to skip the function call when instructions do not need 496 // NOP padding. 497 switch info := nopPaddingInfo[n.instruction]; { 498 case info.jmp: 499 if err := a.encodeJmpNOPPadding(buf, n); err != nil { 500 return err 501 } 502 case info.onNextJmp: 503 if err := a.encodeOnNextJmpNOPPAdding(buf, n); err != nil { 504 return err 505 } 506 } 507 508 // After the padding, we can finalize the offset of this instruction in the binary. 509 n.offsetInBinary = uint64(buf.Len()) 510 511 if err := a.encodeNode(buf, n); err != nil { 512 return err 513 } 514 515 if n.forwardJumpOrigins != nil { 516 if err := a.resolveForwardRelativeJumps(buf, n); err != nil { 517 return fmt.Errorf("invalid relative forward jumps: %w", err) 518 } 519 } 520 521 a.maybeFlushConstants(buf, n.next == nil) 522 } 523 return nil 524 } 525 526 var nopPaddingInfo = [instructionEnd]struct { 527 jmp, onNextJmp bool 528 }{ 529 RET: {jmp: true}, 530 JMP: {jmp: true}, 531 JCC: {jmp: true}, 532 JCS: {jmp: true}, 533 JEQ: {jmp: true}, 534 JGE: {jmp: true}, 535 JGT: {jmp: true}, 536 JHI: {jmp: true}, 537 JLE: {jmp: true}, 538 JLS: {jmp: true}, 539 JLT: {jmp: true}, 540 JMI: {jmp: true}, 541 JNE: {jmp: true}, 542 JPC: {jmp: true}, 543 JPS: {jmp: true}, 544 // The possible fused jump instructions if the next node is a conditional jump instruction. 545 CMPL: {onNextJmp: true}, 546 CMPQ: {onNextJmp: true}, 547 TESTL: {onNextJmp: true}, 548 TESTQ: {onNextJmp: true}, 549 ADDL: {onNextJmp: true}, 550 ADDQ: {onNextJmp: true}, 551 SUBL: {onNextJmp: true}, 552 SUBQ: {onNextJmp: true}, 553 ANDL: {onNextJmp: true}, 554 ANDQ: {onNextJmp: true}, 555 INCQ: {onNextJmp: true}, 556 DECQ: {onNextJmp: true}, 557 } 558 559 func (a *AssemblerImpl) encodeJmpNOPPadding(buf asm.Buffer, n *nodeImpl) error { 560 // In order to know the instruction length before writing into the binary, 561 // we try encoding it. 562 prevLen := buf.Len() 563 564 // Assign the temporary offset which may or may not be correct depending on the padding decision. 565 n.offsetInBinary = uint64(prevLen) 566 567 // Encode the node and get the instruction length. 568 if err := a.encodeNode(buf, n); err != nil { 569 return err 570 } 571 instructionLen := int32(buf.Len() - prevLen) 572 573 // Revert the written bytes. 574 buf.Truncate(prevLen) 575 return a.encodeNOPPadding(buf, instructionLen) 576 } 577 578 func (a *AssemblerImpl) encodeOnNextJmpNOPPAdding(buf asm.Buffer, n *nodeImpl) error { 579 instructionLen, err := a.fusedInstructionLength(buf, n) 580 if err != nil { 581 return err 582 } 583 return a.encodeNOPPadding(buf, instructionLen) 584 } 585 586 // encodeNOPPadding maybe appends NOP instructions before the node `n`. 587 // This is necessary to avoid Intel's jump erratum: 588 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 589 func (a *AssemblerImpl) encodeNOPPadding(buf asm.Buffer, instructionLen int32) error { 590 const boundaryInBytes int32 = 32 591 const mask = boundaryInBytes - 1 592 var padNum int 593 currentPos := int32(buf.Len()) 594 if used := currentPos & mask; used+instructionLen >= boundaryInBytes { 595 padNum = int(boundaryInBytes - used) 596 } 597 a.padNOP(buf, padNum) 598 return nil 599 } 600 601 // fusedInstructionLength returns the length of "macro fused instruction" if the 602 // instruction sequence starting from `n` can be fused by processor. Otherwise, 603 // returns zero. 604 func (a *AssemblerImpl) fusedInstructionLength(buf asm.Buffer, n *nodeImpl) (ret int32, err error) { 605 // Find the next non-NOP instruction. 606 next := n.next 607 for ; next != nil && next.instruction == NOP; next = next.next { 608 } 609 610 if next == nil { 611 return 612 } 613 614 inst, jmpInst := n.instruction, next.instruction 615 616 if !nopPaddingInfo[jmpInst].jmp { 617 // If the next instruction is not jump kind, the instruction will not be fused. 618 return 619 } 620 621 // How to determine whether the instruction can be fused is described in 622 // Section 3.4.2.2 of "Intel Optimization Manual": 623 // https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf 624 isTest := inst == TESTL || inst == TESTQ 625 isCmp := inst == CMPQ || inst == CMPL 626 isTestCmp := isTest || isCmp 627 if isTestCmp && (n.types == operandTypesMemoryToConst || n.types == operandTypesConstToMemory) { 628 // The manual says: "CMP and TEST can not be fused when comparing MEM-IMM". 629 return 630 } 631 632 // Implement the decision according to the table 3-1 in the manual. 633 isAnd := inst == ANDL || inst == ANDQ 634 if !isTest && !isAnd { 635 if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC { 636 // These jumps are only fused for TEST or AND. 637 return 638 } 639 isAdd := inst == ADDL || inst == ADDQ 640 isSub := inst == SUBL || inst == SUBQ 641 if !isCmp && !isAdd && !isSub { 642 if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS { 643 // Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB. 644 return 645 } 646 } 647 } 648 649 // Now the instruction is ensured to be fused by the processor. 650 // In order to know the fused instruction length before writing into the binary, 651 // we try encoding it. 652 savedLen := uint64(buf.Len()) 653 654 // Encode the nodes into the buffer. 655 if err = a.encodeNode(buf, n); err != nil { 656 return 657 } 658 if err = a.encodeNode(buf, next); err != nil { 659 return 660 } 661 662 ret = int32(uint64(buf.Len()) - savedLen) 663 664 // Revert the written bytes. 665 buf.Truncate(int(savedLen)) 666 return 667 } 668 669 // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP" 670 // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf 671 var nopOpcodes = [][11]byte{ 672 {0x90}, 673 {0x66, 0x90}, 674 {0x0f, 0x1f, 0x00}, 675 {0x0f, 0x1f, 0x40, 0x00}, 676 {0x0f, 0x1f, 0x44, 0x00, 0x00}, 677 {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, 678 {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, 679 {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 680 {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 681 {0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 682 {0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 683 } 684 685 func (a *AssemblerImpl) padNOP(buf asm.Buffer, num int) { 686 for num > 0 { 687 singleNopNum := num 688 if singleNopNum > len(nopOpcodes) { 689 singleNopNum = len(nopOpcodes) 690 } 691 buf.AppendBytes(nopOpcodes[singleNopNum-1][:singleNopNum]) 692 num -= singleNopNum 693 } 694 } 695 696 // CompileStandAlone implements the same method as documented on asm.AssemblerBase. 697 func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node { 698 return a.newNode(instruction, operandTypesNoneToNone) 699 } 700 701 // CompileConstToRegister implements the same method as documented on asm.AssemblerBase. 702 func (a *AssemblerImpl) CompileConstToRegister( 703 instruction asm.Instruction, 704 value asm.ConstantValue, 705 destinationReg asm.Register, 706 ) (inst asm.Node) { 707 n := a.newNode(instruction, operandTypesConstToRegister) 708 n.srcConst = value 709 n.dstReg = destinationReg 710 return n 711 } 712 713 // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase. 714 func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) { 715 n := a.newNode(instruction, operandTypesRegisterToRegister) 716 n.srcReg = from 717 n.dstReg = to 718 } 719 720 // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase. 721 func (a *AssemblerImpl) CompileMemoryToRegister( 722 instruction asm.Instruction, 723 sourceBaseReg asm.Register, 724 sourceOffsetConst asm.ConstantValue, 725 destinationReg asm.Register, 726 ) { 727 n := a.newNode(instruction, operandTypesMemoryToRegister) 728 n.srcReg = sourceBaseReg 729 n.srcConst = sourceOffsetConst 730 n.dstReg = destinationReg 731 } 732 733 // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase. 734 func (a *AssemblerImpl) CompileRegisterToMemory( 735 instruction asm.Instruction, 736 sourceRegister, destinationBaseRegister asm.Register, 737 destinationOffsetConst asm.ConstantValue, 738 ) { 739 n := a.newNode(instruction, operandTypesRegisterToMemory) 740 n.srcReg = sourceRegister 741 n.dstReg = destinationBaseRegister 742 n.dstConst = destinationOffsetConst 743 } 744 745 // CompileJump implements the same method as documented on asm.AssemblerBase. 746 func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node { 747 return a.newNode(jmpInstruction, operandTypesNoneToBranch) 748 } 749 750 // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase. 751 func (a *AssemblerImpl) CompileJumpToMemory( 752 jmpInstruction asm.Instruction, 753 baseReg asm.Register, 754 offset asm.ConstantValue, 755 ) { 756 n := a.newNode(jmpInstruction, operandTypesNoneToMemory) 757 n.dstReg = baseReg 758 n.dstConst = offset 759 } 760 761 // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase. 762 func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) { 763 n := a.newNode(jmpInstruction, operandTypesNoneToRegister) 764 n.dstReg = reg 765 } 766 767 // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase. 768 func (a *AssemblerImpl) CompileReadInstructionAddress( 769 destinationRegister asm.Register, 770 beforeAcquisitionTargetInstruction asm.Instruction, 771 ) { 772 n := a.newNode(LEAQ, operandTypesMemoryToRegister) 773 n.dstReg = destinationRegister 774 n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction 775 } 776 777 // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler. 778 func (a *AssemblerImpl) CompileRegisterToRegisterWithArg( 779 instruction asm.Instruction, 780 from, to asm.Register, 781 arg byte, 782 ) { 783 n := a.newNode(instruction, operandTypesRegisterToRegister) 784 n.srcReg = from 785 n.dstReg = to 786 n.arg = arg 787 } 788 789 // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler. 790 func (a *AssemblerImpl) CompileMemoryWithIndexToRegister( 791 instruction asm.Instruction, 792 srcBaseReg asm.Register, 793 srcOffsetConst asm.ConstantValue, 794 srcIndex asm.Register, 795 srcScale int16, 796 dstReg asm.Register, 797 ) { 798 n := a.newNode(instruction, operandTypesMemoryToRegister) 799 n.srcReg = srcBaseReg 800 n.srcConst = srcOffsetConst 801 n.srcMemIndex = srcIndex 802 n.srcMemScale = byte(srcScale) 803 n.dstReg = dstReg 804 } 805 806 // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler. 807 func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister( 808 instruction asm.Instruction, 809 srcBaseReg asm.Register, 810 srcOffsetConst asm.ConstantValue, 811 srcIndex asm.Register, 812 srcScale int16, 813 dstReg asm.Register, 814 arg byte, 815 ) { 816 n := a.newNode(instruction, operandTypesMemoryToRegister) 817 n.srcReg = srcBaseReg 818 n.srcConst = srcOffsetConst 819 n.srcMemIndex = srcIndex 820 n.srcMemScale = byte(srcScale) 821 n.dstReg = dstReg 822 n.arg = arg 823 } 824 825 // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler. 826 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex( 827 instruction asm.Instruction, 828 srcReg, dstBaseReg asm.Register, 829 dstOffsetConst asm.ConstantValue, 830 dstIndex asm.Register, 831 dstScale int16, 832 ) { 833 n := a.newNode(instruction, operandTypesRegisterToMemory) 834 n.srcReg = srcReg 835 n.dstReg = dstBaseReg 836 n.dstConst = dstOffsetConst 837 n.dstMemIndex = dstIndex 838 n.dstMemScale = byte(dstScale) 839 } 840 841 // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler. 842 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg( 843 instruction asm.Instruction, 844 srcReg, dstBaseReg asm.Register, 845 dstOffsetConst asm.ConstantValue, 846 dstIndex asm.Register, 847 dstScale int16, 848 arg byte, 849 ) { 850 n := a.newNode(instruction, operandTypesRegisterToMemory) 851 n.srcReg = srcReg 852 n.dstReg = dstBaseReg 853 n.dstConst = dstOffsetConst 854 n.dstMemIndex = dstIndex 855 n.dstMemScale = byte(dstScale) 856 n.arg = arg 857 } 858 859 // CompileRegisterToConst implements the same method as documented on amd64.Assembler. 860 func (a *AssemblerImpl) CompileRegisterToConst( 861 instruction asm.Instruction, 862 srcRegister asm.Register, 863 value asm.ConstantValue, 864 ) asm.Node { 865 n := a.newNode(instruction, operandTypesRegisterToConst) 866 n.srcReg = srcRegister 867 n.dstConst = value 868 return n 869 } 870 871 // CompileRegisterToNone implements the same method as documented on amd64.Assembler. 872 func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) { 873 n := a.newNode(instruction, operandTypesRegisterToNone) 874 n.srcReg = register 875 } 876 877 // CompileNoneToRegister implements the same method as documented on amd64.Assembler. 878 func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) { 879 n := a.newNode(instruction, operandTypesNoneToRegister) 880 n.dstReg = register 881 } 882 883 // CompileNoneToMemory implements the same method as documented on amd64.Assembler. 884 func (a *AssemblerImpl) CompileNoneToMemory( 885 instruction asm.Instruction, 886 baseReg asm.Register, 887 offset asm.ConstantValue, 888 ) { 889 n := a.newNode(instruction, operandTypesNoneToMemory) 890 n.dstReg = baseReg 891 n.dstConst = offset 892 } 893 894 // CompileConstToMemory implements the same method as documented on amd64.Assembler. 895 func (a *AssemblerImpl) CompileConstToMemory( 896 instruction asm.Instruction, 897 value asm.ConstantValue, 898 dstbaseReg asm.Register, 899 dstOffset asm.ConstantValue, 900 ) asm.Node { 901 n := a.newNode(instruction, operandTypesConstToMemory) 902 n.srcConst = value 903 n.dstReg = dstbaseReg 904 n.dstConst = dstOffset 905 return n 906 } 907 908 // CompileMemoryToConst implements the same method as documented on amd64.Assembler. 909 func (a *AssemblerImpl) CompileMemoryToConst( 910 instruction asm.Instruction, 911 srcBaseReg asm.Register, 912 srcOffset, value asm.ConstantValue, 913 ) asm.Node { 914 n := a.newNode(instruction, operandTypesMemoryToConst) 915 n.srcReg = srcBaseReg 916 n.srcConst = srcOffset 917 n.dstConst = value 918 return n 919 } 920 921 func errorEncodingUnsupported(n *nodeImpl) error { 922 return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types) 923 } 924 925 func (a *AssemblerImpl) encodeNoneToNone(buf asm.Buffer, n *nodeImpl) (err error) { 926 // Throughout the encoding methods, we use this pair of base offset and 927 // code buffer to write instructions. 928 // 929 // The code buffer is allocated at the end of the current buffer to a size 930 // large enough to hold all the bytes that may be written by the method. 931 // 932 // We use Go's append builtin to write to the buffer because it allows the 933 // compiler to generate much better code than if we made calls to write 934 // methods to mutate an encapsulated byte slice. 935 // 936 // At the end of the method, we truncate the buffer size back to the base 937 // plus the length of the code buffer so the end of the buffer points right 938 // after the last byte that was written. 939 base := buf.Len() 940 code := buf.Append(4)[:0] 941 942 switch n.instruction { 943 case CDQ: 944 // https://www.felixcloutier.com/x86/cwd:cdq:cqo 945 code = append(code, 0x99) 946 case CQO: 947 // https://www.felixcloutier.com/x86/cwd:cdq:cqo 948 code = append(code, rexPrefixW, 0x99) 949 case NOP: 950 // Simply optimize out the NOP instructions. 951 case RET: 952 // https://www.felixcloutier.com/x86/ret 953 code = append(code, 0xc3) 954 case UD2: 955 // https://mudongliang.github.io/x86/html/file_module_x86_id_318.html 956 code = append(code, 0x0f, 0x0b) 957 case REPMOVSQ: 958 code = append(code, 0xf3, rexPrefixW, 0xa5) 959 case REPSTOSQ: 960 code = append(code, 0xf3, rexPrefixW, 0xab) 961 case STD: 962 code = append(code, 0xfd) 963 case CLD: 964 code = append(code, 0xfc) 965 default: 966 err = errorEncodingUnsupported(n) 967 } 968 969 buf.Truncate(base + len(code)) 970 return 971 } 972 973 func (a *AssemblerImpl) encodeNoneToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 974 regBits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 975 976 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 977 modRM := 0b11_000_000 | // Specifying that opeand is register. 978 regBits 979 if n.instruction == JMP { 980 // JMP's opcode is defined as "FF /4" meaning that we have to have "4" 981 // in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp 982 modRM |= 0b00_100_000 983 } else if n.instruction == NEGQ { 984 prefix |= rexPrefixW 985 modRM |= 0b00_011_000 986 } else if n.instruction == INCQ { 987 prefix |= rexPrefixW 988 } else if n.instruction == DECQ { 989 prefix |= rexPrefixW 990 modRM |= 0b00_001_000 991 } else { 992 if RegSP <= n.dstReg && n.dstReg <= RegDI { 993 // If the destination is one byte length register, we need to have the default prefix. 994 // https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers 995 prefix |= rexPrefixDefault 996 } 997 } 998 999 base := buf.Len() 1000 code := buf.Append(4)[:0] 1001 1002 if prefix != rexPrefixNone { 1003 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding 1004 code = append(code, prefix) 1005 } 1006 1007 switch n.instruction { 1008 case JMP: 1009 // https://www.felixcloutier.com/x86/jmp 1010 code = append(code, 0xff, modRM) 1011 case SETCC: 1012 // https://www.felixcloutier.com/x86/setcc 1013 code = append(code, 0x0f, 0x93, modRM) 1014 case SETCS: 1015 // https://www.felixcloutier.com/x86/setcc 1016 code = append(code, 0x0f, 0x92, modRM) 1017 case SETEQ: 1018 // https://www.felixcloutier.com/x86/setcc 1019 code = append(code, 0x0f, 0x94, modRM) 1020 case SETGE: 1021 // https://www.felixcloutier.com/x86/setcc 1022 code = append(code, 0x0f, 0x9d, modRM) 1023 case SETGT: 1024 // https://www.felixcloutier.com/x86/setcc 1025 code = append(code, 0x0f, 0x9f, modRM) 1026 case SETHI: 1027 // https://www.felixcloutier.com/x86/setcc 1028 code = append(code, 0x0f, 0x97, modRM) 1029 case SETLE: 1030 // https://www.felixcloutier.com/x86/setcc 1031 code = append(code, 0x0f, 0x9e, modRM) 1032 case SETLS: 1033 // https://www.felixcloutier.com/x86/setcc 1034 code = append(code, 0x0f, 0x96, modRM) 1035 case SETLT: 1036 // https://www.felixcloutier.com/x86/setcc 1037 code = append(code, 0x0f, 0x9c, modRM) 1038 case SETNE: 1039 // https://www.felixcloutier.com/x86/setcc 1040 code = append(code, 0x0f, 0x95, modRM) 1041 case SETPC: 1042 // https://www.felixcloutier.com/x86/setcc 1043 code = append(code, 0x0f, 0x9b, modRM) 1044 case SETPS: 1045 // https://www.felixcloutier.com/x86/setcc 1046 code = append(code, 0x0f, 0x9a, modRM) 1047 case NEGQ: 1048 // https://www.felixcloutier.com/x86/neg 1049 code = append(code, 0xf7, modRM) 1050 case INCQ: 1051 // https://www.felixcloutier.com/x86/inc 1052 code = append(code, 0xff, modRM) 1053 case DECQ: 1054 // https://www.felixcloutier.com/x86/dec 1055 code = append(code, 0xff, modRM) 1056 default: 1057 err = errorEncodingUnsupported(n) 1058 } 1059 1060 buf.Truncate(base + len(code)) 1061 return 1062 } 1063 1064 func (a *AssemblerImpl) encodeNoneToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 1065 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) 1066 if err != nil { 1067 return err 1068 } 1069 1070 var opcode byte 1071 switch n.instruction { 1072 case INCQ: 1073 // https://www.felixcloutier.com/x86/inc 1074 rexPrefix |= rexPrefixW 1075 opcode = 0xff 1076 case DECQ: 1077 // https://www.felixcloutier.com/x86/dec 1078 rexPrefix |= rexPrefixW 1079 modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM. 1080 opcode = 0xff 1081 case JMP: 1082 // https://www.felixcloutier.com/x86/jmp 1083 modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM. 1084 opcode = 0xff 1085 default: 1086 return errorEncodingUnsupported(n) 1087 } 1088 1089 base := buf.Len() 1090 code := buf.Append(12)[:0] 1091 1092 if rexPrefix != rexPrefixNone { 1093 code = append(code, rexPrefix) 1094 } 1095 1096 code = append(code, opcode, modRM) 1097 1098 if sbiExist { 1099 code = append(code, sbi) 1100 } 1101 1102 if displacementWidth != 0 { 1103 code = appendConst(code, n.dstConst, displacementWidth) 1104 } 1105 1106 buf.Truncate(base + len(code)) 1107 return 1108 } 1109 1110 type relativeJumpOpcode struct{ short, long []byte } 1111 1112 func (o relativeJumpOpcode) instructionLen(short bool) int64 { 1113 if short { 1114 return int64(len(o.short)) + 1 // 1 byte = 8 bit offset 1115 } else { 1116 return int64(len(o.long)) + 4 // 4 byte = 32 bit offset 1117 } 1118 } 1119 1120 var relativeJumpOpcodes = [...]relativeJumpOpcode{ 1121 // https://www.felixcloutier.com/x86/jcc 1122 JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}}, 1123 JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}}, 1124 JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}}, 1125 JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}}, 1126 JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}}, 1127 JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}}, 1128 JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}}, 1129 JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}}, 1130 JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}}, 1131 JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}}, 1132 JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}}, 1133 JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}}, 1134 JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}}, 1135 JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}}, 1136 // https://www.felixcloutier.com/x86/jmp 1137 JMP: {short: []byte{0xeb}, long: []byte{0xe9}}, 1138 } 1139 1140 func (a *AssemblerImpl) resolveForwardRelativeJumps(buf asm.Buffer, target *nodeImpl) (err error) { 1141 offsetInBinary := int64(target.OffsetInBinary()) 1142 origin := target.forwardJumpOrigins 1143 for ; origin != nil; origin = origin.forwardJumpOrigins { 1144 shortJump := origin.isForwardShortJump() 1145 op := relativeJumpOpcodes[origin.instruction] 1146 instructionLen := op.instructionLen(shortJump) 1147 1148 // Calculate the offset from the EIP (at the time of executing this jump instruction) 1149 // to the target instruction. This value is always >= 0 as here we only handle forward jumps. 1150 offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen) 1151 if shortJump { 1152 if offset > math.MaxInt8 { 1153 // This forces reassemble in the outer loop inside AssemblerImpl.Assemble(). 1154 a.forceReAssemble = true 1155 // From the next reAssemble phases, this forward jump will be encoded long jump and 1156 // allocate 32-bit offset bytes by default. This means that this `origin` node 1157 // will always enter the "long jump offset encoding" block below 1158 origin.flag ^= nodeFlagShortForwardJump 1159 } else { 1160 buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset) 1161 } 1162 } else { // long jump offset encoding. 1163 if offset > math.MaxInt32 { 1164 return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction)) 1165 } 1166 binary.LittleEndian.PutUint32(buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset)) 1167 } 1168 } 1169 return nil 1170 } 1171 1172 func (a *AssemblerImpl) encodeRelativeJump(buf asm.Buffer, n *nodeImpl) (err error) { 1173 if n.jumpTarget == nil { 1174 err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction)) 1175 return 1176 } 1177 1178 op := relativeJumpOpcodes[n.instruction] 1179 var isShortJump bool 1180 // offsetOfEIP means the offset of EIP register at the time of executing this jump instruction. 1181 // Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP. 1182 var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps 1183 if n.isBackwardJump() { 1184 // If this is the backward jump, we can calculate the exact offset now. 1185 offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary()) 1186 isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8 1187 offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump) 1188 } else { 1189 // For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps. 1190 isShortJump = n.isForwardShortJump() 1191 } 1192 1193 if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here. 1194 return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction)) 1195 } 1196 1197 base := buf.Len() 1198 code := buf.Append(6)[:0] 1199 1200 if isShortJump { 1201 code = append(code, op.short...) 1202 code = append(code, byte(offsetOfEIP)) 1203 } else { 1204 code = append(code, op.long...) 1205 code = appendUint32(code, uint32(offsetOfEIP)) 1206 } 1207 1208 buf.Truncate(base + len(code)) 1209 return 1210 } 1211 1212 func (a *AssemblerImpl) encodeRegisterToNone(buf asm.Buffer, n *nodeImpl) (err error) { 1213 regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) 1214 1215 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1216 modRM := 0b11_000_000 | // Specifying that opeand is register. 1217 regBits 1218 1219 var opcode byte 1220 switch n.instruction { 1221 case DIVL: 1222 // https://www.felixcloutier.com/x86/div 1223 modRM |= 0b00_110_000 1224 opcode = 0xf7 1225 case DIVQ: 1226 // https://www.felixcloutier.com/x86/div 1227 prefix |= rexPrefixW 1228 modRM |= 0b00_110_000 1229 opcode = 0xf7 1230 case IDIVL: 1231 // https://www.felixcloutier.com/x86/idiv 1232 modRM |= 0b00_111_000 1233 opcode = 0xf7 1234 case IDIVQ: 1235 // https://www.felixcloutier.com/x86/idiv 1236 prefix |= rexPrefixW 1237 modRM |= 0b00_111_000 1238 opcode = 0xf7 1239 case MULL: 1240 // https://www.felixcloutier.com/x86/mul 1241 modRM |= 0b00_100_000 1242 opcode = 0xf7 1243 case MULQ: 1244 // https://www.felixcloutier.com/x86/mul 1245 prefix |= rexPrefixW 1246 modRM |= 0b00_100_000 1247 opcode = 0xf7 1248 default: 1249 err = errorEncodingUnsupported(n) 1250 } 1251 1252 base := buf.Len() 1253 code := buf.Append(3)[:0] 1254 1255 if prefix != rexPrefixNone { 1256 code = append(code, prefix) 1257 } 1258 1259 code = append(code, opcode, modRM) 1260 1261 buf.Truncate(base + len(code)) 1262 return 1263 } 1264 1265 var registerToRegisterOpcode = [instructionEnd]*struct { 1266 opcode []byte 1267 rPrefix rexPrefix 1268 mandatoryPrefix byte 1269 srcOnModRMReg bool 1270 isSrc8bit bool 1271 needArg bool 1272 }{ 1273 // https://www.felixcloutier.com/x86/add 1274 ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true}, 1275 ADDQ: {opcode: []byte{0x1}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1276 // https://www.felixcloutier.com/x86/and 1277 ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true}, 1278 ANDQ: {opcode: []byte{0x21}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1279 // https://www.felixcloutier.com/x86/cmp 1280 CMPL: {opcode: []byte{0x39}}, 1281 CMPQ: {opcode: []byte{0x39}, rPrefix: rexPrefixW}, 1282 // https://www.felixcloutier.com/x86/cmovcc 1283 CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: rexPrefixW}, 1284 // https://www.felixcloutier.com/x86/addsd 1285 ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}}, 1286 // https://www.felixcloutier.com/x86/addss 1287 ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}}, 1288 // https://www.felixcloutier.com/x86/addpd 1289 ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}}, 1290 // https://www.felixcloutier.com/x86/addps 1291 ANDPS: {opcode: []byte{0x0f, 0x54}}, 1292 // https://www.felixcloutier.com/x86/bsr 1293 BSRL: {opcode: []byte{0xf, 0xbd}}, 1294 BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: rexPrefixW}, 1295 // https://www.felixcloutier.com/x86/comisd 1296 COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}}, 1297 // https://www.felixcloutier.com/x86/comiss 1298 COMISS: {opcode: []byte{0x0f, 0x2f}}, 1299 // https://www.felixcloutier.com/x86/cvtsd2ss 1300 CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}}, 1301 // https://www.felixcloutier.com/x86/cvtsi2sd 1302 CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}}, 1303 // https://www.felixcloutier.com/x86/cvtsi2sd 1304 CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW}, 1305 // https://www.felixcloutier.com/x86/cvtsi2ss 1306 CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}}, 1307 // https://www.felixcloutier.com/x86/cvtsi2ss 1308 CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: rexPrefixW}, 1309 // https://www.felixcloutier.com/x86/cvtss2sd 1310 CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}}, 1311 // https://www.felixcloutier.com/x86/cvttsd2si 1312 CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}}, 1313 CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW}, 1314 // https://www.felixcloutier.com/x86/cvttss2si 1315 CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}}, 1316 CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: rexPrefixW}, 1317 // https://www.felixcloutier.com/x86/divsd 1318 DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}}, 1319 // https://www.felixcloutier.com/x86/divss 1320 DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}}, 1321 // https://www.felixcloutier.com/x86/lzcnt 1322 LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}}, 1323 LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: rexPrefixW}, 1324 // https://www.felixcloutier.com/x86/maxsd 1325 MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}}, 1326 // https://www.felixcloutier.com/x86/maxss 1327 MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}}, 1328 // https://www.felixcloutier.com/x86/minsd 1329 MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}}, 1330 // https://www.felixcloutier.com/x86/minss 1331 MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}}, 1332 // https://www.felixcloutier.com/x86/movsx:movsxd 1333 MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true}, 1334 // https://www.felixcloutier.com/x86/movzx 1335 MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true}, 1336 // https://www.felixcloutier.com/x86/movzx 1337 MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true}, 1338 // https://www.felixcloutier.com/x86/movsx:movsxd 1339 MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: rexPrefixW, isSrc8bit: true}, 1340 // https://www.felixcloutier.com/x86/movsx:movsxd 1341 MOVLQSX: {opcode: []byte{0x63}, rPrefix: rexPrefixW}, 1342 // https://www.felixcloutier.com/x86/movsx:movsxd 1343 MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: rexPrefixW}, 1344 // https://www.felixcloutier.com/x86/movsx:movsxd 1345 MOVWLSX: {opcode: []byte{0x0f, 0xbf}}, 1346 // https://www.felixcloutier.com/x86/imul 1347 IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: rexPrefixW}, 1348 // https://www.felixcloutier.com/x86/mulss 1349 MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}}, 1350 // https://www.felixcloutier.com/x86/mulsd 1351 MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}}, 1352 // https://www.felixcloutier.com/x86/or 1353 ORL: {opcode: []byte{0x09}, srcOnModRMReg: true}, 1354 ORQ: {opcode: []byte{0x09}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1355 // https://www.felixcloutier.com/x86/orpd 1356 ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}}, 1357 // https://www.felixcloutier.com/x86/orps 1358 ORPS: {opcode: []byte{0x0f, 0x56}}, 1359 // https://www.felixcloutier.com/x86/popcnt 1360 POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}}, 1361 POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: rexPrefixW}, 1362 // https://www.felixcloutier.com/x86/roundss 1363 ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true}, 1364 // https://www.felixcloutier.com/x86/roundsd 1365 ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true}, 1366 // https://www.felixcloutier.com/x86/sqrtss 1367 SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}}, 1368 // https://www.felixcloutier.com/x86/sqrtsd 1369 SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}}, 1370 // https://www.felixcloutier.com/x86/sub 1371 SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true}, 1372 SUBQ: {opcode: []byte{0x29}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1373 // https://www.felixcloutier.com/x86/subss 1374 SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}}, 1375 // https://www.felixcloutier.com/x86/subsd 1376 SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}}, 1377 // https://www.felixcloutier.com/x86/test 1378 TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true}, 1379 TESTQ: {opcode: []byte{0x85}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1380 // https://www.felixcloutier.com/x86/tzcnt 1381 TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}}, 1382 TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: rexPrefixW}, 1383 // https://www.felixcloutier.com/x86/ucomisd 1384 UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}}, 1385 // https://www.felixcloutier.com/x86/ucomiss 1386 UCOMISS: {opcode: []byte{0x0f, 0x2e}}, 1387 // https://www.felixcloutier.com/x86/xchg 1388 XCHGQ: {opcode: []byte{0x87}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1389 // https://www.felixcloutier.com/x86/xor 1390 XORL: {opcode: []byte{0x31}, srcOnModRMReg: true}, 1391 XORQ: {opcode: []byte{0x31}, rPrefix: rexPrefixW, srcOnModRMReg: true}, 1392 // https://www.felixcloutier.com/x86/xorpd 1393 XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}}, 1394 XORPS: {opcode: []byte{0x0f, 0x57}}, 1395 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1396 PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, needArg: true}, 1397 // https://www.felixcloutier.com/x86/pinsrw 1398 PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, needArg: true}, 1399 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1400 PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true}, 1401 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1402 PINSRQ: {mandatoryPrefix: 0x66, rPrefix: rexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, needArg: true}, 1403 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1404 MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}}, 1405 // https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 1406 MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}}, 1407 // https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq 1408 PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}}, 1409 PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}}, 1410 PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}}, 1411 PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}}, 1412 // https://www.felixcloutier.com/x86/psubb:psubw:psubd 1413 PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}}, 1414 PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}}, 1415 PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}}, 1416 // https://www.felixcloutier.com/x86/psubq 1417 PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}}, 1418 // https://www.felixcloutier.com/x86/addps 1419 ADDPS: {opcode: []byte{0x0f, 0x58}}, 1420 // https://www.felixcloutier.com/x86/addpd 1421 ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}}, 1422 // https://www.felixcloutier.com/x86/subps 1423 SUBPS: {opcode: []byte{0x0f, 0x5c}}, 1424 // https://www.felixcloutier.com/x86/subpd 1425 SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}}, 1426 // https://www.felixcloutier.com/x86/pxor 1427 PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}}, 1428 // https://www.felixcloutier.com/x86/pand 1429 PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}}, 1430 // https://www.felixcloutier.com/x86/por 1431 POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}}, 1432 // https://www.felixcloutier.com/x86/pandn 1433 PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}}, 1434 // https://www.felixcloutier.com/x86/pshufb 1435 PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}}, 1436 // https://www.felixcloutier.com/x86/pshufd 1437 PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, needArg: true}, 1438 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1439 PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, needArg: true, srcOnModRMReg: true}, 1440 // https://www.felixcloutier.com/x86/pextrw 1441 PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, needArg: true}, 1442 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1443 PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true}, 1444 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1445 PEXTRQ: {rPrefix: rexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, needArg: true, srcOnModRMReg: true}, 1446 // https://www.felixcloutier.com/x86/insertps 1447 INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, needArg: true}, 1448 // https://www.felixcloutier.com/x86/movlhps 1449 MOVLHPS: {opcode: []byte{0x0f, 0x16}}, 1450 // https://www.felixcloutier.com/x86/ptest 1451 PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}}, 1452 // https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd 1453 PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}}, 1454 PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}}, 1455 PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}}, 1456 // https://www.felixcloutier.com/x86/pcmpeqq 1457 PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}}, 1458 // https://www.felixcloutier.com/x86/paddusb:paddusw 1459 PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}}, 1460 // https://www.felixcloutier.com/x86/movsd 1461 MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}}, 1462 // https://www.felixcloutier.com/x86/packsswb:packssdw 1463 PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}}, 1464 // https://www.felixcloutier.com/x86/pmovmskb 1465 PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}}, 1466 // https://www.felixcloutier.com/x86/movmskps 1467 MOVMSKPS: {opcode: []byte{0x0f, 0x50}}, 1468 // https://www.felixcloutier.com/x86/movmskpd 1469 MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}}, 1470 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 1471 PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}}, 1472 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 1473 PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}}, 1474 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1475 PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}}, 1476 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1477 PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}}, 1478 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1479 PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}}, 1480 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1481 PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}}, 1482 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1483 PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}}, 1484 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1485 PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}}, 1486 // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq 1487 PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}}, 1488 // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq 1489 PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}}, 1490 // https://www.felixcloutier.com/x86/cmpps 1491 CMPPS: {opcode: []byte{0x0f, 0xc2}, needArg: true}, 1492 // https://www.felixcloutier.com/x86/cmppd 1493 CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true}, 1494 // https://www.felixcloutier.com/x86/pcmpgtq 1495 PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}}, 1496 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1497 PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}}, 1498 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1499 PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}}, 1500 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1501 PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}}, 1502 // https://www.felixcloutier.com/x86/pminsd:pminsq 1503 PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}}, 1504 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1505 PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}}, 1506 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1507 PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}}, 1508 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1509 PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}}, 1510 // https://www.felixcloutier.com/x86/pminsb:pminsw 1511 PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}}, 1512 // https://www.felixcloutier.com/x86/pminsb:pminsw 1513 PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}}, 1514 // https://www.felixcloutier.com/x86/pminud:pminuq 1515 PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}}, 1516 // https://www.felixcloutier.com/x86/pminub:pminuw 1517 PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}}, 1518 // https://www.felixcloutier.com/x86/pminub:pminuw 1519 PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}}, 1520 // https://www.felixcloutier.com/x86/pmaxud:pmaxuq 1521 PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}}, 1522 // https://www.felixcloutier.com/x86/pmaxub:pmaxuw 1523 PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}}, 1524 // https://www.felixcloutier.com/x86/pmaxub:pmaxuw 1525 PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}}, 1526 // https://www.felixcloutier.com/x86/pmullw 1527 PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}}, 1528 // https://www.felixcloutier.com/x86/pmulld:pmullq 1529 PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}}, 1530 // https://www.felixcloutier.com/x86/pmuludq 1531 PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}}, 1532 // https://www.felixcloutier.com/x86/psubsb:psubsw 1533 PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}}, 1534 // https://www.felixcloutier.com/x86/psubsb:psubsw 1535 PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}}, 1536 // https://www.felixcloutier.com/x86/psubusb:psubusw 1537 PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}}, 1538 // https://www.felixcloutier.com/x86/psubusb:psubusw 1539 PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}}, 1540 // https://www.felixcloutier.com/x86/paddsb:paddsw 1541 PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}}, 1542 // https://www.felixcloutier.com/x86/paddsb:paddsw 1543 PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}}, 1544 // https://www.felixcloutier.com/x86/paddusb:paddusw 1545 PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}}, 1546 // https://www.felixcloutier.com/x86/pavgb:pavgw 1547 PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}}, 1548 // https://www.felixcloutier.com/x86/pavgb:pavgw 1549 PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}}, 1550 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1551 PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}}, 1552 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1553 PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}}, 1554 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1555 PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}}, 1556 // https://www.felixcloutier.com/x86/blendvpd 1557 BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}}, 1558 // https://www.felixcloutier.com/x86/maxpd 1559 MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}}, 1560 // https://www.felixcloutier.com/x86/maxps 1561 MAXPS: {opcode: []byte{0x0f, 0x5f}}, 1562 // https://www.felixcloutier.com/x86/minpd 1563 MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}}, 1564 // https://www.felixcloutier.com/x86/minps 1565 MINPS: {opcode: []byte{0x0f, 0x5d}}, 1566 // https://www.felixcloutier.com/x86/andnpd 1567 ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}}, 1568 // https://www.felixcloutier.com/x86/andnps 1569 ANDNPS: {opcode: []byte{0x0f, 0x55}}, 1570 // https://www.felixcloutier.com/x86/mulps 1571 MULPS: {opcode: []byte{0x0f, 0x59}}, 1572 // https://www.felixcloutier.com/x86/mulpd 1573 MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}}, 1574 // https://www.felixcloutier.com/x86/divps 1575 DIVPS: {opcode: []byte{0x0f, 0x5e}}, 1576 // https://www.felixcloutier.com/x86/divpd 1577 DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}}, 1578 // https://www.felixcloutier.com/x86/sqrtps 1579 SQRTPS: {opcode: []byte{0x0f, 0x51}}, 1580 // https://www.felixcloutier.com/x86/sqrtpd 1581 SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}}, 1582 // https://www.felixcloutier.com/x86/roundps 1583 ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, needArg: true}, 1584 // https://www.felixcloutier.com/x86/roundpd 1585 ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, needArg: true}, 1586 // https://www.felixcloutier.com/x86/palignr 1587 PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, needArg: true}, 1588 // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq 1589 PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}}, 1590 // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq 1591 PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}}, 1592 // https://www.felixcloutier.com/x86/pmulhuw 1593 PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}}, 1594 // https://www.felixcloutier.com/x86/pmuldq 1595 PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}}, 1596 // https://www.felixcloutier.com/x86/pmulhrsw 1597 PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}}, 1598 // https://www.felixcloutier.com/x86/pmovsx 1599 PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}}, 1600 // https://www.felixcloutier.com/x86/pmovsx 1601 PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}}, 1602 // https://www.felixcloutier.com/x86/pmovsx 1603 PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}}, 1604 // https://www.felixcloutier.com/x86/pmovzx 1605 PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}}, 1606 // https://www.felixcloutier.com/x86/pmovzx 1607 PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}}, 1608 // https://www.felixcloutier.com/x86/pmovzx 1609 PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}}, 1610 // https://www.felixcloutier.com/x86/pmulhw 1611 PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}}, 1612 // https://www.felixcloutier.com/x86/cmpps 1613 CMPEQPS: {opcode: []byte{0x0f, 0xc2}, needArg: true}, 1614 // https://www.felixcloutier.com/x86/cmppd 1615 CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, needArg: true}, 1616 // https://www.felixcloutier.com/x86/cvttps2dq 1617 CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}}, 1618 // https://www.felixcloutier.com/x86/cvtdq2ps 1619 CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}}, 1620 // https://www.felixcloutier.com/x86/cvtdq2pd 1621 CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}}, 1622 // https://www.felixcloutier.com/x86/cvtpd2ps 1623 CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}}, 1624 // https://www.felixcloutier.com/x86/cvtps2pd 1625 CVTPS2PD: {opcode: []byte{0x0f, 0x5a}}, 1626 // https://www.felixcloutier.com/x86/movupd 1627 MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}}, 1628 // https://www.felixcloutier.com/x86/shufps 1629 SHUFPS: {opcode: []byte{0x0f, 0xc6}, needArg: true}, 1630 // https://www.felixcloutier.com/x86/pmaddwd 1631 PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}}, 1632 // https://www.felixcloutier.com/x86/unpcklps 1633 UNPCKLPS: {opcode: []byte{0x0f, 0x14}}, 1634 // https://www.felixcloutier.com/x86/packuswb 1635 PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}}, 1636 // https://www.felixcloutier.com/x86/packsswb:packssdw 1637 PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}}, 1638 // https://www.felixcloutier.com/x86/packusdw 1639 PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}}, 1640 // https://www.felixcloutier.com/x86/pmaddubsw 1641 PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}}, 1642 // https://www.felixcloutier.com/x86/cvttpd2dq 1643 CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}}, 1644 } 1645 1646 var registerToRegisterShiftOpcode = [instructionEnd]*struct { 1647 opcode []byte 1648 rPrefix rexPrefix 1649 modRMExtension byte 1650 }{ 1651 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1652 ROLL: {opcode: []byte{0xd3}}, 1653 ROLQ: {opcode: []byte{0xd3}, rPrefix: rexPrefixW}, 1654 RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000}, 1655 RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: rexPrefixW}, 1656 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1657 SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000}, 1658 SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: rexPrefixW}, 1659 SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000}, 1660 SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: rexPrefixW}, 1661 SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000}, 1662 SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: rexPrefixW}, 1663 } 1664 1665 func (a *AssemblerImpl) encodeRegisterToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 1666 // Alias for readability 1667 inst := n.instruction 1668 base := buf.Len() 1669 code := buf.Append(8)[:0] 1670 1671 switch inst { 1672 case MOVL, MOVQ: 1673 var ( 1674 opcode []byte 1675 mandatoryPrefix byte 1676 srcOnModRMReg bool 1677 rPrefix rexPrefix 1678 ) 1679 srcIsFloat, dstIsFloat := isVectorRegister(n.srcReg), isVectorRegister(n.dstReg) 1680 f2f := srcIsFloat && dstIsFloat 1681 if f2f { 1682 // https://www.felixcloutier.com/x86/movq 1683 opcode, mandatoryPrefix = []byte{0x0f, 0x7e}, 0xf3 1684 } else if srcIsFloat && !dstIsFloat { 1685 // https://www.felixcloutier.com/x86/movd:movq 1686 opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x7e}, 0x66, true 1687 } else if !srcIsFloat && dstIsFloat { 1688 // https://www.felixcloutier.com/x86/movd:movq 1689 opcode, mandatoryPrefix, srcOnModRMReg = []byte{0x0f, 0x6e}, 0x66, false 1690 } else { 1691 // https://www.felixcloutier.com/x86/mov 1692 opcode, srcOnModRMReg = []byte{0x89}, true 1693 } 1694 1695 rexPrefix, modRM, err := n.getRegisterToRegisterModRM(srcOnModRMReg) 1696 if err != nil { 1697 return err 1698 } 1699 rexPrefix |= rPrefix 1700 1701 if inst == MOVQ && !f2f { 1702 rexPrefix |= rexPrefixW 1703 } 1704 if mandatoryPrefix != 0 { 1705 code = append(code, mandatoryPrefix) 1706 } 1707 if rexPrefix != rexPrefixNone { 1708 code = append(code, rexPrefix) 1709 } 1710 code = append(code, opcode...) 1711 code = append(code, modRM) 1712 buf.Truncate(base + len(code)) 1713 return nil 1714 } 1715 1716 if op := registerToRegisterOpcode[inst]; op != nil { 1717 rexPrefix, modRM, err := n.getRegisterToRegisterModRM(op.srcOnModRMReg) 1718 if err != nil { 1719 return err 1720 } 1721 rexPrefix |= op.rPrefix 1722 1723 if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI { 1724 // If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix. 1725 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers 1726 rexPrefix |= rexPrefixDefault 1727 } 1728 1729 if op.mandatoryPrefix != 0 { 1730 code = append(code, op.mandatoryPrefix) 1731 } 1732 1733 if rexPrefix != rexPrefixNone { 1734 code = append(code, rexPrefix) 1735 } 1736 code = append(code, op.opcode...) 1737 code = append(code, modRM) 1738 1739 if op.needArg { 1740 code = append(code, n.arg) 1741 } 1742 } else if op := registerToRegisterShiftOpcode[inst]; op != nil { 1743 reg3bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 1744 rexPrefix |= op.rPrefix 1745 if rexPrefix != rexPrefixNone { 1746 code = append(code, rexPrefix) 1747 } 1748 1749 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1750 modRM := 0b11_000_000 | 1751 (op.modRMExtension) | 1752 reg3bits 1753 code = append(code, op.opcode...) 1754 code = append(code, modRM) 1755 } else { 1756 return errorEncodingUnsupported(n) 1757 } 1758 1759 buf.Truncate(base + len(code)) 1760 return nil 1761 } 1762 1763 func (a *AssemblerImpl) encodeRegisterToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 1764 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) 1765 if err != nil { 1766 return err 1767 } 1768 1769 var opcode []byte 1770 var mandatoryPrefix byte 1771 var isShiftInstruction bool 1772 var needArg bool 1773 switch n.instruction { 1774 case CMPL: 1775 // https://www.felixcloutier.com/x86/cmp 1776 opcode = []byte{0x3b} 1777 case CMPQ: 1778 // https://www.felixcloutier.com/x86/cmp 1779 rexPrefix |= rexPrefixW 1780 opcode = []byte{0x3b} 1781 case MOVB: 1782 // https://www.felixcloutier.com/x86/mov 1783 opcode = []byte{0x88} 1784 // 1 byte register operands need default prefix for the following registers. 1785 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1786 rexPrefix |= rexPrefixDefault 1787 } 1788 case MOVL: 1789 if isVectorRegister(n.srcReg) { 1790 // https://www.felixcloutier.com/x86/movd:movq 1791 opcode = []byte{0x0f, 0x7e} 1792 mandatoryPrefix = 0x66 1793 } else { 1794 // https://www.felixcloutier.com/x86/mov 1795 opcode = []byte{0x89} 1796 } 1797 case MOVQ: 1798 if isVectorRegister(n.srcReg) { 1799 // https://www.felixcloutier.com/x86/movq 1800 opcode = []byte{0x0f, 0xd6} 1801 mandatoryPrefix = 0x66 1802 } else { 1803 // https://www.felixcloutier.com/x86/mov 1804 rexPrefix |= rexPrefixW 1805 opcode = []byte{0x89} 1806 } 1807 case MOVW: 1808 // https://www.felixcloutier.com/x86/mov 1809 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1810 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1811 mandatoryPrefix = 0x66 1812 opcode = []byte{0x89} 1813 case SARL: 1814 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1815 modRM |= 0b00_111_000 1816 opcode = []byte{0xd3} 1817 isShiftInstruction = true 1818 case SARQ: 1819 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1820 rexPrefix |= rexPrefixW 1821 modRM |= 0b00_111_000 1822 opcode = []byte{0xd3} 1823 isShiftInstruction = true 1824 case SHLL: 1825 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1826 modRM |= 0b00_100_000 1827 opcode = []byte{0xd3} 1828 isShiftInstruction = true 1829 case SHLQ: 1830 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1831 rexPrefix |= rexPrefixW 1832 modRM |= 0b00_100_000 1833 opcode = []byte{0xd3} 1834 isShiftInstruction = true 1835 case SHRL: 1836 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1837 modRM |= 0b00_101_000 1838 opcode = []byte{0xd3} 1839 isShiftInstruction = true 1840 case SHRQ: 1841 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1842 rexPrefix |= rexPrefixW 1843 modRM |= 0b00_101_000 1844 opcode = []byte{0xd3} 1845 isShiftInstruction = true 1846 case ROLL: 1847 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1848 opcode = []byte{0xd3} 1849 isShiftInstruction = true 1850 case ROLQ: 1851 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1852 rexPrefix |= rexPrefixW 1853 opcode = []byte{0xd3} 1854 isShiftInstruction = true 1855 case RORL: 1856 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1857 modRM |= 0b00_001_000 1858 opcode = []byte{0xd3} 1859 isShiftInstruction = true 1860 case RORQ: 1861 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1862 rexPrefix |= rexPrefixW 1863 opcode = []byte{0xd3} 1864 modRM |= 0b00_001_000 1865 isShiftInstruction = true 1866 case MOVDQU: 1867 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1868 mandatoryPrefix = 0xf3 1869 opcode = []byte{0x0f, 0x7f} 1870 case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1871 mandatoryPrefix = 0x66 1872 opcode = []byte{0x0f, 0x3a, 0x14} 1873 needArg = true 1874 case PEXTRW: // https://www.felixcloutier.com/x86/pextrw 1875 mandatoryPrefix = 0x66 1876 opcode = []byte{0x0f, 0x3a, 0x15} 1877 needArg = true 1878 case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1879 mandatoryPrefix = 0x66 1880 opcode = []byte{0x0f, 0x3a, 0x16} 1881 needArg = true 1882 case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1883 mandatoryPrefix = 0x66 1884 rexPrefix |= rexPrefixW // REX.W 1885 opcode = []byte{0x0f, 0x3a, 0x16} 1886 needArg = true 1887 default: 1888 return errorEncodingUnsupported(n) 1889 } 1890 1891 if !isShiftInstruction { 1892 srcReg3Bits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg) 1893 1894 rexPrefix |= prefix 1895 modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg 1896 } else { 1897 if n.srcReg != RegCX { 1898 return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg)) 1899 } 1900 } 1901 1902 base := buf.Len() 1903 code := buf.Append(16)[:0] 1904 1905 if mandatoryPrefix != 0 { 1906 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 1907 code = append(code, mandatoryPrefix) 1908 } 1909 1910 if rexPrefix != rexPrefixNone { 1911 code = append(code, rexPrefix) 1912 } 1913 1914 code = append(code, opcode...) 1915 code = append(code, modRM) 1916 1917 if sbiExist { 1918 code = append(code, sbi) 1919 } 1920 1921 if displacementWidth != 0 { 1922 code = appendConst(code, n.dstConst, displacementWidth) 1923 } 1924 1925 if needArg { 1926 code = append(code, n.arg) 1927 } 1928 1929 buf.Truncate(base + len(code)) 1930 return 1931 } 1932 1933 func (a *AssemblerImpl) encodeRegisterToConst(buf asm.Buffer, n *nodeImpl) (err error) { 1934 regBits, prefix := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) 1935 1936 base := buf.Len() 1937 code := buf.Append(10)[:0] 1938 1939 switch n.instruction { 1940 case CMPL, CMPQ: 1941 if n.instruction == CMPQ { 1942 prefix |= rexPrefixW 1943 } 1944 if prefix != rexPrefixNone { 1945 code = append(code, prefix) 1946 } 1947 is8bitConst := fitInSigned8bit(n.dstConst) 1948 // https://www.felixcloutier.com/x86/cmp 1949 if n.srcReg == RegAX && !is8bitConst { 1950 code = append(code, 0x3d) 1951 } else { 1952 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1953 modRM := 0b11_000_000 | // Specifying that opeand is register. 1954 0b00_111_000 | // CMP with immediate needs "/7" extension. 1955 regBits 1956 if is8bitConst { 1957 code = append(code, 0x83, modRM) 1958 } else { 1959 code = append(code, 0x81, modRM) 1960 } 1961 } 1962 default: 1963 err = errorEncodingUnsupported(n) 1964 } 1965 1966 if fitInSigned8bit(n.dstConst) { 1967 code = append(code, byte(n.dstConst)) 1968 } else { 1969 code = appendUint32(code, uint32(n.dstConst)) 1970 } 1971 1972 buf.Truncate(base + len(code)) 1973 return 1974 } 1975 1976 func (a *AssemblerImpl) finalizeReadInstructionAddressNode(code []byte, n *nodeImpl) (err error) { 1977 // Find the target instruction node. 1978 targetNode := n 1979 for ; targetNode != nil; targetNode = targetNode.next { 1980 if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction { 1981 targetNode = targetNode.next 1982 break 1983 } 1984 } 1985 1986 if targetNode == nil { 1987 return errors.New("BUG: target instruction not found for read instruction address") 1988 } 1989 1990 offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */) 1991 if offset >= math.MaxInt32 { 1992 return errors.New("BUG: too large offset for LEAQ instruction") 1993 } 1994 1995 binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset))) 1996 return nil 1997 } 1998 1999 func (a *AssemblerImpl) encodeReadInstructionAddress(buf asm.Buffer, n *nodeImpl) error { 2000 dstReg3Bits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) 2001 2002 a.readInstructionAddressNodes = append(a.readInstructionAddressNodes, n) 2003 2004 // https://www.felixcloutier.com/x86/lea 2005 opcode := byte(0x8d) 2006 rexPrefix |= rexPrefixW 2007 2008 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2009 modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding. 2010 (dstReg3Bits << 3) // Place the dstReg on ModRM:reg. 2011 2012 code := buf.Append(7) 2013 code[0] = rexPrefix 2014 code[1] = opcode 2015 code[2] = modRM 2016 binary.LittleEndian.PutUint32(code[3:], 0) // Preserve 2017 return nil 2018 } 2019 2020 func (a *AssemblerImpl) encodeMemoryToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2021 if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE { 2022 return a.encodeReadInstructionAddress(buf, n) 2023 } 2024 2025 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false) 2026 if err != nil { 2027 return err 2028 } 2029 2030 dstReg3Bits, prefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) 2031 rexPrefix |= prefix 2032 modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg 2033 2034 var mandatoryPrefix byte 2035 var opcode []byte 2036 var needArg bool 2037 2038 switch n.instruction { 2039 case ADDL: 2040 // https://www.felixcloutier.com/x86/add 2041 opcode = []byte{0x03} 2042 case ADDQ: 2043 // https://www.felixcloutier.com/x86/add 2044 rexPrefix |= rexPrefixW 2045 opcode = []byte{0x03} 2046 case CMPL: 2047 // https://www.felixcloutier.com/x86/cmp 2048 opcode = []byte{0x39} 2049 case CMPQ: 2050 // https://www.felixcloutier.com/x86/cmp 2051 rexPrefix |= rexPrefixW 2052 opcode = []byte{0x39} 2053 case LEAQ: 2054 // https://www.felixcloutier.com/x86/lea 2055 rexPrefix |= rexPrefixW 2056 opcode = []byte{0x8d} 2057 case MOVBLSX: 2058 // https://www.felixcloutier.com/x86/movsx:movsxd 2059 opcode = []byte{0x0f, 0xbe} 2060 case MOVBLZX: 2061 // https://www.felixcloutier.com/x86/movzx 2062 opcode = []byte{0x0f, 0xb6} 2063 case MOVBQSX: 2064 // https://www.felixcloutier.com/x86/movsx:movsxd 2065 rexPrefix |= rexPrefixW 2066 opcode = []byte{0x0f, 0xbe} 2067 case MOVBQZX: 2068 // https://www.felixcloutier.com/x86/movzx 2069 rexPrefix |= rexPrefixW 2070 opcode = []byte{0x0f, 0xb6} 2071 case MOVLQSX: 2072 // https://www.felixcloutier.com/x86/movsx:movsxd 2073 rexPrefix |= rexPrefixW 2074 opcode = []byte{0x63} 2075 case MOVLQZX: 2076 // https://www.felixcloutier.com/x86/mov 2077 // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and 2078 // that is semantically equivalent to MOV 32bit to 32bit. 2079 opcode = []byte{0x8B} 2080 case MOVL: 2081 // https://www.felixcloutier.com/x86/mov 2082 // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and 2083 // that is semantically equivalent to MOV 32bit to 32bit. 2084 if isVectorRegister(n.dstReg) { 2085 // https://www.felixcloutier.com/x86/movd:movq 2086 opcode = []byte{0x0f, 0x6e} 2087 mandatoryPrefix = 0x66 2088 } else { 2089 // https://www.felixcloutier.com/x86/mov 2090 opcode = []byte{0x8B} 2091 } 2092 case MOVQ: 2093 if isVectorRegister(n.dstReg) { 2094 // https://www.felixcloutier.com/x86/movq 2095 opcode = []byte{0x0f, 0x7e} 2096 mandatoryPrefix = 0xf3 2097 } else { 2098 // https://www.felixcloutier.com/x86/mov 2099 rexPrefix |= rexPrefixW 2100 opcode = []byte{0x8B} 2101 } 2102 case MOVWLSX: 2103 // https://www.felixcloutier.com/x86/movsx:movsxd 2104 opcode = []byte{0x0f, 0xbf} 2105 case MOVWLZX: 2106 // https://www.felixcloutier.com/x86/movzx 2107 opcode = []byte{0x0f, 0xb7} 2108 case MOVWQSX: 2109 // https://www.felixcloutier.com/x86/movsx:movsxd 2110 rexPrefix |= rexPrefixW 2111 opcode = []byte{0x0f, 0xbf} 2112 case MOVWQZX: 2113 // https://www.felixcloutier.com/x86/movzx 2114 rexPrefix |= rexPrefixW 2115 opcode = []byte{0x0f, 0xb7} 2116 case SUBQ: 2117 // https://www.felixcloutier.com/x86/sub 2118 rexPrefix |= rexPrefixW 2119 opcode = []byte{0x2b} 2120 case SUBSD: 2121 // https://www.felixcloutier.com/x86/subsd 2122 opcode = []byte{0x0f, 0x5c} 2123 mandatoryPrefix = 0xf2 2124 case SUBSS: 2125 // https://www.felixcloutier.com/x86/subss 2126 opcode = []byte{0x0f, 0x5c} 2127 mandatoryPrefix = 0xf3 2128 case UCOMISD: 2129 // https://www.felixcloutier.com/x86/ucomisd 2130 opcode = []byte{0x0f, 0x2e} 2131 mandatoryPrefix = 0x66 2132 case UCOMISS: 2133 // https://www.felixcloutier.com/x86/ucomiss 2134 opcode = []byte{0x0f, 0x2e} 2135 case MOVDQU: 2136 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 2137 mandatoryPrefix = 0xf3 2138 opcode = []byte{0x0f, 0x6f} 2139 case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx 2140 mandatoryPrefix = 0x66 2141 opcode = []byte{0x0f, 0x38, 0x20} 2142 case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx 2143 mandatoryPrefix = 0x66 2144 opcode = []byte{0x0f, 0x38, 0x23} 2145 case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx 2146 mandatoryPrefix = 0x66 2147 opcode = []byte{0x0f, 0x38, 0x25} 2148 case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx 2149 mandatoryPrefix = 0x66 2150 opcode = []byte{0x0f, 0x38, 0x30} 2151 case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx 2152 mandatoryPrefix = 0x66 2153 opcode = []byte{0x0f, 0x38, 0x33} 2154 case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx 2155 mandatoryPrefix = 0x66 2156 opcode = []byte{0x0f, 0x38, 0x35} 2157 case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2158 mandatoryPrefix = 0x66 2159 opcode = []byte{0x0f, 0x3a, 0x20} 2160 needArg = true 2161 case PINSRW: // https://www.felixcloutier.com/x86/pinsrw 2162 mandatoryPrefix = 0x66 2163 opcode = []byte{0x0f, 0xc4} 2164 needArg = true 2165 case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2166 mandatoryPrefix = 0x66 2167 opcode = []byte{0x0f, 0x3a, 0x22} 2168 needArg = true 2169 case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2170 rexPrefix |= rexPrefixW 2171 mandatoryPrefix = 0x66 2172 opcode = []byte{0x0f, 0x3a, 0x22} 2173 needArg = true 2174 default: 2175 return errorEncodingUnsupported(n) 2176 } 2177 2178 base := buf.Len() 2179 code := buf.Append(16)[:0] 2180 2181 if mandatoryPrefix != 0 { 2182 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 2183 code = append(code, mandatoryPrefix) 2184 } 2185 2186 if rexPrefix != rexPrefixNone { 2187 code = append(code, rexPrefix) 2188 } 2189 2190 code = append(code, opcode...) 2191 code = append(code, modRM) 2192 2193 if sbiExist { 2194 code = append(code, sbi) 2195 } 2196 2197 if displacementWidth != 0 { 2198 code = appendConst(code, n.srcConst, displacementWidth) 2199 } 2200 2201 if needArg { 2202 code = append(code, n.arg) 2203 } 2204 2205 buf.Truncate(base + len(code)) 2206 return 2207 } 2208 2209 func (a *AssemblerImpl) encodeConstToRegister(buf asm.Buffer, n *nodeImpl) (err error) { 2210 regBits, rexPrefix := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 2211 2212 isFloatReg := isVectorRegister(n.dstReg) 2213 switch n.instruction { 2214 case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD: 2215 if !isFloatReg { 2216 return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) 2217 } 2218 default: 2219 if isFloatReg { 2220 return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) 2221 } 2222 } 2223 2224 if n.instruction != MOVQ && !fitIn32bit(n.srcConst) { 2225 return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2226 } else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) { 2227 return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2228 } else if (n.instruction == PSLLD || 2229 n.instruction == PSLLQ || 2230 n.instruction == PSRLD || 2231 n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) { 2232 return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2233 } 2234 2235 base := buf.Len() 2236 code := buf.Append(32)[:0] 2237 2238 isSigned8bitConst := fitInSigned8bit(n.srcConst) 2239 switch inst := n.instruction; inst { 2240 case ADDQ: 2241 // https://www.felixcloutier.com/x86/add 2242 rexPrefix |= rexPrefixW 2243 if n.dstReg == RegAX && !isSigned8bitConst { 2244 code = append(code, rexPrefix, 0x05) 2245 } else { 2246 modRM := 0b11_000_000 | // Specifying that opeand is register. 2247 regBits 2248 if isSigned8bitConst { 2249 code = append(code, rexPrefix, 0x83, modRM) 2250 } else { 2251 code = append(code, rexPrefix, 0x81, modRM) 2252 } 2253 } 2254 if isSigned8bitConst { 2255 code = append(code, byte(n.srcConst)) 2256 } else { 2257 code = appendUint32(code, uint32(n.srcConst)) 2258 } 2259 case ANDQ: 2260 // https://www.felixcloutier.com/x86/and 2261 rexPrefix |= rexPrefixW 2262 if n.dstReg == RegAX && !isSigned8bitConst { 2263 code = append(code, rexPrefix, 0x25) 2264 } else { 2265 modRM := 0b11_000_000 | // Specifying that opeand is register. 2266 0b00_100_000 | // AND with immediate needs "/4" extension. 2267 regBits 2268 if isSigned8bitConst { 2269 code = append(code, rexPrefix, 0x83, modRM) 2270 } else { 2271 code = append(code, rexPrefix, 0x81, modRM) 2272 } 2273 } 2274 if fitInSigned8bit(n.srcConst) { 2275 code = append(code, byte(n.srcConst)) 2276 } else { 2277 code = appendUint32(code, uint32(n.srcConst)) 2278 } 2279 case TESTQ: 2280 // https://www.felixcloutier.com/x86/test 2281 rexPrefix |= rexPrefixW 2282 if n.dstReg == RegAX && !isSigned8bitConst { 2283 code = append(code, rexPrefix, 0xa9) 2284 } else { 2285 modRM := 0b11_000_000 | // Specifying that operand is register 2286 regBits 2287 code = append(code, rexPrefix, 0xf7, modRM) 2288 } 2289 code = appendUint32(code, uint32(n.srcConst)) 2290 case MOVL: 2291 // https://www.felixcloutier.com/x86/mov 2292 if rexPrefix != rexPrefixNone { 2293 code = append(code, rexPrefix) 2294 } 2295 code = append(code, 0xb8|regBits) 2296 code = appendUint32(code, uint32(n.srcConst)) 2297 case MOVQ: 2298 // https://www.felixcloutier.com/x86/mov 2299 if fitIn32bit(n.srcConst) { 2300 if n.srcConst > math.MaxInt32 { 2301 if rexPrefix != rexPrefixNone { 2302 code = append(code, rexPrefix) 2303 } 2304 code = append(code, 0xb8|regBits) 2305 } else { 2306 rexPrefix |= rexPrefixW 2307 modRM := 0b11_000_000 | // Specifying that opeand is register. 2308 regBits 2309 code = append(code, rexPrefix, 0xc7, modRM) 2310 } 2311 code = appendUint32(code, uint32(n.srcConst)) 2312 } else { 2313 rexPrefix |= rexPrefixW 2314 code = append(code, rexPrefix, 0xb8|regBits) 2315 code = appendUint64(code, uint64(n.srcConst)) 2316 } 2317 case SHLQ: 2318 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 2319 rexPrefix |= rexPrefixW 2320 modRM := 0b11_000_000 | // Specifying that opeand is register. 2321 0b00_100_000 | // SHL with immediate needs "/4" extension. 2322 regBits 2323 if n.srcConst == 1 { 2324 code = append(code, rexPrefix, 0xd1, modRM) 2325 } else { 2326 code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst)) 2327 } 2328 case SHRQ: 2329 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 2330 rexPrefix |= rexPrefixW 2331 modRM := 0b11_000_000 | // Specifying that opeand is register. 2332 0b00_101_000 | // SHR with immediate needs "/5" extension. 2333 regBits 2334 if n.srcConst == 1 { 2335 code = append(code, rexPrefix, 0xd1, modRM) 2336 } else { 2337 code = append(code, rexPrefix, 0xc1, modRM, byte(n.srcConst)) 2338 } 2339 case PSLLD: 2340 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2341 modRM := 0b11_000_000 | // Specifying that opeand is register. 2342 0b00_110_000 | // PSLL with immediate needs "/6" extension. 2343 regBits 2344 if rexPrefix != rexPrefixNone { 2345 code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst)) 2346 } else { 2347 code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst)) 2348 } 2349 case PSLLQ: 2350 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2351 modRM := 0b11_000_000 | // Specifying that opeand is register. 2352 0b00_110_000 | // PSLL with immediate needs "/6" extension. 2353 regBits 2354 if rexPrefix != rexPrefixNone { 2355 code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst)) 2356 } else { 2357 code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst)) 2358 } 2359 case PSRLD: 2360 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2361 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2362 modRM := 0b11_000_000 | // Specifying that operand is register. 2363 0b00_010_000 | // PSRL with immediate needs "/2" extension. 2364 regBits 2365 if rexPrefix != rexPrefixNone { 2366 code = append(code, 0x66, rexPrefix, 0x0f, 0x72, modRM, byte(n.srcConst)) 2367 } else { 2368 code = append(code, 0x66, 0x0f, 0x72, modRM, byte(n.srcConst)) 2369 } 2370 case PSRLQ: 2371 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2372 modRM := 0b11_000_000 | // Specifying that operand is register. 2373 0b00_010_000 | // PSRL with immediate needs "/2" extension. 2374 regBits 2375 if rexPrefix != rexPrefixNone { 2376 code = append(code, 0x66, rexPrefix, 0x0f, 0x73, modRM, byte(n.srcConst)) 2377 } else { 2378 code = append(code, 0x66, 0x0f, 0x73, modRM, byte(n.srcConst)) 2379 } 2380 case PSRAW, PSRAD: 2381 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 2382 modRM := 0b11_000_000 | // Specifying that operand is register. 2383 0b00_100_000 | // PSRAW with immediate needs "/4" extension. 2384 regBits 2385 code = append(code, 0x66) 2386 if rexPrefix != rexPrefixNone { 2387 code = append(code, rexPrefix) 2388 } 2389 2390 var op byte 2391 if inst == PSRAD { 2392 op = 0x72 2393 } else { // PSRAW 2394 op = 0x71 2395 } 2396 2397 code = append(code, 0x0f, op, modRM, byte(n.srcConst)) 2398 case PSRLW: 2399 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2400 modRM := 0b11_000_000 | // Specifying that operand is register. 2401 0b00_010_000 | // PSRLW with immediate needs "/2" extension. 2402 regBits 2403 code = append(code, 0x66) 2404 if rexPrefix != rexPrefixNone { 2405 code = append(code, rexPrefix) 2406 } 2407 code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst)) 2408 case PSLLW: 2409 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2410 modRM := 0b11_000_000 | // Specifying that operand is register. 2411 0b00_110_000 | // PSLLW with immediate needs "/6" extension. 2412 regBits 2413 code = append(code, 0x66) 2414 if rexPrefix != rexPrefixNone { 2415 code = append(code, rexPrefix) 2416 } 2417 code = append(code, 0x0f, 0x71, modRM, byte(n.srcConst)) 2418 case XORL, XORQ: 2419 // https://www.felixcloutier.com/x86/xor 2420 if inst == XORQ { 2421 rexPrefix |= rexPrefixW 2422 } 2423 if rexPrefix != rexPrefixNone { 2424 code = append(code, rexPrefix) 2425 } 2426 if n.dstReg == RegAX && !isSigned8bitConst { 2427 code = append(code, 0x35) 2428 } else { 2429 modRM := 0b11_000_000 | // Specifying that opeand is register. 2430 0b00_110_000 | // XOR with immediate needs "/6" extension. 2431 regBits 2432 if isSigned8bitConst { 2433 code = append(code, 0x83, modRM) 2434 } else { 2435 code = append(code, 0x81, modRM) 2436 } 2437 } 2438 if fitInSigned8bit(n.srcConst) { 2439 code = append(code, byte(n.srcConst)) 2440 } else { 2441 code = appendUint32(code, uint32(n.srcConst)) 2442 } 2443 default: 2444 err = errorEncodingUnsupported(n) 2445 } 2446 2447 buf.Truncate(base + len(code)) 2448 return 2449 } 2450 2451 func (a *AssemblerImpl) encodeMemoryToConst(buf asm.Buffer, n *nodeImpl) (err error) { 2452 if !fitIn32bit(n.dstConst) { 2453 return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction)) 2454 } 2455 2456 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(false) 2457 if err != nil { 2458 return err 2459 } 2460 2461 // Alias for readability. 2462 c := n.dstConst 2463 2464 var opcode, constWidth byte 2465 switch n.instruction { 2466 case CMPL: 2467 // https://www.felixcloutier.com/x86/cmp 2468 if fitInSigned8bit(c) { 2469 opcode = 0x83 2470 constWidth = 8 2471 } else { 2472 opcode = 0x81 2473 constWidth = 32 2474 } 2475 modRM |= 0b00_111_000 2476 default: 2477 return errorEncodingUnsupported(n) 2478 } 2479 2480 base := buf.Len() 2481 code := buf.Append(20)[:0] 2482 2483 if rexPrefix != rexPrefixNone { 2484 code = append(code, rexPrefix) 2485 } 2486 2487 code = append(code, opcode, modRM) 2488 2489 if sbiExist { 2490 code = append(code, sbi) 2491 } 2492 2493 if displacementWidth != 0 { 2494 code = appendConst(code, n.srcConst, displacementWidth) 2495 } 2496 2497 code = appendConst(code, c, constWidth) 2498 buf.Truncate(base + len(code)) 2499 return 2500 } 2501 2502 func (a *AssemblerImpl) encodeConstToMemory(buf asm.Buffer, n *nodeImpl) (err error) { 2503 rexPrefix, modRM, sbi, sbiExist, displacementWidth, err := n.getMemoryLocation(true) 2504 if err != nil { 2505 return err 2506 } 2507 2508 // Alias for readability. 2509 inst := n.instruction 2510 c := n.srcConst 2511 2512 if inst == MOVB && !fitInSigned8bit(c) { 2513 return fmt.Errorf("too large load target const %d for MOVB", c) 2514 } else if !fitIn32bit(c) { 2515 return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction)) 2516 } 2517 2518 var constWidth, opcode byte 2519 switch inst { 2520 case MOVB: 2521 opcode = 0xc6 2522 constWidth = 8 2523 case MOVL: 2524 opcode = 0xc7 2525 constWidth = 32 2526 case MOVQ: 2527 rexPrefix |= rexPrefixW 2528 opcode = 0xc7 2529 constWidth = 32 2530 default: 2531 return errorEncodingUnsupported(n) 2532 } 2533 2534 base := buf.Len() 2535 code := buf.Append(20)[:0] 2536 2537 if rexPrefix != rexPrefixNone { 2538 code = append(code, rexPrefix) 2539 } 2540 2541 code = append(code, opcode, modRM) 2542 2543 if sbiExist { 2544 code = append(code, sbi) 2545 } 2546 2547 if displacementWidth != 0 { 2548 code = appendConst(code, n.dstConst, displacementWidth) 2549 } 2550 2551 code = appendConst(code, c, constWidth) 2552 2553 buf.Truncate(base + len(code)) 2554 return 2555 } 2556 2557 func appendUint32(code []byte, v uint32) []byte { 2558 b := [4]byte{} 2559 binary.LittleEndian.PutUint32(b[:], uint32(v)) 2560 return append(code, b[:]...) 2561 } 2562 2563 func appendUint64(code []byte, v uint64) []byte { 2564 b := [8]byte{} 2565 binary.LittleEndian.PutUint64(b[:], uint64(v)) 2566 return append(code, b[:]...) 2567 } 2568 2569 func appendConst(code []byte, v int64, length byte) []byte { 2570 switch length { 2571 case 8: 2572 return append(code, byte(v)) 2573 case 32: 2574 return appendUint32(code, uint32(v)) 2575 default: 2576 return appendUint64(code, uint64(v)) 2577 } 2578 } 2579 2580 func (n *nodeImpl) getMemoryLocation(dstMem bool) (p rexPrefix, modRM byte, sbi byte, sbiExist bool, displacementWidth byte, err error) { 2581 var baseReg, indexReg asm.Register 2582 var offset asm.ConstantValue 2583 var scale byte 2584 if dstMem { 2585 baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale 2586 } else { 2587 baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale 2588 } 2589 2590 if !fitIn32bit(offset) { 2591 err = errors.New("offset does not fit in 32-bit integer") 2592 return 2593 } 2594 2595 if baseReg == asm.NilRegister && indexReg != asm.NilRegister { 2596 // [(index*scale) + displacement] addressing is possible, but we haven't used it for now. 2597 err = errors.New("addressing without base register but with index is not implemented") 2598 } else if baseReg == asm.NilRegister { 2599 modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. 2600 sbi, sbiExist = byte(0b00_100_101), true 2601 displacementWidth = 32 2602 } else if indexReg == asm.NilRegister { 2603 modRM, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) 2604 2605 // Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0 2606 // and otherwise [R/M]. 2607 withoutDisplacement := offset == 0 && 2608 // If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value 2609 // is zero since it's not [R/M] operand is not defined for these two registers. 2610 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2611 baseReg != RegR13 && baseReg != RegBP 2612 if withoutDisplacement { 2613 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2614 modRM |= 0b00_000_000 // Specifying that operand is memory without displacement 2615 displacementWidth = 0 2616 } else if fitInSigned8bit(offset) { 2617 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2618 modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement. 2619 displacementWidth = 8 2620 } else { 2621 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2622 modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement. 2623 displacementWidth = 32 2624 } 2625 2626 // For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP]. 2627 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2628 // 2629 // Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement]. 2630 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2 2631 if baseReg == RegSP || baseReg == RegR12 { 2632 sbi, sbiExist = byte(0b00_100_100), true 2633 } 2634 } else { 2635 if indexReg == RegSP { 2636 err = errors.New("SP cannot be used for SIB index") 2637 return 2638 } 2639 2640 modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. 2641 2642 withoutDisplacement := offset == 0 && 2643 // For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod). 2644 baseReg != RegR13 && baseReg != RegBP 2645 if withoutDisplacement { 2646 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2647 modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement 2648 displacementWidth = 0 2649 } else if fitInSigned8bit(offset) { 2650 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2651 modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement. 2652 displacementWidth = 8 2653 } else { 2654 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2655 modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement. 2656 displacementWidth = 32 2657 } 2658 2659 var baseRegBits byte 2660 baseRegBits, p = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) 2661 2662 var indexRegBits byte 2663 var indexRegPrefix rexPrefix 2664 indexRegBits, indexRegPrefix = register3bits(indexReg, registerSpecifierPositionSIBIndex) 2665 p |= indexRegPrefix 2666 2667 sbi, sbiExist = baseRegBits|(indexRegBits<<3), true 2668 switch scale { 2669 case 1: 2670 sbi |= 0b00_000_000 2671 case 2: 2672 sbi |= 0b01_000_000 2673 case 4: 2674 sbi |= 0b10_000_000 2675 case 8: 2676 sbi |= 0b11_000_000 2677 default: 2678 err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale) 2679 return 2680 } 2681 2682 } 2683 return 2684 } 2685 2686 // getRegisterToRegisterModRM does XXXX 2687 // 2688 // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation 2689 // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity. 2690 func (n *nodeImpl) getRegisterToRegisterModRM(srcOnModRMReg bool) (rexPrefix, modRM byte, err error) { 2691 var reg3bits, rm3bits byte 2692 if srcOnModRMReg { 2693 reg3bits, rexPrefix = register3bits(n.srcReg, 2694 // Indicate that srcReg will be specified by ModRM:reg. 2695 registerSpecifierPositionModRMFieldReg) 2696 2697 var dstRexPrefix byte 2698 rm3bits, dstRexPrefix = register3bits(n.dstReg, 2699 // Indicate that dstReg will be specified by ModRM:r/m. 2700 registerSpecifierPositionModRMFieldRM) 2701 rexPrefix |= dstRexPrefix 2702 } else { 2703 rm3bits, rexPrefix = register3bits(n.srcReg, 2704 // Indicate that srcReg will be specified by ModRM:r/m. 2705 registerSpecifierPositionModRMFieldRM) 2706 2707 var dstRexPrefix byte 2708 reg3bits, dstRexPrefix = register3bits(n.dstReg, 2709 // Indicate that dstReg will be specified by ModRM:reg. 2710 registerSpecifierPositionModRMFieldReg) 2711 rexPrefix |= dstRexPrefix 2712 } 2713 2714 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2715 modRM = 0b11_000_000 | // Specifying that dst operand is register. 2716 (reg3bits << 3) | 2717 rm3bits 2718 2719 return 2720 } 2721 2722 // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix 2723 type rexPrefix = byte 2724 2725 // REX prefixes are independent of each other and can be combined with OR. 2726 const ( 2727 rexPrefixNone rexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix. 2728 rexPrefixDefault rexPrefix = 0b0100_0000 2729 rexPrefixW = 0b0000_1000 | rexPrefixDefault // REX.W 2730 rexPrefixR = 0b0000_0100 | rexPrefixDefault // REX.R 2731 rexPrefixX = 0b0000_0010 | rexPrefixDefault // REX.X 2732 rexPrefixB = 0b0000_0001 | rexPrefixDefault // REX.B 2733 ) 2734 2735 // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed. 2736 type registerSpecifierPosition byte 2737 2738 const ( 2739 registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota 2740 registerSpecifierPositionModRMFieldRM 2741 registerSpecifierPositionSIBIndex 2742 ) 2743 2744 var regInfo = [...]struct { 2745 bits byte 2746 needRex bool 2747 }{ 2748 RegAX: {bits: 0b000}, 2749 RegCX: {bits: 0b001}, 2750 RegDX: {bits: 0b010}, 2751 RegBX: {bits: 0b011}, 2752 RegSP: {bits: 0b100}, 2753 RegBP: {bits: 0b101}, 2754 RegSI: {bits: 0b110}, 2755 RegDI: {bits: 0b111}, 2756 RegR8: {bits: 0b000, needRex: true}, 2757 RegR9: {bits: 0b001, needRex: true}, 2758 RegR10: {bits: 0b010, needRex: true}, 2759 RegR11: {bits: 0b011, needRex: true}, 2760 RegR12: {bits: 0b100, needRex: true}, 2761 RegR13: {bits: 0b101, needRex: true}, 2762 RegR14: {bits: 0b110, needRex: true}, 2763 RegR15: {bits: 0b111, needRex: true}, 2764 RegX0: {bits: 0b000}, 2765 RegX1: {bits: 0b001}, 2766 RegX2: {bits: 0b010}, 2767 RegX3: {bits: 0b011}, 2768 RegX4: {bits: 0b100}, 2769 RegX5: {bits: 0b101}, 2770 RegX6: {bits: 0b110}, 2771 RegX7: {bits: 0b111}, 2772 RegX8: {bits: 0b000, needRex: true}, 2773 RegX9: {bits: 0b001, needRex: true}, 2774 RegX10: {bits: 0b010, needRex: true}, 2775 RegX11: {bits: 0b011, needRex: true}, 2776 RegX12: {bits: 0b100, needRex: true}, 2777 RegX13: {bits: 0b101, needRex: true}, 2778 RegX14: {bits: 0b110, needRex: true}, 2779 RegX15: {bits: 0b111, needRex: true}, 2780 } 2781 2782 func register3bits( 2783 reg asm.Register, 2784 registerSpecifierPosition registerSpecifierPosition, 2785 ) (bits byte, prefix rexPrefix) { 2786 info := regInfo[reg] 2787 bits = info.bits 2788 if info.needRex { 2789 // https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix 2790 switch registerSpecifierPosition { 2791 case registerSpecifierPositionModRMFieldReg: 2792 prefix = rexPrefixR 2793 case registerSpecifierPositionModRMFieldRM: 2794 prefix = rexPrefixB 2795 case registerSpecifierPositionSIBIndex: 2796 prefix = rexPrefixX 2797 } 2798 } 2799 return 2800 } 2801 2802 func fitIn32bit(v int64) bool { 2803 return math.MinInt32 <= v && v <= math.MaxUint32 2804 } 2805 2806 func fitInSigned8bit(v int64) bool { 2807 return math.MinInt8 <= v && v <= math.MaxInt8 2808 } 2809 2810 func isVectorRegister(r asm.Register) bool { 2811 return RegX0 <= r && r <= RegX15 2812 }