wa-lang.org/wazero@v1.0.2/internal/asm/amd64/impl.go (about) 1 package amd64 2 3 import ( 4 "bytes" 5 "encoding/binary" 6 "errors" 7 "fmt" 8 "math" 9 10 "wa-lang.org/wazero/internal/asm" 11 ) 12 13 // nodeImpl implements asm.Node for amd64. 14 type nodeImpl struct { 15 instruction asm.Instruction 16 17 offsetInBinaryField asm.NodeOffsetInBinary // Field suffix to dodge conflict with OffsetInBinary 18 19 // jumpTarget holds the target node in the linked for the jump-kind instruction. 20 jumpTarget *nodeImpl 21 flag nodeFlag 22 // next holds the next node from this node in the assembled linked list. 23 next *nodeImpl 24 25 types operandTypes 26 srcReg, dstReg asm.Register 27 srcConst, dstConst asm.ConstantValue 28 srcMemIndex, dstMemIndex asm.Register 29 srcMemScale, dstMemScale byte 30 31 arg byte 32 33 // readInstructionAddressBeforeTargetInstruction holds the instruction right before the target of 34 // read instruction address instruction. See asm.assemblerBase.CompileReadInstructionAddress. 35 readInstructionAddressBeforeTargetInstruction asm.Instruction 36 37 // jumpOrigins hold all the nodes trying to jump into this node. In other words, all the nodes with .jumpTarget == this. 38 jumpOrigins map[*nodeImpl]struct{} 39 40 staticConst *asm.StaticConst 41 } 42 43 type nodeFlag byte 44 45 const ( 46 // nodeFlagInitializedForEncoding is always set to indicate that node is already initialized. Notably, this is used to judge 47 // whether a jump is backward or forward before encoding. 48 nodeFlagInitializedForEncoding nodeFlag = 1 << iota 49 nodeFlagBackwardJump 50 // nodeFlagShortForwardJump is set to false by default and only used by forward branch jumps, which means .jumpTarget != nil and 51 // the target node is encoded after this node. False by default means that we Encode all the jumps with jumpTarget 52 // as short jump (i.e. relative signed 8-bit integer offset jump) and try to Encode as small as possible. 53 nodeFlagShortForwardJump 54 ) 55 56 func (n *nodeImpl) isInitializedForEncoding() bool { 57 return n.flag&nodeFlagInitializedForEncoding != 0 58 } 59 60 func (n *nodeImpl) isJumpNode() bool { 61 return n.jumpTarget != nil 62 } 63 64 func (n *nodeImpl) isBackwardJump() bool { 65 return n.isJumpNode() && (n.flag&nodeFlagBackwardJump != 0) 66 } 67 68 func (n *nodeImpl) isForwardJump() bool { 69 return n.isJumpNode() && (n.flag&nodeFlagBackwardJump == 0) 70 } 71 72 func (n *nodeImpl) isForwardShortJump() bool { 73 return n.isForwardJump() && n.flag&nodeFlagShortForwardJump != 0 74 } 75 76 // AssignJumpTarget implements asm.Node.AssignJumpTarget. 77 func (n *nodeImpl) AssignJumpTarget(target asm.Node) { 78 n.jumpTarget = target.(*nodeImpl) 79 } 80 81 // AssignDestinationConstant implements asm.Node.AssignDestinationConstant. 82 func (n *nodeImpl) AssignDestinationConstant(value asm.ConstantValue) { 83 n.dstConst = value 84 } 85 86 // AssignSourceConstant implements asm.Node.AssignSourceConstant. 87 func (n *nodeImpl) AssignSourceConstant(value asm.ConstantValue) { 88 n.srcConst = value 89 } 90 91 // OffsetInBinary implements asm.Node.OffsetInBinary. 92 func (n *nodeImpl) OffsetInBinary() asm.NodeOffsetInBinary { 93 return n.offsetInBinaryField 94 } 95 96 // String implements fmt.Stringer. 97 // 98 // This is for debugging purpose, and the format is almost same as the AT&T assembly syntax, 99 // meaning that this should look like "INSTRUCTION ${from}, ${to}" where each operand 100 // might be embraced by '[]' to represent the memory location. 101 func (n *nodeImpl) String() (ret string) { 102 instName := InstructionName(n.instruction) 103 switch n.types { 104 case operandTypesNoneToNone: 105 ret = instName 106 case operandTypesNoneToRegister: 107 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.dstReg)) 108 case operandTypesNoneToMemory: 109 if n.dstMemIndex != asm.NilRegister { 110 ret = fmt.Sprintf("%s [%s + 0x%x + %s*0x%x]", instName, 111 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 112 } else { 113 ret = fmt.Sprintf("%s [%s + 0x%x]", instName, RegisterName(n.dstReg), n.dstConst) 114 } 115 case operandTypesNoneToBranch: 116 ret = fmt.Sprintf("%s {%v}", instName, n.jumpTarget) 117 case operandTypesRegisterToNone: 118 ret = fmt.Sprintf("%s %s", instName, RegisterName(n.srcReg)) 119 case operandTypesRegisterToRegister: 120 ret = fmt.Sprintf("%s %s, %s", instName, RegisterName(n.srcReg), RegisterName(n.dstReg)) 121 case operandTypesRegisterToMemory: 122 if n.dstMemIndex != asm.NilRegister { 123 ret = fmt.Sprintf("%s %s, [%s + 0x%x + %s*0x%x]", instName, RegisterName(n.srcReg), 124 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 125 } else { 126 ret = fmt.Sprintf("%s %s, [%s + 0x%x]", instName, RegisterName(n.srcReg), RegisterName(n.dstReg), n.dstConst) 127 } 128 case operandTypesRegisterToConst: 129 ret = fmt.Sprintf("%s %s, 0x%x", instName, RegisterName(n.srcReg), n.dstConst) 130 case operandTypesMemoryToRegister: 131 if n.srcMemIndex != asm.NilRegister { 132 ret = fmt.Sprintf("%s [%s + %#x + %s*%#x], %s", instName, 133 RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, RegisterName(n.dstReg)) 134 } else { 135 ret = fmt.Sprintf("%s [%s + 0x%x], %s", instName, RegisterName(n.srcReg), n.srcConst, RegisterName(n.dstReg)) 136 } 137 case operandTypesMemoryToConst: 138 if n.srcMemIndex != asm.NilRegister { 139 ret = fmt.Sprintf("%s [%s + %#x + %s*0x%x], 0x%x", instName, 140 RegisterName(n.srcReg), n.srcConst, RegisterName(n.srcMemIndex), n.srcMemScale, n.dstConst) 141 } else { 142 ret = fmt.Sprintf("%s [%s + %#x], 0x%x", instName, RegisterName(n.srcReg), n.srcConst, n.dstConst) 143 } 144 case operandTypesConstToMemory: 145 if n.dstMemIndex != asm.NilRegister { 146 ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x + %s*0x%x]", instName, n.srcConst, 147 RegisterName(n.dstReg), n.dstConst, RegisterName(n.dstMemIndex), n.dstMemScale) 148 } else { 149 ret = fmt.Sprintf("%s 0x%x, [%s + 0x%x]", instName, n.srcConst, RegisterName(n.dstReg), n.dstConst) 150 } 151 case operandTypesConstToRegister: 152 ret = fmt.Sprintf("%s 0x%x, %s", instName, n.srcConst, RegisterName(n.dstReg)) 153 case operandTypesStaticConstToRegister: 154 ret = fmt.Sprintf("%s $%#x, %s", instName, n.staticConst.Raw, RegisterName(n.dstReg)) 155 case operandTypesRegisterToStaticConst: 156 ret = fmt.Sprintf("%s %s, $%#x", instName, RegisterName(n.srcReg), n.staticConst.Raw) 157 } 158 return 159 } 160 161 // operandType represents where an operand is placed for an instruction. 162 // Note: this is almost the same as obj.AddrType in GO assembler. 163 type operandType byte 164 165 const ( 166 operandTypeNone operandType = iota 167 operandTypeRegister 168 operandTypeMemory 169 operandTypeConst 170 operandTypeStaticConst 171 operandTypeBranch 172 ) 173 174 func (o operandType) String() (ret string) { 175 switch o { 176 case operandTypeNone: 177 ret = "none" 178 case operandTypeRegister: 179 ret = "register" 180 case operandTypeMemory: 181 ret = "memory" 182 case operandTypeConst: 183 ret = "const" 184 case operandTypeBranch: 185 ret = "branch" 186 case operandTypeStaticConst: 187 ret = "static-const" 188 } 189 return 190 } 191 192 // operandTypes represents the only combinations of two operandTypes used by wazero 193 type operandTypes struct{ src, dst operandType } 194 195 var ( 196 operandTypesNoneToNone = operandTypes{operandTypeNone, operandTypeNone} 197 operandTypesNoneToRegister = operandTypes{operandTypeNone, operandTypeRegister} 198 operandTypesNoneToMemory = operandTypes{operandTypeNone, operandTypeMemory} 199 operandTypesNoneToBranch = operandTypes{operandTypeNone, operandTypeBranch} 200 operandTypesRegisterToNone = operandTypes{operandTypeRegister, operandTypeNone} 201 operandTypesRegisterToRegister = operandTypes{operandTypeRegister, operandTypeRegister} 202 operandTypesRegisterToMemory = operandTypes{operandTypeRegister, operandTypeMemory} 203 operandTypesRegisterToConst = operandTypes{operandTypeRegister, operandTypeConst} 204 operandTypesMemoryToRegister = operandTypes{operandTypeMemory, operandTypeRegister} 205 operandTypesMemoryToConst = operandTypes{operandTypeMemory, operandTypeConst} 206 operandTypesConstToRegister = operandTypes{operandTypeConst, operandTypeRegister} 207 operandTypesConstToMemory = operandTypes{operandTypeConst, operandTypeMemory} 208 operandTypesStaticConstToRegister = operandTypes{operandTypeStaticConst, operandTypeRegister} 209 operandTypesRegisterToStaticConst = operandTypes{operandTypeRegister, operandTypeStaticConst} 210 ) 211 212 // String implements fmt.Stringer 213 func (o operandTypes) String() string { 214 return fmt.Sprintf("from:%s,to:%s", o.src, o.dst) 215 } 216 217 // AssemblerImpl implements Assembler. 218 type AssemblerImpl struct { 219 asm.BaseAssemblerImpl 220 enablePadding bool 221 root, current *nodeImpl 222 nodeCount int 223 buf *bytes.Buffer 224 forceReAssemble bool 225 // MaxDisplacementForConstantPool is fixed to defaultMaxDisplacementForConstantPool 226 // but have it as an exported field here for testability. 227 MaxDisplacementForConstantPool int 228 229 pool *asm.StaticConstPool 230 } 231 232 func NewAssembler() *AssemblerImpl { 233 return &AssemblerImpl{ 234 buf: bytes.NewBuffer(nil), enablePadding: true, pool: asm.NewStaticConstPool(), 235 MaxDisplacementForConstantPool: defaultMaxDisplacementForConstantPool, 236 } 237 } 238 239 // newNode creates a new Node and appends it into the linked list. 240 func (a *AssemblerImpl) newNode(instruction asm.Instruction, types operandTypes) *nodeImpl { 241 n := &nodeImpl{ 242 instruction: instruction, 243 next: nil, 244 types: types, 245 jumpOrigins: map[*nodeImpl]struct{}{}, 246 } 247 a.addNode(n) 248 a.nodeCount++ 249 return n 250 } 251 252 // addNode appends the new node into the linked list. 253 func (a *AssemblerImpl) addNode(node *nodeImpl) { 254 if a.root == nil { 255 a.root = node 256 a.current = node 257 } else { 258 parent := a.current 259 parent.next = node 260 a.current = node 261 } 262 263 for _, o := range a.SetBranchTargetOnNextNodes { 264 origin := o.(*nodeImpl) 265 origin.jumpTarget = node 266 } 267 a.SetBranchTargetOnNextNodes = nil 268 } 269 270 // EncodeNode encodes the given node into writer. 271 func (a *AssemblerImpl) EncodeNode(n *nodeImpl) (err error) { 272 switch n.types { 273 case operandTypesNoneToNone: 274 err = a.encodeNoneToNone(n) 275 case operandTypesNoneToRegister: 276 err = a.encodeNoneToRegister(n) 277 case operandTypesNoneToMemory: 278 err = a.encodeNoneToMemory(n) 279 case operandTypesNoneToBranch: 280 // Branching operand can be encoded as relative jumps. 281 err = a.encodeRelativeJump(n) 282 case operandTypesRegisterToNone: 283 err = a.encodeRegisterToNone(n) 284 case operandTypesRegisterToRegister: 285 err = a.encodeRegisterToRegister(n) 286 case operandTypesRegisterToMemory: 287 err = a.encodeRegisterToMemory(n) 288 case operandTypesRegisterToConst: 289 err = a.encodeRegisterToConst(n) 290 case operandTypesMemoryToRegister: 291 err = a.encodeMemoryToRegister(n) 292 case operandTypesConstToRegister: 293 err = a.encodeConstToRegister(n) 294 case operandTypesConstToMemory: 295 err = a.encodeConstToMemory(n) 296 case operandTypesMemoryToConst: 297 err = a.encodeMemoryToConst(n) 298 case operandTypesStaticConstToRegister: 299 err = a.encodeStaticConstToRegister(n) 300 case operandTypesRegisterToStaticConst: 301 err = a.encodeRegisterToStaticConst(n) 302 default: 303 err = fmt.Errorf("encoder undefined for [%s] operand type", n.types) 304 } 305 if err != nil { 306 err = fmt.Errorf("%w: %s", err, n) // Ensure the error is debuggable by including the string value of the node. 307 } 308 return 309 } 310 311 // Assemble implements asm.AssemblerBase 312 func (a *AssemblerImpl) Assemble() ([]byte, error) { 313 a.InitializeNodesForEncoding() 314 315 // Continue encoding until we are not forced to re-assemble which happens when 316 // a short relative jump ends up the offset larger than 8-bit length. 317 for { 318 err := a.Encode() 319 if err != nil { 320 return nil, err 321 } 322 323 if !a.forceReAssemble { 324 break 325 } else { 326 // We reset the length of buffer but don't delete the underlying slice since 327 // the binary size will roughly the same after reassemble. 328 a.buf.Reset() 329 // Reset the re-assemble flag in order to avoid the infinite loop! 330 a.forceReAssemble = false 331 } 332 } 333 334 code := a.buf.Bytes() 335 for _, cb := range a.OnGenerateCallbacks { 336 if err := cb(code); err != nil { 337 return nil, err 338 } 339 } 340 return code, nil 341 } 342 343 // InitializeNodesForEncoding initializes nodeImpl.flag and determine all the jumps 344 // are forward or backward jump. 345 func (a *AssemblerImpl) InitializeNodesForEncoding() { 346 for n := a.root; n != nil; n = n.next { 347 n.flag |= nodeFlagInitializedForEncoding 348 if target := n.jumpTarget; target != nil { 349 if target.isInitializedForEncoding() { 350 // This means the target exists behind. 351 n.flag |= nodeFlagBackwardJump 352 } else { 353 // Otherwise, this is forward jump. 354 // We start with assuming that the jump can be short (8-bit displacement). 355 // If it doens't fit, we change this flag in resolveRelativeForwardJump. 356 n.flag |= nodeFlagShortForwardJump 357 } 358 } 359 } 360 361 // Roughly allocate the buffer by assuming an instruction has 5-bytes length on average. 362 a.buf.Grow(a.nodeCount * 5) 363 } 364 365 func (a *AssemblerImpl) Encode() (err error) { 366 for n := a.root; n != nil; n = n.next { 367 // If an instruction needs NOP padding, we do so before encoding it. 368 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 369 if a.enablePadding { 370 if err = a.maybeNOPPadding(n); err != nil { 371 return 372 } 373 } 374 375 // After the padding, we can finalize the offset of this instruction in the binary. 376 n.offsetInBinaryField = uint64(a.buf.Len()) 377 378 if err = a.EncodeNode(n); err != nil { 379 return 380 } 381 382 err = a.ResolveForwardRelativeJumps(n) 383 if err != nil { 384 err = fmt.Errorf("invalid relative forward jumps: %w", err) 385 break 386 } 387 388 a.maybeFlushConstants(n.next == nil) 389 } 390 return 391 } 392 393 // maybeNOPPadding maybe appends NOP instructions before the node `n`. 394 // This is necessary to avoid Intel's jump erratum: 395 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 396 func (a *AssemblerImpl) maybeNOPPadding(n *nodeImpl) (err error) { 397 var instructionLen int32 398 399 // See in Section 2.1 in for when we have to pad NOP. 400 // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf 401 switch n.instruction { 402 case RET, JMP, JCC, JCS, JEQ, JGE, JGT, JHI, JLE, JLS, JLT, JMI, JNE, JPC, JPS: 403 // In order to know the instruction length before writing into the binary, 404 // we try encoding it with the temporary buffer. 405 saved := a.buf 406 a.buf = bytes.NewBuffer(nil) 407 408 // Assign the temporary offset which may or may not be correct depending on the padding decision. 409 n.offsetInBinaryField = uint64(saved.Len()) 410 411 // Encode the node and get the instruction length. 412 if err = a.EncodeNode(n); err != nil { 413 return 414 } 415 instructionLen = int32(a.buf.Len()) 416 417 // Revert the temporary buffer. 418 a.buf = saved 419 case // The possible fused jump instructions if the next node is a conditional jump instruction. 420 CMPL, CMPQ, TESTL, TESTQ, ADDL, ADDQ, SUBL, SUBQ, ANDL, ANDQ, INCQ, DECQ: 421 instructionLen, err = a.fusedInstructionLength(n) 422 if err != nil { 423 return err 424 } 425 } 426 427 if instructionLen == 0 { 428 return 429 } 430 431 const boundaryInBytes int32 = 32 432 const mask int32 = boundaryInBytes - 1 433 434 var padNum int 435 currentPos := int32(a.buf.Len()) 436 if used := currentPos & mask; used+instructionLen >= boundaryInBytes { 437 padNum = int(boundaryInBytes - used) 438 } 439 440 a.padNOP(padNum) 441 return 442 } 443 444 // fusedInstructionLength returns the length of "macro fused instruction" if the 445 // instruction sequence starting from `n` can be fused by processor. Otherwise, 446 // returns zero. 447 func (a *AssemblerImpl) fusedInstructionLength(n *nodeImpl) (ret int32, err error) { 448 // Find the next non-NOP instruction. 449 next := n.next 450 for ; next != nil && next.instruction == NOP; next = next.next { 451 } 452 453 if next == nil { 454 return 455 } 456 457 inst, jmpInst := n.instruction, next.instruction 458 459 if !(jmpInst == JCC || jmpInst == JCS || jmpInst == JEQ || jmpInst == JGE || jmpInst == JGT || 460 jmpInst == JHI || jmpInst == JLE || jmpInst == JLS || jmpInst == JLT || jmpInst == JMI || 461 jmpInst == JNE || jmpInst == JPC || jmpInst == JPS) { 462 // If the next instruction is not jump kind, the instruction will not be fused. 463 return 464 } 465 466 // How to determine whether the instruction can be fused is described in 467 // Section 3.4.2.2 of "Intel Optimization Manual": 468 // https://www.intel.com/content/dam/doc/manual/64-ia-32-architectures-optimization-manual.pdf 469 isTest := inst == TESTL || inst == TESTQ 470 isCmp := inst == CMPQ || inst == CMPL 471 isTestCmp := isTest || isCmp 472 if isTestCmp && ((n.types.src == operandTypeMemory && n.types.dst == operandTypeConst) || 473 (n.types.src == operandTypeConst && n.types.dst == operandTypeMemory)) { 474 // The manual says: "CMP and TEST can not be fused when comparing MEM-IMM". 475 return 476 } 477 478 // Implement the decision according to the table 3-1 in the manual. 479 isAnd := inst == ANDL || inst == ANDQ 480 if !isTest && !isAnd { 481 if jmpInst == JMI || jmpInst == JPL || jmpInst == JPS || jmpInst == JPC { 482 // These jumps are only fused for TEST or AND. 483 return 484 } 485 isAdd := inst == ADDL || inst == ADDQ 486 isSub := inst == SUBL || inst == SUBQ 487 if !isCmp && !isAdd && !isSub { 488 if jmpInst == JCS || jmpInst == JCC || jmpInst == JHI || jmpInst == JLS { 489 // Thses jumpst are only fused for TEST, AND, CMP, ADD, or SUB. 490 return 491 } 492 } 493 } 494 495 // Now the instruction is ensured to be fused by the processor. 496 // In order to know the fused instruction length before writing into the binary, 497 // we try encoding it with the temporary buffer. 498 saved := a.buf 499 savedLen := uint64(saved.Len()) 500 a.buf = bytes.NewBuffer(nil) 501 502 for _, fused := range []*nodeImpl{n, next} { 503 // Assign the temporary offset which may or may not be correct depending on the padding decision. 504 fused.offsetInBinaryField = savedLen + uint64(a.buf.Len()) 505 506 // Encode the node into the temporary buffer. 507 if err = a.EncodeNode(fused); err != nil { 508 return 509 } 510 } 511 512 ret = int32(a.buf.Len()) 513 514 // Revert the temporary buffer. 515 a.buf = saved 516 return 517 } 518 519 // nopOpcodes is the multi byte NOP instructions table derived from section 5.8 "Code Padding with Operand-Size Override and Multibyte NOP" 520 // in "AMD Software Optimization Guide for AMD Family 15h Processors" https://www.amd.com/system/files/TechDocs/47414_15h_sw_opt_guide.pdf 521 var nopOpcodes = [][11]byte{ 522 {0x90}, 523 {0x66, 0x90}, 524 {0x0f, 0x1f, 0x00}, 525 {0x0f, 0x1f, 0x40, 0x00}, 526 {0x0f, 0x1f, 0x44, 0x00, 0x00}, 527 {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00}, 528 {0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00}, 529 {0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 530 {0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 531 {0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 532 {0x66, 0x66, 0x66, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, 533 } 534 535 func (a *AssemblerImpl) padNOP(num int) { 536 for num > 0 { 537 singleNopNum := num 538 if singleNopNum > len(nopOpcodes) { 539 singleNopNum = len(nopOpcodes) 540 } 541 a.buf.Write(nopOpcodes[singleNopNum-1][:singleNopNum]) 542 num -= singleNopNum 543 } 544 } 545 546 // CompileStandAlone implements the same method as documented on asm.AssemblerBase. 547 func (a *AssemblerImpl) CompileStandAlone(instruction asm.Instruction) asm.Node { 548 return a.newNode(instruction, operandTypesNoneToNone) 549 } 550 551 // CompileConstToRegister implements the same method as documented on asm.AssemblerBase. 552 func (a *AssemblerImpl) CompileConstToRegister( 553 instruction asm.Instruction, 554 value asm.ConstantValue, 555 destinationReg asm.Register, 556 ) (inst asm.Node) { 557 n := a.newNode(instruction, operandTypesConstToRegister) 558 n.srcConst = value 559 n.dstReg = destinationReg 560 return n 561 } 562 563 // CompileRegisterToRegister implements the same method as documented on asm.AssemblerBase. 564 func (a *AssemblerImpl) CompileRegisterToRegister(instruction asm.Instruction, from, to asm.Register) { 565 n := a.newNode(instruction, operandTypesRegisterToRegister) 566 n.srcReg = from 567 n.dstReg = to 568 } 569 570 // CompileMemoryToRegister implements the same method as documented on asm.AssemblerBase. 571 func (a *AssemblerImpl) CompileMemoryToRegister( 572 instruction asm.Instruction, 573 sourceBaseReg asm.Register, 574 sourceOffsetConst asm.ConstantValue, 575 destinationReg asm.Register, 576 ) { 577 n := a.newNode(instruction, operandTypesMemoryToRegister) 578 n.srcReg = sourceBaseReg 579 n.srcConst = sourceOffsetConst 580 n.dstReg = destinationReg 581 } 582 583 // CompileRegisterToMemory implements the same method as documented on asm.AssemblerBase. 584 func (a *AssemblerImpl) CompileRegisterToMemory( 585 instruction asm.Instruction, 586 sourceRegister, destinationBaseRegister asm.Register, 587 destinationOffsetConst asm.ConstantValue, 588 ) { 589 n := a.newNode(instruction, operandTypesRegisterToMemory) 590 n.srcReg = sourceRegister 591 n.dstReg = destinationBaseRegister 592 n.dstConst = destinationOffsetConst 593 } 594 595 // CompileJump implements the same method as documented on asm.AssemblerBase. 596 func (a *AssemblerImpl) CompileJump(jmpInstruction asm.Instruction) asm.Node { 597 return a.newNode(jmpInstruction, operandTypesNoneToBranch) 598 } 599 600 // CompileJumpToMemory implements the same method as documented on asm.AssemblerBase. 601 func (a *AssemblerImpl) CompileJumpToMemory( 602 jmpInstruction asm.Instruction, 603 baseReg asm.Register, 604 offset asm.ConstantValue, 605 ) { 606 n := a.newNode(jmpInstruction, operandTypesNoneToMemory) 607 n.dstReg = baseReg 608 n.dstConst = offset 609 } 610 611 // CompileJumpToRegister implements the same method as documented on asm.AssemblerBase. 612 func (a *AssemblerImpl) CompileJumpToRegister(jmpInstruction asm.Instruction, reg asm.Register) { 613 n := a.newNode(jmpInstruction, operandTypesNoneToRegister) 614 n.dstReg = reg 615 } 616 617 // CompileReadInstructionAddress implements the same method as documented on asm.AssemblerBase. 618 func (a *AssemblerImpl) CompileReadInstructionAddress( 619 destinationRegister asm.Register, 620 beforeAcquisitionTargetInstruction asm.Instruction, 621 ) { 622 n := a.newNode(LEAQ, operandTypesMemoryToRegister) 623 n.dstReg = destinationRegister 624 n.readInstructionAddressBeforeTargetInstruction = beforeAcquisitionTargetInstruction 625 } 626 627 // CompileRegisterToRegisterWithArg implements the same method as documented on amd64.Assembler. 628 func (a *AssemblerImpl) CompileRegisterToRegisterWithArg( 629 instruction asm.Instruction, 630 from, to asm.Register, 631 arg byte, 632 ) { 633 n := a.newNode(instruction, operandTypesRegisterToRegister) 634 n.srcReg = from 635 n.dstReg = to 636 n.arg = arg 637 } 638 639 // CompileMemoryWithIndexToRegister implements the same method as documented on amd64.Assembler. 640 func (a *AssemblerImpl) CompileMemoryWithIndexToRegister( 641 instruction asm.Instruction, 642 srcBaseReg asm.Register, 643 srcOffsetConst asm.ConstantValue, 644 srcIndex asm.Register, 645 srcScale int16, 646 dstReg asm.Register, 647 ) { 648 n := a.newNode(instruction, operandTypesMemoryToRegister) 649 n.srcReg = srcBaseReg 650 n.srcConst = srcOffsetConst 651 n.srcMemIndex = srcIndex 652 n.srcMemScale = byte(srcScale) 653 n.dstReg = dstReg 654 } 655 656 // CompileMemoryWithIndexAndArgToRegister implements the same method as documented on amd64.Assembler. 657 func (a *AssemblerImpl) CompileMemoryWithIndexAndArgToRegister( 658 instruction asm.Instruction, 659 srcBaseReg asm.Register, 660 srcOffsetConst asm.ConstantValue, 661 srcIndex asm.Register, 662 srcScale int16, 663 dstReg asm.Register, 664 arg byte, 665 ) { 666 n := a.newNode(instruction, operandTypesMemoryToRegister) 667 n.srcReg = srcBaseReg 668 n.srcConst = srcOffsetConst 669 n.srcMemIndex = srcIndex 670 n.srcMemScale = byte(srcScale) 671 n.dstReg = dstReg 672 n.arg = arg 673 } 674 675 // CompileRegisterToMemoryWithIndex implements the same method as documented on amd64.Assembler. 676 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndex( 677 instruction asm.Instruction, 678 srcReg, dstBaseReg asm.Register, 679 dstOffsetConst asm.ConstantValue, 680 dstIndex asm.Register, 681 dstScale int16, 682 ) { 683 n := a.newNode(instruction, operandTypesRegisterToMemory) 684 n.srcReg = srcReg 685 n.dstReg = dstBaseReg 686 n.dstConst = dstOffsetConst 687 n.dstMemIndex = dstIndex 688 n.dstMemScale = byte(dstScale) 689 } 690 691 // CompileRegisterToMemoryWithIndexAndArg implements the same method as documented on amd64.Assembler. 692 func (a *AssemblerImpl) CompileRegisterToMemoryWithIndexAndArg( 693 instruction asm.Instruction, 694 srcReg, dstBaseReg asm.Register, 695 dstOffsetConst asm.ConstantValue, 696 dstIndex asm.Register, 697 dstScale int16, 698 arg byte, 699 ) { 700 n := a.newNode(instruction, operandTypesRegisterToMemory) 701 n.srcReg = srcReg 702 n.dstReg = dstBaseReg 703 n.dstConst = dstOffsetConst 704 n.dstMemIndex = dstIndex 705 n.dstMemScale = byte(dstScale) 706 n.arg = arg 707 } 708 709 // CompileRegisterToConst implements the same method as documented on amd64.Assembler. 710 func (a *AssemblerImpl) CompileRegisterToConst( 711 instruction asm.Instruction, 712 srcRegister asm.Register, 713 value asm.ConstantValue, 714 ) asm.Node { 715 n := a.newNode(instruction, operandTypesRegisterToConst) 716 n.srcReg = srcRegister 717 n.dstConst = value 718 return n 719 } 720 721 // CompileRegisterToNone implements the same method as documented on amd64.Assembler. 722 func (a *AssemblerImpl) CompileRegisterToNone(instruction asm.Instruction, register asm.Register) { 723 n := a.newNode(instruction, operandTypesRegisterToNone) 724 n.srcReg = register 725 } 726 727 // CompileNoneToRegister implements the same method as documented on amd64.Assembler. 728 func (a *AssemblerImpl) CompileNoneToRegister(instruction asm.Instruction, register asm.Register) { 729 n := a.newNode(instruction, operandTypesNoneToRegister) 730 n.dstReg = register 731 } 732 733 // CompileNoneToMemory implements the same method as documented on amd64.Assembler. 734 func (a *AssemblerImpl) CompileNoneToMemory( 735 instruction asm.Instruction, 736 baseReg asm.Register, 737 offset asm.ConstantValue, 738 ) { 739 n := a.newNode(instruction, operandTypesNoneToMemory) 740 n.dstReg = baseReg 741 n.dstConst = offset 742 } 743 744 // CompileConstToMemory implements the same method as documented on amd64.Assembler. 745 func (a *AssemblerImpl) CompileConstToMemory( 746 instruction asm.Instruction, 747 value asm.ConstantValue, 748 dstbaseReg asm.Register, 749 dstOffset asm.ConstantValue, 750 ) asm.Node { 751 n := a.newNode(instruction, operandTypesConstToMemory) 752 n.srcConst = value 753 n.dstReg = dstbaseReg 754 n.dstConst = dstOffset 755 return n 756 } 757 758 // CompileMemoryToConst implements the same method as documented on amd64.Assembler. 759 func (a *AssemblerImpl) CompileMemoryToConst( 760 instruction asm.Instruction, 761 srcBaseReg asm.Register, 762 srcOffset, value asm.ConstantValue, 763 ) asm.Node { 764 n := a.newNode(instruction, operandTypesMemoryToConst) 765 n.srcReg = srcBaseReg 766 n.srcConst = srcOffset 767 n.dstConst = value 768 return n 769 } 770 771 func errorEncodingUnsupported(n *nodeImpl) error { 772 return fmt.Errorf("%s is unsupported for %s type", InstructionName(n.instruction), n.types) 773 } 774 775 func (a *AssemblerImpl) encodeNoneToNone(n *nodeImpl) (err error) { 776 switch n.instruction { 777 case CDQ: 778 // https://www.felixcloutier.com/x86/cwd:cdq:cqo 779 err = a.buf.WriteByte(0x99) 780 case CQO: 781 // https://www.felixcloutier.com/x86/cwd:cdq:cqo 782 _, err = a.buf.Write([]byte{RexPrefixW, 0x99}) 783 case NOP: 784 // Simply optimize out the NOP instructions. 785 case RET: 786 // https://www.felixcloutier.com/x86/ret 787 err = a.buf.WriteByte(0xc3) 788 case UD2: 789 // https://mudongliang.github.io/x86/html/file_module_x86_id_318.html 790 _, err = a.buf.Write([]byte{0x0f, 0x0b}) 791 case REPMOVSQ: 792 _, err = a.buf.Write([]byte{0xf3, RexPrefixW, 0xa5}) 793 case REPSTOSQ: 794 _, err = a.buf.Write([]byte{0xf3, RexPrefixW, 0xab}) 795 case STD: 796 _, err = a.buf.Write([]byte{0xfd}) 797 case CLD: 798 _, err = a.buf.Write([]byte{0xfc}) 799 default: 800 err = errorEncodingUnsupported(n) 801 } 802 return 803 } 804 805 func (a *AssemblerImpl) encodeNoneToRegister(n *nodeImpl) (err error) { 806 regBits, prefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 807 if err != nil { 808 return err 809 } 810 811 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 812 modRM := 0b11_000_000 | // Specifying that opeand is register. 813 regBits 814 if n.instruction == JMP { 815 // JMP's opcode is defined as "FF /4" meaning that we have to have "4" 816 // in 4-6th bits in the ModRM byte. https://www.felixcloutier.com/x86/jmp 817 modRM |= 0b00_100_000 818 } else if n.instruction == NEGQ { 819 prefix |= RexPrefixW 820 modRM |= 0b00_011_000 821 } else if n.instruction == INCQ { 822 prefix |= RexPrefixW 823 } else if n.instruction == DECQ { 824 prefix |= RexPrefixW 825 modRM |= 0b00_001_000 826 } else { 827 if RegSP <= n.dstReg && n.dstReg <= RegDI { 828 // If the destination is one byte length register, we need to have the default prefix. 829 // https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers 830 prefix |= RexPrefixDefault 831 } 832 } 833 834 if prefix != RexPrefixNone { 835 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Encoding 836 if err = a.buf.WriteByte(prefix); err != nil { 837 return 838 } 839 } 840 841 switch n.instruction { 842 case JMP: 843 // https://www.felixcloutier.com/x86/jmp 844 _, err = a.buf.Write([]byte{0xff, modRM}) 845 case SETCC: 846 // https://www.felixcloutier.com/x86/setcc 847 _, err = a.buf.Write([]byte{0x0f, 0x93, modRM}) 848 case SETCS: 849 // https://www.felixcloutier.com/x86/setcc 850 _, err = a.buf.Write([]byte{0x0f, 0x92, modRM}) 851 case SETEQ: 852 // https://www.felixcloutier.com/x86/setcc 853 _, err = a.buf.Write([]byte{0x0f, 0x94, modRM}) 854 case SETGE: 855 // https://www.felixcloutier.com/x86/setcc 856 _, err = a.buf.Write([]byte{0x0f, 0x9d, modRM}) 857 case SETGT: 858 // https://www.felixcloutier.com/x86/setcc 859 _, err = a.buf.Write([]byte{0x0f, 0x9f, modRM}) 860 case SETHI: 861 // https://www.felixcloutier.com/x86/setcc 862 _, err = a.buf.Write([]byte{0x0f, 0x97, modRM}) 863 case SETLE: 864 // https://www.felixcloutier.com/x86/setcc 865 _, err = a.buf.Write([]byte{0x0f, 0x9e, modRM}) 866 case SETLS: 867 // https://www.felixcloutier.com/x86/setcc 868 _, err = a.buf.Write([]byte{0x0f, 0x96, modRM}) 869 case SETLT: 870 // https://www.felixcloutier.com/x86/setcc 871 _, err = a.buf.Write([]byte{0x0f, 0x9c, modRM}) 872 case SETNE: 873 // https://www.felixcloutier.com/x86/setcc 874 _, err = a.buf.Write([]byte{0x0f, 0x95, modRM}) 875 case SETPC: 876 // https://www.felixcloutier.com/x86/setcc 877 _, err = a.buf.Write([]byte{0x0f, 0x9b, modRM}) 878 case SETPS: 879 // https://www.felixcloutier.com/x86/setcc 880 _, err = a.buf.Write([]byte{0x0f, 0x9a, modRM}) 881 case NEGQ: 882 // https://www.felixcloutier.com/x86/neg 883 _, err = a.buf.Write([]byte{0xf7, modRM}) 884 case INCQ: 885 // https://www.felixcloutier.com/x86/inc 886 _, err = a.buf.Write([]byte{0xff, modRM}) 887 case DECQ: 888 // https://www.felixcloutier.com/x86/dec 889 _, err = a.buf.Write([]byte{0xff, modRM}) 890 default: 891 err = errorEncodingUnsupported(n) 892 } 893 return 894 } 895 896 func (a *AssemblerImpl) encodeNoneToMemory(n *nodeImpl) (err error) { 897 RexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation() 898 if err != nil { 899 return err 900 } 901 902 var opcode byte 903 switch n.instruction { 904 case INCQ: 905 // https://www.felixcloutier.com/x86/inc 906 RexPrefix |= RexPrefixW 907 opcode = 0xff 908 case DECQ: 909 // https://www.felixcloutier.com/x86/dec 910 RexPrefix |= RexPrefixW 911 modRM |= 0b00_001_000 // DEC needs "/1" extension in ModRM. 912 opcode = 0xff 913 case JMP: 914 // https://www.felixcloutier.com/x86/jmp 915 modRM |= 0b00_100_000 // JMP needs "/4" extension in ModRM. 916 opcode = 0xff 917 default: 918 return errorEncodingUnsupported(n) 919 } 920 921 if RexPrefix != RexPrefixNone { 922 a.buf.WriteByte(RexPrefix) 923 } 924 925 a.buf.Write([]byte{opcode, modRM}) 926 927 if sbi != nil { 928 a.buf.WriteByte(*sbi) 929 } 930 931 if displacementWidth != 0 { 932 a.WriteConst(n.dstConst, displacementWidth) 933 } 934 return 935 } 936 937 type relativeJumpOpcode struct{ short, long []byte } 938 939 func (o relativeJumpOpcode) instructionLen(short bool) int64 { 940 if short { 941 return int64(len(o.short)) + 1 // 1 byte = 8 bit offset 942 } else { 943 return int64(len(o.long)) + 4 // 4 byte = 32 bit offset 944 } 945 } 946 947 var relativeJumpOpcodes = map[asm.Instruction]relativeJumpOpcode{ 948 // https://www.felixcloutier.com/x86/jcc 949 JCC: {short: []byte{0x73}, long: []byte{0x0f, 0x83}}, 950 JCS: {short: []byte{0x72}, long: []byte{0x0f, 0x82}}, 951 JEQ: {short: []byte{0x74}, long: []byte{0x0f, 0x84}}, 952 JGE: {short: []byte{0x7d}, long: []byte{0x0f, 0x8d}}, 953 JGT: {short: []byte{0x7f}, long: []byte{0x0f, 0x8f}}, 954 JHI: {short: []byte{0x77}, long: []byte{0x0f, 0x87}}, 955 JLE: {short: []byte{0x7e}, long: []byte{0x0f, 0x8e}}, 956 JLS: {short: []byte{0x76}, long: []byte{0x0f, 0x86}}, 957 JLT: {short: []byte{0x7c}, long: []byte{0x0f, 0x8c}}, 958 JMI: {short: []byte{0x78}, long: []byte{0x0f, 0x88}}, 959 JPL: {short: []byte{0x79}, long: []byte{0x0f, 0x89}}, 960 JNE: {short: []byte{0x75}, long: []byte{0x0f, 0x85}}, 961 JPC: {short: []byte{0x7b}, long: []byte{0x0f, 0x8b}}, 962 JPS: {short: []byte{0x7a}, long: []byte{0x0f, 0x8a}}, 963 // https://www.felixcloutier.com/x86/jmp 964 JMP: {short: []byte{0xeb}, long: []byte{0xe9}}, 965 } 966 967 func (a *AssemblerImpl) ResolveForwardRelativeJumps(target *nodeImpl) (err error) { 968 offsetInBinary := int64(target.OffsetInBinary()) 969 for origin := range target.jumpOrigins { 970 shortJump := origin.isForwardShortJump() 971 op := relativeJumpOpcodes[origin.instruction] 972 instructionLen := op.instructionLen(shortJump) 973 974 // Calculate the offset from the EIP (at the time of executing this jump instruction) 975 // to the target instruction. This value is always >= 0 as here we only handle forward jumps. 976 offset := offsetInBinary - (int64(origin.OffsetInBinary()) + instructionLen) 977 if shortJump { 978 if offset > math.MaxInt8 { 979 // This forces reassemble in the outer loop inside AssemblerImpl.Assemble(). 980 a.forceReAssemble = true 981 // From the next reAssemble phases, this forward jump will be encoded long jump and 982 // allocate 32-bit offset bytes by default. This means that this `origin` node 983 // will always enter the "long jump offset encoding" block below 984 origin.flag ^= nodeFlagShortForwardJump 985 } else { 986 a.buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-1] = byte(offset) 987 } 988 } else { // long jump offset encoding. 989 if offset > math.MaxInt32 { 990 return fmt.Errorf("too large jump offset %d for encoding %s", offset, InstructionName(origin.instruction)) 991 } 992 binary.LittleEndian.PutUint32(a.buf.Bytes()[origin.OffsetInBinary()+uint64(instructionLen)-4:], uint32(offset)) 993 } 994 } 995 return nil 996 } 997 998 func (a *AssemblerImpl) encodeRelativeJump(n *nodeImpl) (err error) { 999 if n.jumpTarget == nil { 1000 err = fmt.Errorf("jump target must not be nil for relative %s", InstructionName(n.instruction)) 1001 return 1002 } 1003 1004 op, ok := relativeJumpOpcodes[n.instruction] 1005 if !ok { 1006 return errorEncodingUnsupported(n) 1007 } 1008 1009 var isShortJump bool 1010 // offsetOfEIP means the offset of EIP register at the time of executing this jump instruction. 1011 // Relative jump instructions can be encoded with the signed 8-bit or 32-bit integer offsets from the EIP. 1012 var offsetOfEIP int64 = 0 // We set zero and resolve later once the target instruction is encoded for forward jumps 1013 if n.isBackwardJump() { 1014 // If this is the backward jump, we can calculate the exact offset now. 1015 offsetOfJumpInstruction := int64(n.jumpTarget.OffsetInBinary()) - int64(n.OffsetInBinary()) 1016 isShortJump = offsetOfJumpInstruction-2 >= math.MinInt8 1017 offsetOfEIP = offsetOfJumpInstruction - op.instructionLen(isShortJump) 1018 } else { 1019 // For forward jumps, we resolve the offset when we Encode the target node. See AssemblerImpl.ResolveForwardRelativeJumps. 1020 n.jumpTarget.jumpOrigins[n] = struct{}{} 1021 isShortJump = n.isForwardShortJump() 1022 } 1023 1024 if offsetOfEIP < math.MinInt32 { // offsetOfEIP is always <= 0 as we don't calculate it for forward jump here. 1025 return fmt.Errorf("too large jump offset %d for encoding %s", offsetOfEIP, InstructionName(n.instruction)) 1026 } 1027 1028 if isShortJump { 1029 a.buf.Write(op.short) 1030 a.WriteConst(offsetOfEIP, 8) 1031 } else { 1032 a.buf.Write(op.long) 1033 a.WriteConst(offsetOfEIP, 32) 1034 } 1035 return 1036 } 1037 1038 func (a *AssemblerImpl) encodeRegisterToNone(n *nodeImpl) (err error) { 1039 regBits, prefix, err := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) 1040 if err != nil { 1041 return err 1042 } 1043 1044 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1045 modRM := 0b11_000_000 | // Specifying that opeand is register. 1046 regBits 1047 1048 var opcode byte 1049 switch n.instruction { 1050 case DIVL: 1051 // https://www.felixcloutier.com/x86/div 1052 modRM |= 0b00_110_000 1053 opcode = 0xf7 1054 case DIVQ: 1055 // https://www.felixcloutier.com/x86/div 1056 prefix |= RexPrefixW 1057 modRM |= 0b00_110_000 1058 opcode = 0xf7 1059 case IDIVL: 1060 // https://www.felixcloutier.com/x86/idiv 1061 modRM |= 0b00_111_000 1062 opcode = 0xf7 1063 case IDIVQ: 1064 // https://www.felixcloutier.com/x86/idiv 1065 prefix |= RexPrefixW 1066 modRM |= 0b00_111_000 1067 opcode = 0xf7 1068 case MULL: 1069 // https://www.felixcloutier.com/x86/mul 1070 modRM |= 0b00_100_000 1071 opcode = 0xf7 1072 case MULQ: 1073 // https://www.felixcloutier.com/x86/mul 1074 prefix |= RexPrefixW 1075 modRM |= 0b00_100_000 1076 opcode = 0xf7 1077 default: 1078 err = errorEncodingUnsupported(n) 1079 } 1080 1081 if prefix != RexPrefixNone { 1082 a.buf.WriteByte(prefix) 1083 } 1084 1085 a.buf.Write([]byte{opcode, modRM}) 1086 return 1087 } 1088 1089 var registerToRegisterOpcode = map[asm.Instruction]struct { 1090 opcode []byte 1091 rPrefix RexPrefix 1092 mandatoryPrefix byte 1093 srcOnModRMReg bool 1094 isSrc8bit bool 1095 needArg bool 1096 requireSrcFloat, requireDstFloat bool 1097 }{ 1098 // https://www.felixcloutier.com/x86/add 1099 ADDL: {opcode: []byte{0x1}, srcOnModRMReg: true}, 1100 ADDQ: {opcode: []byte{0x1}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1101 // https://www.felixcloutier.com/x86/and 1102 ANDL: {opcode: []byte{0x21}, srcOnModRMReg: true}, 1103 ANDQ: {opcode: []byte{0x21}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1104 // https://www.felixcloutier.com/x86/cmp 1105 CMPL: {opcode: []byte{0x39}}, 1106 CMPQ: {opcode: []byte{0x39}, rPrefix: RexPrefixW}, 1107 // https://www.felixcloutier.com/x86/cmovcc 1108 CMOVQCS: {opcode: []byte{0x0f, 0x42}, rPrefix: RexPrefixW}, 1109 // https://www.felixcloutier.com/x86/addsd 1110 ADDSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true}, 1111 // https://www.felixcloutier.com/x86/addss 1112 ADDSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true}, 1113 // https://www.felixcloutier.com/x86/addpd 1114 ANDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x54}, requireSrcFloat: true, requireDstFloat: true}, 1115 // https://www.felixcloutier.com/x86/addps 1116 ANDPS: {opcode: []byte{0x0f, 0x54}, requireSrcFloat: true, requireDstFloat: true}, 1117 // https://www.felixcloutier.com/x86/bsr 1118 BSRL: {opcode: []byte{0xf, 0xbd}}, 1119 BSRQ: {opcode: []byte{0xf, 0xbd}, rPrefix: RexPrefixW}, 1120 // https://www.felixcloutier.com/x86/comisd 1121 COMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2f}, requireSrcFloat: true, requireDstFloat: true}, 1122 // https://www.felixcloutier.com/x86/comiss 1123 COMISS: {opcode: []byte{0x0f, 0x2f}, requireSrcFloat: true, requireDstFloat: true}, 1124 // https://www.felixcloutier.com/x86/cvtsd2ss 1125 CVTSD2SS: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true}, 1126 // https://www.felixcloutier.com/x86/cvtsi2sd 1127 CVTSL2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, requireDstFloat: true}, 1128 // https://www.felixcloutier.com/x86/cvtsi2sd 1129 CVTSQ2SD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2a}, rPrefix: RexPrefixW, requireDstFloat: true}, 1130 // https://www.felixcloutier.com/x86/cvtsi2ss 1131 CVTSL2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, requireDstFloat: true}, 1132 // https://www.felixcloutier.com/x86/cvtsi2ss 1133 CVTSQ2SS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2a}, rPrefix: RexPrefixW, requireDstFloat: true}, 1134 // https://www.felixcloutier.com/x86/cvtss2sd 1135 CVTSS2SD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true}, 1136 // https://www.felixcloutier.com/x86/cvttsd2si 1137 CVTTSD2SL: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, requireSrcFloat: true}, 1138 CVTTSD2SQ: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x2c}, rPrefix: RexPrefixW, requireSrcFloat: true}, 1139 // https://www.felixcloutier.com/x86/cvttss2si 1140 CVTTSS2SL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, requireSrcFloat: true}, 1141 CVTTSS2SQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x2c}, rPrefix: RexPrefixW, requireSrcFloat: true}, 1142 // https://www.felixcloutier.com/x86/divsd 1143 DIVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true}, 1144 // https://www.felixcloutier.com/x86/divss 1145 DIVSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true}, 1146 // https://www.felixcloutier.com/x86/lzcnt 1147 LZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}}, 1148 LZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbd}, rPrefix: RexPrefixW}, 1149 // https://www.felixcloutier.com/x86/maxsd 1150 MAXSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true}, 1151 // https://www.felixcloutier.com/x86/maxss 1152 MAXSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true}, 1153 // https://www.felixcloutier.com/x86/minsd 1154 MINSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true}, 1155 // https://www.felixcloutier.com/x86/minss 1156 MINSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true}, 1157 // https://www.felixcloutier.com/x86/movsx:movsxd 1158 MOVBLSX: {opcode: []byte{0x0f, 0xbe}, isSrc8bit: true}, 1159 // https://www.felixcloutier.com/x86/movzx 1160 MOVBLZX: {opcode: []byte{0x0f, 0xb6}, isSrc8bit: true}, 1161 // https://www.felixcloutier.com/x86/movzx 1162 MOVWLZX: {opcode: []byte{0x0f, 0xb7}, isSrc8bit: true}, 1163 // https://www.felixcloutier.com/x86/movsx:movsxd 1164 MOVBQSX: {opcode: []byte{0x0f, 0xbe}, rPrefix: RexPrefixW, isSrc8bit: true}, 1165 // https://www.felixcloutier.com/x86/movsx:movsxd 1166 MOVLQSX: {opcode: []byte{0x63}, rPrefix: RexPrefixW}, 1167 // https://www.felixcloutier.com/x86/movsx:movsxd 1168 MOVWQSX: {opcode: []byte{0x0f, 0xbf}, rPrefix: RexPrefixW}, 1169 // https://www.felixcloutier.com/x86/movsx:movsxd 1170 MOVWLSX: {opcode: []byte{0x0f, 0xbf}}, 1171 // https://www.felixcloutier.com/x86/imul 1172 IMULQ: {opcode: []byte{0x0f, 0xaf}, rPrefix: RexPrefixW}, 1173 // https://www.felixcloutier.com/x86/mulss 1174 MULSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true}, 1175 // https://www.felixcloutier.com/x86/mulsd 1176 MULSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true}, 1177 // https://www.felixcloutier.com/x86/or 1178 ORL: {opcode: []byte{0x09}, srcOnModRMReg: true}, 1179 ORQ: {opcode: []byte{0x09}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1180 // https://www.felixcloutier.com/x86/orpd 1181 ORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x56}, requireSrcFloat: true, requireDstFloat: true}, 1182 // https://www.felixcloutier.com/x86/orps 1183 ORPS: {opcode: []byte{0x0f, 0x56}, requireSrcFloat: true, requireDstFloat: true}, 1184 // https://www.felixcloutier.com/x86/popcnt 1185 POPCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}}, 1186 POPCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xb8}, rPrefix: RexPrefixW}, 1187 // https://www.felixcloutier.com/x86/roundss 1188 ROUNDSS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0a}, needArg: true, requireSrcFloat: true, requireDstFloat: true}, 1189 // https://www.felixcloutier.com/x86/roundsd 1190 ROUNDSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0b}, needArg: true, requireSrcFloat: true, requireDstFloat: true}, 1191 // https://www.felixcloutier.com/x86/sqrtss 1192 SQRTSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true}, 1193 // https://www.felixcloutier.com/x86/sqrtsd 1194 SQRTSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true}, 1195 // https://www.felixcloutier.com/x86/sub 1196 SUBL: {opcode: []byte{0x29}, srcOnModRMReg: true}, 1197 SUBQ: {opcode: []byte{0x29}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1198 // https://www.felixcloutier.com/x86/subss 1199 SUBSS: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true}, 1200 // https://www.felixcloutier.com/x86/subsd 1201 SUBSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true}, 1202 // https://www.felixcloutier.com/x86/test 1203 TESTL: {opcode: []byte{0x85}, srcOnModRMReg: true}, 1204 TESTQ: {opcode: []byte{0x85}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1205 // https://www.felixcloutier.com/x86/tzcnt 1206 TZCNTL: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}}, 1207 TZCNTQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xbc}, rPrefix: RexPrefixW}, 1208 // https://www.felixcloutier.com/x86/ucomisd 1209 UCOMISD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x2e}, requireSrcFloat: true, requireDstFloat: true}, 1210 // https://www.felixcloutier.com/x86/ucomiss 1211 UCOMISS: {opcode: []byte{0x0f, 0x2e}, requireSrcFloat: true, requireDstFloat: true}, 1212 // https://www.felixcloutier.com/x86/xchg 1213 XCHGQ: {opcode: []byte{0x87}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1214 // https://www.felixcloutier.com/x86/xor 1215 XORL: {opcode: []byte{0x31}, srcOnModRMReg: true}, 1216 XORQ: {opcode: []byte{0x31}, rPrefix: RexPrefixW, srcOnModRMReg: true}, 1217 // https://www.felixcloutier.com/x86/xorpd 1218 XORPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x57}, requireSrcFloat: true, requireDstFloat: true}, 1219 XORPS: {opcode: []byte{0x0f, 0x57}, requireSrcFloat: true, requireDstFloat: true}, 1220 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1221 PINSRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x20}, requireSrcFloat: false, requireDstFloat: true, needArg: true}, 1222 // https://www.felixcloutier.com/x86/pinsrw 1223 PINSRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc4}, requireSrcFloat: false, requireDstFloat: true, needArg: true}, 1224 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1225 PINSRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x22}, requireSrcFloat: false, requireDstFloat: true, needArg: true}, 1226 // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 1227 PINSRQ: {mandatoryPrefix: 0x66, rPrefix: RexPrefixW, opcode: []byte{0x0f, 0x3a, 0x22}, requireSrcFloat: false, requireDstFloat: true, needArg: true}, 1228 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1229 MOVDQU: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x6f}, requireSrcFloat: true, requireDstFloat: true}, 1230 // https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64 1231 MOVDQA: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6f}, requireSrcFloat: true, requireDstFloat: true}, 1232 // https://www.felixcloutier.com/x86/paddb:paddw:paddd:paddq 1233 PADDB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfc}, requireSrcFloat: true, requireDstFloat: true}, 1234 PADDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfd}, requireSrcFloat: true, requireDstFloat: true}, 1235 PADDD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfe}, requireSrcFloat: true, requireDstFloat: true}, 1236 PADDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd4}, requireSrcFloat: true, requireDstFloat: true}, 1237 // https://www.felixcloutier.com/x86/psubb:psubw:psubd 1238 PSUBB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf8}, requireSrcFloat: true, requireDstFloat: true}, 1239 PSUBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf9}, requireSrcFloat: true, requireDstFloat: true}, 1240 PSUBD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfa}, requireSrcFloat: true, requireDstFloat: true}, 1241 // https://www.felixcloutier.com/x86/psubq 1242 PSUBQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xfb}, requireSrcFloat: true, requireDstFloat: true}, 1243 // https://www.felixcloutier.com/x86/addps 1244 ADDPS: {opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true}, 1245 // https://www.felixcloutier.com/x86/addpd 1246 ADDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x58}, requireSrcFloat: true, requireDstFloat: true}, 1247 // https://www.felixcloutier.com/x86/subps 1248 SUBPS: {opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true}, 1249 // https://www.felixcloutier.com/x86/subpd 1250 SUBPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5c}, requireSrcFloat: true, requireDstFloat: true}, 1251 // https://www.felixcloutier.com/x86/pxor 1252 PXOR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xef}, requireSrcFloat: true, requireDstFloat: true}, 1253 // https://www.felixcloutier.com/x86/pand 1254 PAND: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdb}, requireSrcFloat: true, requireDstFloat: true}, 1255 // https://www.felixcloutier.com/x86/por 1256 POR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xeb}, requireSrcFloat: true, requireDstFloat: true}, 1257 // https://www.felixcloutier.com/x86/pandn 1258 PANDN: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdf}, requireSrcFloat: true, requireDstFloat: true}, 1259 // https://www.felixcloutier.com/x86/pshufb 1260 PSHUFB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0}, requireSrcFloat: true, requireDstFloat: true}, 1261 // https://www.felixcloutier.com/x86/pshufd 1262 PSHUFD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x70}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1263 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1264 PEXTRB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x14}, requireSrcFloat: true, requireDstFloat: false, needArg: true, srcOnModRMReg: true}, 1265 // https://www.felixcloutier.com/x86/pextrw 1266 PEXTRW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc5}, requireSrcFloat: true, requireDstFloat: false, needArg: true}, 1267 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1268 PEXTRD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, requireSrcFloat: true, requireDstFloat: false, needArg: true, srcOnModRMReg: true}, 1269 // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1270 PEXTRQ: {rPrefix: RexPrefixW, mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x16}, requireSrcFloat: true, requireDstFloat: false, needArg: true, srcOnModRMReg: true}, 1271 // https://www.felixcloutier.com/x86/insertps 1272 INSERTPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x21}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1273 // https://www.felixcloutier.com/x86/movlhps 1274 MOVLHPS: {opcode: []byte{0x0f, 0x16}, requireSrcFloat: true, requireDstFloat: true}, 1275 // https://www.felixcloutier.com/x86/ptest 1276 PTEST: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x17}, requireSrcFloat: true, requireDstFloat: true}, 1277 // https://www.felixcloutier.com/x86/pcmpeqb:pcmpeqw:pcmpeqd 1278 PCMPEQB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x74}, requireSrcFloat: true, requireDstFloat: true}, 1279 PCMPEQW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x75}, requireSrcFloat: true, requireDstFloat: true}, 1280 PCMPEQD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x76}, requireSrcFloat: true, requireDstFloat: true}, 1281 // https://www.felixcloutier.com/x86/pcmpeqq 1282 PCMPEQQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x29}, requireSrcFloat: true, requireDstFloat: true}, 1283 // https://www.felixcloutier.com/x86/paddusb:paddusw 1284 PADDUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdc}, requireSrcFloat: true, requireDstFloat: true}, 1285 // https://www.felixcloutier.com/x86/movsd 1286 MOVSD: {mandatoryPrefix: 0xf2, opcode: []byte{0x0f, 0x10}, requireSrcFloat: true, requireDstFloat: true}, 1287 // https://www.felixcloutier.com/x86/packsswb:packssdw 1288 PACKSSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x63}, requireSrcFloat: true, requireDstFloat: true}, 1289 // https://www.felixcloutier.com/x86/pmovmskb 1290 PMOVMSKB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd7}, requireSrcFloat: true, requireDstFloat: false}, 1291 // https://www.felixcloutier.com/x86/movmskps 1292 MOVMSKPS: {opcode: []byte{0x0f, 0x50}, requireSrcFloat: true, requireDstFloat: false}, 1293 // https://www.felixcloutier.com/x86/movmskpd 1294 MOVMSKPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x50}, requireSrcFloat: true, requireDstFloat: false}, 1295 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 1296 PSRAD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe2}, requireSrcFloat: true, requireDstFloat: true}, 1297 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 1298 PSRAW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe1}, requireSrcFloat: true, requireDstFloat: true}, 1299 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1300 PSRLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd3}, requireSrcFloat: true, requireDstFloat: true}, 1301 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1302 PSRLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd2}, requireSrcFloat: true, requireDstFloat: true}, 1303 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 1304 PSRLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd1}, requireSrcFloat: true, requireDstFloat: true}, 1305 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1306 PSLLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf1}, requireSrcFloat: true, requireDstFloat: true}, 1307 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1308 PSLLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf2}, requireSrcFloat: true, requireDstFloat: true}, 1309 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 1310 PSLLQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf3}, requireSrcFloat: true, requireDstFloat: true}, 1311 // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq 1312 PUNPCKLBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x60}, requireSrcFloat: true, requireDstFloat: true}, 1313 // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq 1314 PUNPCKHBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x68}, requireSrcFloat: true, requireDstFloat: true}, 1315 // https://www.felixcloutier.com/x86/cmpps 1316 CMPPS: {opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1317 // https://www.felixcloutier.com/x86/cmppd 1318 CMPPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1319 // https://www.felixcloutier.com/x86/pcmpgtq 1320 PCMPGTQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x37}, requireSrcFloat: true, requireDstFloat: true}, 1321 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1322 PCMPGTD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x66}, requireSrcFloat: true, requireDstFloat: true}, 1323 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1324 PCMPGTW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x65}, requireSrcFloat: true, requireDstFloat: true}, 1325 // https://www.felixcloutier.com/x86/pcmpgtb:pcmpgtw:pcmpgtd 1326 PCMPGTB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x64}, requireSrcFloat: true, requireDstFloat: true}, 1327 // https://www.felixcloutier.com/x86/pminsd:pminsq 1328 PMINSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x39}, requireSrcFloat: true, requireDstFloat: true}, 1329 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1330 PMAXSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3d}, requireSrcFloat: true, requireDstFloat: true}, 1331 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1332 PMAXSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xee}, requireSrcFloat: true, requireDstFloat: true}, 1333 // https://www.felixcloutier.com/x86/pmaxsb:pmaxsw:pmaxsd:pmaxsq 1334 PMAXSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3c}, requireSrcFloat: true, requireDstFloat: true}, 1335 // https://www.felixcloutier.com/x86/pminsb:pminsw 1336 PMINSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xea}, requireSrcFloat: true, requireDstFloat: true}, 1337 // https://www.felixcloutier.com/x86/pminsb:pminsw 1338 PMINSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x38}, requireSrcFloat: true, requireDstFloat: true}, 1339 // https://www.felixcloutier.com/x86/pminud:pminuq 1340 PMINUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3b}, requireSrcFloat: true, requireDstFloat: true}, 1341 // https://www.felixcloutier.com/x86/pminub:pminuw 1342 PMINUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3a}, requireSrcFloat: true, requireDstFloat: true}, 1343 // https://www.felixcloutier.com/x86/pminub:pminuw 1344 PMINUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xda}, requireSrcFloat: true, requireDstFloat: true}, 1345 // https://www.felixcloutier.com/x86/pmaxud:pmaxuq 1346 PMAXUD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3f}, requireSrcFloat: true, requireDstFloat: true}, 1347 // https://www.felixcloutier.com/x86/pmaxub:pmaxuw 1348 PMAXUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x3e}, requireSrcFloat: true, requireDstFloat: true}, 1349 // https://www.felixcloutier.com/x86/pmaxub:pmaxuw 1350 PMAXUB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xde}, requireSrcFloat: true, requireDstFloat: true}, 1351 // https://www.felixcloutier.com/x86/pmullw 1352 PMULLW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd5}, requireSrcFloat: true, requireDstFloat: true}, 1353 // https://www.felixcloutier.com/x86/pmulld:pmullq 1354 PMULLD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x40}, requireSrcFloat: true, requireDstFloat: true}, 1355 // https://www.felixcloutier.com/x86/pmuludq 1356 PMULUDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf4}, requireSrcFloat: true, requireDstFloat: true}, 1357 // https://www.felixcloutier.com/x86/psubsb:psubsw 1358 PSUBSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe8}, requireSrcFloat: true, requireDstFloat: true}, 1359 // https://www.felixcloutier.com/x86/psubsb:psubsw 1360 PSUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe9}, requireSrcFloat: true, requireDstFloat: true}, 1361 // https://www.felixcloutier.com/x86/psubusb:psubusw 1362 PSUBUSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd8}, requireSrcFloat: true, requireDstFloat: true}, 1363 // https://www.felixcloutier.com/x86/psubusb:psubusw 1364 PSUBUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xd9}, requireSrcFloat: true, requireDstFloat: true}, 1365 // https://www.felixcloutier.com/x86/paddsb:paddsw 1366 PADDSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xed}, requireSrcFloat: true, requireDstFloat: true}, 1367 // https://www.felixcloutier.com/x86/paddsb:paddsw 1368 PADDSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xec}, requireSrcFloat: true, requireDstFloat: true}, 1369 // https://www.felixcloutier.com/x86/paddusb:paddusw 1370 PADDUSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xdd}, requireSrcFloat: true, requireDstFloat: true}, 1371 // https://www.felixcloutier.com/x86/pavgb:pavgw 1372 PAVGB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe0}, requireSrcFloat: true, requireDstFloat: true}, 1373 // https://www.felixcloutier.com/x86/pavgb:pavgw 1374 PAVGW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe3}, requireSrcFloat: true, requireDstFloat: true}, 1375 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1376 PABSB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1c}, requireSrcFloat: true, requireDstFloat: true}, 1377 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1378 PABSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1d}, requireSrcFloat: true, requireDstFloat: true}, 1379 // https://www.felixcloutier.com/x86/pabsb:pabsw:pabsd:pabsq 1380 PABSD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x1e}, requireSrcFloat: true, requireDstFloat: true}, 1381 // https://www.felixcloutier.com/x86/blendvpd 1382 BLENDVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x15}, requireSrcFloat: true, requireDstFloat: true}, 1383 // https://www.felixcloutier.com/x86/maxpd 1384 MAXPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true}, 1385 // https://www.felixcloutier.com/x86/maxps 1386 MAXPS: {opcode: []byte{0x0f, 0x5f}, requireSrcFloat: true, requireDstFloat: true}, 1387 // https://www.felixcloutier.com/x86/minpd 1388 MINPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true}, 1389 // https://www.felixcloutier.com/x86/minps 1390 MINPS: {opcode: []byte{0x0f, 0x5d}, requireSrcFloat: true, requireDstFloat: true}, 1391 // https://www.felixcloutier.com/x86/andnpd 1392 ANDNPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x55}, requireSrcFloat: true, requireDstFloat: true}, 1393 // https://www.felixcloutier.com/x86/andnps 1394 ANDNPS: {opcode: []byte{0x0f, 0x55}, requireSrcFloat: true, requireDstFloat: true}, 1395 // https://www.felixcloutier.com/x86/mulps 1396 MULPS: {opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true}, 1397 // https://www.felixcloutier.com/x86/mulpd 1398 MULPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x59}, requireSrcFloat: true, requireDstFloat: true}, 1399 // https://www.felixcloutier.com/x86/divps 1400 DIVPS: {opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true}, 1401 // https://www.felixcloutier.com/x86/divpd 1402 DIVPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5e}, requireSrcFloat: true, requireDstFloat: true}, 1403 // https://www.felixcloutier.com/x86/sqrtps 1404 SQRTPS: {opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true}, 1405 // https://www.felixcloutier.com/x86/sqrtpd 1406 SQRTPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x51}, requireSrcFloat: true, requireDstFloat: true}, 1407 // https://www.felixcloutier.com/x86/roundps 1408 ROUNDPS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x08}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1409 // https://www.felixcloutier.com/x86/roundpd 1410 ROUNDPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x09}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1411 // https://www.felixcloutier.com/x86/palignr 1412 PALIGNR: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x3a, 0x0f}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1413 // https://www.felixcloutier.com/x86/punpcklbw:punpcklwd:punpckldq:punpcklqdq 1414 PUNPCKLWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x61}, requireSrcFloat: true, requireDstFloat: true}, 1415 // https://www.felixcloutier.com/x86/punpckhbw:punpckhwd:punpckhdq:punpckhqdq 1416 PUNPCKHWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x69}, requireSrcFloat: true, requireDstFloat: true}, 1417 // https://www.felixcloutier.com/x86/pmulhuw 1418 PMULHUW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe4}, requireSrcFloat: true, requireDstFloat: true}, 1419 // https://www.felixcloutier.com/x86/pmuldq 1420 PMULDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x28}, requireSrcFloat: true, requireDstFloat: true}, 1421 // https://www.felixcloutier.com/x86/pmulhrsw 1422 PMULHRSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x0b}, requireSrcFloat: true, requireDstFloat: true}, 1423 // https://www.felixcloutier.com/x86/pmovsx 1424 PMOVSXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x20}, requireSrcFloat: true, requireDstFloat: true}, 1425 // https://www.felixcloutier.com/x86/pmovsx 1426 PMOVSXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x23}, requireSrcFloat: true, requireDstFloat: true}, 1427 // https://www.felixcloutier.com/x86/pmovsx 1428 PMOVSXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x25}, requireSrcFloat: true, requireDstFloat: true}, 1429 // https://www.felixcloutier.com/x86/pmovzx 1430 PMOVZXBW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x30}, requireSrcFloat: true, requireDstFloat: true}, 1431 // https://www.felixcloutier.com/x86/pmovzx 1432 PMOVZXWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x33}, requireSrcFloat: true, requireDstFloat: true}, 1433 // https://www.felixcloutier.com/x86/pmovzx 1434 PMOVZXDQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x35}, requireSrcFloat: true, requireDstFloat: true}, 1435 // https://www.felixcloutier.com/x86/pmulhw 1436 PMULHW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe5}, requireSrcFloat: true, requireDstFloat: true}, 1437 // https://www.felixcloutier.com/x86/cmpps 1438 CMPEQPS: {opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1439 // https://www.felixcloutier.com/x86/cmppd 1440 CMPEQPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xc2}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1441 // https://www.felixcloutier.com/x86/cvttps2dq 1442 CVTTPS2DQ: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0x5b}, requireSrcFloat: true, requireDstFloat: true}, 1443 // https://www.felixcloutier.com/x86/cvtdq2ps 1444 CVTDQ2PS: {opcode: []byte{0x0f, 0x5b}, requireSrcFloat: true, requireDstFloat: true}, 1445 // https://www.felixcloutier.com/x86/cvtdq2pd 1446 CVTDQ2PD: {mandatoryPrefix: 0xf3, opcode: []byte{0x0f, 0xe6}, requireSrcFloat: true, requireDstFloat: true}, 1447 // https://www.felixcloutier.com/x86/cvtpd2ps 1448 CVTPD2PS: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true}, 1449 // https://www.felixcloutier.com/x86/cvtps2pd 1450 CVTPS2PD: {opcode: []byte{0x0f, 0x5a}, requireSrcFloat: true, requireDstFloat: true}, 1451 // https://www.felixcloutier.com/x86/movupd 1452 MOVUPD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x10}, requireSrcFloat: true, requireDstFloat: true}, 1453 // https://www.felixcloutier.com/x86/shufps 1454 SHUFPS: {opcode: []byte{0x0f, 0xc6}, requireSrcFloat: true, requireDstFloat: true, needArg: true}, 1455 // https://www.felixcloutier.com/x86/pmaddwd 1456 PMADDWD: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xf5}, requireSrcFloat: true, requireDstFloat: true}, 1457 // https://www.felixcloutier.com/x86/unpcklps 1458 UNPCKLPS: {opcode: []byte{0x0f, 0x14}, requireSrcFloat: true, requireDstFloat: true}, 1459 // https://www.felixcloutier.com/x86/packuswb 1460 PACKUSWB: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x67}, requireSrcFloat: true, requireDstFloat: true}, 1461 // https://www.felixcloutier.com/x86/packsswb:packssdw 1462 PACKSSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x6b}, requireSrcFloat: true, requireDstFloat: true}, 1463 // https://www.felixcloutier.com/x86/packusdw 1464 PACKUSDW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x2b}, requireSrcFloat: true, requireDstFloat: true}, 1465 // https://www.felixcloutier.com/x86/pmaddubsw 1466 PMADDUBSW: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0x38, 0x04}, requireSrcFloat: true, requireDstFloat: true}, 1467 // https://www.felixcloutier.com/x86/cvttpd2dq 1468 CVTTPD2DQ: {mandatoryPrefix: 0x66, opcode: []byte{0x0f, 0xe6}, requireDstFloat: true, requireSrcFloat: true}, 1469 } 1470 1471 var RegisterToRegisterShiftOpcode = map[asm.Instruction]struct { 1472 opcode []byte 1473 rPrefix RexPrefix 1474 modRMExtension byte 1475 }{ 1476 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1477 ROLL: {opcode: []byte{0xd3}}, 1478 ROLQ: {opcode: []byte{0xd3}, rPrefix: RexPrefixW}, 1479 RORL: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000}, 1480 RORQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_001_000, rPrefix: RexPrefixW}, 1481 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1482 SARL: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000}, 1483 SARQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_111_000, rPrefix: RexPrefixW}, 1484 SHLL: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000}, 1485 SHLQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_100_000, rPrefix: RexPrefixW}, 1486 SHRL: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000}, 1487 SHRQ: {opcode: []byte{0xd3}, modRMExtension: 0b00_101_000, rPrefix: RexPrefixW}, 1488 } 1489 1490 type registerToRegisterMOVOpcode struct { 1491 opcode []byte 1492 mandatoryPrefix byte 1493 srcOnModRMReg bool 1494 rPrefix RexPrefix 1495 } 1496 1497 var registerToRegisterMOVOpcodes = map[asm.Instruction]struct { 1498 i2i, i2f, f2i, f2f registerToRegisterMOVOpcode 1499 }{ 1500 MOVL: { 1501 // https://www.felixcloutier.com/x86/mov 1502 i2i: registerToRegisterMOVOpcode{opcode: []byte{0x89}, srcOnModRMReg: true}, 1503 // https://www.felixcloutier.com/x86/movd:movq 1504 i2f: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x6e}, mandatoryPrefix: 0x66, srcOnModRMReg: false}, 1505 f2i: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x7e}, mandatoryPrefix: 0x66, srcOnModRMReg: true}, 1506 }, 1507 MOVQ: { 1508 // https://www.felixcloutier.com/x86/mov 1509 i2i: registerToRegisterMOVOpcode{opcode: []byte{0x89}, srcOnModRMReg: true, rPrefix: RexPrefixW}, 1510 // https://www.felixcloutier.com/x86/movd:movq 1511 i2f: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x6e}, mandatoryPrefix: 0x66, srcOnModRMReg: false, rPrefix: RexPrefixW}, 1512 f2i: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x7e}, mandatoryPrefix: 0x66, srcOnModRMReg: true, rPrefix: RexPrefixW}, 1513 // https://www.felixcloutier.com/x86/movq 1514 f2f: registerToRegisterMOVOpcode{opcode: []byte{0x0f, 0x7e}, mandatoryPrefix: 0xf3}, 1515 }, 1516 } 1517 1518 func (a *AssemblerImpl) encodeRegisterToRegister(n *nodeImpl) (err error) { 1519 // Alias for readability 1520 inst := n.instruction 1521 1522 if op, ok := registerToRegisterMOVOpcodes[inst]; ok { 1523 var opcode registerToRegisterMOVOpcode 1524 srcIsFloat, dstIsFloat := IsVectorRegister(n.srcReg), IsVectorRegister(n.dstReg) 1525 if srcIsFloat && dstIsFloat { 1526 if inst == MOVL { 1527 return errors.New("MOVL for float to float is undefined") 1528 } 1529 opcode = op.f2f 1530 } else if srcIsFloat && !dstIsFloat { 1531 opcode = op.f2i 1532 } else if !srcIsFloat && dstIsFloat { 1533 opcode = op.i2f 1534 } else { 1535 opcode = op.i2i 1536 } 1537 1538 rexPrefix, modRM, err := n.GetRegisterToRegisterModRM(opcode.srcOnModRMReg) 1539 if err != nil { 1540 return err 1541 } 1542 rexPrefix |= opcode.rPrefix 1543 1544 if opcode.mandatoryPrefix != 0 { 1545 a.buf.WriteByte(opcode.mandatoryPrefix) 1546 } 1547 1548 if rexPrefix != RexPrefixNone { 1549 a.buf.WriteByte(rexPrefix) 1550 } 1551 a.buf.Write(opcode.opcode) 1552 1553 a.buf.WriteByte(modRM) 1554 return nil 1555 } else if op, ok := registerToRegisterOpcode[inst]; ok { 1556 srcIsFloat, dstIsFloat := IsVectorRegister(n.srcReg), IsVectorRegister(n.dstReg) 1557 if op.requireSrcFloat && !srcIsFloat { 1558 return fmt.Errorf("%s require float src register but got %s", InstructionName(inst), RegisterName(n.srcReg)) 1559 } else if op.requireDstFloat && !dstIsFloat { 1560 return fmt.Errorf("%s require float dst register but got %s", InstructionName(inst), RegisterName(n.dstReg)) 1561 } else if !op.requireSrcFloat && srcIsFloat { 1562 return fmt.Errorf("%s require integer src register but got %s", InstructionName(inst), RegisterName(n.srcReg)) 1563 } else if !op.requireDstFloat && dstIsFloat { 1564 return fmt.Errorf("%s require integer dst register but got %s", InstructionName(inst), RegisterName(n.dstReg)) 1565 } 1566 1567 rexPrefix, modRM, err := n.GetRegisterToRegisterModRM(op.srcOnModRMReg) 1568 if err != nil { 1569 return err 1570 } 1571 rexPrefix |= op.rPrefix 1572 1573 if op.isSrc8bit && RegSP <= n.srcReg && n.srcReg <= RegDI { 1574 // If an operand register is 8-bit length of SP, BP, DI, or SI register, we need to have the default prefix. 1575 // https: //wiki.osdev.org/X86-64_Instruction_Encoding#Registers 1576 rexPrefix |= RexPrefixDefault 1577 } 1578 1579 if op.mandatoryPrefix != 0 { 1580 a.buf.WriteByte(op.mandatoryPrefix) 1581 } 1582 1583 if rexPrefix != RexPrefixNone { 1584 a.buf.WriteByte(rexPrefix) 1585 } 1586 a.buf.Write(op.opcode) 1587 1588 a.buf.WriteByte(modRM) 1589 1590 if op.needArg { 1591 a.WriteConst(int64(n.arg), 8) 1592 } 1593 return nil 1594 } else if op, ok := RegisterToRegisterShiftOpcode[inst]; ok { 1595 if n.srcReg != RegCX { 1596 return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(inst), RegisterName(n.srcReg)) 1597 } else if IsVectorRegister(n.dstReg) { 1598 return fmt.Errorf("shifting instruction %s require integer register as dst but got %s", InstructionName(inst), RegisterName(n.srcReg)) 1599 } 1600 1601 reg3bits, rexPrefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 1602 if err != nil { 1603 return err 1604 } 1605 1606 rexPrefix |= op.rPrefix 1607 if rexPrefix != RexPrefixNone { 1608 a.buf.WriteByte(rexPrefix) 1609 } 1610 1611 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1612 modRM := 0b11_000_000 | 1613 (op.modRMExtension) | 1614 reg3bits 1615 a.buf.Write(append(op.opcode, modRM)) 1616 return nil 1617 } else { 1618 return errorEncodingUnsupported(n) 1619 } 1620 } 1621 1622 func (a *AssemblerImpl) encodeRegisterToMemory(n *nodeImpl) (err error) { 1623 rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation() 1624 if err != nil { 1625 return err 1626 } 1627 1628 var opcode []byte 1629 var mandatoryPrefix byte 1630 var isShiftInstruction bool 1631 var needArg bool 1632 switch n.instruction { 1633 case CMPL: 1634 // https://www.felixcloutier.com/x86/cmp 1635 opcode = []byte{0x3b} 1636 case CMPQ: 1637 // https://www.felixcloutier.com/x86/cmp 1638 rexPrefix |= RexPrefixW 1639 opcode = []byte{0x3b} 1640 case MOVB: 1641 // https://www.felixcloutier.com/x86/mov 1642 opcode = []byte{0x88} 1643 // 1 byte register operands need default prefix for the following registers. 1644 if n.srcReg >= RegSP && n.srcReg <= RegDI { 1645 rexPrefix |= RexPrefixDefault 1646 } 1647 case MOVL: 1648 if IsVectorRegister(n.srcReg) { 1649 // https://www.felixcloutier.com/x86/movd:movq 1650 opcode = []byte{0x0f, 0x7e} 1651 mandatoryPrefix = 0x66 1652 } else { 1653 // https://www.felixcloutier.com/x86/mov 1654 opcode = []byte{0x89} 1655 } 1656 case MOVQ: 1657 if IsVectorRegister(n.srcReg) { 1658 // https://www.felixcloutier.com/x86/movq 1659 opcode = []byte{0x0f, 0xd6} 1660 mandatoryPrefix = 0x66 1661 } else { 1662 // https://www.felixcloutier.com/x86/mov 1663 rexPrefix |= RexPrefixW 1664 opcode = []byte{0x89} 1665 } 1666 case MOVW: 1667 // https://www.felixcloutier.com/x86/mov 1668 // Note: Need 0x66 to indicate that the operand size is 16-bit. 1669 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Operand-size_and_address-size_override_prefix 1670 mandatoryPrefix = 0x66 1671 opcode = []byte{0x89} 1672 case SARL: 1673 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1674 modRM |= 0b00_111_000 1675 opcode = []byte{0xd3} 1676 isShiftInstruction = true 1677 case SARQ: 1678 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1679 rexPrefix |= RexPrefixW 1680 modRM |= 0b00_111_000 1681 opcode = []byte{0xd3} 1682 isShiftInstruction = true 1683 case SHLL: 1684 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1685 modRM |= 0b00_100_000 1686 opcode = []byte{0xd3} 1687 isShiftInstruction = true 1688 case SHLQ: 1689 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1690 rexPrefix |= RexPrefixW 1691 modRM |= 0b00_100_000 1692 opcode = []byte{0xd3} 1693 isShiftInstruction = true 1694 case SHRL: 1695 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1696 modRM |= 0b00_101_000 1697 opcode = []byte{0xd3} 1698 isShiftInstruction = true 1699 case SHRQ: 1700 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 1701 rexPrefix |= RexPrefixW 1702 modRM |= 0b00_101_000 1703 opcode = []byte{0xd3} 1704 isShiftInstruction = true 1705 case ROLL: 1706 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1707 opcode = []byte{0xd3} 1708 isShiftInstruction = true 1709 case ROLQ: 1710 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1711 rexPrefix |= RexPrefixW 1712 opcode = []byte{0xd3} 1713 isShiftInstruction = true 1714 case RORL: 1715 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1716 modRM |= 0b00_001_000 1717 opcode = []byte{0xd3} 1718 isShiftInstruction = true 1719 case RORQ: 1720 // https://www.felixcloutier.com/x86/rcl:rcr:rol:ror 1721 rexPrefix |= RexPrefixW 1722 opcode = []byte{0xd3} 1723 modRM |= 0b00_001_000 1724 isShiftInstruction = true 1725 case MOVDQU: 1726 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1727 mandatoryPrefix = 0xf3 1728 opcode = []byte{0x0f, 0x7f} 1729 case PEXTRB: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1730 mandatoryPrefix = 0x66 1731 opcode = []byte{0x0f, 0x3a, 0x14} 1732 needArg = true 1733 case PEXTRW: // https://www.felixcloutier.com/x86/pextrw 1734 mandatoryPrefix = 0x66 1735 opcode = []byte{0x0f, 0x3a, 0x15} 1736 needArg = true 1737 case PEXTRD: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1738 mandatoryPrefix = 0x66 1739 opcode = []byte{0x0f, 0x3a, 0x16} 1740 needArg = true 1741 case PEXTRQ: // https://www.felixcloutier.com/x86/pextrb:pextrd:pextrq 1742 mandatoryPrefix = 0x66 1743 rexPrefix |= RexPrefixW // REX.W 1744 opcode = []byte{0x0f, 0x3a, 0x16} 1745 needArg = true 1746 default: 1747 return errorEncodingUnsupported(n) 1748 } 1749 1750 if !isShiftInstruction { 1751 srcReg3Bits, prefix, err := register3bits(n.srcReg, registerSpecifierPositionModRMFieldReg) 1752 if err != nil { 1753 return err 1754 } 1755 1756 rexPrefix |= prefix 1757 modRM |= srcReg3Bits << 3 // Place the source register on ModRM:reg 1758 } else { 1759 if n.srcReg != RegCX { 1760 return fmt.Errorf("shifting instruction %s require CX register as src but got %s", InstructionName(n.instruction), RegisterName(n.srcReg)) 1761 } 1762 } 1763 1764 if mandatoryPrefix != 0 { 1765 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 1766 a.buf.WriteByte(mandatoryPrefix) 1767 } 1768 1769 if rexPrefix != RexPrefixNone { 1770 a.buf.WriteByte(rexPrefix) 1771 } 1772 1773 a.buf.Write(opcode) 1774 1775 a.buf.WriteByte(modRM) 1776 1777 if sbi != nil { 1778 a.buf.WriteByte(*sbi) 1779 } 1780 1781 if displacementWidth != 0 { 1782 a.WriteConst(n.dstConst, displacementWidth) 1783 } 1784 1785 if needArg { 1786 a.WriteConst(int64(n.arg), 8) 1787 } 1788 return 1789 } 1790 1791 func (a *AssemblerImpl) encodeRegisterToConst(n *nodeImpl) (err error) { 1792 regBits, prefix, err := register3bits(n.srcReg, registerSpecifierPositionModRMFieldRM) 1793 if err != nil { 1794 return err 1795 } 1796 1797 switch n.instruction { 1798 case CMPL, CMPQ: 1799 if n.instruction == CMPQ { 1800 prefix |= RexPrefixW 1801 } 1802 if prefix != RexPrefixNone { 1803 a.buf.WriteByte(prefix) 1804 } 1805 is8bitConst := fitInSigned8bit(n.dstConst) 1806 // https://www.felixcloutier.com/x86/cmp 1807 if n.srcReg == RegAX && !is8bitConst { 1808 a.buf.Write([]byte{0x3d}) 1809 } else { 1810 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 1811 modRM := 0b11_000_000 | // Specifying that opeand is register. 1812 0b00_111_000 | // CMP with immediate needs "/7" extension. 1813 regBits 1814 if is8bitConst { 1815 a.buf.Write([]byte{0x83, modRM}) 1816 } else { 1817 a.buf.Write([]byte{0x81, modRM}) 1818 } 1819 } 1820 default: 1821 err = errorEncodingUnsupported(n) 1822 } 1823 1824 if fitInSigned8bit(n.dstConst) { 1825 a.WriteConst(n.dstConst, 8) 1826 } else { 1827 a.WriteConst(n.dstConst, 32) 1828 } 1829 return 1830 } 1831 1832 func (a *AssemblerImpl) encodeReadInstructionAddress(n *nodeImpl) error { 1833 dstReg3Bits, rexPrefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) 1834 if err != nil { 1835 return err 1836 } 1837 1838 a.AddOnGenerateCallBack(func(code []byte) error { 1839 // Find the target instruction node. 1840 targetNode := n 1841 for ; targetNode != nil; targetNode = targetNode.next { 1842 if targetNode.instruction == n.readInstructionAddressBeforeTargetInstruction { 1843 targetNode = targetNode.next 1844 break 1845 } 1846 } 1847 1848 if targetNode == nil { 1849 return errors.New("BUG: target instruction not found for read instruction address") 1850 } 1851 1852 offset := targetNode.OffsetInBinary() - (n.OffsetInBinary() + 7 /* 7 = the length of the LEAQ instruction */) 1853 if offset >= math.MaxInt32 { 1854 return errors.New("BUG: too large offset for LEAQ instruction") 1855 } 1856 1857 binary.LittleEndian.PutUint32(code[n.OffsetInBinary()+3:], uint32(int32(offset))) 1858 return nil 1859 }) 1860 1861 // https://www.felixcloutier.com/x86/lea 1862 opcode := byte(0x8d) 1863 rexPrefix |= RexPrefixW 1864 1865 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 1866 modRM := 0b00_000_101 | // Indicate "LEAQ [RIP + 32bit displacement], dstReg" encoding. 1867 (dstReg3Bits << 3) // Place the dstReg on ModRM:reg. 1868 1869 a.buf.Write([]byte{rexPrefix, opcode, modRM}) 1870 a.WriteConst(int64(0), 32) // Preserve 1871 return nil 1872 } 1873 1874 func (a *AssemblerImpl) encodeMemoryToRegister(n *nodeImpl) (err error) { 1875 if n.instruction == LEAQ && n.readInstructionAddressBeforeTargetInstruction != NONE { 1876 return a.encodeReadInstructionAddress(n) 1877 } 1878 1879 rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation() 1880 if err != nil { 1881 return err 1882 } 1883 1884 dstReg3Bits, prefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldReg) 1885 if err != nil { 1886 return err 1887 } 1888 1889 rexPrefix |= prefix 1890 modRM |= dstReg3Bits << 3 // Place the destination register on ModRM:reg 1891 1892 var mandatoryPrefix byte 1893 var opcode []byte 1894 var needArg bool 1895 switch n.instruction { 1896 case ADDL: 1897 // https://www.felixcloutier.com/x86/add 1898 opcode = []byte{0x03} 1899 case ADDQ: 1900 // https://www.felixcloutier.com/x86/add 1901 rexPrefix |= RexPrefixW 1902 opcode = []byte{0x03} 1903 case CMPL: 1904 // https://www.felixcloutier.com/x86/cmp 1905 opcode = []byte{0x39} 1906 case CMPQ: 1907 // https://www.felixcloutier.com/x86/cmp 1908 rexPrefix |= RexPrefixW 1909 opcode = []byte{0x39} 1910 case LEAQ: 1911 // https://www.felixcloutier.com/x86/lea 1912 rexPrefix |= RexPrefixW 1913 opcode = []byte{0x8d} 1914 case MOVBLSX: 1915 // https://www.felixcloutier.com/x86/movsx:movsxd 1916 opcode = []byte{0x0f, 0xbe} 1917 case MOVBLZX: 1918 // https://www.felixcloutier.com/x86/movzx 1919 opcode = []byte{0x0f, 0xb6} 1920 case MOVBQSX: 1921 // https://www.felixcloutier.com/x86/movsx:movsxd 1922 rexPrefix |= RexPrefixW 1923 opcode = []byte{0x0f, 0xbe} 1924 case MOVBQZX: 1925 // https://www.felixcloutier.com/x86/movzx 1926 rexPrefix |= RexPrefixW 1927 opcode = []byte{0x0f, 0xb6} 1928 case MOVLQSX: 1929 // https://www.felixcloutier.com/x86/movsx:movsxd 1930 rexPrefix |= RexPrefixW 1931 opcode = []byte{0x63} 1932 case MOVLQZX: 1933 // https://www.felixcloutier.com/x86/mov 1934 // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and 1935 // that is semantically equivalent to MOV 32bit to 32bit. 1936 opcode = []byte{0x8B} 1937 case MOVL: 1938 // https://www.felixcloutier.com/x86/mov 1939 // Note: MOVLQZX means zero extending 32bit reg to 64-bit reg and 1940 // that is semantically equivalent to MOV 32bit to 32bit. 1941 if IsVectorRegister(n.dstReg) { 1942 // https://www.felixcloutier.com/x86/movd:movq 1943 opcode = []byte{0x0f, 0x6e} 1944 mandatoryPrefix = 0x66 1945 } else { 1946 // https://www.felixcloutier.com/x86/mov 1947 opcode = []byte{0x8B} 1948 } 1949 case MOVQ: 1950 if IsVectorRegister(n.dstReg) { 1951 // https://www.felixcloutier.com/x86/movq 1952 opcode = []byte{0x0f, 0x7e} 1953 mandatoryPrefix = 0xf3 1954 } else { 1955 // https://www.felixcloutier.com/x86/mov 1956 rexPrefix |= RexPrefixW 1957 opcode = []byte{0x8B} 1958 } 1959 case MOVWLSX: 1960 // https://www.felixcloutier.com/x86/movsx:movsxd 1961 opcode = []byte{0x0f, 0xbf} 1962 case MOVWLZX: 1963 // https://www.felixcloutier.com/x86/movzx 1964 opcode = []byte{0x0f, 0xb7} 1965 case MOVWQSX: 1966 // https://www.felixcloutier.com/x86/movsx:movsxd 1967 rexPrefix |= RexPrefixW 1968 opcode = []byte{0x0f, 0xbf} 1969 case MOVWQZX: 1970 // https://www.felixcloutier.com/x86/movzx 1971 rexPrefix |= RexPrefixW 1972 opcode = []byte{0x0f, 0xb7} 1973 case SUBQ: 1974 // https://www.felixcloutier.com/x86/sub 1975 rexPrefix |= RexPrefixW 1976 opcode = []byte{0x2b} 1977 case SUBSD: 1978 // https://www.felixcloutier.com/x86/subsd 1979 opcode = []byte{0x0f, 0x5c} 1980 mandatoryPrefix = 0xf2 1981 case SUBSS: 1982 // https://www.felixcloutier.com/x86/subss 1983 opcode = []byte{0x0f, 0x5c} 1984 mandatoryPrefix = 0xf3 1985 case UCOMISD: 1986 // https://www.felixcloutier.com/x86/ucomisd 1987 opcode = []byte{0x0f, 0x2e} 1988 mandatoryPrefix = 0x66 1989 case UCOMISS: 1990 // https://www.felixcloutier.com/x86/ucomiss 1991 opcode = []byte{0x0f, 0x2e} 1992 case MOVDQU: 1993 // https://www.felixcloutier.com/x86/movdqu:vmovdqu8:vmovdqu16:vmovdqu32:vmovdqu64 1994 mandatoryPrefix = 0xf3 1995 opcode = []byte{0x0f, 0x6f} 1996 case PMOVSXBW: // https://www.felixcloutier.com/x86/pmovsx 1997 mandatoryPrefix = 0x66 1998 opcode = []byte{0x0f, 0x38, 0x20} 1999 case PMOVSXWD: // https://www.felixcloutier.com/x86/pmovsx 2000 mandatoryPrefix = 0x66 2001 opcode = []byte{0x0f, 0x38, 0x23} 2002 case PMOVSXDQ: // https://www.felixcloutier.com/x86/pmovsx 2003 mandatoryPrefix = 0x66 2004 opcode = []byte{0x0f, 0x38, 0x25} 2005 case PMOVZXBW: // https://www.felixcloutier.com/x86/pmovzx 2006 mandatoryPrefix = 0x66 2007 opcode = []byte{0x0f, 0x38, 0x30} 2008 case PMOVZXWD: // https://www.felixcloutier.com/x86/pmovzx 2009 mandatoryPrefix = 0x66 2010 opcode = []byte{0x0f, 0x38, 0x33} 2011 case PMOVZXDQ: // https://www.felixcloutier.com/x86/pmovzx 2012 mandatoryPrefix = 0x66 2013 opcode = []byte{0x0f, 0x38, 0x35} 2014 case PINSRB: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2015 mandatoryPrefix = 0x66 2016 opcode = []byte{0x0f, 0x3a, 0x20} 2017 needArg = true 2018 case PINSRW: // https://www.felixcloutier.com/x86/pinsrw 2019 mandatoryPrefix = 0x66 2020 opcode = []byte{0x0f, 0xc4} 2021 needArg = true 2022 case PINSRD: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2023 mandatoryPrefix = 0x66 2024 opcode = []byte{0x0f, 0x3a, 0x22} 2025 needArg = true 2026 case PINSRQ: // https://www.felixcloutier.com/x86/pinsrb:pinsrd:pinsrq 2027 rexPrefix |= RexPrefixW 2028 mandatoryPrefix = 0x66 2029 opcode = []byte{0x0f, 0x3a, 0x22} 2030 needArg = true 2031 default: 2032 return errorEncodingUnsupported(n) 2033 } 2034 2035 if mandatoryPrefix != 0 { 2036 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Mandatory_prefix 2037 a.buf.WriteByte(mandatoryPrefix) 2038 } 2039 2040 if rexPrefix != RexPrefixNone { 2041 a.buf.WriteByte(rexPrefix) 2042 } 2043 2044 a.buf.Write(opcode) 2045 2046 a.buf.WriteByte(modRM) 2047 2048 if sbi != nil { 2049 a.buf.WriteByte(*sbi) 2050 } 2051 2052 if displacementWidth != 0 { 2053 a.WriteConst(n.srcConst, displacementWidth) 2054 } 2055 2056 if needArg { 2057 a.WriteConst(int64(n.arg), 8) 2058 } 2059 return 2060 } 2061 2062 func (a *AssemblerImpl) encodeConstToRegister(n *nodeImpl) (err error) { 2063 regBits, rexPrefix, err := register3bits(n.dstReg, registerSpecifierPositionModRMFieldRM) 2064 if err != nil { 2065 return err 2066 } 2067 2068 isFloatReg := IsVectorRegister(n.dstReg) 2069 switch n.instruction { 2070 case PSLLD, PSLLQ, PSRLD, PSRLQ, PSRAW, PSRLW, PSLLW, PSRAD: 2071 if !isFloatReg { 2072 return fmt.Errorf("%s needs float register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) 2073 } 2074 default: 2075 if isFloatReg { 2076 return fmt.Errorf("%s needs int register but got %s", InstructionName(n.instruction), RegisterName(n.dstReg)) 2077 } 2078 } 2079 2080 if n.instruction != MOVQ && !FitIn32bit(n.srcConst) { 2081 return fmt.Errorf("constant must fit in 32-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2082 } else if (n.instruction == SHLQ || n.instruction == SHRQ) && (n.srcConst < 0 || n.srcConst > math.MaxUint8) { 2083 return fmt.Errorf("constant must fit in positive 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2084 } else if (n.instruction == PSLLD || 2085 n.instruction == PSLLQ || 2086 n.instruction == PSRLD || 2087 n.instruction == PSRLQ) && (n.srcConst < math.MinInt8 || n.srcConst > math.MaxInt8) { 2088 return fmt.Errorf("constant must fit in signed 8-bit integer for %s, but got %d", InstructionName(n.instruction), n.srcConst) 2089 } 2090 2091 isSigned8bitConst := fitInSigned8bit(n.srcConst) 2092 switch inst := n.instruction; inst { 2093 case ADDQ: 2094 // https://www.felixcloutier.com/x86/add 2095 rexPrefix |= RexPrefixW 2096 if n.dstReg == RegAX && !isSigned8bitConst { 2097 a.buf.Write([]byte{rexPrefix, 0x05}) 2098 } else { 2099 modRM := 0b11_000_000 | // Specifying that opeand is register. 2100 regBits 2101 if isSigned8bitConst { 2102 a.buf.Write([]byte{rexPrefix, 0x83, modRM}) 2103 } else { 2104 a.buf.Write([]byte{rexPrefix, 0x81, modRM}) 2105 } 2106 } 2107 if isSigned8bitConst { 2108 a.WriteConst(n.srcConst, 8) 2109 } else { 2110 a.WriteConst(n.srcConst, 32) 2111 } 2112 case ANDQ: 2113 // https://www.felixcloutier.com/x86/and 2114 rexPrefix |= RexPrefixW 2115 if n.dstReg == RegAX && !isSigned8bitConst { 2116 a.buf.Write([]byte{rexPrefix, 0x25}) 2117 } else { 2118 modRM := 0b11_000_000 | // Specifying that opeand is register. 2119 0b00_100_000 | // AND with immediate needs "/4" extension. 2120 regBits 2121 if isSigned8bitConst { 2122 a.buf.Write([]byte{rexPrefix, 0x83, modRM}) 2123 } else { 2124 a.buf.Write([]byte{rexPrefix, 0x81, modRM}) 2125 } 2126 } 2127 if fitInSigned8bit(n.srcConst) { 2128 a.WriteConst(n.srcConst, 8) 2129 } else { 2130 a.WriteConst(n.srcConst, 32) 2131 } 2132 case TESTQ: 2133 // https://www.felixcloutier.com/x86/test 2134 rexPrefix |= RexPrefixW 2135 if n.dstReg == RegAX && !isSigned8bitConst { 2136 a.buf.Write([]byte{rexPrefix, 0xa9}) 2137 } else { 2138 modRM := 0b11_000_000 | // Specifying that operand is register 2139 regBits 2140 a.buf.Write([]byte{rexPrefix, 0xf7, modRM}) 2141 } 2142 a.WriteConst(n.srcConst, 32) 2143 case MOVL: 2144 // https://www.felixcloutier.com/x86/mov 2145 if rexPrefix != RexPrefixNone { 2146 a.buf.WriteByte(rexPrefix) 2147 } 2148 a.buf.Write([]byte{0xb8 | regBits}) 2149 a.WriteConst(n.srcConst, 32) 2150 case MOVQ: 2151 // https://www.felixcloutier.com/x86/mov 2152 if FitIn32bit(n.srcConst) { 2153 if n.srcConst > math.MaxInt32 { 2154 if rexPrefix != RexPrefixNone { 2155 a.buf.WriteByte(rexPrefix) 2156 } 2157 a.buf.Write([]byte{0xb8 | regBits}) 2158 } else { 2159 rexPrefix |= RexPrefixW 2160 modRM := 0b11_000_000 | // Specifying that opeand is register. 2161 regBits 2162 a.buf.Write([]byte{rexPrefix, 0xc7, modRM}) 2163 } 2164 a.WriteConst(n.srcConst, 32) 2165 } else { 2166 rexPrefix |= RexPrefixW 2167 a.buf.Write([]byte{rexPrefix, 0xb8 | regBits}) 2168 a.WriteConst(n.srcConst, 64) 2169 } 2170 case SHLQ: 2171 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 2172 rexPrefix |= RexPrefixW 2173 modRM := 0b11_000_000 | // Specifying that opeand is register. 2174 0b00_100_000 | // SHL with immediate needs "/4" extension. 2175 regBits 2176 if n.srcConst == 1 { 2177 a.buf.Write([]byte{rexPrefix, 0xd1, modRM}) 2178 } else { 2179 a.buf.Write([]byte{rexPrefix, 0xc1, modRM}) 2180 a.WriteConst(n.srcConst, 8) 2181 } 2182 case SHRQ: 2183 // https://www.felixcloutier.com/x86/sal:sar:shl:shr 2184 rexPrefix |= RexPrefixW 2185 modRM := 0b11_000_000 | // Specifying that opeand is register. 2186 0b00_101_000 | // SHR with immediate needs "/5" extension. 2187 regBits 2188 if n.srcConst == 1 { 2189 a.buf.Write([]byte{rexPrefix, 0xd1, modRM}) 2190 } else { 2191 a.buf.Write([]byte{rexPrefix, 0xc1, modRM}) 2192 a.WriteConst(n.srcConst, 8) 2193 } 2194 case PSLLD: 2195 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2196 modRM := 0b11_000_000 | // Specifying that opeand is register. 2197 0b00_110_000 | // PSLL with immediate needs "/6" extension. 2198 regBits 2199 if rexPrefix != RexPrefixNone { 2200 a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x72, modRM}) 2201 a.WriteConst(n.srcConst, 8) 2202 } else { 2203 a.buf.Write([]byte{0x66, 0x0f, 0x72, modRM}) 2204 a.WriteConst(n.srcConst, 8) 2205 } 2206 case PSLLQ: 2207 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2208 modRM := 0b11_000_000 | // Specifying that opeand is register. 2209 0b00_110_000 | // PSLL with immediate needs "/6" extension. 2210 regBits 2211 if rexPrefix != RexPrefixNone { 2212 a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x73, modRM}) 2213 a.WriteConst(n.srcConst, 8) 2214 } else { 2215 a.buf.Write([]byte{0x66, 0x0f, 0x73, modRM}) 2216 a.WriteConst(n.srcConst, 8) 2217 } 2218 case PSRLD: 2219 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2220 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2221 modRM := 0b11_000_000 | // Specifying that operand is register. 2222 0b00_010_000 | // PSRL with immediate needs "/2" extension. 2223 regBits 2224 if rexPrefix != RexPrefixNone { 2225 a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x72, modRM}) 2226 a.WriteConst(n.srcConst, 8) 2227 } else { 2228 a.buf.Write([]byte{0x66, 0x0f, 0x72, modRM}) 2229 a.WriteConst(n.srcConst, 8) 2230 } 2231 case PSRLQ: 2232 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2233 modRM := 0b11_000_000 | // Specifying that operand is register. 2234 0b00_010_000 | // PSRL with immediate needs "/2" extension. 2235 regBits 2236 if rexPrefix != RexPrefixNone { 2237 a.buf.Write([]byte{0x66, rexPrefix, 0x0f, 0x73, modRM}) 2238 a.WriteConst(n.srcConst, 8) 2239 } else { 2240 a.buf.Write([]byte{0x66, 0x0f, 0x73, modRM}) 2241 a.WriteConst(n.srcConst, 8) 2242 } 2243 case PSRAW, PSRAD: 2244 // https://www.felixcloutier.com/x86/psraw:psrad:psraq 2245 modRM := 0b11_000_000 | // Specifying that operand is register. 2246 0b00_100_000 | // PSRAW with immediate needs "/4" extension. 2247 regBits 2248 a.buf.WriteByte(0x66) 2249 if rexPrefix != RexPrefixNone { 2250 a.buf.WriteByte(rexPrefix) 2251 } 2252 2253 var op byte 2254 if inst == PSRAD { 2255 op = 0x72 2256 } else { // PSRAW 2257 op = 0x71 2258 } 2259 2260 a.buf.Write([]byte{0x0f, op, modRM}) 2261 a.WriteConst(n.srcConst, 8) 2262 case PSRLW: 2263 // https://www.felixcloutier.com/x86/psrlw:psrld:psrlq 2264 modRM := 0b11_000_000 | // Specifying that operand is register. 2265 0b00_010_000 | // PSRLW with immediate needs "/2" extension. 2266 regBits 2267 a.buf.WriteByte(0x66) 2268 if rexPrefix != RexPrefixNone { 2269 a.buf.WriteByte(rexPrefix) 2270 } 2271 a.buf.Write([]byte{0x0f, 0x71, modRM}) 2272 a.WriteConst(n.srcConst, 8) 2273 case PSLLW: 2274 // https://www.felixcloutier.com/x86/psllw:pslld:psllq 2275 modRM := 0b11_000_000 | // Specifying that operand is register. 2276 0b00_110_000 | // PSLLW with immediate needs "/6" extension. 2277 regBits 2278 a.buf.WriteByte(0x66) 2279 if rexPrefix != RexPrefixNone { 2280 a.buf.WriteByte(rexPrefix) 2281 } 2282 a.buf.Write([]byte{0x0f, 0x71, modRM}) 2283 a.WriteConst(n.srcConst, 8) 2284 case XORL, XORQ: 2285 // https://www.felixcloutier.com/x86/xor 2286 if inst == XORQ { 2287 rexPrefix |= RexPrefixW 2288 } 2289 if rexPrefix != RexPrefixNone { 2290 a.buf.WriteByte(rexPrefix) 2291 } 2292 if n.dstReg == RegAX && !isSigned8bitConst { 2293 a.buf.Write([]byte{0x35}) 2294 } else { 2295 modRM := 0b11_000_000 | // Specifying that opeand is register. 2296 0b00_110_000 | // XOR with immediate needs "/6" extension. 2297 regBits 2298 if isSigned8bitConst { 2299 a.buf.Write([]byte{0x83, modRM}) 2300 } else { 2301 a.buf.Write([]byte{0x81, modRM}) 2302 } 2303 } 2304 if fitInSigned8bit(n.srcConst) { 2305 a.WriteConst(n.srcConst, 8) 2306 } else { 2307 a.WriteConst(n.srcConst, 32) 2308 } 2309 default: 2310 err = errorEncodingUnsupported(n) 2311 } 2312 return 2313 } 2314 2315 func (a *AssemblerImpl) encodeMemoryToConst(n *nodeImpl) (err error) { 2316 if !FitIn32bit(n.dstConst) { 2317 return fmt.Errorf("too large target const %d for %s", n.dstConst, InstructionName(n.instruction)) 2318 } 2319 2320 rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation() 2321 if err != nil { 2322 return err 2323 } 2324 2325 // Alias for readability. 2326 c := n.dstConst 2327 2328 var opcode, constWidth byte 2329 switch n.instruction { 2330 case CMPL: 2331 // https://www.felixcloutier.com/x86/cmp 2332 if fitInSigned8bit(c) { 2333 opcode = 0x83 2334 constWidth = 8 2335 } else { 2336 opcode = 0x81 2337 constWidth = 32 2338 } 2339 modRM |= 0b00_111_000 2340 default: 2341 return errorEncodingUnsupported(n) 2342 } 2343 2344 if rexPrefix != RexPrefixNone { 2345 a.buf.WriteByte(rexPrefix) 2346 } 2347 2348 a.buf.Write([]byte{opcode, modRM}) 2349 2350 if sbi != nil { 2351 a.buf.WriteByte(*sbi) 2352 } 2353 2354 if displacementWidth != 0 { 2355 a.WriteConst(n.srcConst, displacementWidth) 2356 } 2357 2358 a.WriteConst(c, constWidth) 2359 return 2360 } 2361 2362 func (a *AssemblerImpl) encodeConstToMemory(n *nodeImpl) (err error) { 2363 rexPrefix, modRM, sbi, displacementWidth, err := n.GetMemoryLocation() 2364 if err != nil { 2365 return err 2366 } 2367 2368 // Alias for readability. 2369 inst := n.instruction 2370 c := n.srcConst 2371 2372 if inst == MOVB && !fitInSigned8bit(c) { 2373 return fmt.Errorf("too large load target const %d for MOVB", c) 2374 } else if !FitIn32bit(c) { 2375 return fmt.Errorf("too large load target const %d for %s", c, InstructionName(n.instruction)) 2376 } 2377 2378 var constWidth, opcode byte 2379 switch inst { 2380 case MOVB: 2381 opcode = 0xc6 2382 constWidth = 8 2383 case MOVL: 2384 opcode = 0xc7 2385 constWidth = 32 2386 case MOVQ: 2387 rexPrefix |= RexPrefixW 2388 opcode = 0xc7 2389 constWidth = 32 2390 default: 2391 return errorEncodingUnsupported(n) 2392 } 2393 2394 if rexPrefix != RexPrefixNone { 2395 a.buf.WriteByte(rexPrefix) 2396 } 2397 2398 a.buf.Write([]byte{opcode, modRM}) 2399 2400 if sbi != nil { 2401 a.buf.WriteByte(*sbi) 2402 } 2403 2404 if displacementWidth != 0 { 2405 a.WriteConst(n.dstConst, displacementWidth) 2406 } 2407 2408 a.WriteConst(c, constWidth) 2409 return 2410 } 2411 2412 func (a *AssemblerImpl) WriteConst(v int64, length byte) { 2413 switch length { 2414 case 8: 2415 a.buf.WriteByte(byte(int8(v))) 2416 case 32: 2417 // TODO: any way to directly put little endian bytes into bytes.Buffer? 2418 offsetBytes := make([]byte, 4) 2419 binary.LittleEndian.PutUint32(offsetBytes, uint32(int32(v))) 2420 a.buf.Write(offsetBytes) 2421 case 64: 2422 // TODO: any way to directly put little endian bytes into bytes.Buffer? 2423 offsetBytes := make([]byte, 8) 2424 binary.LittleEndian.PutUint64(offsetBytes, uint64(v)) 2425 a.buf.Write(offsetBytes) 2426 default: 2427 panic("BUG: length must be one of 8, 32 or 64") 2428 } 2429 } 2430 2431 func (n *nodeImpl) GetMemoryLocation() (p RexPrefix, modRM byte, sbi *byte, displacementWidth byte, err error) { 2432 var baseReg, indexReg asm.Register 2433 var offset asm.ConstantValue 2434 var scale byte 2435 if n.types.dst == operandTypeMemory { 2436 baseReg, offset, indexReg, scale = n.dstReg, n.dstConst, n.dstMemIndex, n.dstMemScale 2437 } else if n.types.src == operandTypeMemory { 2438 baseReg, offset, indexReg, scale = n.srcReg, n.srcConst, n.srcMemIndex, n.srcMemScale 2439 } else { 2440 err = fmt.Errorf("memory location is not supported for %s", n.types) 2441 return 2442 } 2443 2444 if !FitIn32bit(offset) { 2445 err = errors.New("offset does not fit in 32-bit integer") 2446 return 2447 } 2448 2449 if baseReg == asm.NilRegister && indexReg != asm.NilRegister { 2450 // [(index*scale) + displacement] addressing is possible, but we haven't used it for now. 2451 err = errors.New("addressing without base register but with index is not implemented") 2452 } else if baseReg == asm.NilRegister { 2453 modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. 2454 sbiValue := byte(0b00_100_101) 2455 sbi = &sbiValue 2456 displacementWidth = 32 2457 } else if indexReg == asm.NilRegister { 2458 modRM, p, err = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) 2459 if err != nil { 2460 return 2461 } 2462 2463 // Create ModR/M byte so that this instruction takes [R/M + displacement] operand if displacement !=0 2464 // and otherwise [R/M]. 2465 withoutDisplacement := offset == 0 && 2466 // If the target register is R13 or BP, we have to keep [R/M + displacement] even if the value 2467 // is zero since it's not [R/M] operand is not defined for these two registers. 2468 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2469 baseReg != RegR13 && baseReg != RegBP 2470 if withoutDisplacement { 2471 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2472 modRM |= 0b00_000_000 // Specifying that operand is memory without displacement 2473 displacementWidth = 0 2474 } else if fitInSigned8bit(offset) { 2475 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2476 modRM |= 0b01_000_000 // Specifying that operand is memory + 8bit displacement. 2477 displacementWidth = 8 2478 } else { 2479 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2480 modRM |= 0b10_000_000 // Specifying that operand is memory + 32bit displacement. 2481 displacementWidth = 32 2482 } 2483 2484 // For SP and R12 register, we have [SIB + displacement] if the const is non-zero, otherwise [SIP]. 2485 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing 2486 // 2487 // Thefore we emit the SIB byte before the const so that [SIB + displacement] ends up [register + displacement]. 2488 // https://wiki.osdev.org/X86-64_Instruction_Encoding#32.2F64-bit_addressing_2 2489 if baseReg == RegSP || baseReg == RegR12 { 2490 sbiValue := byte(0b00_100_100) 2491 sbi = &sbiValue 2492 } 2493 } else { 2494 if indexReg == RegSP { 2495 err = errors.New("SP cannot be used for SIB index") 2496 return 2497 } 2498 2499 modRM = 0b00_000_100 // Indicate that the memory location is specified by SIB. 2500 2501 withoutDisplacement := offset == 0 && 2502 // For R13 and BP, base registers cannot be encoded "without displacement" mod (i.e. 0b00 mod). 2503 baseReg != RegR13 && baseReg != RegBP 2504 if withoutDisplacement { 2505 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2506 modRM |= 0b00_000_000 // Specifying that operand is SIB without displacement 2507 displacementWidth = 0 2508 } else if fitInSigned8bit(offset) { 2509 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2510 modRM |= 0b01_000_000 // Specifying that operand is SIB + 8bit displacement. 2511 displacementWidth = 8 2512 } else { 2513 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2514 modRM |= 0b10_000_000 // Specifying that operand is SIB + 32bit displacement. 2515 displacementWidth = 32 2516 } 2517 2518 var baseRegBits byte 2519 baseRegBits, p, err = register3bits(baseReg, registerSpecifierPositionModRMFieldRM) 2520 if err != nil { 2521 return 2522 } 2523 2524 var indexRegBits byte 2525 var indexRegPrefix RexPrefix 2526 indexRegBits, indexRegPrefix, err = register3bits(indexReg, registerSpecifierPositionSIBIndex) 2527 if err != nil { 2528 return 2529 } 2530 p |= indexRegPrefix 2531 2532 sbiValue := baseRegBits | (indexRegBits << 3) 2533 switch scale { 2534 case 1: 2535 sbiValue |= 0b00_000_000 2536 case 2: 2537 sbiValue |= 0b01_000_000 2538 case 4: 2539 sbiValue |= 0b10_000_000 2540 case 8: 2541 sbiValue |= 0b11_000_000 2542 default: 2543 err = fmt.Errorf("scale in SIB must be one of 1, 2, 4, 8 but got %d", scale) 2544 return 2545 } 2546 2547 sbi = &sbiValue 2548 } 2549 return 2550 } 2551 2552 // GetRegisterToRegisterModRM does XXXX 2553 // 2554 // TODO: srcOnModRMReg can be deleted after golang-asm removal. This is necessary to match our implementation 2555 // with golang-asm, but in practice, there are equivalent opcodes to always have src on ModRM:reg without ambiguity. 2556 func (n *nodeImpl) GetRegisterToRegisterModRM(srcOnModRMReg bool) (RexPrefix, modRM byte, err error) { 2557 var reg3bits, rm3bits byte 2558 if srcOnModRMReg { 2559 reg3bits, RexPrefix, err = register3bits(n.srcReg, 2560 // Indicate that srcReg will be specified by ModRM:reg. 2561 registerSpecifierPositionModRMFieldReg) 2562 if err != nil { 2563 return 2564 } 2565 2566 var dstRexPrefix byte 2567 rm3bits, dstRexPrefix, err = register3bits(n.dstReg, 2568 // Indicate that dstReg will be specified by ModRM:r/m. 2569 registerSpecifierPositionModRMFieldRM) 2570 if err != nil { 2571 return 2572 } 2573 RexPrefix |= dstRexPrefix 2574 } else { 2575 rm3bits, RexPrefix, err = register3bits(n.srcReg, 2576 // Indicate that srcReg will be specified by ModRM:r/m. 2577 registerSpecifierPositionModRMFieldRM) 2578 if err != nil { 2579 return 2580 } 2581 2582 var dstRexPrefix byte 2583 reg3bits, dstRexPrefix, err = register3bits(n.dstReg, 2584 // Indicate that dstReg will be specified by ModRM:reg. 2585 registerSpecifierPositionModRMFieldReg) 2586 if err != nil { 2587 return 2588 } 2589 RexPrefix |= dstRexPrefix 2590 } 2591 2592 // https://wiki.osdev.org/X86-64_Instruction_Encoding#ModR.2FM 2593 modRM = 0b11_000_000 | // Specifying that dst operand is register. 2594 (reg3bits << 3) | 2595 rm3bits 2596 2597 return 2598 } 2599 2600 // RexPrefix represents REX prefix https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix 2601 type RexPrefix = byte 2602 2603 // REX prefixes are independent of each other and can be combined with OR. 2604 const ( 2605 RexPrefixNone RexPrefix = 0x0000_0000 // Indicates that the instruction doesn't need RexPrefix. 2606 RexPrefixDefault RexPrefix = 0b0100_0000 2607 RexPrefixW = 0b0000_1000 | RexPrefixDefault // REX.W 2608 RexPrefixR = 0b0000_0100 | RexPrefixDefault // REX.R 2609 RexPrefixX = 0b0000_0010 | RexPrefixDefault // REX.X 2610 RexPrefixB = 0b0000_0001 | RexPrefixDefault // REX.B 2611 ) 2612 2613 // registerSpecifierPosition represents the position in the instruction bytes where an operand register is placed. 2614 type registerSpecifierPosition byte 2615 2616 const ( 2617 registerSpecifierPositionModRMFieldReg registerSpecifierPosition = iota 2618 registerSpecifierPositionModRMFieldRM 2619 registerSpecifierPositionSIBIndex 2620 ) 2621 2622 func register3bits( 2623 reg asm.Register, 2624 registerSpecifierPosition registerSpecifierPosition, 2625 ) (bits byte, prefix RexPrefix, err error) { 2626 prefix = RexPrefixNone 2627 if RegR8 <= reg && reg <= RegR15 || RegX8 <= reg && reg <= RegX15 { 2628 // https://wiki.osdev.org/X86-64_Instruction_Encoding#REX_prefix 2629 switch registerSpecifierPosition { 2630 case registerSpecifierPositionModRMFieldReg: 2631 prefix = RexPrefixR 2632 case registerSpecifierPositionModRMFieldRM: 2633 prefix = RexPrefixB 2634 case registerSpecifierPositionSIBIndex: 2635 prefix = RexPrefixX 2636 } 2637 } 2638 2639 // https://wiki.osdev.org/X86-64_Instruction_Encoding#Registers 2640 switch reg { 2641 case RegAX, RegR8, RegX0, RegX8: 2642 bits = 0b000 2643 case RegCX, RegR9, RegX1, RegX9: 2644 bits = 0b001 2645 case RegDX, RegR10, RegX2, RegX10: 2646 bits = 0b010 2647 case RegBX, RegR11, RegX3, RegX11: 2648 bits = 0b011 2649 case RegSP, RegR12, RegX4, RegX12: 2650 bits = 0b100 2651 case RegBP, RegR13, RegX5, RegX13: 2652 bits = 0b101 2653 case RegSI, RegR14, RegX6, RegX14: 2654 bits = 0b110 2655 case RegDI, RegR15, RegX7, RegX15: 2656 bits = 0b111 2657 default: 2658 err = fmt.Errorf("invalid register [%s]", RegisterName(reg)) 2659 } 2660 return 2661 } 2662 2663 func FitIn32bit(v int64) bool { 2664 return math.MinInt32 <= v && v <= math.MaxUint32 2665 } 2666 2667 func fitInSigned8bit(v int64) bool { 2668 return math.MinInt8 <= v && v <= math.MaxInt8 2669 } 2670 2671 func IsVectorRegister(r asm.Register) bool { 2672 return RegX0 <= r && r <= RegX15 2673 }