gitee.com/leisunstar/runtime@v0.0.0-20200521203717-5cef3e7b53f9/virtcontainers/qemu.go (about) 1 // Copyright (c) 2016 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 6 package virtcontainers 7 8 import ( 9 "bufio" 10 "context" 11 "encoding/hex" 12 "encoding/json" 13 "fmt" 14 "io/ioutil" 15 "math" 16 "net" 17 "os" 18 "os/exec" 19 "path/filepath" 20 "strconv" 21 "strings" 22 "sync" 23 "syscall" 24 "time" 25 "unsafe" 26 27 govmmQemu "github.com/intel/govmm/qemu" 28 "github.com/opencontainers/selinux/go-selinux/label" 29 "github.com/opentracing/opentracing-go" 30 "github.com/pkg/errors" 31 "github.com/sirupsen/logrus" 32 "golang.org/x/sys/unix" 33 34 "github.com/kata-containers/runtime/virtcontainers/device/config" 35 persistapi "github.com/kata-containers/runtime/virtcontainers/persist/api" 36 "github.com/kata-containers/runtime/virtcontainers/pkg/uuid" 37 "github.com/kata-containers/runtime/virtcontainers/types" 38 "github.com/kata-containers/runtime/virtcontainers/utils" 39 ) 40 41 // romFile is the file name of the ROM that can be used for virtio-pci devices. 42 // If this file name is empty, this means we expect the firmware used by Qemu, 43 // such as SeaBIOS or OVMF for instance, to handle this directly. 44 const romFile = "" 45 46 // disable-modern is a option to QEMU that will fall back to using 0.9 version 47 // of virtio. Since moving to QEMU4.0, we can start using virtio 1.0 version. 48 // Default value is false. 49 const defaultDisableModern = false 50 51 type qmpChannel struct { 52 sync.Mutex 53 ctx context.Context 54 path string 55 qmp *govmmQemu.QMP 56 disconn chan struct{} 57 } 58 59 // CPUDevice represents a CPU device which was hot-added in a running VM 60 type CPUDevice struct { 61 // ID is used to identify this CPU in the hypervisor options. 62 ID string 63 } 64 65 // QemuState keeps Qemu's state 66 type QemuState struct { 67 Bridges []types.Bridge 68 // HotpluggedCPUs is the list of CPUs that were hot-added 69 HotpluggedVCPUs []CPUDevice 70 HotpluggedMemory int 71 UUID string 72 HotplugVFIOOnRootBus bool 73 VirtiofsdPid int 74 PCIeRootPort int 75 } 76 77 // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor. 78 type qemu struct { 79 id string 80 81 config HypervisorConfig 82 83 qmpMonitorCh qmpChannel 84 85 qemuConfig govmmQemu.Config 86 87 state QemuState 88 89 arch qemuArch 90 91 // fds is a list of file descriptors inherited by QEMU process 92 // they'll be closed once QEMU process is running 93 fds []*os.File 94 95 ctx context.Context 96 97 nvdimmCount int 98 99 stopped bool 100 101 store persistapi.PersistDriver 102 } 103 104 const ( 105 consoleSocket = "console.sock" 106 qmpSocket = "qmp.sock" 107 vhostFSSocket = "vhost-fs.sock" 108 109 qmpCapErrMsg = "Failed to negoatiate QMP capabilities" 110 qmpExecCatCmd = "exec:cat" 111 112 scsiControllerID = "scsi0" 113 rngID = "rng0" 114 vsockKernelOption = "agent.use_vsock" 115 fallbackFileBackedMemDir = "/dev/shm" 116 ) 117 118 var qemuMajorVersion int 119 var qemuMinorVersion int 120 121 // agnostic list of kernel parameters 122 var defaultKernelParameters = []Param{ 123 {"panic", "1"}, 124 } 125 126 type qmpLogger struct { 127 logger *logrus.Entry 128 } 129 130 func newQMPLogger() qmpLogger { 131 return qmpLogger{ 132 logger: virtLog.WithField("subsystem", "qmp"), 133 } 134 } 135 136 func (l qmpLogger) V(level int32) bool { 137 return level != 0 138 } 139 140 func (l qmpLogger) Infof(format string, v ...interface{}) { 141 l.logger.Infof(format, v...) 142 } 143 144 func (l qmpLogger) Warningf(format string, v ...interface{}) { 145 l.logger.Warnf(format, v...) 146 } 147 148 func (l qmpLogger) Errorf(format string, v ...interface{}) { 149 l.logger.Errorf(format, v...) 150 } 151 152 // Logger returns a logrus logger appropriate for logging qemu messages 153 func (q *qemu) Logger() *logrus.Entry { 154 return virtLog.WithField("subsystem", "qemu") 155 } 156 157 func (q *qemu) kernelParameters() string { 158 // get a list of arch kernel parameters 159 params := q.arch.kernelParameters(q.config.Debug) 160 161 // use default parameters 162 params = append(params, defaultKernelParameters...) 163 164 // set the maximum number of vCPUs 165 params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)}) 166 167 // Add a kernel param to indicate if vsock is being used. 168 // This will be consumed by the agent to determine if it needs to listen on 169 // a serial or vsock channel 170 params = append(params, Param{vsockKernelOption, strconv.FormatBool(q.config.UseVSock)}) 171 172 // add the params specified by the provided config. As the kernel 173 // honours the last parameter value set and since the config-provided 174 // params are added here, they will take priority over the defaults. 175 params = append(params, q.config.KernelParams...) 176 177 paramsStr := SerializeParams(params, "=") 178 179 return strings.Join(paramsStr, " ") 180 } 181 182 // Adds all capabilities supported by qemu implementation of hypervisor interface 183 func (q *qemu) capabilities() types.Capabilities { 184 span, _ := q.trace("capabilities") 185 defer span.Finish() 186 187 return q.arch.capabilities() 188 } 189 190 func (q *qemu) hypervisorConfig() HypervisorConfig { 191 return q.config 192 } 193 194 // get the QEMU binary path 195 func (q *qemu) qemuPath() (string, error) { 196 p, err := q.config.HypervisorAssetPath() 197 if err != nil { 198 return "", err 199 } 200 201 if p == "" { 202 p, err = q.arch.qemuPath() 203 if err != nil { 204 return "", err 205 } 206 } 207 208 if _, err = os.Stat(p); os.IsNotExist(err) { 209 return "", fmt.Errorf("QEMU path (%s) does not exist", p) 210 } 211 212 return p, nil 213 } 214 215 func (q *qemu) trace(name string) (opentracing.Span, context.Context) { 216 if q.ctx == nil { 217 q.Logger().WithField("type", "bug").Error("trace called before context set") 218 q.ctx = context.Background() 219 } 220 221 span, ctx := opentracing.StartSpanFromContext(q.ctx, name) 222 223 span.SetTag("subsystem", "hypervisor") 224 span.SetTag("type", "qemu") 225 226 return span, ctx 227 } 228 229 // setup sets the Qemu structure up. 230 func (q *qemu) setup(id string, hypervisorConfig *HypervisorConfig) error { 231 span, _ := q.trace("setup") 232 defer span.Finish() 233 234 err := hypervisorConfig.valid() 235 if err != nil { 236 return err 237 } 238 239 q.id = id 240 q.config = *hypervisorConfig 241 q.arch = newQemuArch(q.config) 242 243 initrdPath, err := q.config.InitrdAssetPath() 244 if err != nil { 245 return err 246 } 247 imagePath, err := q.config.ImageAssetPath() 248 if err != nil { 249 return err 250 } 251 if initrdPath == "" && imagePath != "" && !q.config.DisableImageNvdimm { 252 q.nvdimmCount = 1 253 } else { 254 q.nvdimmCount = 0 255 } 256 257 var create bool 258 if q.state.UUID == "" { 259 create = true 260 } 261 262 q.arch.setBridges(q.state.Bridges) 263 264 if create { 265 q.Logger().Debug("Creating bridges") 266 q.arch.bridges(q.config.DefaultBridges) 267 268 q.Logger().Debug("Creating UUID") 269 q.state.UUID = uuid.Generate().String() 270 271 q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus 272 q.state.PCIeRootPort = int(q.config.PCIeRootPort) 273 274 // The path might already exist, but in case of VM templating, 275 // we have to create it since the sandbox has not created it yet. 276 if err = os.MkdirAll(filepath.Join(q.store.RunStoragePath(), id), DirMode); err != nil { 277 return err 278 } 279 } 280 281 nested, err := RunningOnVMM(procCPUInfo) 282 if err != nil { 283 return err 284 } 285 286 if !q.config.DisableNestingChecks && nested { 287 q.arch.enableNestingChecks() 288 } else { 289 q.Logger().WithField("inside-vm", fmt.Sprintf("%t", nested)).Debug("Disable nesting environment checks") 290 q.arch.disableNestingChecks() 291 } 292 293 if !q.config.DisableVhostNet { 294 q.arch.enableVhostNet() 295 } else { 296 q.Logger().Debug("Disable vhost_net") 297 q.arch.disableVhostNet() 298 } 299 300 return nil 301 } 302 303 func (q *qemu) cpuTopology() govmmQemu.SMP { 304 return q.arch.cpuTopology(q.config.NumVCPUs, q.config.DefaultMaxVCPUs) 305 } 306 307 func (q *qemu) hostMemMB() (uint64, error) { 308 hostMemKb, err := getHostMemorySizeKb(procMemInfo) 309 if err != nil { 310 return 0, fmt.Errorf("Unable to read memory info: %s", err) 311 } 312 if hostMemKb == 0 { 313 return 0, fmt.Errorf("Error host memory size 0") 314 } 315 316 return hostMemKb / 1024, nil 317 } 318 319 func (q *qemu) memoryTopology() (govmmQemu.Memory, error) { 320 hostMemMb, err := q.hostMemMB() 321 if err != nil { 322 return govmmQemu.Memory{}, err 323 } 324 325 memMb := uint64(q.config.MemorySize) 326 327 return q.arch.memoryTopology(memMb, hostMemMb, uint8(q.config.MemSlots)), nil 328 } 329 330 func (q *qemu) qmpSocketPath(id string) (string, error) { 331 return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, qmpSocket) 332 } 333 334 func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) { 335 machine, err := q.arch.machine() 336 if err != nil { 337 return govmmQemu.Machine{}, err 338 } 339 340 accelerators := q.config.MachineAccelerators 341 if accelerators != "" { 342 if !strings.HasPrefix(accelerators, ",") { 343 accelerators = fmt.Sprintf(",%s", accelerators) 344 } 345 machine.Options += accelerators 346 } 347 348 return machine, nil 349 } 350 351 func (q *qemu) appendImage(devices []govmmQemu.Device) ([]govmmQemu.Device, error) { 352 imagePath, err := q.config.ImageAssetPath() 353 if err != nil { 354 return nil, err 355 } 356 357 if imagePath != "" { 358 devices, err = q.arch.appendImage(devices, imagePath) 359 if err != nil { 360 return nil, err 361 } 362 } 363 364 return devices, nil 365 } 366 367 func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) { 368 monitorSockPath, err := q.qmpSocketPath(q.id) 369 if err != nil { 370 return nil, err 371 } 372 373 q.qmpMonitorCh = qmpChannel{ 374 ctx: q.ctx, 375 path: monitorSockPath, 376 } 377 378 return []govmmQemu.QMPSocket{ 379 { 380 Type: "unix", 381 Name: q.qmpMonitorCh.path, 382 Server: true, 383 NoWait: true, 384 }, 385 }, nil 386 } 387 388 func (q *qemu) buildDevices(initrdPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, error) { 389 var devices []govmmQemu.Device 390 391 console, err := q.getSandboxConsole(q.id) 392 if err != nil { 393 return nil, nil, err 394 } 395 396 // Add bridges before any other devices. This way we make sure that 397 // bridge gets the first available PCI address i.e bridgePCIStartAddr 398 devices = q.arch.appendBridges(devices) 399 400 devices, err = q.arch.appendConsole(devices, console) 401 if err != nil { 402 return nil, nil, err 403 } 404 405 if initrdPath == "" { 406 devices, err = q.appendImage(devices) 407 if err != nil { 408 return nil, nil, err 409 } 410 } 411 412 var ioThread *govmmQemu.IOThread 413 if q.config.BlockDeviceDriver == config.VirtioSCSI { 414 return q.arch.appendSCSIController(devices, q.config.EnableIOThreads) 415 } 416 417 return devices, ioThread, nil 418 419 } 420 421 func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming { 422 incoming := govmmQemu.Incoming{} 423 424 if q.config.BootToBeTemplate || q.config.BootFromTemplate { 425 knobs.FileBackedMem = true 426 memory.Path = q.config.MemoryPath 427 428 if q.config.BootToBeTemplate { 429 knobs.MemShared = true 430 } 431 432 if q.config.BootFromTemplate { 433 incoming.MigrationType = govmmQemu.MigrationDefer 434 } 435 } 436 437 return incoming 438 } 439 440 func (q *qemu) setupFileBackedMem(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) { 441 var target string 442 if q.config.FileBackedMemRootDir != "" { 443 target = q.config.FileBackedMemRootDir 444 } else { 445 target = fallbackFileBackedMemDir 446 } 447 if _, err := os.Stat(target); err != nil { 448 q.Logger().WithError(err).Error("File backed memory location does not exist") 449 return 450 } 451 452 knobs.FileBackedMem = true 453 knobs.MemShared = true 454 memory.Path = target 455 } 456 457 // createSandbox is the Hypervisor sandbox creation implementation for govmmQemu. 458 func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig, stateful bool) error { 459 // Save the tracing context 460 q.ctx = ctx 461 462 span, _ := q.trace("createSandbox") 463 defer span.Finish() 464 465 if err := q.setup(id, hypervisorConfig); err != nil { 466 return err 467 } 468 469 machine, err := q.getQemuMachine() 470 if err != nil { 471 return err 472 } 473 474 smp := q.cpuTopology() 475 476 memory, err := q.memoryTopology() 477 if err != nil { 478 return err 479 } 480 481 knobs := govmmQemu.Knobs{ 482 NoUserConfig: true, 483 NoDefaults: true, 484 NoGraphic: true, 485 Daemonize: true, 486 MemPrealloc: q.config.MemPrealloc, 487 HugePages: q.config.HugePages, 488 Realtime: q.config.Realtime, 489 Mlock: q.config.Mlock, 490 } 491 492 kernelPath, err := q.config.KernelAssetPath() 493 if err != nil { 494 return err 495 } 496 497 initrdPath, err := q.config.InitrdAssetPath() 498 if err != nil { 499 return err 500 } 501 502 kernel := govmmQemu.Kernel{ 503 Path: kernelPath, 504 InitrdPath: initrdPath, 505 Params: q.kernelParameters(), 506 } 507 508 incoming := q.setupTemplate(&knobs, &memory) 509 510 // With the current implementations, VM templating will not work with file 511 // based memory (stand-alone) or virtiofs. This is because VM templating 512 // builds the first VM with file-backed memory and shared=on and the 513 // subsequent ones with shared=off. virtio-fs always requires shared=on for 514 // memory. 515 if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" { 516 if !(q.config.BootToBeTemplate || q.config.BootFromTemplate) { 517 q.setupFileBackedMem(&knobs, &memory) 518 } else { 519 return errors.New("VM templating has been enabled with either virtio-fs or file backed memory and this configuration will not work") 520 } 521 if q.config.HugePages { 522 knobs.MemPrealloc = true 523 } 524 } 525 526 // Vhost-user-blk/scsi process which can improve performance, like SPDK, 527 // requires shared-on hugepage to work with Qemu. 528 if q.config.EnableVhostUserStore { 529 if !q.config.HugePages { 530 return errors.New("Vhost-user-blk/scsi is enabled without HugePages. This configuration will not work") 531 } 532 knobs.MemShared = true 533 } 534 535 rtc := govmmQemu.RTC{ 536 Base: "utc", 537 DriftFix: "slew", 538 } 539 540 if q.state.UUID == "" { 541 return fmt.Errorf("UUID should not be empty") 542 } 543 544 qmpSockets, err := q.createQmpSocket() 545 if err != nil { 546 return err 547 } 548 549 devices, ioThread, err := q.buildDevices(initrdPath) 550 if err != nil { 551 return err 552 } 553 554 cpuModel := q.arch.cpuModel() 555 cpuModel += "," + q.config.CPUFeatures 556 557 firmwarePath, err := q.config.FirmwareAssetPath() 558 if err != nil { 559 return err 560 } 561 562 qemuPath, err := q.qemuPath() 563 if err != nil { 564 return err 565 } 566 567 qemuConfig := govmmQemu.Config{ 568 Name: fmt.Sprintf("sandbox-%s", q.id), 569 UUID: q.state.UUID, 570 Path: qemuPath, 571 Ctx: q.qmpMonitorCh.ctx, 572 Machine: machine, 573 SMP: smp, 574 Memory: memory, 575 Devices: devices, 576 CPUModel: cpuModel, 577 Kernel: kernel, 578 RTC: rtc, 579 QMPSockets: qmpSockets, 580 Knobs: knobs, 581 Incoming: incoming, 582 VGA: "none", 583 GlobalParam: "kvm-pit.lost_tick_policy=discard", 584 Bios: firmwarePath, 585 PidFile: filepath.Join(q.store.RunVMStoragePath(), q.id, "pid"), 586 } 587 588 if ioThread != nil { 589 qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread} 590 } 591 // Add RNG device to hypervisor 592 rngDev := config.RNGDev{ 593 ID: rngID, 594 Filename: q.config.EntropySource, 595 } 596 qemuConfig.Devices, err = q.arch.appendRNGDevice(qemuConfig.Devices, rngDev) 597 if err != nil { 598 return err 599 } 600 601 // Add PCIe Root Port devices to hypervisor 602 // The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port. 603 // For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt 604 if hypervisorConfig.PCIeRootPort > 0 { 605 qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort) 606 } 607 608 q.qemuConfig = qemuConfig 609 610 return nil 611 } 612 613 func (q *qemu) vhostFSSocketPath(id string) (string, error) { 614 return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, vhostFSSocket) 615 } 616 617 func (q *qemu) virtiofsdArgs(fd uintptr) []string { 618 // The daemon will terminate when the vhost-user socket 619 // connection with QEMU closes. Therefore we do not keep track 620 // of this child process after returning from this function. 621 sourcePath := filepath.Join(kataHostSharedDir(), q.id) 622 args := []string{ 623 fmt.Sprintf("--fd=%v", fd), 624 "-o", "source=" + sourcePath, 625 "-o", "cache=" + q.config.VirtioFSCache, 626 "--syslog", "-o", "no_posix_lock"} 627 if q.config.Debug { 628 args = append(args, "-d") 629 } else { 630 args = append(args, "-f") 631 } 632 633 if len(q.config.VirtioFSExtraArgs) != 0 { 634 args = append(args, q.config.VirtioFSExtraArgs...) 635 } 636 return args 637 } 638 639 func (q *qemu) setupVirtiofsd() (err error) { 640 var listener *net.UnixListener 641 var fd *os.File 642 643 if _, err = os.Stat(q.config.VirtioFSDaemon); os.IsNotExist(err) { 644 return fmt.Errorf("virtiofsd path (%s) does not exist", q.config.VirtioFSDaemon) 645 } 646 647 sockPath, err := q.vhostFSSocketPath(q.id) 648 if err != nil { 649 return err 650 } 651 652 listener, err = net.ListenUnix("unix", &net.UnixAddr{ 653 Name: sockPath, 654 Net: "unix", 655 }) 656 if err != nil { 657 return err 658 } 659 listener.SetUnlinkOnClose(false) 660 661 fd, err = listener.File() 662 listener.Close() // no longer needed since fd is a dup 663 listener = nil 664 if err != nil { 665 return err 666 } 667 defer fd.Close() 668 669 const sockFd = 3 // Cmd.ExtraFiles[] fds are numbered starting from 3 670 cmd := exec.Command(q.config.VirtioFSDaemon, q.virtiofsdArgs(sockFd)...) 671 cmd.ExtraFiles = append(cmd.ExtraFiles, fd) 672 stderr, err := cmd.StderrPipe() 673 if err != nil { 674 return err 675 } 676 677 err = cmd.Start() 678 if err == nil { 679 q.state.VirtiofsdPid = cmd.Process.Pid 680 } 681 fd.Close() 682 683 // Monitor virtiofsd's stderr and stop sandbox if virtiofsd quits 684 go func() { 685 scanner := bufio.NewScanner(stderr) 686 for scanner.Scan() { 687 q.Logger().WithField("source", "virtiofsd").Info(scanner.Text()) 688 } 689 q.Logger().Info("virtiofsd quits") 690 // Wait to release resources of virtiofsd process 691 cmd.Process.Wait() 692 q.stopSandbox() 693 }() 694 return err 695 } 696 697 func (q *qemu) getMemArgs() (bool, string, string, error) { 698 share := false 699 target := "" 700 memoryBack := "memory-backend-ram" 701 702 if q.qemuConfig.Knobs.HugePages { 703 // we are setting all the bits that govmm sets when hugepages are enabled. 704 // https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677 705 target = "/dev/hugepages" 706 memoryBack = "memory-backend-file" 707 share = true 708 } else { 709 if q.config.EnableVhostUserStore { 710 // Vhost-user-blk/scsi process which can improve performance, like SPDK, 711 // requires shared-on hugepage to work with Qemu. 712 return share, target, "", fmt.Errorf("Vhost-user-blk/scsi requires hugepage memory") 713 } 714 715 if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" { 716 target = q.qemuConfig.Memory.Path 717 memoryBack = "memory-backend-file" 718 } 719 } 720 721 if q.qemuConfig.Knobs.MemShared { 722 share = true 723 } 724 725 return share, target, memoryBack, nil 726 } 727 728 func (q *qemu) setupVirtioMem() error { 729 maxMem, err := q.hostMemMB() 730 if err != nil { 731 return err 732 } 733 // 1024 is size for nvdimm 734 sizeMB := int(maxMem) - int(q.config.MemorySize) 735 736 share, target, memoryBack, err := q.getMemArgs() 737 if err != nil { 738 return err 739 } 740 741 err = q.qmpSetup() 742 if err != nil { 743 return err 744 } 745 err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0") 746 if err == nil { 747 q.config.VirtioMem = true 748 q.Logger().Infof("Setup %dMB virtio-mem-pci success", sizeMB) 749 } else { 750 help := "" 751 if strings.Contains(err.Error(), "Cannot allocate memory") { 752 help = ". Please use command \"echo 1 > /proc/sys/vm/overcommit_memory\" handle it." 753 } 754 err = fmt.Errorf("Add %dMB virtio-mem-pci fail %s%s", sizeMB, err.Error(), help) 755 } 756 757 return err 758 } 759 760 // startSandbox will start the Sandbox's VM. 761 func (q *qemu) startSandbox(timeout int) error { 762 span, _ := q.trace("startSandbox") 763 defer span.Finish() 764 765 if q.config.Debug { 766 params := q.arch.kernelParameters(q.config.Debug) 767 strParams := SerializeParams(params, "=") 768 formatted := strings.Join(strParams, " ") 769 770 // The name of this field matches a similar one generated by 771 // the runtime and allows users to identify which parameters 772 // are set here, which come from the runtime and which are set 773 // by the user. 774 q.Logger().WithField("default-kernel-parameters", formatted).Debug() 775 } 776 777 defer func() { 778 for _, fd := range q.fds { 779 if err := fd.Close(); err != nil { 780 q.Logger().WithError(err).Error("After launching Qemu") 781 } 782 } 783 q.fds = []*os.File{} 784 }() 785 786 vmPath := filepath.Join(q.store.RunVMStoragePath(), q.id) 787 err := os.MkdirAll(vmPath, DirMode) 788 if err != nil { 789 return err 790 } 791 // append logfile only on debug 792 if q.config.Debug { 793 q.qemuConfig.LogFile = filepath.Join(vmPath, "qemu.log") 794 } 795 796 defer func() { 797 if err != nil { 798 if err := os.RemoveAll(vmPath); err != nil { 799 q.Logger().WithError(err).Error("Fail to clean up vm directory") 800 } 801 } 802 }() 803 804 // This needs to be done as late as possible, just before launching 805 // virtiofsd are executed by kata-runtime after this call, run with 806 // the SELinux label. If these processes require privileged, we do 807 // notwant to run them under confinement. 808 if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil { 809 return err 810 } 811 defer label.SetProcessLabel("") 812 813 if q.config.SharedFS == config.VirtioFS { 814 err = q.setupVirtiofsd() 815 if err != nil { 816 return err 817 } 818 } 819 820 var strErr string 821 strErr, err = govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger()) 822 if err != nil { 823 if q.config.Debug && q.qemuConfig.LogFile != "" { 824 b, err := ioutil.ReadFile(q.qemuConfig.LogFile) 825 if err == nil { 826 strErr += string(b) 827 } 828 } 829 830 q.Logger().WithError(err).Errorf("failed to launch qemu: %s", strErr) 831 return fmt.Errorf("failed to launch qemu: %s, error messages from qemu log: %s", err, strErr) 832 } 833 834 err = q.waitSandbox(timeout) 835 if err != nil { 836 return err 837 } 838 839 if q.config.BootFromTemplate { 840 if err = q.bootFromTemplate(); err != nil { 841 return err 842 } 843 } 844 845 if q.config.VirtioMem { 846 err = q.setupVirtioMem() 847 } 848 849 return err 850 } 851 852 func (q *qemu) bootFromTemplate() error { 853 err := q.qmpSetup() 854 if err != nil { 855 return err 856 } 857 defer q.qmpShutdown() 858 859 err = q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp) 860 if err != nil { 861 q.Logger().WithError(err).Error("set migration ignore shared memory") 862 return err 863 } 864 uri := fmt.Sprintf("exec:cat %s", q.config.DevicesStatePath) 865 err = q.qmpMonitorCh.qmp.ExecuteMigrationIncoming(q.qmpMonitorCh.ctx, uri) 866 if err != nil { 867 return err 868 } 869 return q.waitMigration() 870 } 871 872 // waitSandbox will wait for the Sandbox's VM to be up and running. 873 func (q *qemu) waitSandbox(timeout int) error { 874 span, _ := q.trace("waitSandbox") 875 defer span.Finish() 876 877 if timeout < 0 { 878 return fmt.Errorf("Invalid timeout %ds", timeout) 879 } 880 881 cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()} 882 883 var qmp *govmmQemu.QMP 884 var disconnectCh chan struct{} 885 var ver *govmmQemu.QMPVersion 886 var err error 887 888 // clear any possible old state before trying to connect again. 889 q.qmpShutdown() 890 timeStart := time.Now() 891 for { 892 disconnectCh = make(chan struct{}) 893 qmp, ver, err = govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh) 894 if err == nil { 895 break 896 } 897 898 if int(time.Since(timeStart).Seconds()) > timeout { 899 return fmt.Errorf("Failed to connect to QEMU instance (timeout %ds): %v", timeout, err) 900 } 901 902 time.Sleep(time.Duration(50) * time.Millisecond) 903 } 904 q.qmpMonitorCh.qmp = qmp 905 q.qmpMonitorCh.disconn = disconnectCh 906 defer q.qmpShutdown() 907 908 qemuMajorVersion = ver.Major 909 qemuMinorVersion = ver.Minor 910 911 q.Logger().WithFields(logrus.Fields{ 912 "qmp-major-version": ver.Major, 913 "qmp-minor-version": ver.Minor, 914 "qmp-micro-version": ver.Micro, 915 "qmp-capabilities": strings.Join(ver.Capabilities, ","), 916 }).Infof("QMP details") 917 918 if err = q.qmpMonitorCh.qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx); err != nil { 919 q.Logger().WithError(err).Error(qmpCapErrMsg) 920 return err 921 } 922 923 return nil 924 } 925 926 // stopSandbox will stop the Sandbox's VM. 927 func (q *qemu) stopSandbox() error { 928 span, _ := q.trace("stopSandbox") 929 defer span.Finish() 930 931 q.Logger().Info("Stopping Sandbox") 932 if q.stopped { 933 q.Logger().Info("Already stopped") 934 return nil 935 } 936 937 defer func() { 938 q.cleanupVM() 939 q.stopped = true 940 }() 941 942 if q.config.Debug && q.qemuConfig.LogFile != "" { 943 f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0) 944 if err == nil { 945 scanner := bufio.NewScanner(f) 946 for scanner.Scan() { 947 q.Logger().Debug(scanner.Text()) 948 } 949 if err := scanner.Err(); err != nil { 950 q.Logger().WithError(err).Debug("read qemu log failed") 951 } 952 } 953 } 954 955 err := q.qmpSetup() 956 if err != nil { 957 return err 958 } 959 960 err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx) 961 if err != nil { 962 q.Logger().WithError(err).Error("Fail to execute qmp QUIT") 963 return err 964 } 965 966 return nil 967 } 968 969 func (q *qemu) cleanupVM() error { 970 971 // cleanup vm path 972 dir := filepath.Join(q.store.RunVMStoragePath(), q.id) 973 974 // If it's a symlink, remove both dir and the target. 975 // This can happen when vm template links a sandbox to a vm. 976 link, err := filepath.EvalSymlinks(dir) 977 if err != nil { 978 // Well, it's just cleanup failure. Let's ignore it. 979 q.Logger().WithError(err).WithField("dir", dir).Warn("failed to resolve vm path") 980 } 981 q.Logger().WithField("link", link).WithField("dir", dir).Infof("cleanup vm path") 982 983 if err := os.RemoveAll(dir); err != nil { 984 q.Logger().WithError(err).Warnf("failed to remove vm path %s", dir) 985 } 986 if link != dir && link != "" { 987 if err := os.RemoveAll(link); err != nil { 988 q.Logger().WithError(err).WithField("link", link).Warn("failed to remove resolved vm path") 989 } 990 } 991 992 if q.config.VMid != "" { 993 dir = filepath.Join(q.store.RunStoragePath(), q.config.VMid) 994 if err := os.RemoveAll(dir); err != nil { 995 q.Logger().WithError(err).WithField("path", dir).Warnf("failed to remove vm path") 996 } 997 } 998 999 return nil 1000 } 1001 1002 func (q *qemu) togglePauseSandbox(pause bool) error { 1003 span, _ := q.trace("togglePauseSandbox") 1004 defer span.Finish() 1005 1006 err := q.qmpSetup() 1007 if err != nil { 1008 return err 1009 } 1010 1011 if pause { 1012 err = q.qmpMonitorCh.qmp.ExecuteStop(q.qmpMonitorCh.ctx) 1013 } else { 1014 err = q.qmpMonitorCh.qmp.ExecuteCont(q.qmpMonitorCh.ctx) 1015 } 1016 1017 if err != nil { 1018 return err 1019 } 1020 1021 return nil 1022 } 1023 1024 func (q *qemu) qmpSetup() error { 1025 q.qmpMonitorCh.Lock() 1026 defer q.qmpMonitorCh.Unlock() 1027 1028 if q.qmpMonitorCh.qmp != nil { 1029 return nil 1030 } 1031 1032 cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()} 1033 1034 // Auto-closed by QMPStart(). 1035 disconnectCh := make(chan struct{}) 1036 1037 qmp, _, err := govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh) 1038 if err != nil { 1039 q.Logger().WithError(err).Error("Failed to connect to QEMU instance") 1040 return err 1041 } 1042 1043 err = qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx) 1044 if err != nil { 1045 qmp.Shutdown() 1046 q.Logger().WithError(err).Error(qmpCapErrMsg) 1047 return err 1048 } 1049 q.qmpMonitorCh.qmp = qmp 1050 q.qmpMonitorCh.disconn = disconnectCh 1051 1052 return nil 1053 } 1054 1055 func (q *qemu) qmpShutdown() { 1056 q.qmpMonitorCh.Lock() 1057 defer q.qmpMonitorCh.Unlock() 1058 1059 if q.qmpMonitorCh.qmp != nil { 1060 q.qmpMonitorCh.qmp.Shutdown() 1061 // wait on disconnected channel to be sure that the qmp channel has 1062 // been closed cleanly. 1063 <-q.qmpMonitorCh.disconn 1064 q.qmpMonitorCh.qmp = nil 1065 q.qmpMonitorCh.disconn = nil 1066 } 1067 } 1068 1069 func (q *qemu) hotplugAddBlockDevice(drive *config.BlockDrive, op operation, devID string) (err error) { 1070 // drive can be a pmem device, in which case it's used as backing file for a nvdimm device 1071 if q.config.BlockDeviceDriver == config.Nvdimm || drive.Pmem { 1072 var blocksize int64 1073 file, err := os.Open(drive.File) 1074 if err != nil { 1075 return err 1076 } 1077 defer file.Close() 1078 1079 st, err := file.Stat() 1080 if err != nil { 1081 return fmt.Errorf("failed to get information from nvdimm device %v: %v", drive.File, err) 1082 } 1083 1084 // regular files do not support syscall BLKGETSIZE64 1085 if st.Mode().IsRegular() { 1086 blocksize = st.Size() 1087 } else if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 { 1088 return err 1089 } 1090 1091 if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize, &drive.Pmem); err != nil { 1092 q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File) 1093 return err 1094 } 1095 drive.NvdimmID = strconv.Itoa(q.nvdimmCount) 1096 q.nvdimmCount++ 1097 return nil 1098 } 1099 1100 if q.config.BlockDeviceCacheSet { 1101 err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, drive.File, drive.ID, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush) 1102 } else { 1103 err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, drive.File, drive.ID) 1104 } 1105 if err != nil { 1106 return err 1107 } 1108 1109 defer func() { 1110 if err != nil { 1111 q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID) 1112 } 1113 }() 1114 1115 switch { 1116 case q.config.BlockDeviceDriver == config.VirtioBlockCCW: 1117 driver := "virtio-blk-ccw" 1118 1119 addr, bridge, err := q.arch.addDeviceToBridge(drive.ID, types.CCW) 1120 if err != nil { 1121 return err 1122 } 1123 var devNoHotplug string 1124 devNoHotplug, err = bridge.AddressFormatCCW(addr) 1125 if err != nil { 1126 return err 1127 } 1128 drive.DevNo, err = bridge.AddressFormatCCWForVirtServer(addr) 1129 if err != nil { 1130 return err 1131 } 1132 if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false); err != nil { 1133 return err 1134 } 1135 case q.config.BlockDeviceDriver == config.VirtioBlock: 1136 driver := "virtio-blk-pci" 1137 addr, bridge, err := q.arch.addDeviceToBridge(drive.ID, types.PCI) 1138 if err != nil { 1139 return err 1140 } 1141 1142 defer func() { 1143 if err != nil { 1144 q.arch.removeDeviceFromBridge(drive.ID) 1145 } 1146 }() 1147 1148 // PCI address is in the format bridge-addr/device-addr eg. "03/02" 1149 drive.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr 1150 1151 if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, 0, true, defaultDisableModern); err != nil { 1152 return err 1153 } 1154 case q.config.BlockDeviceDriver == config.VirtioSCSI: 1155 driver := "scsi-hd" 1156 1157 // Bus exposed by the SCSI Controller 1158 bus := scsiControllerID + ".0" 1159 1160 // Get SCSI-id and LUN based on the order of attaching drives. 1161 scsiID, lun, err := utils.GetSCSIIdLun(drive.Index) 1162 if err != nil { 1163 return err 1164 } 1165 1166 if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, defaultDisableModern); err != nil { 1167 return err 1168 } 1169 default: 1170 return fmt.Errorf("Block device %s not recognized", q.config.BlockDeviceDriver) 1171 } 1172 1173 return nil 1174 } 1175 1176 func (q *qemu) hotplugAddVhostUserBlkDevice(vAttr *config.VhostUserDeviceAttrs, op operation, devID string) (err error) { 1177 err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false) 1178 if err != nil { 1179 return err 1180 } 1181 1182 defer func() { 1183 if err != nil { 1184 q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID) 1185 } 1186 }() 1187 1188 driver := "vhost-user-blk-pci" 1189 addr, bridge, err := q.arch.addDeviceToBridge(vAttr.DevID, types.PCI) 1190 if err != nil { 1191 return err 1192 } 1193 1194 defer func() { 1195 if err != nil { 1196 q.arch.removeDeviceFromBridge(vAttr.DevID) 1197 } 1198 }() 1199 1200 // PCI address is in the format bridge-addr/device-addr eg. "03/02" 1201 vAttr.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr 1202 1203 if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil { 1204 return err 1205 } 1206 1207 return nil 1208 } 1209 1210 func (q *qemu) hotplugBlockDevice(drive *config.BlockDrive, op operation) error { 1211 err := q.qmpSetup() 1212 if err != nil { 1213 return err 1214 } 1215 1216 devID := "virtio-" + drive.ID 1217 1218 if op == addDevice { 1219 err = q.hotplugAddBlockDevice(drive, op, devID) 1220 } else { 1221 if q.config.BlockDeviceDriver == config.VirtioBlock { 1222 if err := q.arch.removeDeviceFromBridge(drive.ID); err != nil { 1223 return err 1224 } 1225 } 1226 1227 if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { 1228 return err 1229 } 1230 1231 if err := q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID); err != nil { 1232 return err 1233 } 1234 } 1235 1236 return err 1237 } 1238 1239 func (q *qemu) hotplugVhostUserDevice(vAttr *config.VhostUserDeviceAttrs, op operation) error { 1240 err := q.qmpSetup() 1241 if err != nil { 1242 return err 1243 } 1244 1245 devID := "virtio-" + vAttr.DevID 1246 1247 if op == addDevice { 1248 switch vAttr.Type { 1249 case config.VhostUserBlk: 1250 return q.hotplugAddVhostUserBlkDevice(vAttr, op, devID) 1251 default: 1252 return fmt.Errorf("Incorrect vhost-user device type found") 1253 } 1254 } else { 1255 if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil { 1256 return err 1257 } 1258 1259 if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { 1260 return err 1261 } 1262 1263 if err := q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID); err != nil { 1264 return err 1265 } 1266 } 1267 1268 return nil 1269 } 1270 1271 func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err error) { 1272 err = q.qmpSetup() 1273 if err != nil { 1274 return err 1275 } 1276 1277 devID := device.ID 1278 machinneType := q.hypervisorConfig().HypervisorMachineType 1279 1280 if op == addDevice { 1281 1282 buf, _ := json.Marshal(device) 1283 q.Logger().WithFields(logrus.Fields{ 1284 "machine-type": machinneType, 1285 "hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus, 1286 "pcie-root-port": q.state.PCIeRootPort, 1287 "device-info": string(buf), 1288 }).Info("Start hot-plug VFIO device") 1289 1290 // In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus 1291 // for pc machine type instead of bridge. This is useful for devices that require 1292 // a large PCI BAR which is a currently a limitation with PCI bridges. 1293 if q.state.HotplugVFIOOnRootBus { 1294 1295 // In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port. 1296 switch machinneType { 1297 case QemuQ35: 1298 if device.IsPCIe && q.state.PCIeRootPort <= 0 { 1299 q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35") 1300 device.Bus = "" 1301 } 1302 default: 1303 device.Bus = "" 1304 } 1305 1306 switch device.Type { 1307 case config.VFIODeviceNormalType: 1308 return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile) 1309 case config.VFIODeviceMediatedType: 1310 return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile) 1311 default: 1312 return fmt.Errorf("Incorrect VFIO device type found") 1313 } 1314 } 1315 1316 addr, bridge, err := q.arch.addDeviceToBridge(devID, types.PCI) 1317 if err != nil { 1318 return err 1319 } 1320 1321 defer func() { 1322 if err != nil { 1323 q.arch.removeDeviceFromBridge(devID) 1324 } 1325 }() 1326 1327 switch device.Type { 1328 case config.VFIODeviceNormalType: 1329 return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile) 1330 case config.VFIODeviceMediatedType: 1331 return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile) 1332 default: 1333 return fmt.Errorf("Incorrect VFIO device type found") 1334 } 1335 } else { 1336 q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device") 1337 1338 if !q.state.HotplugVFIOOnRootBus { 1339 if err := q.arch.removeDeviceFromBridge(devID); err != nil { 1340 return err 1341 } 1342 } 1343 1344 if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { 1345 return err 1346 } 1347 } 1348 1349 return nil 1350 } 1351 1352 func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error { 1353 var ( 1354 VMFdNames []string 1355 VhostFdNames []string 1356 ) 1357 for i, VMFd := range VMFds { 1358 fdName := fmt.Sprintf("fd%d", i) 1359 if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VMFd); err != nil { 1360 return err 1361 } 1362 VMFdNames = append(VMFdNames, fdName) 1363 } 1364 for i, VhostFd := range VhostFds { 1365 fdName := fmt.Sprintf("vhostfd%d", i) 1366 if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VhostFd); err != nil { 1367 return err 1368 } 1369 VhostFd.Close() 1370 VhostFdNames = append(VhostFdNames, fdName) 1371 } 1372 return q.qmpMonitorCh.qmp.ExecuteNetdevAddByFds(q.qmpMonitorCh.ctx, "tap", name, VMFdNames, VhostFdNames) 1373 } 1374 1375 func (q *qemu) hotplugNetDevice(endpoint Endpoint, op operation) (err error) { 1376 err = q.qmpSetup() 1377 if err != nil { 1378 return err 1379 } 1380 var tap TapInterface 1381 1382 switch endpoint.Type() { 1383 case VethEndpointType: 1384 drive := endpoint.(*VethEndpoint) 1385 tap = drive.NetPair.TapInterface 1386 case TapEndpointType: 1387 drive := endpoint.(*TapEndpoint) 1388 tap = drive.TapInterface 1389 default: 1390 return fmt.Errorf("this endpoint is not supported") 1391 } 1392 1393 devID := "virtio-" + tap.ID 1394 if op == addDevice { 1395 if err = q.hotAddNetDevice(tap.Name, endpoint.HardwareAddr(), tap.VMFds, tap.VhostFds); err != nil { 1396 return err 1397 } 1398 1399 defer func() { 1400 if err != nil { 1401 q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name) 1402 } 1403 }() 1404 1405 addr, bridge, err := q.arch.addDeviceToBridge(tap.ID, types.PCI) 1406 if err != nil { 1407 return err 1408 } 1409 1410 defer func() { 1411 if err != nil { 1412 q.arch.removeDeviceFromBridge(tap.ID) 1413 } 1414 }() 1415 1416 pciAddr := fmt.Sprintf("%02x/%s", bridge.Addr, addr) 1417 endpoint.SetPciAddr(pciAddr) 1418 1419 var machine govmmQemu.Machine 1420 machine, err = q.getQemuMachine() 1421 if err != nil { 1422 return err 1423 } 1424 if machine.Type == QemuCCWVirtio { 1425 devNoHotplug := fmt.Sprintf("fe.%x.%x", bridge.Addr, addr) 1426 return q.qmpMonitorCh.qmp.ExecuteNetCCWDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), devNoHotplug, int(q.config.NumVCPUs)) 1427 } 1428 return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridge.ID, romFile, int(q.config.NumVCPUs), defaultDisableModern) 1429 1430 } 1431 1432 if err := q.arch.removeDeviceFromBridge(tap.ID); err != nil { 1433 return err 1434 } 1435 1436 if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil { 1437 return err 1438 } 1439 if err := q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name); err != nil { 1440 return err 1441 } 1442 1443 return nil 1444 } 1445 1446 func (q *qemu) hotplugDevice(devInfo interface{}, devType deviceType, op operation) (interface{}, error) { 1447 switch devType { 1448 case blockDev: 1449 drive := devInfo.(*config.BlockDrive) 1450 return nil, q.hotplugBlockDevice(drive, op) 1451 case cpuDev: 1452 vcpus := devInfo.(uint32) 1453 return q.hotplugCPUs(vcpus, op) 1454 case vfioDev: 1455 device := devInfo.(*config.VFIODev) 1456 return nil, q.hotplugVFIODevice(device, op) 1457 case memoryDev: 1458 memdev := devInfo.(*memoryDevice) 1459 return q.hotplugMemory(memdev, op) 1460 case netDev: 1461 device := devInfo.(Endpoint) 1462 return nil, q.hotplugNetDevice(device, op) 1463 case vhostuserDev: 1464 vAttr := devInfo.(*config.VhostUserDeviceAttrs) 1465 return nil, q.hotplugVhostUserDevice(vAttr, op) 1466 default: 1467 return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType) 1468 } 1469 } 1470 1471 func (q *qemu) hotplugAddDevice(devInfo interface{}, devType deviceType) (interface{}, error) { 1472 span, _ := q.trace("hotplugAddDevice") 1473 defer span.Finish() 1474 1475 data, err := q.hotplugDevice(devInfo, devType, addDevice) 1476 if err != nil { 1477 return data, err 1478 } 1479 1480 return data, nil 1481 } 1482 1483 func (q *qemu) hotplugRemoveDevice(devInfo interface{}, devType deviceType) (interface{}, error) { 1484 span, _ := q.trace("hotplugRemoveDevice") 1485 defer span.Finish() 1486 1487 data, err := q.hotplugDevice(devInfo, devType, removeDevice) 1488 if err != nil { 1489 return data, err 1490 } 1491 1492 return data, nil 1493 } 1494 1495 func (q *qemu) hotplugCPUs(vcpus uint32, op operation) (uint32, error) { 1496 if vcpus == 0 { 1497 q.Logger().Warnf("cannot hotplug 0 vCPUs") 1498 return 0, nil 1499 } 1500 1501 err := q.qmpSetup() 1502 if err != nil { 1503 return 0, err 1504 } 1505 1506 if op == addDevice { 1507 return q.hotplugAddCPUs(vcpus) 1508 } 1509 1510 return q.hotplugRemoveCPUs(vcpus) 1511 } 1512 1513 // try to hot add an amount of vCPUs, returns the number of vCPUs added 1514 func (q *qemu) hotplugAddCPUs(amount uint32) (uint32, error) { 1515 currentVCPUs := q.qemuConfig.SMP.CPUs + uint32(len(q.state.HotpluggedVCPUs)) 1516 1517 // Don't fail if the number of max vCPUs is exceeded, log a warning and hot add the vCPUs needed 1518 // to reach out max vCPUs 1519 if currentVCPUs+amount > q.config.DefaultMaxVCPUs { 1520 q.Logger().Warnf("Cannot hotplug %d CPUs, currently this SB has %d CPUs and the maximum amount of CPUs is %d", 1521 amount, currentVCPUs, q.config.DefaultMaxVCPUs) 1522 amount = q.config.DefaultMaxVCPUs - currentVCPUs 1523 } 1524 1525 if amount == 0 { 1526 // Don't fail if no more vCPUs can be added, since cgroups still can be updated 1527 q.Logger().Warnf("maximum number of vCPUs '%d' has been reached", q.config.DefaultMaxVCPUs) 1528 return 0, nil 1529 } 1530 1531 // get the list of hotpluggable CPUs 1532 hotpluggableVCPUs, err := q.qmpMonitorCh.qmp.ExecuteQueryHotpluggableCPUs(q.qmpMonitorCh.ctx) 1533 if err != nil { 1534 return 0, fmt.Errorf("failed to query hotpluggable CPUs: %v", err) 1535 } 1536 1537 machine, err := q.arch.machine() 1538 if err != nil { 1539 return 0, fmt.Errorf("failed to query machine type: %v", err) 1540 } 1541 1542 var hotpluggedVCPUs uint32 1543 for _, hc := range hotpluggableVCPUs { 1544 // qom-path is the path to the CPU, non-empty means that this CPU is already in use 1545 if hc.QOMPath != "" { 1546 continue 1547 } 1548 1549 // CPU type, i.e host-x86_64-cpu 1550 driver := hc.Type 1551 cpuID := fmt.Sprintf("cpu-%d", len(q.state.HotpluggedVCPUs)) 1552 socketID := fmt.Sprintf("%d", hc.Properties.Socket) 1553 dieID := fmt.Sprintf("%d", hc.Properties.Die) 1554 coreID := fmt.Sprintf("%d", hc.Properties.Core) 1555 threadID := fmt.Sprintf("%d", hc.Properties.Thread) 1556 1557 // If CPU type is IBM pSeries or Z, we do not set socketID and threadID 1558 if machine.Type == "pseries" || machine.Type == "s390-ccw-virtio" { 1559 socketID = "" 1560 threadID = "" 1561 dieID = "" 1562 } 1563 1564 if err := q.qmpMonitorCh.qmp.ExecuteCPUDeviceAdd(q.qmpMonitorCh.ctx, driver, cpuID, socketID, dieID, coreID, threadID, romFile); err != nil { 1565 // don't fail, let's try with other CPU 1566 continue 1567 } 1568 1569 // a new vCPU was added, update list of hotplugged vCPUs and check if all vCPUs were added 1570 q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, CPUDevice{cpuID}) 1571 hotpluggedVCPUs++ 1572 if hotpluggedVCPUs == amount { 1573 // All vCPUs were hotplugged 1574 return amount, nil 1575 } 1576 } 1577 1578 return hotpluggedVCPUs, fmt.Errorf("failed to hot add vCPUs: only %d vCPUs of %d were added", hotpluggedVCPUs, amount) 1579 } 1580 1581 // try to hot remove an amount of vCPUs, returns the number of vCPUs removed 1582 func (q *qemu) hotplugRemoveCPUs(amount uint32) (uint32, error) { 1583 hotpluggedVCPUs := uint32(len(q.state.HotpluggedVCPUs)) 1584 1585 // we can only remove hotplugged vCPUs 1586 if amount > hotpluggedVCPUs { 1587 return 0, fmt.Errorf("Unable to remove %d CPUs, currently there are only %d hotplugged CPUs", amount, hotpluggedVCPUs) 1588 } 1589 1590 for i := uint32(0); i < amount; i++ { 1591 // get the last vCPUs and try to remove it 1592 cpu := q.state.HotpluggedVCPUs[len(q.state.HotpluggedVCPUs)-1] 1593 if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, cpu.ID); err != nil { 1594 return i, fmt.Errorf("failed to hotunplug CPUs, only %d CPUs were hotunplugged: %v", i, err) 1595 } 1596 1597 // remove from the list the vCPU hotunplugged 1598 q.state.HotpluggedVCPUs = q.state.HotpluggedVCPUs[:len(q.state.HotpluggedVCPUs)-1] 1599 } 1600 1601 return amount, nil 1602 } 1603 1604 func (q *qemu) hotplugMemory(memDev *memoryDevice, op operation) (int, error) { 1605 1606 if !q.arch.supportGuestMemoryHotplug() { 1607 return 0, fmt.Errorf("guest memory hotplug not supported") 1608 } 1609 if memDev.sizeMB < 0 { 1610 return 0, fmt.Errorf("cannot hotplug negative size (%d) memory", memDev.sizeMB) 1611 } 1612 memLog := q.Logger().WithField("hotplug", "memory") 1613 1614 memLog.WithField("hotplug-memory-mb", memDev.sizeMB).Debug("requested memory hotplug") 1615 err := q.qmpSetup() 1616 if err != nil { 1617 return 0, err 1618 } 1619 1620 currentMemory := int(q.config.MemorySize) + q.state.HotpluggedMemory 1621 1622 if memDev.sizeMB == 0 { 1623 memLog.Debug("hotplug is not required") 1624 return 0, nil 1625 } 1626 1627 switch op { 1628 case removeDevice: 1629 memLog.WithField("operation", "remove").Debugf("Requested to remove memory: %d MB", memDev.sizeMB) 1630 // Dont fail but warn that this is not supported. 1631 memLog.Warn("hot-remove VM memory not supported") 1632 return 0, nil 1633 case addDevice: 1634 memLog.WithField("operation", "add").Debugf("Requested to add memory: %d MB", memDev.sizeMB) 1635 maxMem, err := q.hostMemMB() 1636 if err != nil { 1637 return 0, err 1638 } 1639 1640 // Don't exceed the maximum amount of memory 1641 if currentMemory+memDev.sizeMB > int(maxMem) { 1642 // Fixme: return a typed error 1643 return 0, fmt.Errorf("Unable to hotplug %d MiB memory, the SB has %d MiB and the maximum amount is %d MiB", 1644 memDev.sizeMB, currentMemory, maxMem) 1645 } 1646 memoryAdded, err := q.hotplugAddMemory(memDev) 1647 if err != nil { 1648 return memoryAdded, err 1649 } 1650 return memoryAdded, nil 1651 default: 1652 return 0, fmt.Errorf("invalid operation %v", op) 1653 } 1654 1655 } 1656 1657 func (q *qemu) hotplugAddMemory(memDev *memoryDevice) (int, error) { 1658 memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx) 1659 if err != nil { 1660 return 0, fmt.Errorf("failed to query memory devices: %v", err) 1661 } 1662 1663 if len(memoryDevices) != 0 { 1664 maxSlot := -1 1665 for _, device := range memoryDevices { 1666 if maxSlot < device.Data.Slot { 1667 maxSlot = device.Data.Slot 1668 } 1669 } 1670 memDev.slot = maxSlot + 1 1671 } 1672 1673 share, target, memoryBack, err := q.getMemArgs() 1674 if err != nil { 1675 return 0, err 1676 } 1677 1678 err = q.qmpMonitorCh.qmp.ExecHotplugMemory(q.qmpMonitorCh.ctx, memoryBack, "mem"+strconv.Itoa(memDev.slot), target, memDev.sizeMB, share) 1679 if err != nil { 1680 q.Logger().WithError(err).Error("hotplug memory") 1681 return 0, err 1682 } 1683 // if guest kernel only supports memory hotplug via probe interface, we need to get address of hot-add memory device 1684 if memDev.probe { 1685 memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx) 1686 if err != nil { 1687 return 0, fmt.Errorf("failed to query memory devices: %v", err) 1688 } 1689 if len(memoryDevices) != 0 { 1690 q.Logger().WithField("addr", fmt.Sprintf("0x%x", memoryDevices[len(memoryDevices)-1].Data.Addr)).Debug("recently hot-add memory device") 1691 memDev.addr = memoryDevices[len(memoryDevices)-1].Data.Addr 1692 } else { 1693 return 0, fmt.Errorf("failed to probe address of recently hot-add memory device, no device exists") 1694 } 1695 } 1696 q.state.HotpluggedMemory += memDev.sizeMB 1697 return memDev.sizeMB, nil 1698 } 1699 1700 func (q *qemu) pauseSandbox() error { 1701 span, _ := q.trace("pauseSandbox") 1702 defer span.Finish() 1703 1704 return q.togglePauseSandbox(true) 1705 } 1706 1707 func (q *qemu) resumeSandbox() error { 1708 span, _ := q.trace("resumeSandbox") 1709 defer span.Finish() 1710 1711 return q.togglePauseSandbox(false) 1712 } 1713 1714 // addDevice will add extra devices to Qemu command line. 1715 func (q *qemu) addDevice(devInfo interface{}, devType deviceType) error { 1716 var err error 1717 span, _ := q.trace("addDevice") 1718 defer span.Finish() 1719 1720 switch v := devInfo.(type) { 1721 case types.Volume: 1722 if q.config.SharedFS == config.VirtioFS { 1723 q.Logger().WithField("volume-type", "virtio-fs").Info("adding volume") 1724 1725 var randBytes []byte 1726 randBytes, err = utils.GenerateRandomBytes(8) 1727 if err != nil { 1728 return err 1729 } 1730 id := hex.EncodeToString(randBytes) 1731 1732 var sockPath string 1733 sockPath, err = q.vhostFSSocketPath(q.id) 1734 if err != nil { 1735 return err 1736 } 1737 1738 vhostDev := config.VhostUserDeviceAttrs{ 1739 Tag: v.MountTag, 1740 Type: config.VhostUserFS, 1741 CacheSize: q.config.VirtioFSCacheSize, 1742 Cache: q.config.VirtioFSCache, 1743 } 1744 vhostDev.SocketPath = sockPath 1745 vhostDev.DevID = id 1746 1747 q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(q.qemuConfig.Devices, vhostDev) 1748 } else { 1749 q.Logger().WithField("volume-type", "virtio-9p").Info("adding volume") 1750 q.qemuConfig.Devices, err = q.arch.append9PVolume(q.qemuConfig.Devices, v) 1751 } 1752 case types.Socket: 1753 q.qemuConfig.Devices = q.arch.appendSocket(q.qemuConfig.Devices, v) 1754 case types.VSock: 1755 q.fds = append(q.fds, v.VhostFd) 1756 q.qemuConfig.Devices, err = q.arch.appendVSock(q.qemuConfig.Devices, v) 1757 case Endpoint: 1758 q.qemuConfig.Devices, err = q.arch.appendNetwork(q.qemuConfig.Devices, v) 1759 case config.BlockDrive: 1760 q.qemuConfig.Devices, err = q.arch.appendBlockDevice(q.qemuConfig.Devices, v) 1761 case config.VhostUserDeviceAttrs: 1762 q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(q.qemuConfig.Devices, v) 1763 case config.VFIODev: 1764 q.qemuConfig.Devices = q.arch.appendVFIODevice(q.qemuConfig.Devices, v) 1765 default: 1766 break 1767 } 1768 1769 return err 1770 } 1771 1772 // getSandboxConsole builds the path of the console where we can read 1773 // logs coming from the sandbox. 1774 func (q *qemu) getSandboxConsole(id string) (string, error) { 1775 span, _ := q.trace("getSandboxConsole") 1776 defer span.Finish() 1777 1778 return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, consoleSocket) 1779 } 1780 1781 func (q *qemu) saveSandbox() error { 1782 q.Logger().Info("save sandbox") 1783 1784 err := q.qmpSetup() 1785 if err != nil { 1786 return err 1787 } 1788 1789 // BootToBeTemplate sets the VM to be a template that other VMs can clone from. We would want to 1790 // bypass shared memory when saving the VM to a local file through migration exec. 1791 if q.config.BootToBeTemplate { 1792 err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp) 1793 if err != nil { 1794 q.Logger().WithError(err).Error("set migration ignore shared memory") 1795 return err 1796 } 1797 } 1798 1799 err = q.qmpMonitorCh.qmp.ExecSetMigrateArguments(q.qmpMonitorCh.ctx, fmt.Sprintf("%s>%s", qmpExecCatCmd, q.config.DevicesStatePath)) 1800 if err != nil { 1801 q.Logger().WithError(err).Error("exec migration") 1802 return err 1803 } 1804 1805 return q.waitMigration() 1806 } 1807 1808 func (q *qemu) waitMigration() error { 1809 t := time.NewTimer(qmpMigrationWaitTimeout) 1810 defer t.Stop() 1811 for { 1812 status, err := q.qmpMonitorCh.qmp.ExecuteQueryMigration(q.qmpMonitorCh.ctx) 1813 if err != nil { 1814 q.Logger().WithError(err).Error("failed to query migration status") 1815 return err 1816 } 1817 if status.Status == "completed" { 1818 break 1819 } 1820 1821 select { 1822 case <-t.C: 1823 q.Logger().WithField("migration-status", status).Error("timeout waiting for qemu migration") 1824 return fmt.Errorf("timed out after %d seconds waiting for qemu migration", qmpMigrationWaitTimeout) 1825 default: 1826 // migration in progress 1827 q.Logger().WithField("migration-status", status).Debug("migration in progress") 1828 time.Sleep(100 * time.Millisecond) 1829 } 1830 } 1831 1832 return nil 1833 } 1834 1835 func (q *qemu) disconnect() { 1836 span, _ := q.trace("disconnect") 1837 defer span.Finish() 1838 1839 q.qmpShutdown() 1840 } 1841 1842 // resizeMemory get a request to update the VM memory to reqMemMB 1843 // Memory update is managed with two approaches 1844 // Add memory to VM: 1845 // When memory is required to be added we hotplug memory 1846 // Remove Memory from VM/ Return memory to host. 1847 // 1848 // Memory unplug can be slow and it cannot be guaranteed. 1849 // Additionally, the unplug has not small granularly it has to be 1850 // the memory to remove has to be at least the size of one slot. 1851 // To return memory back we are resizing the VM memory balloon. 1852 // A longer term solution is evaluate solutions like virtio-mem 1853 func (q *qemu) resizeMemory(reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, memoryDevice, error) { 1854 1855 currentMemory := q.config.MemorySize + uint32(q.state.HotpluggedMemory) 1856 err := q.qmpSetup() 1857 if err != nil { 1858 return 0, memoryDevice{}, err 1859 } 1860 var addMemDevice memoryDevice 1861 if q.config.VirtioMem && currentMemory != reqMemMB { 1862 q.Logger().WithField("hotplug", "memory").Debugf("resize memory from %dMB to %dMB", currentMemory, reqMemMB) 1863 sizeByte := (reqMemMB - q.config.MemorySize) * 1024 * 1024 1864 err = q.qmpMonitorCh.qmp.ExecQomSet(q.qmpMonitorCh.ctx, "virtiomem0", "requested-size", uint64(sizeByte)) 1865 if err != nil { 1866 return 0, memoryDevice{}, err 1867 } 1868 q.state.HotpluggedMemory = int(sizeByte / 1024 / 1024) 1869 return reqMemMB, memoryDevice{}, nil 1870 } 1871 1872 switch { 1873 case currentMemory < reqMemMB: 1874 //hotplug 1875 addMemMB := reqMemMB - currentMemory 1876 memHotplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB) 1877 if err != nil { 1878 return currentMemory, memoryDevice{}, err 1879 } 1880 1881 addMemDevice.sizeMB = int(memHotplugMB) 1882 addMemDevice.probe = probe 1883 1884 data, err := q.hotplugAddDevice(&addMemDevice, memoryDev) 1885 if err != nil { 1886 return currentMemory, addMemDevice, err 1887 } 1888 memoryAdded, ok := data.(int) 1889 if !ok { 1890 return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory added, got %+v", data) 1891 } 1892 currentMemory += uint32(memoryAdded) 1893 case currentMemory > reqMemMB: 1894 //hotunplug 1895 addMemMB := currentMemory - reqMemMB 1896 memHotunplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB) 1897 if err != nil { 1898 return currentMemory, memoryDevice{}, err 1899 } 1900 1901 addMemDevice.sizeMB = int(memHotunplugMB) 1902 addMemDevice.probe = probe 1903 1904 data, err := q.hotplugRemoveDevice(&addMemDevice, memoryDev) 1905 if err != nil { 1906 return currentMemory, addMemDevice, err 1907 } 1908 memoryRemoved, ok := data.(int) 1909 if !ok { 1910 return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory removed, got %+v", data) 1911 } 1912 //FIXME: This is to check memory hotplugRemoveDevice reported 0, as this is not supported. 1913 // In the future if this is implemented this validation should be removed. 1914 if memoryRemoved != 0 { 1915 return currentMemory, addMemDevice, fmt.Errorf("memory hot unplug is not supported, something went wrong") 1916 } 1917 currentMemory -= uint32(memoryRemoved) 1918 } 1919 1920 // currentMemory is the current memory (updated) of the VM, return to caller to allow verify 1921 // the current VM memory state. 1922 return currentMemory, addMemDevice, nil 1923 } 1924 1925 // genericAppendBridges appends to devices the given bridges 1926 // nolint: unused, deadcode 1927 func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, machineType string) []govmmQemu.Device { 1928 bus := defaultPCBridgeBus 1929 switch machineType { 1930 case QemuQ35, QemuVirt: 1931 bus = defaultBridgeBus 1932 } 1933 1934 for idx, b := range bridges { 1935 t := govmmQemu.PCIBridge 1936 if b.Type == types.PCIE { 1937 t = govmmQemu.PCIEBridge 1938 } 1939 if b.Type == types.CCW { 1940 continue 1941 } 1942 1943 bridges[idx].Addr = bridgePCIStartAddr + idx 1944 1945 devices = append(devices, 1946 govmmQemu.BridgeDevice{ 1947 Type: t, 1948 Bus: bus, 1949 ID: b.ID, 1950 // Each bridge is required to be assigned a unique chassis id > 0 1951 Chassis: idx + 1, 1952 SHPC: true, 1953 Addr: strconv.FormatInt(int64(bridges[idx].Addr), 10), 1954 }, 1955 ) 1956 } 1957 1958 return devices 1959 } 1960 1961 func genericBridges(number uint32, machineType string) []types.Bridge { 1962 var bridges []types.Bridge 1963 var bt types.Type 1964 1965 switch machineType { 1966 case QemuQ35: 1967 // currently only pci bridges are supported 1968 // qemu-2.10 will introduce pcie bridges 1969 fallthrough 1970 case QemuPC: 1971 bt = types.PCI 1972 case QemuVirt: 1973 bt = types.PCIE 1974 case QemuPseries: 1975 bt = types.PCI 1976 case QemuCCWVirtio: 1977 bt = types.CCW 1978 default: 1979 return nil 1980 } 1981 1982 for i := uint32(0); i < number; i++ { 1983 bridges = append(bridges, types.NewBridge(bt, fmt.Sprintf("%s-bridge-%d", bt, i), make(map[uint32]string), 0)) 1984 } 1985 1986 return bridges 1987 } 1988 1989 // nolint: unused, deadcode 1990 func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint32) govmmQemu.Memory { 1991 // image NVDIMM device needs memory space 1024MB 1992 // See https://github.com/clearcontainers/runtime/issues/380 1993 memoryOffset += 1024 1994 1995 memMax := fmt.Sprintf("%dM", hostMemoryMb+uint64(memoryOffset)) 1996 1997 mem := fmt.Sprintf("%dM", memoryMb) 1998 1999 memory := govmmQemu.Memory{ 2000 Size: mem, 2001 Slots: slots, 2002 MaxMem: memMax, 2003 } 2004 2005 return memory 2006 } 2007 2008 // genericAppendPCIeRootPort appends to devices the given pcie-root-port 2009 func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device { 2010 var ( 2011 bus string 2012 chassis string 2013 multiFunction bool 2014 addr string 2015 ) 2016 switch machineType { 2017 case QemuQ35: 2018 bus = defaultBridgeBus 2019 chassis = "0" 2020 multiFunction = false 2021 addr = "0" 2022 default: 2023 return devices 2024 } 2025 2026 for i := uint32(0); i < number; i++ { 2027 devices = append(devices, 2028 govmmQemu.PCIeRootPortDevice{ 2029 ID: fmt.Sprintf("%s%d", pcieRootPortPrefix, i), 2030 Bus: bus, 2031 Chassis: chassis, 2032 Slot: strconv.FormatUint(uint64(i), 10), 2033 Multifunction: multiFunction, 2034 Addr: addr, 2035 }, 2036 ) 2037 } 2038 return devices 2039 } 2040 2041 func (q *qemu) getThreadIDs() (vcpuThreadIDs, error) { 2042 span, _ := q.trace("getThreadIDs") 2043 defer span.Finish() 2044 2045 tid := vcpuThreadIDs{} 2046 err := q.qmpSetup() 2047 if err != nil { 2048 return tid, err 2049 } 2050 2051 cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpus(q.qmpMonitorCh.ctx) 2052 if err != nil { 2053 q.Logger().WithError(err).Error("failed to query cpu infos") 2054 return tid, err 2055 } 2056 2057 tid.vcpus = make(map[int]int, len(cpuInfos)) 2058 for _, i := range cpuInfos { 2059 if i.ThreadID > 0 { 2060 tid.vcpus[i.CPU] = i.ThreadID 2061 } 2062 } 2063 return tid, nil 2064 } 2065 2066 func calcHotplugMemMiBSize(mem uint32, memorySectionSizeMB uint32) (uint32, error) { 2067 if memorySectionSizeMB == 0 { 2068 return mem, nil 2069 } 2070 2071 // TODO: hot add memory aligned to memory section should be more properly. See https://github.com/kata-containers/runtime/pull/624#issuecomment-419656853 2072 return uint32(math.Ceil(float64(mem)/float64(memorySectionSizeMB))) * memorySectionSizeMB, nil 2073 } 2074 2075 func (q *qemu) resizeVCPUs(reqVCPUs uint32) (currentVCPUs uint32, newVCPUs uint32, err error) { 2076 2077 currentVCPUs = q.config.NumVCPUs + uint32(len(q.state.HotpluggedVCPUs)) 2078 newVCPUs = currentVCPUs 2079 switch { 2080 case currentVCPUs < reqVCPUs: 2081 //hotplug 2082 addCPUs := reqVCPUs - currentVCPUs 2083 data, err := q.hotplugAddDevice(addCPUs, cpuDev) 2084 if err != nil { 2085 return currentVCPUs, newVCPUs, err 2086 } 2087 vCPUsAdded, ok := data.(uint32) 2088 if !ok { 2089 return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs added, got %+v", data) 2090 } 2091 newVCPUs += vCPUsAdded 2092 case currentVCPUs > reqVCPUs: 2093 //hotunplug 2094 removeCPUs := currentVCPUs - reqVCPUs 2095 data, err := q.hotplugRemoveDevice(removeCPUs, cpuDev) 2096 if err != nil { 2097 return currentVCPUs, newVCPUs, err 2098 } 2099 vCPUsRemoved, ok := data.(uint32) 2100 if !ok { 2101 return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs removed, got %+v", data) 2102 } 2103 newVCPUs -= vCPUsRemoved 2104 } 2105 return currentVCPUs, newVCPUs, nil 2106 } 2107 2108 func (q *qemu) cleanup() error { 2109 span, _ := q.trace("cleanup") 2110 defer span.Finish() 2111 2112 for _, fd := range q.fds { 2113 if err := fd.Close(); err != nil { 2114 q.Logger().WithError(err).Warn("failed closing fd") 2115 } 2116 } 2117 q.fds = []*os.File{} 2118 2119 return nil 2120 } 2121 2122 func (q *qemu) getPids() []int { 2123 data, err := ioutil.ReadFile(q.qemuConfig.PidFile) 2124 if err != nil { 2125 q.Logger().WithError(err).Error("Could not read qemu pid file") 2126 return []int{0} 2127 } 2128 2129 pid, err := strconv.Atoi(strings.Trim(string(data), "\n\t ")) 2130 if err != nil { 2131 q.Logger().WithError(err).Error("Could not convert string to int") 2132 return []int{0} 2133 } 2134 2135 var pids []int 2136 pids = append(pids, pid) 2137 if q.state.VirtiofsdPid != 0 { 2138 pids = append(pids, q.state.VirtiofsdPid) 2139 } 2140 2141 return pids 2142 } 2143 2144 type qemuGrpc struct { 2145 ID string 2146 QmpChannelpath string 2147 State QemuState 2148 NvdimmCount int 2149 2150 // Most members of q.qemuConfig are just to generate 2151 // q.qemuConfig.qemuParams that is used by LaunchQemu except 2152 // q.qemuConfig.SMP. 2153 // So just transport q.qemuConfig.SMP from VM Cache server to runtime. 2154 QemuSMP govmmQemu.SMP 2155 } 2156 2157 func (q *qemu) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error { 2158 var qp qemuGrpc 2159 err := json.Unmarshal(j, &qp) 2160 if err != nil { 2161 return err 2162 } 2163 2164 q.id = qp.ID 2165 q.config = *hypervisorConfig 2166 q.qmpMonitorCh.ctx = ctx 2167 q.qmpMonitorCh.path = qp.QmpChannelpath 2168 q.qemuConfig.Ctx = ctx 2169 q.state = qp.State 2170 q.arch = newQemuArch(q.config) 2171 q.ctx = ctx 2172 q.nvdimmCount = qp.NvdimmCount 2173 2174 q.qemuConfig.SMP = qp.QemuSMP 2175 2176 q.arch.setBridges(q.state.Bridges) 2177 return nil 2178 } 2179 2180 func (q *qemu) toGrpc() ([]byte, error) { 2181 q.qmpShutdown() 2182 2183 q.cleanup() 2184 qp := qemuGrpc{ 2185 ID: q.id, 2186 QmpChannelpath: q.qmpMonitorCh.path, 2187 State: q.state, 2188 NvdimmCount: q.nvdimmCount, 2189 2190 QemuSMP: q.qemuConfig.SMP, 2191 } 2192 2193 return json.Marshal(&qp) 2194 } 2195 2196 func (q *qemu) save() (s persistapi.HypervisorState) { 2197 pids := q.getPids() 2198 if len(pids) != 0 { 2199 s.Pid = pids[0] 2200 } 2201 s.VirtiofsdPid = q.state.VirtiofsdPid 2202 s.Type = string(QemuHypervisor) 2203 s.UUID = q.state.UUID 2204 s.HotpluggedMemory = q.state.HotpluggedMemory 2205 s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus 2206 s.PCIeRootPort = q.state.PCIeRootPort 2207 2208 for _, bridge := range q.arch.getBridges() { 2209 s.Bridges = append(s.Bridges, persistapi.Bridge{ 2210 DeviceAddr: bridge.Devices, 2211 Type: string(bridge.Type), 2212 ID: bridge.ID, 2213 Addr: bridge.Addr, 2214 }) 2215 } 2216 2217 for _, cpu := range q.state.HotpluggedVCPUs { 2218 s.HotpluggedVCPUs = append(s.HotpluggedVCPUs, persistapi.CPUDevice{ 2219 ID: cpu.ID, 2220 }) 2221 } 2222 return 2223 } 2224 2225 func (q *qemu) load(s persistapi.HypervisorState) { 2226 q.state.UUID = s.UUID 2227 q.state.HotpluggedMemory = s.HotpluggedMemory 2228 q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus 2229 q.state.VirtiofsdPid = s.VirtiofsdPid 2230 q.state.PCIeRootPort = s.PCIeRootPort 2231 2232 for _, bridge := range s.Bridges { 2233 q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr)) 2234 } 2235 2236 for _, cpu := range s.HotpluggedVCPUs { 2237 q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, CPUDevice{ 2238 ID: cpu.ID, 2239 }) 2240 } 2241 } 2242 2243 func (q *qemu) check() error { 2244 err := q.qmpSetup() 2245 if err != nil { 2246 return err 2247 } 2248 2249 status, err := q.qmpMonitorCh.qmp.ExecuteQueryStatus(q.qmpMonitorCh.ctx) 2250 if err != nil { 2251 return err 2252 } 2253 2254 if status.Status == "internal-error" || status.Status == "guest-panicked" { 2255 return errors.Errorf("guest failure: %s", status.Status) 2256 } 2257 2258 return nil 2259 } 2260 2261 func (q *qemu) generateSocket(id string, useVsock bool) (interface{}, error) { 2262 return generateVMSocket(id, useVsock, q.store.RunVMStoragePath()) 2263 }