gitee.com/leisunstar/runtime@v0.0.0-20200521203717-5cef3e7b53f9/virtcontainers/qemu.go (about)

     1  // Copyright (c) 2016 Intel Corporation
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  //
     5  
     6  package virtcontainers
     7  
     8  import (
     9  	"bufio"
    10  	"context"
    11  	"encoding/hex"
    12  	"encoding/json"
    13  	"fmt"
    14  	"io/ioutil"
    15  	"math"
    16  	"net"
    17  	"os"
    18  	"os/exec"
    19  	"path/filepath"
    20  	"strconv"
    21  	"strings"
    22  	"sync"
    23  	"syscall"
    24  	"time"
    25  	"unsafe"
    26  
    27  	govmmQemu "github.com/intel/govmm/qemu"
    28  	"github.com/opencontainers/selinux/go-selinux/label"
    29  	"github.com/opentracing/opentracing-go"
    30  	"github.com/pkg/errors"
    31  	"github.com/sirupsen/logrus"
    32  	"golang.org/x/sys/unix"
    33  
    34  	"github.com/kata-containers/runtime/virtcontainers/device/config"
    35  	persistapi "github.com/kata-containers/runtime/virtcontainers/persist/api"
    36  	"github.com/kata-containers/runtime/virtcontainers/pkg/uuid"
    37  	"github.com/kata-containers/runtime/virtcontainers/types"
    38  	"github.com/kata-containers/runtime/virtcontainers/utils"
    39  )
    40  
    41  // romFile is the file name of the ROM that can be used for virtio-pci devices.
    42  // If this file name is empty, this means we expect the firmware used by Qemu,
    43  // such as SeaBIOS or OVMF for instance, to handle this directly.
    44  const romFile = ""
    45  
    46  // disable-modern is a option to QEMU that will fall back to using 0.9 version
    47  // of virtio. Since moving to QEMU4.0, we can start using virtio 1.0 version.
    48  // Default value is false.
    49  const defaultDisableModern = false
    50  
    51  type qmpChannel struct {
    52  	sync.Mutex
    53  	ctx     context.Context
    54  	path    string
    55  	qmp     *govmmQemu.QMP
    56  	disconn chan struct{}
    57  }
    58  
    59  // CPUDevice represents a CPU device which was hot-added in a running VM
    60  type CPUDevice struct {
    61  	// ID is used to identify this CPU in the hypervisor options.
    62  	ID string
    63  }
    64  
    65  // QemuState keeps Qemu's state
    66  type QemuState struct {
    67  	Bridges []types.Bridge
    68  	// HotpluggedCPUs is the list of CPUs that were hot-added
    69  	HotpluggedVCPUs      []CPUDevice
    70  	HotpluggedMemory     int
    71  	UUID                 string
    72  	HotplugVFIOOnRootBus bool
    73  	VirtiofsdPid         int
    74  	PCIeRootPort         int
    75  }
    76  
    77  // qemu is an Hypervisor interface implementation for the Linux qemu hypervisor.
    78  type qemu struct {
    79  	id string
    80  
    81  	config HypervisorConfig
    82  
    83  	qmpMonitorCh qmpChannel
    84  
    85  	qemuConfig govmmQemu.Config
    86  
    87  	state QemuState
    88  
    89  	arch qemuArch
    90  
    91  	// fds is a list of file descriptors inherited by QEMU process
    92  	// they'll be closed once QEMU process is running
    93  	fds []*os.File
    94  
    95  	ctx context.Context
    96  
    97  	nvdimmCount int
    98  
    99  	stopped bool
   100  
   101  	store persistapi.PersistDriver
   102  }
   103  
   104  const (
   105  	consoleSocket = "console.sock"
   106  	qmpSocket     = "qmp.sock"
   107  	vhostFSSocket = "vhost-fs.sock"
   108  
   109  	qmpCapErrMsg  = "Failed to negoatiate QMP capabilities"
   110  	qmpExecCatCmd = "exec:cat"
   111  
   112  	scsiControllerID         = "scsi0"
   113  	rngID                    = "rng0"
   114  	vsockKernelOption        = "agent.use_vsock"
   115  	fallbackFileBackedMemDir = "/dev/shm"
   116  )
   117  
   118  var qemuMajorVersion int
   119  var qemuMinorVersion int
   120  
   121  // agnostic list of kernel parameters
   122  var defaultKernelParameters = []Param{
   123  	{"panic", "1"},
   124  }
   125  
   126  type qmpLogger struct {
   127  	logger *logrus.Entry
   128  }
   129  
   130  func newQMPLogger() qmpLogger {
   131  	return qmpLogger{
   132  		logger: virtLog.WithField("subsystem", "qmp"),
   133  	}
   134  }
   135  
   136  func (l qmpLogger) V(level int32) bool {
   137  	return level != 0
   138  }
   139  
   140  func (l qmpLogger) Infof(format string, v ...interface{}) {
   141  	l.logger.Infof(format, v...)
   142  }
   143  
   144  func (l qmpLogger) Warningf(format string, v ...interface{}) {
   145  	l.logger.Warnf(format, v...)
   146  }
   147  
   148  func (l qmpLogger) Errorf(format string, v ...interface{}) {
   149  	l.logger.Errorf(format, v...)
   150  }
   151  
   152  // Logger returns a logrus logger appropriate for logging qemu messages
   153  func (q *qemu) Logger() *logrus.Entry {
   154  	return virtLog.WithField("subsystem", "qemu")
   155  }
   156  
   157  func (q *qemu) kernelParameters() string {
   158  	// get a list of arch kernel parameters
   159  	params := q.arch.kernelParameters(q.config.Debug)
   160  
   161  	// use default parameters
   162  	params = append(params, defaultKernelParameters...)
   163  
   164  	// set the maximum number of vCPUs
   165  	params = append(params, Param{"nr_cpus", fmt.Sprintf("%d", q.config.DefaultMaxVCPUs)})
   166  
   167  	// Add a kernel param to indicate if vsock is being used.
   168  	// This will be consumed by the agent to determine if it needs to listen on
   169  	// a serial or vsock channel
   170  	params = append(params, Param{vsockKernelOption, strconv.FormatBool(q.config.UseVSock)})
   171  
   172  	// add the params specified by the provided config. As the kernel
   173  	// honours the last parameter value set and since the config-provided
   174  	// params are added here, they will take priority over the defaults.
   175  	params = append(params, q.config.KernelParams...)
   176  
   177  	paramsStr := SerializeParams(params, "=")
   178  
   179  	return strings.Join(paramsStr, " ")
   180  }
   181  
   182  // Adds all capabilities supported by qemu implementation of hypervisor interface
   183  func (q *qemu) capabilities() types.Capabilities {
   184  	span, _ := q.trace("capabilities")
   185  	defer span.Finish()
   186  
   187  	return q.arch.capabilities()
   188  }
   189  
   190  func (q *qemu) hypervisorConfig() HypervisorConfig {
   191  	return q.config
   192  }
   193  
   194  // get the QEMU binary path
   195  func (q *qemu) qemuPath() (string, error) {
   196  	p, err := q.config.HypervisorAssetPath()
   197  	if err != nil {
   198  		return "", err
   199  	}
   200  
   201  	if p == "" {
   202  		p, err = q.arch.qemuPath()
   203  		if err != nil {
   204  			return "", err
   205  		}
   206  	}
   207  
   208  	if _, err = os.Stat(p); os.IsNotExist(err) {
   209  		return "", fmt.Errorf("QEMU path (%s) does not exist", p)
   210  	}
   211  
   212  	return p, nil
   213  }
   214  
   215  func (q *qemu) trace(name string) (opentracing.Span, context.Context) {
   216  	if q.ctx == nil {
   217  		q.Logger().WithField("type", "bug").Error("trace called before context set")
   218  		q.ctx = context.Background()
   219  	}
   220  
   221  	span, ctx := opentracing.StartSpanFromContext(q.ctx, name)
   222  
   223  	span.SetTag("subsystem", "hypervisor")
   224  	span.SetTag("type", "qemu")
   225  
   226  	return span, ctx
   227  }
   228  
   229  // setup sets the Qemu structure up.
   230  func (q *qemu) setup(id string, hypervisorConfig *HypervisorConfig) error {
   231  	span, _ := q.trace("setup")
   232  	defer span.Finish()
   233  
   234  	err := hypervisorConfig.valid()
   235  	if err != nil {
   236  		return err
   237  	}
   238  
   239  	q.id = id
   240  	q.config = *hypervisorConfig
   241  	q.arch = newQemuArch(q.config)
   242  
   243  	initrdPath, err := q.config.InitrdAssetPath()
   244  	if err != nil {
   245  		return err
   246  	}
   247  	imagePath, err := q.config.ImageAssetPath()
   248  	if err != nil {
   249  		return err
   250  	}
   251  	if initrdPath == "" && imagePath != "" && !q.config.DisableImageNvdimm {
   252  		q.nvdimmCount = 1
   253  	} else {
   254  		q.nvdimmCount = 0
   255  	}
   256  
   257  	var create bool
   258  	if q.state.UUID == "" {
   259  		create = true
   260  	}
   261  
   262  	q.arch.setBridges(q.state.Bridges)
   263  
   264  	if create {
   265  		q.Logger().Debug("Creating bridges")
   266  		q.arch.bridges(q.config.DefaultBridges)
   267  
   268  		q.Logger().Debug("Creating UUID")
   269  		q.state.UUID = uuid.Generate().String()
   270  
   271  		q.state.HotplugVFIOOnRootBus = q.config.HotplugVFIOOnRootBus
   272  		q.state.PCIeRootPort = int(q.config.PCIeRootPort)
   273  
   274  		// The path might already exist, but in case of VM templating,
   275  		// we have to create it since the sandbox has not created it yet.
   276  		if err = os.MkdirAll(filepath.Join(q.store.RunStoragePath(), id), DirMode); err != nil {
   277  			return err
   278  		}
   279  	}
   280  
   281  	nested, err := RunningOnVMM(procCPUInfo)
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	if !q.config.DisableNestingChecks && nested {
   287  		q.arch.enableNestingChecks()
   288  	} else {
   289  		q.Logger().WithField("inside-vm", fmt.Sprintf("%t", nested)).Debug("Disable nesting environment checks")
   290  		q.arch.disableNestingChecks()
   291  	}
   292  
   293  	if !q.config.DisableVhostNet {
   294  		q.arch.enableVhostNet()
   295  	} else {
   296  		q.Logger().Debug("Disable vhost_net")
   297  		q.arch.disableVhostNet()
   298  	}
   299  
   300  	return nil
   301  }
   302  
   303  func (q *qemu) cpuTopology() govmmQemu.SMP {
   304  	return q.arch.cpuTopology(q.config.NumVCPUs, q.config.DefaultMaxVCPUs)
   305  }
   306  
   307  func (q *qemu) hostMemMB() (uint64, error) {
   308  	hostMemKb, err := getHostMemorySizeKb(procMemInfo)
   309  	if err != nil {
   310  		return 0, fmt.Errorf("Unable to read memory info: %s", err)
   311  	}
   312  	if hostMemKb == 0 {
   313  		return 0, fmt.Errorf("Error host memory size 0")
   314  	}
   315  
   316  	return hostMemKb / 1024, nil
   317  }
   318  
   319  func (q *qemu) memoryTopology() (govmmQemu.Memory, error) {
   320  	hostMemMb, err := q.hostMemMB()
   321  	if err != nil {
   322  		return govmmQemu.Memory{}, err
   323  	}
   324  
   325  	memMb := uint64(q.config.MemorySize)
   326  
   327  	return q.arch.memoryTopology(memMb, hostMemMb, uint8(q.config.MemSlots)), nil
   328  }
   329  
   330  func (q *qemu) qmpSocketPath(id string) (string, error) {
   331  	return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, qmpSocket)
   332  }
   333  
   334  func (q *qemu) getQemuMachine() (govmmQemu.Machine, error) {
   335  	machine, err := q.arch.machine()
   336  	if err != nil {
   337  		return govmmQemu.Machine{}, err
   338  	}
   339  
   340  	accelerators := q.config.MachineAccelerators
   341  	if accelerators != "" {
   342  		if !strings.HasPrefix(accelerators, ",") {
   343  			accelerators = fmt.Sprintf(",%s", accelerators)
   344  		}
   345  		machine.Options += accelerators
   346  	}
   347  
   348  	return machine, nil
   349  }
   350  
   351  func (q *qemu) appendImage(devices []govmmQemu.Device) ([]govmmQemu.Device, error) {
   352  	imagePath, err := q.config.ImageAssetPath()
   353  	if err != nil {
   354  		return nil, err
   355  	}
   356  
   357  	if imagePath != "" {
   358  		devices, err = q.arch.appendImage(devices, imagePath)
   359  		if err != nil {
   360  			return nil, err
   361  		}
   362  	}
   363  
   364  	return devices, nil
   365  }
   366  
   367  func (q *qemu) createQmpSocket() ([]govmmQemu.QMPSocket, error) {
   368  	monitorSockPath, err := q.qmpSocketPath(q.id)
   369  	if err != nil {
   370  		return nil, err
   371  	}
   372  
   373  	q.qmpMonitorCh = qmpChannel{
   374  		ctx:  q.ctx,
   375  		path: monitorSockPath,
   376  	}
   377  
   378  	return []govmmQemu.QMPSocket{
   379  		{
   380  			Type:   "unix",
   381  			Name:   q.qmpMonitorCh.path,
   382  			Server: true,
   383  			NoWait: true,
   384  		},
   385  	}, nil
   386  }
   387  
   388  func (q *qemu) buildDevices(initrdPath string) ([]govmmQemu.Device, *govmmQemu.IOThread, error) {
   389  	var devices []govmmQemu.Device
   390  
   391  	console, err := q.getSandboxConsole(q.id)
   392  	if err != nil {
   393  		return nil, nil, err
   394  	}
   395  
   396  	// Add bridges before any other devices. This way we make sure that
   397  	// bridge gets the first available PCI address i.e bridgePCIStartAddr
   398  	devices = q.arch.appendBridges(devices)
   399  
   400  	devices, err = q.arch.appendConsole(devices, console)
   401  	if err != nil {
   402  		return nil, nil, err
   403  	}
   404  
   405  	if initrdPath == "" {
   406  		devices, err = q.appendImage(devices)
   407  		if err != nil {
   408  			return nil, nil, err
   409  		}
   410  	}
   411  
   412  	var ioThread *govmmQemu.IOThread
   413  	if q.config.BlockDeviceDriver == config.VirtioSCSI {
   414  		return q.arch.appendSCSIController(devices, q.config.EnableIOThreads)
   415  	}
   416  
   417  	return devices, ioThread, nil
   418  
   419  }
   420  
   421  func (q *qemu) setupTemplate(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) govmmQemu.Incoming {
   422  	incoming := govmmQemu.Incoming{}
   423  
   424  	if q.config.BootToBeTemplate || q.config.BootFromTemplate {
   425  		knobs.FileBackedMem = true
   426  		memory.Path = q.config.MemoryPath
   427  
   428  		if q.config.BootToBeTemplate {
   429  			knobs.MemShared = true
   430  		}
   431  
   432  		if q.config.BootFromTemplate {
   433  			incoming.MigrationType = govmmQemu.MigrationDefer
   434  		}
   435  	}
   436  
   437  	return incoming
   438  }
   439  
   440  func (q *qemu) setupFileBackedMem(knobs *govmmQemu.Knobs, memory *govmmQemu.Memory) {
   441  	var target string
   442  	if q.config.FileBackedMemRootDir != "" {
   443  		target = q.config.FileBackedMemRootDir
   444  	} else {
   445  		target = fallbackFileBackedMemDir
   446  	}
   447  	if _, err := os.Stat(target); err != nil {
   448  		q.Logger().WithError(err).Error("File backed memory location does not exist")
   449  		return
   450  	}
   451  
   452  	knobs.FileBackedMem = true
   453  	knobs.MemShared = true
   454  	memory.Path = target
   455  }
   456  
   457  // createSandbox is the Hypervisor sandbox creation implementation for govmmQemu.
   458  func (q *qemu) createSandbox(ctx context.Context, id string, networkNS NetworkNamespace, hypervisorConfig *HypervisorConfig, stateful bool) error {
   459  	// Save the tracing context
   460  	q.ctx = ctx
   461  
   462  	span, _ := q.trace("createSandbox")
   463  	defer span.Finish()
   464  
   465  	if err := q.setup(id, hypervisorConfig); err != nil {
   466  		return err
   467  	}
   468  
   469  	machine, err := q.getQemuMachine()
   470  	if err != nil {
   471  		return err
   472  	}
   473  
   474  	smp := q.cpuTopology()
   475  
   476  	memory, err := q.memoryTopology()
   477  	if err != nil {
   478  		return err
   479  	}
   480  
   481  	knobs := govmmQemu.Knobs{
   482  		NoUserConfig: true,
   483  		NoDefaults:   true,
   484  		NoGraphic:    true,
   485  		Daemonize:    true,
   486  		MemPrealloc:  q.config.MemPrealloc,
   487  		HugePages:    q.config.HugePages,
   488  		Realtime:     q.config.Realtime,
   489  		Mlock:        q.config.Mlock,
   490  	}
   491  
   492  	kernelPath, err := q.config.KernelAssetPath()
   493  	if err != nil {
   494  		return err
   495  	}
   496  
   497  	initrdPath, err := q.config.InitrdAssetPath()
   498  	if err != nil {
   499  		return err
   500  	}
   501  
   502  	kernel := govmmQemu.Kernel{
   503  		Path:       kernelPath,
   504  		InitrdPath: initrdPath,
   505  		Params:     q.kernelParameters(),
   506  	}
   507  
   508  	incoming := q.setupTemplate(&knobs, &memory)
   509  
   510  	// With the current implementations, VM templating will not work with file
   511  	// based memory (stand-alone) or virtiofs. This is because VM templating
   512  	// builds the first VM with file-backed memory and shared=on and the
   513  	// subsequent ones with shared=off. virtio-fs always requires shared=on for
   514  	// memory.
   515  	if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" {
   516  		if !(q.config.BootToBeTemplate || q.config.BootFromTemplate) {
   517  			q.setupFileBackedMem(&knobs, &memory)
   518  		} else {
   519  			return errors.New("VM templating has been enabled with either virtio-fs or file backed memory and this configuration will not work")
   520  		}
   521  		if q.config.HugePages {
   522  			knobs.MemPrealloc = true
   523  		}
   524  	}
   525  
   526  	// Vhost-user-blk/scsi process which can improve performance, like SPDK,
   527  	// requires shared-on hugepage to work with Qemu.
   528  	if q.config.EnableVhostUserStore {
   529  		if !q.config.HugePages {
   530  			return errors.New("Vhost-user-blk/scsi is enabled without HugePages. This configuration will not work")
   531  		}
   532  		knobs.MemShared = true
   533  	}
   534  
   535  	rtc := govmmQemu.RTC{
   536  		Base:     "utc",
   537  		DriftFix: "slew",
   538  	}
   539  
   540  	if q.state.UUID == "" {
   541  		return fmt.Errorf("UUID should not be empty")
   542  	}
   543  
   544  	qmpSockets, err := q.createQmpSocket()
   545  	if err != nil {
   546  		return err
   547  	}
   548  
   549  	devices, ioThread, err := q.buildDevices(initrdPath)
   550  	if err != nil {
   551  		return err
   552  	}
   553  
   554  	cpuModel := q.arch.cpuModel()
   555  	cpuModel += "," + q.config.CPUFeatures
   556  
   557  	firmwarePath, err := q.config.FirmwareAssetPath()
   558  	if err != nil {
   559  		return err
   560  	}
   561  
   562  	qemuPath, err := q.qemuPath()
   563  	if err != nil {
   564  		return err
   565  	}
   566  
   567  	qemuConfig := govmmQemu.Config{
   568  		Name:        fmt.Sprintf("sandbox-%s", q.id),
   569  		UUID:        q.state.UUID,
   570  		Path:        qemuPath,
   571  		Ctx:         q.qmpMonitorCh.ctx,
   572  		Machine:     machine,
   573  		SMP:         smp,
   574  		Memory:      memory,
   575  		Devices:     devices,
   576  		CPUModel:    cpuModel,
   577  		Kernel:      kernel,
   578  		RTC:         rtc,
   579  		QMPSockets:  qmpSockets,
   580  		Knobs:       knobs,
   581  		Incoming:    incoming,
   582  		VGA:         "none",
   583  		GlobalParam: "kvm-pit.lost_tick_policy=discard",
   584  		Bios:        firmwarePath,
   585  		PidFile:     filepath.Join(q.store.RunVMStoragePath(), q.id, "pid"),
   586  	}
   587  
   588  	if ioThread != nil {
   589  		qemuConfig.IOThreads = []govmmQemu.IOThread{*ioThread}
   590  	}
   591  	// Add RNG device to hypervisor
   592  	rngDev := config.RNGDev{
   593  		ID:       rngID,
   594  		Filename: q.config.EntropySource,
   595  	}
   596  	qemuConfig.Devices, err = q.arch.appendRNGDevice(qemuConfig.Devices, rngDev)
   597  	if err != nil {
   598  		return err
   599  	}
   600  
   601  	// Add PCIe Root Port devices to hypervisor
   602  	// The pcie.0 bus do not support hot-plug, but PCIe device can be hot-plugged into PCIe Root Port.
   603  	// For more details, please see https://github.com/qemu/qemu/blob/master/docs/pcie.txt
   604  	if hypervisorConfig.PCIeRootPort > 0 {
   605  		qemuConfig.Devices = q.arch.appendPCIeRootPortDevice(qemuConfig.Devices, hypervisorConfig.PCIeRootPort)
   606  	}
   607  
   608  	q.qemuConfig = qemuConfig
   609  
   610  	return nil
   611  }
   612  
   613  func (q *qemu) vhostFSSocketPath(id string) (string, error) {
   614  	return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, vhostFSSocket)
   615  }
   616  
   617  func (q *qemu) virtiofsdArgs(fd uintptr) []string {
   618  	// The daemon will terminate when the vhost-user socket
   619  	// connection with QEMU closes.  Therefore we do not keep track
   620  	// of this child process after returning from this function.
   621  	sourcePath := filepath.Join(kataHostSharedDir(), q.id)
   622  	args := []string{
   623  		fmt.Sprintf("--fd=%v", fd),
   624  		"-o", "source=" + sourcePath,
   625  		"-o", "cache=" + q.config.VirtioFSCache,
   626  		"--syslog", "-o", "no_posix_lock"}
   627  	if q.config.Debug {
   628  		args = append(args, "-d")
   629  	} else {
   630  		args = append(args, "-f")
   631  	}
   632  
   633  	if len(q.config.VirtioFSExtraArgs) != 0 {
   634  		args = append(args, q.config.VirtioFSExtraArgs...)
   635  	}
   636  	return args
   637  }
   638  
   639  func (q *qemu) setupVirtiofsd() (err error) {
   640  	var listener *net.UnixListener
   641  	var fd *os.File
   642  
   643  	if _, err = os.Stat(q.config.VirtioFSDaemon); os.IsNotExist(err) {
   644  		return fmt.Errorf("virtiofsd path (%s) does not exist", q.config.VirtioFSDaemon)
   645  	}
   646  
   647  	sockPath, err := q.vhostFSSocketPath(q.id)
   648  	if err != nil {
   649  		return err
   650  	}
   651  
   652  	listener, err = net.ListenUnix("unix", &net.UnixAddr{
   653  		Name: sockPath,
   654  		Net:  "unix",
   655  	})
   656  	if err != nil {
   657  		return err
   658  	}
   659  	listener.SetUnlinkOnClose(false)
   660  
   661  	fd, err = listener.File()
   662  	listener.Close() // no longer needed since fd is a dup
   663  	listener = nil
   664  	if err != nil {
   665  		return err
   666  	}
   667  	defer fd.Close()
   668  
   669  	const sockFd = 3 // Cmd.ExtraFiles[] fds are numbered starting from 3
   670  	cmd := exec.Command(q.config.VirtioFSDaemon, q.virtiofsdArgs(sockFd)...)
   671  	cmd.ExtraFiles = append(cmd.ExtraFiles, fd)
   672  	stderr, err := cmd.StderrPipe()
   673  	if err != nil {
   674  		return err
   675  	}
   676  
   677  	err = cmd.Start()
   678  	if err == nil {
   679  		q.state.VirtiofsdPid = cmd.Process.Pid
   680  	}
   681  	fd.Close()
   682  
   683  	// Monitor virtiofsd's stderr and stop sandbox if virtiofsd quits
   684  	go func() {
   685  		scanner := bufio.NewScanner(stderr)
   686  		for scanner.Scan() {
   687  			q.Logger().WithField("source", "virtiofsd").Info(scanner.Text())
   688  		}
   689  		q.Logger().Info("virtiofsd quits")
   690  		// Wait to release resources of virtiofsd process
   691  		cmd.Process.Wait()
   692  		q.stopSandbox()
   693  	}()
   694  	return err
   695  }
   696  
   697  func (q *qemu) getMemArgs() (bool, string, string, error) {
   698  	share := false
   699  	target := ""
   700  	memoryBack := "memory-backend-ram"
   701  
   702  	if q.qemuConfig.Knobs.HugePages {
   703  		// we are setting all the bits that govmm sets when hugepages are enabled.
   704  		// https://github.com/intel/govmm/blob/master/qemu/qemu.go#L1677
   705  		target = "/dev/hugepages"
   706  		memoryBack = "memory-backend-file"
   707  		share = true
   708  	} else {
   709  		if q.config.EnableVhostUserStore {
   710  			// Vhost-user-blk/scsi process which can improve performance, like SPDK,
   711  			// requires shared-on hugepage to work with Qemu.
   712  			return share, target, "", fmt.Errorf("Vhost-user-blk/scsi requires hugepage memory")
   713  		}
   714  
   715  		if q.config.SharedFS == config.VirtioFS || q.config.FileBackedMemRootDir != "" {
   716  			target = q.qemuConfig.Memory.Path
   717  			memoryBack = "memory-backend-file"
   718  		}
   719  	}
   720  
   721  	if q.qemuConfig.Knobs.MemShared {
   722  		share = true
   723  	}
   724  
   725  	return share, target, memoryBack, nil
   726  }
   727  
   728  func (q *qemu) setupVirtioMem() error {
   729  	maxMem, err := q.hostMemMB()
   730  	if err != nil {
   731  		return err
   732  	}
   733  	// 1024 is size for nvdimm
   734  	sizeMB := int(maxMem) - int(q.config.MemorySize)
   735  
   736  	share, target, memoryBack, err := q.getMemArgs()
   737  	if err != nil {
   738  		return err
   739  	}
   740  
   741  	err = q.qmpSetup()
   742  	if err != nil {
   743  		return err
   744  	}
   745  	err = q.qmpMonitorCh.qmp.ExecMemdevAdd(q.qmpMonitorCh.ctx, memoryBack, "virtiomem", target, sizeMB, share, "virtio-mem-pci", "virtiomem0")
   746  	if err == nil {
   747  		q.config.VirtioMem = true
   748  		q.Logger().Infof("Setup %dMB virtio-mem-pci success", sizeMB)
   749  	} else {
   750  		help := ""
   751  		if strings.Contains(err.Error(), "Cannot allocate memory") {
   752  			help = ".  Please use command \"echo 1 > /proc/sys/vm/overcommit_memory\" handle it."
   753  		}
   754  		err = fmt.Errorf("Add %dMB virtio-mem-pci fail %s%s", sizeMB, err.Error(), help)
   755  	}
   756  
   757  	return err
   758  }
   759  
   760  // startSandbox will start the Sandbox's VM.
   761  func (q *qemu) startSandbox(timeout int) error {
   762  	span, _ := q.trace("startSandbox")
   763  	defer span.Finish()
   764  
   765  	if q.config.Debug {
   766  		params := q.arch.kernelParameters(q.config.Debug)
   767  		strParams := SerializeParams(params, "=")
   768  		formatted := strings.Join(strParams, " ")
   769  
   770  		// The name of this field matches a similar one generated by
   771  		// the runtime and allows users to identify which parameters
   772  		// are set here, which come from the runtime and which are set
   773  		// by the user.
   774  		q.Logger().WithField("default-kernel-parameters", formatted).Debug()
   775  	}
   776  
   777  	defer func() {
   778  		for _, fd := range q.fds {
   779  			if err := fd.Close(); err != nil {
   780  				q.Logger().WithError(err).Error("After launching Qemu")
   781  			}
   782  		}
   783  		q.fds = []*os.File{}
   784  	}()
   785  
   786  	vmPath := filepath.Join(q.store.RunVMStoragePath(), q.id)
   787  	err := os.MkdirAll(vmPath, DirMode)
   788  	if err != nil {
   789  		return err
   790  	}
   791  	// append logfile only on debug
   792  	if q.config.Debug {
   793  		q.qemuConfig.LogFile = filepath.Join(vmPath, "qemu.log")
   794  	}
   795  
   796  	defer func() {
   797  		if err != nil {
   798  			if err := os.RemoveAll(vmPath); err != nil {
   799  				q.Logger().WithError(err).Error("Fail to clean up vm directory")
   800  			}
   801  		}
   802  	}()
   803  
   804  	// This needs to be done as late as possible, just before launching
   805  	// virtiofsd are executed by kata-runtime after this call, run with
   806  	// the SELinux label. If these processes require privileged, we do
   807  	// notwant to run them under confinement.
   808  	if err := label.SetProcessLabel(q.config.SELinuxProcessLabel); err != nil {
   809  		return err
   810  	}
   811  	defer label.SetProcessLabel("")
   812  
   813  	if q.config.SharedFS == config.VirtioFS {
   814  		err = q.setupVirtiofsd()
   815  		if err != nil {
   816  			return err
   817  		}
   818  	}
   819  
   820  	var strErr string
   821  	strErr, err = govmmQemu.LaunchQemu(q.qemuConfig, newQMPLogger())
   822  	if err != nil {
   823  		if q.config.Debug && q.qemuConfig.LogFile != "" {
   824  			b, err := ioutil.ReadFile(q.qemuConfig.LogFile)
   825  			if err == nil {
   826  				strErr += string(b)
   827  			}
   828  		}
   829  
   830  		q.Logger().WithError(err).Errorf("failed to launch qemu: %s", strErr)
   831  		return fmt.Errorf("failed to launch qemu: %s, error messages from qemu log: %s", err, strErr)
   832  	}
   833  
   834  	err = q.waitSandbox(timeout)
   835  	if err != nil {
   836  		return err
   837  	}
   838  
   839  	if q.config.BootFromTemplate {
   840  		if err = q.bootFromTemplate(); err != nil {
   841  			return err
   842  		}
   843  	}
   844  
   845  	if q.config.VirtioMem {
   846  		err = q.setupVirtioMem()
   847  	}
   848  
   849  	return err
   850  }
   851  
   852  func (q *qemu) bootFromTemplate() error {
   853  	err := q.qmpSetup()
   854  	if err != nil {
   855  		return err
   856  	}
   857  	defer q.qmpShutdown()
   858  
   859  	err = q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp)
   860  	if err != nil {
   861  		q.Logger().WithError(err).Error("set migration ignore shared memory")
   862  		return err
   863  	}
   864  	uri := fmt.Sprintf("exec:cat %s", q.config.DevicesStatePath)
   865  	err = q.qmpMonitorCh.qmp.ExecuteMigrationIncoming(q.qmpMonitorCh.ctx, uri)
   866  	if err != nil {
   867  		return err
   868  	}
   869  	return q.waitMigration()
   870  }
   871  
   872  // waitSandbox will wait for the Sandbox's VM to be up and running.
   873  func (q *qemu) waitSandbox(timeout int) error {
   874  	span, _ := q.trace("waitSandbox")
   875  	defer span.Finish()
   876  
   877  	if timeout < 0 {
   878  		return fmt.Errorf("Invalid timeout %ds", timeout)
   879  	}
   880  
   881  	cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()}
   882  
   883  	var qmp *govmmQemu.QMP
   884  	var disconnectCh chan struct{}
   885  	var ver *govmmQemu.QMPVersion
   886  	var err error
   887  
   888  	// clear any possible old state before trying to connect again.
   889  	q.qmpShutdown()
   890  	timeStart := time.Now()
   891  	for {
   892  		disconnectCh = make(chan struct{})
   893  		qmp, ver, err = govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh)
   894  		if err == nil {
   895  			break
   896  		}
   897  
   898  		if int(time.Since(timeStart).Seconds()) > timeout {
   899  			return fmt.Errorf("Failed to connect to QEMU instance (timeout %ds): %v", timeout, err)
   900  		}
   901  
   902  		time.Sleep(time.Duration(50) * time.Millisecond)
   903  	}
   904  	q.qmpMonitorCh.qmp = qmp
   905  	q.qmpMonitorCh.disconn = disconnectCh
   906  	defer q.qmpShutdown()
   907  
   908  	qemuMajorVersion = ver.Major
   909  	qemuMinorVersion = ver.Minor
   910  
   911  	q.Logger().WithFields(logrus.Fields{
   912  		"qmp-major-version": ver.Major,
   913  		"qmp-minor-version": ver.Minor,
   914  		"qmp-micro-version": ver.Micro,
   915  		"qmp-capabilities":  strings.Join(ver.Capabilities, ","),
   916  	}).Infof("QMP details")
   917  
   918  	if err = q.qmpMonitorCh.qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx); err != nil {
   919  		q.Logger().WithError(err).Error(qmpCapErrMsg)
   920  		return err
   921  	}
   922  
   923  	return nil
   924  }
   925  
   926  // stopSandbox will stop the Sandbox's VM.
   927  func (q *qemu) stopSandbox() error {
   928  	span, _ := q.trace("stopSandbox")
   929  	defer span.Finish()
   930  
   931  	q.Logger().Info("Stopping Sandbox")
   932  	if q.stopped {
   933  		q.Logger().Info("Already stopped")
   934  		return nil
   935  	}
   936  
   937  	defer func() {
   938  		q.cleanupVM()
   939  		q.stopped = true
   940  	}()
   941  
   942  	if q.config.Debug && q.qemuConfig.LogFile != "" {
   943  		f, err := os.OpenFile(q.qemuConfig.LogFile, os.O_RDONLY, 0)
   944  		if err == nil {
   945  			scanner := bufio.NewScanner(f)
   946  			for scanner.Scan() {
   947  				q.Logger().Debug(scanner.Text())
   948  			}
   949  			if err := scanner.Err(); err != nil {
   950  				q.Logger().WithError(err).Debug("read qemu log failed")
   951  			}
   952  		}
   953  	}
   954  
   955  	err := q.qmpSetup()
   956  	if err != nil {
   957  		return err
   958  	}
   959  
   960  	err = q.qmpMonitorCh.qmp.ExecuteQuit(q.qmpMonitorCh.ctx)
   961  	if err != nil {
   962  		q.Logger().WithError(err).Error("Fail to execute qmp QUIT")
   963  		return err
   964  	}
   965  
   966  	return nil
   967  }
   968  
   969  func (q *qemu) cleanupVM() error {
   970  
   971  	// cleanup vm path
   972  	dir := filepath.Join(q.store.RunVMStoragePath(), q.id)
   973  
   974  	// If it's a symlink, remove both dir and the target.
   975  	// This can happen when vm template links a sandbox to a vm.
   976  	link, err := filepath.EvalSymlinks(dir)
   977  	if err != nil {
   978  		// Well, it's just cleanup failure. Let's ignore it.
   979  		q.Logger().WithError(err).WithField("dir", dir).Warn("failed to resolve vm path")
   980  	}
   981  	q.Logger().WithField("link", link).WithField("dir", dir).Infof("cleanup vm path")
   982  
   983  	if err := os.RemoveAll(dir); err != nil {
   984  		q.Logger().WithError(err).Warnf("failed to remove vm path %s", dir)
   985  	}
   986  	if link != dir && link != "" {
   987  		if err := os.RemoveAll(link); err != nil {
   988  			q.Logger().WithError(err).WithField("link", link).Warn("failed to remove resolved vm path")
   989  		}
   990  	}
   991  
   992  	if q.config.VMid != "" {
   993  		dir = filepath.Join(q.store.RunStoragePath(), q.config.VMid)
   994  		if err := os.RemoveAll(dir); err != nil {
   995  			q.Logger().WithError(err).WithField("path", dir).Warnf("failed to remove vm path")
   996  		}
   997  	}
   998  
   999  	return nil
  1000  }
  1001  
  1002  func (q *qemu) togglePauseSandbox(pause bool) error {
  1003  	span, _ := q.trace("togglePauseSandbox")
  1004  	defer span.Finish()
  1005  
  1006  	err := q.qmpSetup()
  1007  	if err != nil {
  1008  		return err
  1009  	}
  1010  
  1011  	if pause {
  1012  		err = q.qmpMonitorCh.qmp.ExecuteStop(q.qmpMonitorCh.ctx)
  1013  	} else {
  1014  		err = q.qmpMonitorCh.qmp.ExecuteCont(q.qmpMonitorCh.ctx)
  1015  	}
  1016  
  1017  	if err != nil {
  1018  		return err
  1019  	}
  1020  
  1021  	return nil
  1022  }
  1023  
  1024  func (q *qemu) qmpSetup() error {
  1025  	q.qmpMonitorCh.Lock()
  1026  	defer q.qmpMonitorCh.Unlock()
  1027  
  1028  	if q.qmpMonitorCh.qmp != nil {
  1029  		return nil
  1030  	}
  1031  
  1032  	cfg := govmmQemu.QMPConfig{Logger: newQMPLogger()}
  1033  
  1034  	// Auto-closed by QMPStart().
  1035  	disconnectCh := make(chan struct{})
  1036  
  1037  	qmp, _, err := govmmQemu.QMPStart(q.qmpMonitorCh.ctx, q.qmpMonitorCh.path, cfg, disconnectCh)
  1038  	if err != nil {
  1039  		q.Logger().WithError(err).Error("Failed to connect to QEMU instance")
  1040  		return err
  1041  	}
  1042  
  1043  	err = qmp.ExecuteQMPCapabilities(q.qmpMonitorCh.ctx)
  1044  	if err != nil {
  1045  		qmp.Shutdown()
  1046  		q.Logger().WithError(err).Error(qmpCapErrMsg)
  1047  		return err
  1048  	}
  1049  	q.qmpMonitorCh.qmp = qmp
  1050  	q.qmpMonitorCh.disconn = disconnectCh
  1051  
  1052  	return nil
  1053  }
  1054  
  1055  func (q *qemu) qmpShutdown() {
  1056  	q.qmpMonitorCh.Lock()
  1057  	defer q.qmpMonitorCh.Unlock()
  1058  
  1059  	if q.qmpMonitorCh.qmp != nil {
  1060  		q.qmpMonitorCh.qmp.Shutdown()
  1061  		// wait on disconnected channel to be sure that the qmp channel has
  1062  		// been closed cleanly.
  1063  		<-q.qmpMonitorCh.disconn
  1064  		q.qmpMonitorCh.qmp = nil
  1065  		q.qmpMonitorCh.disconn = nil
  1066  	}
  1067  }
  1068  
  1069  func (q *qemu) hotplugAddBlockDevice(drive *config.BlockDrive, op operation, devID string) (err error) {
  1070  	// drive can be a pmem device, in which case it's used as backing file for a nvdimm device
  1071  	if q.config.BlockDeviceDriver == config.Nvdimm || drive.Pmem {
  1072  		var blocksize int64
  1073  		file, err := os.Open(drive.File)
  1074  		if err != nil {
  1075  			return err
  1076  		}
  1077  		defer file.Close()
  1078  
  1079  		st, err := file.Stat()
  1080  		if err != nil {
  1081  			return fmt.Errorf("failed to get information from nvdimm device %v: %v", drive.File, err)
  1082  		}
  1083  
  1084  		// regular files do not support syscall BLKGETSIZE64
  1085  		if st.Mode().IsRegular() {
  1086  			blocksize = st.Size()
  1087  		} else if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, file.Fd(), unix.BLKGETSIZE64, uintptr(unsafe.Pointer(&blocksize))); err != 0 {
  1088  			return err
  1089  		}
  1090  
  1091  		if err = q.qmpMonitorCh.qmp.ExecuteNVDIMMDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, drive.File, blocksize, &drive.Pmem); err != nil {
  1092  			q.Logger().WithError(err).Errorf("Failed to add NVDIMM device %s", drive.File)
  1093  			return err
  1094  		}
  1095  		drive.NvdimmID = strconv.Itoa(q.nvdimmCount)
  1096  		q.nvdimmCount++
  1097  		return nil
  1098  	}
  1099  
  1100  	if q.config.BlockDeviceCacheSet {
  1101  		err = q.qmpMonitorCh.qmp.ExecuteBlockdevAddWithCache(q.qmpMonitorCh.ctx, drive.File, drive.ID, q.config.BlockDeviceCacheDirect, q.config.BlockDeviceCacheNoflush)
  1102  	} else {
  1103  		err = q.qmpMonitorCh.qmp.ExecuteBlockdevAdd(q.qmpMonitorCh.ctx, drive.File, drive.ID)
  1104  	}
  1105  	if err != nil {
  1106  		return err
  1107  	}
  1108  
  1109  	defer func() {
  1110  		if err != nil {
  1111  			q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID)
  1112  		}
  1113  	}()
  1114  
  1115  	switch {
  1116  	case q.config.BlockDeviceDriver == config.VirtioBlockCCW:
  1117  		driver := "virtio-blk-ccw"
  1118  
  1119  		addr, bridge, err := q.arch.addDeviceToBridge(drive.ID, types.CCW)
  1120  		if err != nil {
  1121  			return err
  1122  		}
  1123  		var devNoHotplug string
  1124  		devNoHotplug, err = bridge.AddressFormatCCW(addr)
  1125  		if err != nil {
  1126  			return err
  1127  		}
  1128  		drive.DevNo, err = bridge.AddressFormatCCWForVirtServer(addr)
  1129  		if err != nil {
  1130  			return err
  1131  		}
  1132  		if err = q.qmpMonitorCh.qmp.ExecuteDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, devNoHotplug, "", true, false); err != nil {
  1133  			return err
  1134  		}
  1135  	case q.config.BlockDeviceDriver == config.VirtioBlock:
  1136  		driver := "virtio-blk-pci"
  1137  		addr, bridge, err := q.arch.addDeviceToBridge(drive.ID, types.PCI)
  1138  		if err != nil {
  1139  			return err
  1140  		}
  1141  
  1142  		defer func() {
  1143  			if err != nil {
  1144  				q.arch.removeDeviceFromBridge(drive.ID)
  1145  			}
  1146  		}()
  1147  
  1148  		// PCI address is in the format bridge-addr/device-addr eg. "03/02"
  1149  		drive.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr
  1150  
  1151  		if err = q.qmpMonitorCh.qmp.ExecutePCIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, addr, bridge.ID, romFile, 0, true, defaultDisableModern); err != nil {
  1152  			return err
  1153  		}
  1154  	case q.config.BlockDeviceDriver == config.VirtioSCSI:
  1155  		driver := "scsi-hd"
  1156  
  1157  		// Bus exposed by the SCSI Controller
  1158  		bus := scsiControllerID + ".0"
  1159  
  1160  		// Get SCSI-id and LUN based on the order of attaching drives.
  1161  		scsiID, lun, err := utils.GetSCSIIdLun(drive.Index)
  1162  		if err != nil {
  1163  			return err
  1164  		}
  1165  
  1166  		if err = q.qmpMonitorCh.qmp.ExecuteSCSIDeviceAdd(q.qmpMonitorCh.ctx, drive.ID, devID, driver, bus, romFile, scsiID, lun, true, defaultDisableModern); err != nil {
  1167  			return err
  1168  		}
  1169  	default:
  1170  		return fmt.Errorf("Block device %s not recognized", q.config.BlockDeviceDriver)
  1171  	}
  1172  
  1173  	return nil
  1174  }
  1175  
  1176  func (q *qemu) hotplugAddVhostUserBlkDevice(vAttr *config.VhostUserDeviceAttrs, op operation, devID string) (err error) {
  1177  	err = q.qmpMonitorCh.qmp.ExecuteCharDevUnixSocketAdd(q.qmpMonitorCh.ctx, vAttr.DevID, vAttr.SocketPath, false, false)
  1178  	if err != nil {
  1179  		return err
  1180  	}
  1181  
  1182  	defer func() {
  1183  		if err != nil {
  1184  			q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID)
  1185  		}
  1186  	}()
  1187  
  1188  	driver := "vhost-user-blk-pci"
  1189  	addr, bridge, err := q.arch.addDeviceToBridge(vAttr.DevID, types.PCI)
  1190  	if err != nil {
  1191  		return err
  1192  	}
  1193  
  1194  	defer func() {
  1195  		if err != nil {
  1196  			q.arch.removeDeviceFromBridge(vAttr.DevID)
  1197  		}
  1198  	}()
  1199  
  1200  	// PCI address is in the format bridge-addr/device-addr eg. "03/02"
  1201  	vAttr.PCIAddr = fmt.Sprintf("%02x", bridge.Addr) + "/" + addr
  1202  
  1203  	if err = q.qmpMonitorCh.qmp.ExecutePCIVhostUserDevAdd(q.qmpMonitorCh.ctx, driver, devID, vAttr.DevID, addr, bridge.ID); err != nil {
  1204  		return err
  1205  	}
  1206  
  1207  	return nil
  1208  }
  1209  
  1210  func (q *qemu) hotplugBlockDevice(drive *config.BlockDrive, op operation) error {
  1211  	err := q.qmpSetup()
  1212  	if err != nil {
  1213  		return err
  1214  	}
  1215  
  1216  	devID := "virtio-" + drive.ID
  1217  
  1218  	if op == addDevice {
  1219  		err = q.hotplugAddBlockDevice(drive, op, devID)
  1220  	} else {
  1221  		if q.config.BlockDeviceDriver == config.VirtioBlock {
  1222  			if err := q.arch.removeDeviceFromBridge(drive.ID); err != nil {
  1223  				return err
  1224  			}
  1225  		}
  1226  
  1227  		if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
  1228  			return err
  1229  		}
  1230  
  1231  		if err := q.qmpMonitorCh.qmp.ExecuteBlockdevDel(q.qmpMonitorCh.ctx, drive.ID); err != nil {
  1232  			return err
  1233  		}
  1234  	}
  1235  
  1236  	return err
  1237  }
  1238  
  1239  func (q *qemu) hotplugVhostUserDevice(vAttr *config.VhostUserDeviceAttrs, op operation) error {
  1240  	err := q.qmpSetup()
  1241  	if err != nil {
  1242  		return err
  1243  	}
  1244  
  1245  	devID := "virtio-" + vAttr.DevID
  1246  
  1247  	if op == addDevice {
  1248  		switch vAttr.Type {
  1249  		case config.VhostUserBlk:
  1250  			return q.hotplugAddVhostUserBlkDevice(vAttr, op, devID)
  1251  		default:
  1252  			return fmt.Errorf("Incorrect vhost-user device type found")
  1253  		}
  1254  	} else {
  1255  		if err := q.arch.removeDeviceFromBridge(vAttr.DevID); err != nil {
  1256  			return err
  1257  		}
  1258  
  1259  		if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
  1260  			return err
  1261  		}
  1262  
  1263  		if err := q.qmpMonitorCh.qmp.ExecuteChardevDel(q.qmpMonitorCh.ctx, vAttr.DevID); err != nil {
  1264  			return err
  1265  		}
  1266  	}
  1267  
  1268  	return nil
  1269  }
  1270  
  1271  func (q *qemu) hotplugVFIODevice(device *config.VFIODev, op operation) (err error) {
  1272  	err = q.qmpSetup()
  1273  	if err != nil {
  1274  		return err
  1275  	}
  1276  
  1277  	devID := device.ID
  1278  	machinneType := q.hypervisorConfig().HypervisorMachineType
  1279  
  1280  	if op == addDevice {
  1281  
  1282  		buf, _ := json.Marshal(device)
  1283  		q.Logger().WithFields(logrus.Fields{
  1284  			"machine-type":             machinneType,
  1285  			"hotplug-vfio-on-root-bus": q.state.HotplugVFIOOnRootBus,
  1286  			"pcie-root-port":           q.state.PCIeRootPort,
  1287  			"device-info":              string(buf),
  1288  		}).Info("Start hot-plug VFIO device")
  1289  
  1290  		// In case HotplugVFIOOnRootBus is true, devices are hotplugged on the root bus
  1291  		// for pc machine type instead of bridge. This is useful for devices that require
  1292  		// a large PCI BAR which is a currently a limitation with PCI bridges.
  1293  		if q.state.HotplugVFIOOnRootBus {
  1294  
  1295  			// In case MachineType is q35, a PCIe device is hotplugged on a PCIe Root Port.
  1296  			switch machinneType {
  1297  			case QemuQ35:
  1298  				if device.IsPCIe && q.state.PCIeRootPort <= 0 {
  1299  					q.Logger().WithField("dev-id", device.ID).Warn("VFIO device is a PCIe device. It's recommended to add the PCIe Root Port by setting the pcie_root_port parameter in the configuration for q35")
  1300  					device.Bus = ""
  1301  				}
  1302  			default:
  1303  				device.Bus = ""
  1304  			}
  1305  
  1306  			switch device.Type {
  1307  			case config.VFIODeviceNormalType:
  1308  				return q.qmpMonitorCh.qmp.ExecuteVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, device.Bus, romFile)
  1309  			case config.VFIODeviceMediatedType:
  1310  				return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, "", device.Bus, romFile)
  1311  			default:
  1312  				return fmt.Errorf("Incorrect VFIO device type found")
  1313  			}
  1314  		}
  1315  
  1316  		addr, bridge, err := q.arch.addDeviceToBridge(devID, types.PCI)
  1317  		if err != nil {
  1318  			return err
  1319  		}
  1320  
  1321  		defer func() {
  1322  			if err != nil {
  1323  				q.arch.removeDeviceFromBridge(devID)
  1324  			}
  1325  		}()
  1326  
  1327  		switch device.Type {
  1328  		case config.VFIODeviceNormalType:
  1329  			return q.qmpMonitorCh.qmp.ExecutePCIVFIODeviceAdd(q.qmpMonitorCh.ctx, devID, device.BDF, addr, bridge.ID, romFile)
  1330  		case config.VFIODeviceMediatedType:
  1331  			return q.qmpMonitorCh.qmp.ExecutePCIVFIOMediatedDeviceAdd(q.qmpMonitorCh.ctx, devID, device.SysfsDev, addr, bridge.ID, romFile)
  1332  		default:
  1333  			return fmt.Errorf("Incorrect VFIO device type found")
  1334  		}
  1335  	} else {
  1336  		q.Logger().WithField("dev-id", devID).Info("Start hot-unplug VFIO device")
  1337  
  1338  		if !q.state.HotplugVFIOOnRootBus {
  1339  			if err := q.arch.removeDeviceFromBridge(devID); err != nil {
  1340  				return err
  1341  			}
  1342  		}
  1343  
  1344  		if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
  1345  			return err
  1346  		}
  1347  	}
  1348  
  1349  	return nil
  1350  }
  1351  
  1352  func (q *qemu) hotAddNetDevice(name, hardAddr string, VMFds, VhostFds []*os.File) error {
  1353  	var (
  1354  		VMFdNames    []string
  1355  		VhostFdNames []string
  1356  	)
  1357  	for i, VMFd := range VMFds {
  1358  		fdName := fmt.Sprintf("fd%d", i)
  1359  		if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VMFd); err != nil {
  1360  			return err
  1361  		}
  1362  		VMFdNames = append(VMFdNames, fdName)
  1363  	}
  1364  	for i, VhostFd := range VhostFds {
  1365  		fdName := fmt.Sprintf("vhostfd%d", i)
  1366  		if err := q.qmpMonitorCh.qmp.ExecuteGetFD(q.qmpMonitorCh.ctx, fdName, VhostFd); err != nil {
  1367  			return err
  1368  		}
  1369  		VhostFd.Close()
  1370  		VhostFdNames = append(VhostFdNames, fdName)
  1371  	}
  1372  	return q.qmpMonitorCh.qmp.ExecuteNetdevAddByFds(q.qmpMonitorCh.ctx, "tap", name, VMFdNames, VhostFdNames)
  1373  }
  1374  
  1375  func (q *qemu) hotplugNetDevice(endpoint Endpoint, op operation) (err error) {
  1376  	err = q.qmpSetup()
  1377  	if err != nil {
  1378  		return err
  1379  	}
  1380  	var tap TapInterface
  1381  
  1382  	switch endpoint.Type() {
  1383  	case VethEndpointType:
  1384  		drive := endpoint.(*VethEndpoint)
  1385  		tap = drive.NetPair.TapInterface
  1386  	case TapEndpointType:
  1387  		drive := endpoint.(*TapEndpoint)
  1388  		tap = drive.TapInterface
  1389  	default:
  1390  		return fmt.Errorf("this endpoint is not supported")
  1391  	}
  1392  
  1393  	devID := "virtio-" + tap.ID
  1394  	if op == addDevice {
  1395  		if err = q.hotAddNetDevice(tap.Name, endpoint.HardwareAddr(), tap.VMFds, tap.VhostFds); err != nil {
  1396  			return err
  1397  		}
  1398  
  1399  		defer func() {
  1400  			if err != nil {
  1401  				q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name)
  1402  			}
  1403  		}()
  1404  
  1405  		addr, bridge, err := q.arch.addDeviceToBridge(tap.ID, types.PCI)
  1406  		if err != nil {
  1407  			return err
  1408  		}
  1409  
  1410  		defer func() {
  1411  			if err != nil {
  1412  				q.arch.removeDeviceFromBridge(tap.ID)
  1413  			}
  1414  		}()
  1415  
  1416  		pciAddr := fmt.Sprintf("%02x/%s", bridge.Addr, addr)
  1417  		endpoint.SetPciAddr(pciAddr)
  1418  
  1419  		var machine govmmQemu.Machine
  1420  		machine, err = q.getQemuMachine()
  1421  		if err != nil {
  1422  			return err
  1423  		}
  1424  		if machine.Type == QemuCCWVirtio {
  1425  			devNoHotplug := fmt.Sprintf("fe.%x.%x", bridge.Addr, addr)
  1426  			return q.qmpMonitorCh.qmp.ExecuteNetCCWDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), devNoHotplug, int(q.config.NumVCPUs))
  1427  		}
  1428  		return q.qmpMonitorCh.qmp.ExecuteNetPCIDeviceAdd(q.qmpMonitorCh.ctx, tap.Name, devID, endpoint.HardwareAddr(), addr, bridge.ID, romFile, int(q.config.NumVCPUs), defaultDisableModern)
  1429  
  1430  	}
  1431  
  1432  	if err := q.arch.removeDeviceFromBridge(tap.ID); err != nil {
  1433  		return err
  1434  	}
  1435  
  1436  	if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, devID); err != nil {
  1437  		return err
  1438  	}
  1439  	if err := q.qmpMonitorCh.qmp.ExecuteNetdevDel(q.qmpMonitorCh.ctx, tap.Name); err != nil {
  1440  		return err
  1441  	}
  1442  
  1443  	return nil
  1444  }
  1445  
  1446  func (q *qemu) hotplugDevice(devInfo interface{}, devType deviceType, op operation) (interface{}, error) {
  1447  	switch devType {
  1448  	case blockDev:
  1449  		drive := devInfo.(*config.BlockDrive)
  1450  		return nil, q.hotplugBlockDevice(drive, op)
  1451  	case cpuDev:
  1452  		vcpus := devInfo.(uint32)
  1453  		return q.hotplugCPUs(vcpus, op)
  1454  	case vfioDev:
  1455  		device := devInfo.(*config.VFIODev)
  1456  		return nil, q.hotplugVFIODevice(device, op)
  1457  	case memoryDev:
  1458  		memdev := devInfo.(*memoryDevice)
  1459  		return q.hotplugMemory(memdev, op)
  1460  	case netDev:
  1461  		device := devInfo.(Endpoint)
  1462  		return nil, q.hotplugNetDevice(device, op)
  1463  	case vhostuserDev:
  1464  		vAttr := devInfo.(*config.VhostUserDeviceAttrs)
  1465  		return nil, q.hotplugVhostUserDevice(vAttr, op)
  1466  	default:
  1467  		return nil, fmt.Errorf("cannot hotplug device: unsupported device type '%v'", devType)
  1468  	}
  1469  }
  1470  
  1471  func (q *qemu) hotplugAddDevice(devInfo interface{}, devType deviceType) (interface{}, error) {
  1472  	span, _ := q.trace("hotplugAddDevice")
  1473  	defer span.Finish()
  1474  
  1475  	data, err := q.hotplugDevice(devInfo, devType, addDevice)
  1476  	if err != nil {
  1477  		return data, err
  1478  	}
  1479  
  1480  	return data, nil
  1481  }
  1482  
  1483  func (q *qemu) hotplugRemoveDevice(devInfo interface{}, devType deviceType) (interface{}, error) {
  1484  	span, _ := q.trace("hotplugRemoveDevice")
  1485  	defer span.Finish()
  1486  
  1487  	data, err := q.hotplugDevice(devInfo, devType, removeDevice)
  1488  	if err != nil {
  1489  		return data, err
  1490  	}
  1491  
  1492  	return data, nil
  1493  }
  1494  
  1495  func (q *qemu) hotplugCPUs(vcpus uint32, op operation) (uint32, error) {
  1496  	if vcpus == 0 {
  1497  		q.Logger().Warnf("cannot hotplug 0 vCPUs")
  1498  		return 0, nil
  1499  	}
  1500  
  1501  	err := q.qmpSetup()
  1502  	if err != nil {
  1503  		return 0, err
  1504  	}
  1505  
  1506  	if op == addDevice {
  1507  		return q.hotplugAddCPUs(vcpus)
  1508  	}
  1509  
  1510  	return q.hotplugRemoveCPUs(vcpus)
  1511  }
  1512  
  1513  // try to hot add an amount of vCPUs, returns the number of vCPUs added
  1514  func (q *qemu) hotplugAddCPUs(amount uint32) (uint32, error) {
  1515  	currentVCPUs := q.qemuConfig.SMP.CPUs + uint32(len(q.state.HotpluggedVCPUs))
  1516  
  1517  	// Don't fail if the number of max vCPUs is exceeded, log a warning and hot add the vCPUs needed
  1518  	// to reach out max vCPUs
  1519  	if currentVCPUs+amount > q.config.DefaultMaxVCPUs {
  1520  		q.Logger().Warnf("Cannot hotplug %d CPUs, currently this SB has %d CPUs and the maximum amount of CPUs is %d",
  1521  			amount, currentVCPUs, q.config.DefaultMaxVCPUs)
  1522  		amount = q.config.DefaultMaxVCPUs - currentVCPUs
  1523  	}
  1524  
  1525  	if amount == 0 {
  1526  		// Don't fail if no more vCPUs can be added, since cgroups still can be updated
  1527  		q.Logger().Warnf("maximum number of vCPUs '%d' has been reached", q.config.DefaultMaxVCPUs)
  1528  		return 0, nil
  1529  	}
  1530  
  1531  	// get the list of hotpluggable CPUs
  1532  	hotpluggableVCPUs, err := q.qmpMonitorCh.qmp.ExecuteQueryHotpluggableCPUs(q.qmpMonitorCh.ctx)
  1533  	if err != nil {
  1534  		return 0, fmt.Errorf("failed to query hotpluggable CPUs: %v", err)
  1535  	}
  1536  
  1537  	machine, err := q.arch.machine()
  1538  	if err != nil {
  1539  		return 0, fmt.Errorf("failed to query machine type: %v", err)
  1540  	}
  1541  
  1542  	var hotpluggedVCPUs uint32
  1543  	for _, hc := range hotpluggableVCPUs {
  1544  		// qom-path is the path to the CPU, non-empty means that this CPU is already in use
  1545  		if hc.QOMPath != "" {
  1546  			continue
  1547  		}
  1548  
  1549  		// CPU type, i.e host-x86_64-cpu
  1550  		driver := hc.Type
  1551  		cpuID := fmt.Sprintf("cpu-%d", len(q.state.HotpluggedVCPUs))
  1552  		socketID := fmt.Sprintf("%d", hc.Properties.Socket)
  1553  		dieID := fmt.Sprintf("%d", hc.Properties.Die)
  1554  		coreID := fmt.Sprintf("%d", hc.Properties.Core)
  1555  		threadID := fmt.Sprintf("%d", hc.Properties.Thread)
  1556  
  1557  		// If CPU type is IBM pSeries or Z, we do not set socketID and threadID
  1558  		if machine.Type == "pseries" || machine.Type == "s390-ccw-virtio" {
  1559  			socketID = ""
  1560  			threadID = ""
  1561  			dieID = ""
  1562  		}
  1563  
  1564  		if err := q.qmpMonitorCh.qmp.ExecuteCPUDeviceAdd(q.qmpMonitorCh.ctx, driver, cpuID, socketID, dieID, coreID, threadID, romFile); err != nil {
  1565  			// don't fail, let's try with other CPU
  1566  			continue
  1567  		}
  1568  
  1569  		// a new vCPU was added, update list of hotplugged vCPUs and check if all vCPUs were added
  1570  		q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, CPUDevice{cpuID})
  1571  		hotpluggedVCPUs++
  1572  		if hotpluggedVCPUs == amount {
  1573  			// All vCPUs were hotplugged
  1574  			return amount, nil
  1575  		}
  1576  	}
  1577  
  1578  	return hotpluggedVCPUs, fmt.Errorf("failed to hot add vCPUs: only %d vCPUs of %d were added", hotpluggedVCPUs, amount)
  1579  }
  1580  
  1581  // try to  hot remove an amount of vCPUs, returns the number of vCPUs removed
  1582  func (q *qemu) hotplugRemoveCPUs(amount uint32) (uint32, error) {
  1583  	hotpluggedVCPUs := uint32(len(q.state.HotpluggedVCPUs))
  1584  
  1585  	// we can only remove hotplugged vCPUs
  1586  	if amount > hotpluggedVCPUs {
  1587  		return 0, fmt.Errorf("Unable to remove %d CPUs, currently there are only %d hotplugged CPUs", amount, hotpluggedVCPUs)
  1588  	}
  1589  
  1590  	for i := uint32(0); i < amount; i++ {
  1591  		// get the last vCPUs and try to remove it
  1592  		cpu := q.state.HotpluggedVCPUs[len(q.state.HotpluggedVCPUs)-1]
  1593  		if err := q.qmpMonitorCh.qmp.ExecuteDeviceDel(q.qmpMonitorCh.ctx, cpu.ID); err != nil {
  1594  			return i, fmt.Errorf("failed to hotunplug CPUs, only %d CPUs were hotunplugged: %v", i, err)
  1595  		}
  1596  
  1597  		// remove from the list the vCPU hotunplugged
  1598  		q.state.HotpluggedVCPUs = q.state.HotpluggedVCPUs[:len(q.state.HotpluggedVCPUs)-1]
  1599  	}
  1600  
  1601  	return amount, nil
  1602  }
  1603  
  1604  func (q *qemu) hotplugMemory(memDev *memoryDevice, op operation) (int, error) {
  1605  
  1606  	if !q.arch.supportGuestMemoryHotplug() {
  1607  		return 0, fmt.Errorf("guest memory hotplug not supported")
  1608  	}
  1609  	if memDev.sizeMB < 0 {
  1610  		return 0, fmt.Errorf("cannot hotplug negative size (%d) memory", memDev.sizeMB)
  1611  	}
  1612  	memLog := q.Logger().WithField("hotplug", "memory")
  1613  
  1614  	memLog.WithField("hotplug-memory-mb", memDev.sizeMB).Debug("requested memory hotplug")
  1615  	err := q.qmpSetup()
  1616  	if err != nil {
  1617  		return 0, err
  1618  	}
  1619  
  1620  	currentMemory := int(q.config.MemorySize) + q.state.HotpluggedMemory
  1621  
  1622  	if memDev.sizeMB == 0 {
  1623  		memLog.Debug("hotplug is not required")
  1624  		return 0, nil
  1625  	}
  1626  
  1627  	switch op {
  1628  	case removeDevice:
  1629  		memLog.WithField("operation", "remove").Debugf("Requested to remove memory: %d MB", memDev.sizeMB)
  1630  		// Dont fail but warn that this is not supported.
  1631  		memLog.Warn("hot-remove VM memory not supported")
  1632  		return 0, nil
  1633  	case addDevice:
  1634  		memLog.WithField("operation", "add").Debugf("Requested to add memory: %d MB", memDev.sizeMB)
  1635  		maxMem, err := q.hostMemMB()
  1636  		if err != nil {
  1637  			return 0, err
  1638  		}
  1639  
  1640  		// Don't exceed the maximum amount of memory
  1641  		if currentMemory+memDev.sizeMB > int(maxMem) {
  1642  			// Fixme: return a typed error
  1643  			return 0, fmt.Errorf("Unable to hotplug %d MiB memory, the SB has %d MiB and the maximum amount is %d MiB",
  1644  				memDev.sizeMB, currentMemory, maxMem)
  1645  		}
  1646  		memoryAdded, err := q.hotplugAddMemory(memDev)
  1647  		if err != nil {
  1648  			return memoryAdded, err
  1649  		}
  1650  		return memoryAdded, nil
  1651  	default:
  1652  		return 0, fmt.Errorf("invalid operation %v", op)
  1653  	}
  1654  
  1655  }
  1656  
  1657  func (q *qemu) hotplugAddMemory(memDev *memoryDevice) (int, error) {
  1658  	memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx)
  1659  	if err != nil {
  1660  		return 0, fmt.Errorf("failed to query memory devices: %v", err)
  1661  	}
  1662  
  1663  	if len(memoryDevices) != 0 {
  1664  		maxSlot := -1
  1665  		for _, device := range memoryDevices {
  1666  			if maxSlot < device.Data.Slot {
  1667  				maxSlot = device.Data.Slot
  1668  			}
  1669  		}
  1670  		memDev.slot = maxSlot + 1
  1671  	}
  1672  
  1673  	share, target, memoryBack, err := q.getMemArgs()
  1674  	if err != nil {
  1675  		return 0, err
  1676  	}
  1677  
  1678  	err = q.qmpMonitorCh.qmp.ExecHotplugMemory(q.qmpMonitorCh.ctx, memoryBack, "mem"+strconv.Itoa(memDev.slot), target, memDev.sizeMB, share)
  1679  	if err != nil {
  1680  		q.Logger().WithError(err).Error("hotplug memory")
  1681  		return 0, err
  1682  	}
  1683  	// if guest kernel only supports memory hotplug via probe interface, we need to get address of hot-add memory device
  1684  	if memDev.probe {
  1685  		memoryDevices, err := q.qmpMonitorCh.qmp.ExecQueryMemoryDevices(q.qmpMonitorCh.ctx)
  1686  		if err != nil {
  1687  			return 0, fmt.Errorf("failed to query memory devices: %v", err)
  1688  		}
  1689  		if len(memoryDevices) != 0 {
  1690  			q.Logger().WithField("addr", fmt.Sprintf("0x%x", memoryDevices[len(memoryDevices)-1].Data.Addr)).Debug("recently hot-add memory device")
  1691  			memDev.addr = memoryDevices[len(memoryDevices)-1].Data.Addr
  1692  		} else {
  1693  			return 0, fmt.Errorf("failed to probe address of recently hot-add memory device, no device exists")
  1694  		}
  1695  	}
  1696  	q.state.HotpluggedMemory += memDev.sizeMB
  1697  	return memDev.sizeMB, nil
  1698  }
  1699  
  1700  func (q *qemu) pauseSandbox() error {
  1701  	span, _ := q.trace("pauseSandbox")
  1702  	defer span.Finish()
  1703  
  1704  	return q.togglePauseSandbox(true)
  1705  }
  1706  
  1707  func (q *qemu) resumeSandbox() error {
  1708  	span, _ := q.trace("resumeSandbox")
  1709  	defer span.Finish()
  1710  
  1711  	return q.togglePauseSandbox(false)
  1712  }
  1713  
  1714  // addDevice will add extra devices to Qemu command line.
  1715  func (q *qemu) addDevice(devInfo interface{}, devType deviceType) error {
  1716  	var err error
  1717  	span, _ := q.trace("addDevice")
  1718  	defer span.Finish()
  1719  
  1720  	switch v := devInfo.(type) {
  1721  	case types.Volume:
  1722  		if q.config.SharedFS == config.VirtioFS {
  1723  			q.Logger().WithField("volume-type", "virtio-fs").Info("adding volume")
  1724  
  1725  			var randBytes []byte
  1726  			randBytes, err = utils.GenerateRandomBytes(8)
  1727  			if err != nil {
  1728  				return err
  1729  			}
  1730  			id := hex.EncodeToString(randBytes)
  1731  
  1732  			var sockPath string
  1733  			sockPath, err = q.vhostFSSocketPath(q.id)
  1734  			if err != nil {
  1735  				return err
  1736  			}
  1737  
  1738  			vhostDev := config.VhostUserDeviceAttrs{
  1739  				Tag:       v.MountTag,
  1740  				Type:      config.VhostUserFS,
  1741  				CacheSize: q.config.VirtioFSCacheSize,
  1742  				Cache:     q.config.VirtioFSCache,
  1743  			}
  1744  			vhostDev.SocketPath = sockPath
  1745  			vhostDev.DevID = id
  1746  
  1747  			q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(q.qemuConfig.Devices, vhostDev)
  1748  		} else {
  1749  			q.Logger().WithField("volume-type", "virtio-9p").Info("adding volume")
  1750  			q.qemuConfig.Devices, err = q.arch.append9PVolume(q.qemuConfig.Devices, v)
  1751  		}
  1752  	case types.Socket:
  1753  		q.qemuConfig.Devices = q.arch.appendSocket(q.qemuConfig.Devices, v)
  1754  	case types.VSock:
  1755  		q.fds = append(q.fds, v.VhostFd)
  1756  		q.qemuConfig.Devices, err = q.arch.appendVSock(q.qemuConfig.Devices, v)
  1757  	case Endpoint:
  1758  		q.qemuConfig.Devices, err = q.arch.appendNetwork(q.qemuConfig.Devices, v)
  1759  	case config.BlockDrive:
  1760  		q.qemuConfig.Devices, err = q.arch.appendBlockDevice(q.qemuConfig.Devices, v)
  1761  	case config.VhostUserDeviceAttrs:
  1762  		q.qemuConfig.Devices, err = q.arch.appendVhostUserDevice(q.qemuConfig.Devices, v)
  1763  	case config.VFIODev:
  1764  		q.qemuConfig.Devices = q.arch.appendVFIODevice(q.qemuConfig.Devices, v)
  1765  	default:
  1766  		break
  1767  	}
  1768  
  1769  	return err
  1770  }
  1771  
  1772  // getSandboxConsole builds the path of the console where we can read
  1773  // logs coming from the sandbox.
  1774  func (q *qemu) getSandboxConsole(id string) (string, error) {
  1775  	span, _ := q.trace("getSandboxConsole")
  1776  	defer span.Finish()
  1777  
  1778  	return utils.BuildSocketPath(q.store.RunVMStoragePath(), id, consoleSocket)
  1779  }
  1780  
  1781  func (q *qemu) saveSandbox() error {
  1782  	q.Logger().Info("save sandbox")
  1783  
  1784  	err := q.qmpSetup()
  1785  	if err != nil {
  1786  		return err
  1787  	}
  1788  
  1789  	// BootToBeTemplate sets the VM to be a template that other VMs can clone from. We would want to
  1790  	// bypass shared memory when saving the VM to a local file through migration exec.
  1791  	if q.config.BootToBeTemplate {
  1792  		err := q.arch.setIgnoreSharedMemoryMigrationCaps(q.qmpMonitorCh.ctx, q.qmpMonitorCh.qmp)
  1793  		if err != nil {
  1794  			q.Logger().WithError(err).Error("set migration ignore shared memory")
  1795  			return err
  1796  		}
  1797  	}
  1798  
  1799  	err = q.qmpMonitorCh.qmp.ExecSetMigrateArguments(q.qmpMonitorCh.ctx, fmt.Sprintf("%s>%s", qmpExecCatCmd, q.config.DevicesStatePath))
  1800  	if err != nil {
  1801  		q.Logger().WithError(err).Error("exec migration")
  1802  		return err
  1803  	}
  1804  
  1805  	return q.waitMigration()
  1806  }
  1807  
  1808  func (q *qemu) waitMigration() error {
  1809  	t := time.NewTimer(qmpMigrationWaitTimeout)
  1810  	defer t.Stop()
  1811  	for {
  1812  		status, err := q.qmpMonitorCh.qmp.ExecuteQueryMigration(q.qmpMonitorCh.ctx)
  1813  		if err != nil {
  1814  			q.Logger().WithError(err).Error("failed to query migration status")
  1815  			return err
  1816  		}
  1817  		if status.Status == "completed" {
  1818  			break
  1819  		}
  1820  
  1821  		select {
  1822  		case <-t.C:
  1823  			q.Logger().WithField("migration-status", status).Error("timeout waiting for qemu migration")
  1824  			return fmt.Errorf("timed out after %d seconds waiting for qemu migration", qmpMigrationWaitTimeout)
  1825  		default:
  1826  			// migration in progress
  1827  			q.Logger().WithField("migration-status", status).Debug("migration in progress")
  1828  			time.Sleep(100 * time.Millisecond)
  1829  		}
  1830  	}
  1831  
  1832  	return nil
  1833  }
  1834  
  1835  func (q *qemu) disconnect() {
  1836  	span, _ := q.trace("disconnect")
  1837  	defer span.Finish()
  1838  
  1839  	q.qmpShutdown()
  1840  }
  1841  
  1842  // resizeMemory get a request to update the VM memory to reqMemMB
  1843  // Memory update is managed with two approaches
  1844  // Add memory to VM:
  1845  // When memory is required to be added we hotplug memory
  1846  // Remove Memory from VM/ Return memory to host.
  1847  //
  1848  // Memory unplug can be slow and it cannot be guaranteed.
  1849  // Additionally, the unplug has not small granularly it has to be
  1850  // the memory to remove has to be at least the size of one slot.
  1851  // To return memory back we are resizing the VM memory balloon.
  1852  // A longer term solution is evaluate solutions like virtio-mem
  1853  func (q *qemu) resizeMemory(reqMemMB uint32, memoryBlockSizeMB uint32, probe bool) (uint32, memoryDevice, error) {
  1854  
  1855  	currentMemory := q.config.MemorySize + uint32(q.state.HotpluggedMemory)
  1856  	err := q.qmpSetup()
  1857  	if err != nil {
  1858  		return 0, memoryDevice{}, err
  1859  	}
  1860  	var addMemDevice memoryDevice
  1861  	if q.config.VirtioMem && currentMemory != reqMemMB {
  1862  		q.Logger().WithField("hotplug", "memory").Debugf("resize memory from %dMB to %dMB", currentMemory, reqMemMB)
  1863  		sizeByte := (reqMemMB - q.config.MemorySize) * 1024 * 1024
  1864  		err = q.qmpMonitorCh.qmp.ExecQomSet(q.qmpMonitorCh.ctx, "virtiomem0", "requested-size", uint64(sizeByte))
  1865  		if err != nil {
  1866  			return 0, memoryDevice{}, err
  1867  		}
  1868  		q.state.HotpluggedMemory = int(sizeByte / 1024 / 1024)
  1869  		return reqMemMB, memoryDevice{}, nil
  1870  	}
  1871  
  1872  	switch {
  1873  	case currentMemory < reqMemMB:
  1874  		//hotplug
  1875  		addMemMB := reqMemMB - currentMemory
  1876  		memHotplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB)
  1877  		if err != nil {
  1878  			return currentMemory, memoryDevice{}, err
  1879  		}
  1880  
  1881  		addMemDevice.sizeMB = int(memHotplugMB)
  1882  		addMemDevice.probe = probe
  1883  
  1884  		data, err := q.hotplugAddDevice(&addMemDevice, memoryDev)
  1885  		if err != nil {
  1886  			return currentMemory, addMemDevice, err
  1887  		}
  1888  		memoryAdded, ok := data.(int)
  1889  		if !ok {
  1890  			return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory added, got %+v", data)
  1891  		}
  1892  		currentMemory += uint32(memoryAdded)
  1893  	case currentMemory > reqMemMB:
  1894  		//hotunplug
  1895  		addMemMB := currentMemory - reqMemMB
  1896  		memHotunplugMB, err := calcHotplugMemMiBSize(addMemMB, memoryBlockSizeMB)
  1897  		if err != nil {
  1898  			return currentMemory, memoryDevice{}, err
  1899  		}
  1900  
  1901  		addMemDevice.sizeMB = int(memHotunplugMB)
  1902  		addMemDevice.probe = probe
  1903  
  1904  		data, err := q.hotplugRemoveDevice(&addMemDevice, memoryDev)
  1905  		if err != nil {
  1906  			return currentMemory, addMemDevice, err
  1907  		}
  1908  		memoryRemoved, ok := data.(int)
  1909  		if !ok {
  1910  			return currentMemory, addMemDevice, fmt.Errorf("Could not get the memory removed, got %+v", data)
  1911  		}
  1912  		//FIXME: This is to check memory hotplugRemoveDevice reported 0, as this is not supported.
  1913  		// In the future if this is implemented this validation should be removed.
  1914  		if memoryRemoved != 0 {
  1915  			return currentMemory, addMemDevice, fmt.Errorf("memory hot unplug is not supported, something went wrong")
  1916  		}
  1917  		currentMemory -= uint32(memoryRemoved)
  1918  	}
  1919  
  1920  	// currentMemory is the current memory (updated) of the VM, return to caller to allow verify
  1921  	// the current VM memory state.
  1922  	return currentMemory, addMemDevice, nil
  1923  }
  1924  
  1925  // genericAppendBridges appends to devices the given bridges
  1926  // nolint: unused, deadcode
  1927  func genericAppendBridges(devices []govmmQemu.Device, bridges []types.Bridge, machineType string) []govmmQemu.Device {
  1928  	bus := defaultPCBridgeBus
  1929  	switch machineType {
  1930  	case QemuQ35, QemuVirt:
  1931  		bus = defaultBridgeBus
  1932  	}
  1933  
  1934  	for idx, b := range bridges {
  1935  		t := govmmQemu.PCIBridge
  1936  		if b.Type == types.PCIE {
  1937  			t = govmmQemu.PCIEBridge
  1938  		}
  1939  		if b.Type == types.CCW {
  1940  			continue
  1941  		}
  1942  
  1943  		bridges[idx].Addr = bridgePCIStartAddr + idx
  1944  
  1945  		devices = append(devices,
  1946  			govmmQemu.BridgeDevice{
  1947  				Type: t,
  1948  				Bus:  bus,
  1949  				ID:   b.ID,
  1950  				// Each bridge is required to be assigned a unique chassis id > 0
  1951  				Chassis: idx + 1,
  1952  				SHPC:    true,
  1953  				Addr:    strconv.FormatInt(int64(bridges[idx].Addr), 10),
  1954  			},
  1955  		)
  1956  	}
  1957  
  1958  	return devices
  1959  }
  1960  
  1961  func genericBridges(number uint32, machineType string) []types.Bridge {
  1962  	var bridges []types.Bridge
  1963  	var bt types.Type
  1964  
  1965  	switch machineType {
  1966  	case QemuQ35:
  1967  		// currently only pci bridges are supported
  1968  		// qemu-2.10 will introduce pcie bridges
  1969  		fallthrough
  1970  	case QemuPC:
  1971  		bt = types.PCI
  1972  	case QemuVirt:
  1973  		bt = types.PCIE
  1974  	case QemuPseries:
  1975  		bt = types.PCI
  1976  	case QemuCCWVirtio:
  1977  		bt = types.CCW
  1978  	default:
  1979  		return nil
  1980  	}
  1981  
  1982  	for i := uint32(0); i < number; i++ {
  1983  		bridges = append(bridges, types.NewBridge(bt, fmt.Sprintf("%s-bridge-%d", bt, i), make(map[uint32]string), 0))
  1984  	}
  1985  
  1986  	return bridges
  1987  }
  1988  
  1989  // nolint: unused, deadcode
  1990  func genericMemoryTopology(memoryMb, hostMemoryMb uint64, slots uint8, memoryOffset uint32) govmmQemu.Memory {
  1991  	// image NVDIMM device needs memory space 1024MB
  1992  	// See https://github.com/clearcontainers/runtime/issues/380
  1993  	memoryOffset += 1024
  1994  
  1995  	memMax := fmt.Sprintf("%dM", hostMemoryMb+uint64(memoryOffset))
  1996  
  1997  	mem := fmt.Sprintf("%dM", memoryMb)
  1998  
  1999  	memory := govmmQemu.Memory{
  2000  		Size:   mem,
  2001  		Slots:  slots,
  2002  		MaxMem: memMax,
  2003  	}
  2004  
  2005  	return memory
  2006  }
  2007  
  2008  // genericAppendPCIeRootPort appends to devices the given pcie-root-port
  2009  func genericAppendPCIeRootPort(devices []govmmQemu.Device, number uint32, machineType string) []govmmQemu.Device {
  2010  	var (
  2011  		bus           string
  2012  		chassis       string
  2013  		multiFunction bool
  2014  		addr          string
  2015  	)
  2016  	switch machineType {
  2017  	case QemuQ35:
  2018  		bus = defaultBridgeBus
  2019  		chassis = "0"
  2020  		multiFunction = false
  2021  		addr = "0"
  2022  	default:
  2023  		return devices
  2024  	}
  2025  
  2026  	for i := uint32(0); i < number; i++ {
  2027  		devices = append(devices,
  2028  			govmmQemu.PCIeRootPortDevice{
  2029  				ID:            fmt.Sprintf("%s%d", pcieRootPortPrefix, i),
  2030  				Bus:           bus,
  2031  				Chassis:       chassis,
  2032  				Slot:          strconv.FormatUint(uint64(i), 10),
  2033  				Multifunction: multiFunction,
  2034  				Addr:          addr,
  2035  			},
  2036  		)
  2037  	}
  2038  	return devices
  2039  }
  2040  
  2041  func (q *qemu) getThreadIDs() (vcpuThreadIDs, error) {
  2042  	span, _ := q.trace("getThreadIDs")
  2043  	defer span.Finish()
  2044  
  2045  	tid := vcpuThreadIDs{}
  2046  	err := q.qmpSetup()
  2047  	if err != nil {
  2048  		return tid, err
  2049  	}
  2050  
  2051  	cpuInfos, err := q.qmpMonitorCh.qmp.ExecQueryCpus(q.qmpMonitorCh.ctx)
  2052  	if err != nil {
  2053  		q.Logger().WithError(err).Error("failed to query cpu infos")
  2054  		return tid, err
  2055  	}
  2056  
  2057  	tid.vcpus = make(map[int]int, len(cpuInfos))
  2058  	for _, i := range cpuInfos {
  2059  		if i.ThreadID > 0 {
  2060  			tid.vcpus[i.CPU] = i.ThreadID
  2061  		}
  2062  	}
  2063  	return tid, nil
  2064  }
  2065  
  2066  func calcHotplugMemMiBSize(mem uint32, memorySectionSizeMB uint32) (uint32, error) {
  2067  	if memorySectionSizeMB == 0 {
  2068  		return mem, nil
  2069  	}
  2070  
  2071  	// TODO: hot add memory aligned to memory section should be more properly. See https://github.com/kata-containers/runtime/pull/624#issuecomment-419656853
  2072  	return uint32(math.Ceil(float64(mem)/float64(memorySectionSizeMB))) * memorySectionSizeMB, nil
  2073  }
  2074  
  2075  func (q *qemu) resizeVCPUs(reqVCPUs uint32) (currentVCPUs uint32, newVCPUs uint32, err error) {
  2076  
  2077  	currentVCPUs = q.config.NumVCPUs + uint32(len(q.state.HotpluggedVCPUs))
  2078  	newVCPUs = currentVCPUs
  2079  	switch {
  2080  	case currentVCPUs < reqVCPUs:
  2081  		//hotplug
  2082  		addCPUs := reqVCPUs - currentVCPUs
  2083  		data, err := q.hotplugAddDevice(addCPUs, cpuDev)
  2084  		if err != nil {
  2085  			return currentVCPUs, newVCPUs, err
  2086  		}
  2087  		vCPUsAdded, ok := data.(uint32)
  2088  		if !ok {
  2089  			return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs added, got %+v", data)
  2090  		}
  2091  		newVCPUs += vCPUsAdded
  2092  	case currentVCPUs > reqVCPUs:
  2093  		//hotunplug
  2094  		removeCPUs := currentVCPUs - reqVCPUs
  2095  		data, err := q.hotplugRemoveDevice(removeCPUs, cpuDev)
  2096  		if err != nil {
  2097  			return currentVCPUs, newVCPUs, err
  2098  		}
  2099  		vCPUsRemoved, ok := data.(uint32)
  2100  		if !ok {
  2101  			return currentVCPUs, newVCPUs, fmt.Errorf("Could not get the vCPUs removed, got %+v", data)
  2102  		}
  2103  		newVCPUs -= vCPUsRemoved
  2104  	}
  2105  	return currentVCPUs, newVCPUs, nil
  2106  }
  2107  
  2108  func (q *qemu) cleanup() error {
  2109  	span, _ := q.trace("cleanup")
  2110  	defer span.Finish()
  2111  
  2112  	for _, fd := range q.fds {
  2113  		if err := fd.Close(); err != nil {
  2114  			q.Logger().WithError(err).Warn("failed closing fd")
  2115  		}
  2116  	}
  2117  	q.fds = []*os.File{}
  2118  
  2119  	return nil
  2120  }
  2121  
  2122  func (q *qemu) getPids() []int {
  2123  	data, err := ioutil.ReadFile(q.qemuConfig.PidFile)
  2124  	if err != nil {
  2125  		q.Logger().WithError(err).Error("Could not read qemu pid file")
  2126  		return []int{0}
  2127  	}
  2128  
  2129  	pid, err := strconv.Atoi(strings.Trim(string(data), "\n\t "))
  2130  	if err != nil {
  2131  		q.Logger().WithError(err).Error("Could not convert string to int")
  2132  		return []int{0}
  2133  	}
  2134  
  2135  	var pids []int
  2136  	pids = append(pids, pid)
  2137  	if q.state.VirtiofsdPid != 0 {
  2138  		pids = append(pids, q.state.VirtiofsdPid)
  2139  	}
  2140  
  2141  	return pids
  2142  }
  2143  
  2144  type qemuGrpc struct {
  2145  	ID             string
  2146  	QmpChannelpath string
  2147  	State          QemuState
  2148  	NvdimmCount    int
  2149  
  2150  	// Most members of q.qemuConfig are just to generate
  2151  	// q.qemuConfig.qemuParams that is used by LaunchQemu except
  2152  	// q.qemuConfig.SMP.
  2153  	// So just transport q.qemuConfig.SMP from VM Cache server to runtime.
  2154  	QemuSMP govmmQemu.SMP
  2155  }
  2156  
  2157  func (q *qemu) fromGrpc(ctx context.Context, hypervisorConfig *HypervisorConfig, j []byte) error {
  2158  	var qp qemuGrpc
  2159  	err := json.Unmarshal(j, &qp)
  2160  	if err != nil {
  2161  		return err
  2162  	}
  2163  
  2164  	q.id = qp.ID
  2165  	q.config = *hypervisorConfig
  2166  	q.qmpMonitorCh.ctx = ctx
  2167  	q.qmpMonitorCh.path = qp.QmpChannelpath
  2168  	q.qemuConfig.Ctx = ctx
  2169  	q.state = qp.State
  2170  	q.arch = newQemuArch(q.config)
  2171  	q.ctx = ctx
  2172  	q.nvdimmCount = qp.NvdimmCount
  2173  
  2174  	q.qemuConfig.SMP = qp.QemuSMP
  2175  
  2176  	q.arch.setBridges(q.state.Bridges)
  2177  	return nil
  2178  }
  2179  
  2180  func (q *qemu) toGrpc() ([]byte, error) {
  2181  	q.qmpShutdown()
  2182  
  2183  	q.cleanup()
  2184  	qp := qemuGrpc{
  2185  		ID:             q.id,
  2186  		QmpChannelpath: q.qmpMonitorCh.path,
  2187  		State:          q.state,
  2188  		NvdimmCount:    q.nvdimmCount,
  2189  
  2190  		QemuSMP: q.qemuConfig.SMP,
  2191  	}
  2192  
  2193  	return json.Marshal(&qp)
  2194  }
  2195  
  2196  func (q *qemu) save() (s persistapi.HypervisorState) {
  2197  	pids := q.getPids()
  2198  	if len(pids) != 0 {
  2199  		s.Pid = pids[0]
  2200  	}
  2201  	s.VirtiofsdPid = q.state.VirtiofsdPid
  2202  	s.Type = string(QemuHypervisor)
  2203  	s.UUID = q.state.UUID
  2204  	s.HotpluggedMemory = q.state.HotpluggedMemory
  2205  	s.HotplugVFIOOnRootBus = q.state.HotplugVFIOOnRootBus
  2206  	s.PCIeRootPort = q.state.PCIeRootPort
  2207  
  2208  	for _, bridge := range q.arch.getBridges() {
  2209  		s.Bridges = append(s.Bridges, persistapi.Bridge{
  2210  			DeviceAddr: bridge.Devices,
  2211  			Type:       string(bridge.Type),
  2212  			ID:         bridge.ID,
  2213  			Addr:       bridge.Addr,
  2214  		})
  2215  	}
  2216  
  2217  	for _, cpu := range q.state.HotpluggedVCPUs {
  2218  		s.HotpluggedVCPUs = append(s.HotpluggedVCPUs, persistapi.CPUDevice{
  2219  			ID: cpu.ID,
  2220  		})
  2221  	}
  2222  	return
  2223  }
  2224  
  2225  func (q *qemu) load(s persistapi.HypervisorState) {
  2226  	q.state.UUID = s.UUID
  2227  	q.state.HotpluggedMemory = s.HotpluggedMemory
  2228  	q.state.HotplugVFIOOnRootBus = s.HotplugVFIOOnRootBus
  2229  	q.state.VirtiofsdPid = s.VirtiofsdPid
  2230  	q.state.PCIeRootPort = s.PCIeRootPort
  2231  
  2232  	for _, bridge := range s.Bridges {
  2233  		q.state.Bridges = append(q.state.Bridges, types.NewBridge(types.Type(bridge.Type), bridge.ID, bridge.DeviceAddr, bridge.Addr))
  2234  	}
  2235  
  2236  	for _, cpu := range s.HotpluggedVCPUs {
  2237  		q.state.HotpluggedVCPUs = append(q.state.HotpluggedVCPUs, CPUDevice{
  2238  			ID: cpu.ID,
  2239  		})
  2240  	}
  2241  }
  2242  
  2243  func (q *qemu) check() error {
  2244  	err := q.qmpSetup()
  2245  	if err != nil {
  2246  		return err
  2247  	}
  2248  
  2249  	status, err := q.qmpMonitorCh.qmp.ExecuteQueryStatus(q.qmpMonitorCh.ctx)
  2250  	if err != nil {
  2251  		return err
  2252  	}
  2253  
  2254  	if status.Status == "internal-error" || status.Status == "guest-panicked" {
  2255  		return errors.Errorf("guest failure: %s", status.Status)
  2256  	}
  2257  
  2258  	return nil
  2259  }
  2260  
  2261  func (q *qemu) generateSocket(id string, useVsock bool) (interface{}, error) {
  2262  	return generateVMSocket(id, useVsock, q.store.RunVMStoragePath())
  2263  }