github.com/mirantis/virtlet@v1.5.2-0.20191204181327-1659b8a48e9b/pkg/libvirttools/virtualization.go (about)

     1  /*
     2  Copyright 2016-2017 Mirantis
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package libvirttools
    18  
    19  import (
    20  	"fmt"
    21  	"path/filepath"
    22  	"strings"
    23  	"time"
    24  
    25  	"github.com/golang/glog"
    26  	"github.com/jonboulle/clockwork"
    27  	libvirtxml "github.com/libvirt/libvirt-go-xml"
    28  	uuid "github.com/nu7hatch/gouuid"
    29  	"k8s.io/apimachinery/pkg/fields"
    30  	kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
    31  
    32  	vconfig "github.com/Mirantis/virtlet/pkg/config"
    33  	"github.com/Mirantis/virtlet/pkg/fs"
    34  	"github.com/Mirantis/virtlet/pkg/metadata"
    35  	"github.com/Mirantis/virtlet/pkg/metadata/types"
    36  	"github.com/Mirantis/virtlet/pkg/network"
    37  	"github.com/Mirantis/virtlet/pkg/utils"
    38  	"github.com/Mirantis/virtlet/pkg/virt"
    39  )
    40  
    41  const (
    42  	defaultMemory     = 1024
    43  	defaultMemoryUnit = "MiB"
    44  	defaultDomainType = "kvm"
    45  	defaultEmulator   = "/usr/bin/kvm"
    46  	noKvmDomainType   = "qemu"
    47  	noKvmEmulator     = "/usr/bin/qemu-system-x86_64"
    48  
    49  	domainStartCheckInterval      = 250 * time.Millisecond
    50  	domainStartTimeout            = 10 * time.Second
    51  	domainShutdownRetryInterval   = 5 * time.Second
    52  	domainShutdownOnRemoveTimeout = 60 * time.Second
    53  	domainDestroyCheckInterval    = 500 * time.Millisecond
    54  	domainDestroyTimeout          = 5 * time.Second
    55  
    56  	// ContainerNsUUID template for container ns uuid generation
    57  	ContainerNsUUID = "67b7fb47-7735-4b64-86d2-6d062d121966"
    58  
    59  	// KubernetesPodNameLabel is pod name container label (copied from kubetypes).
    60  	KubernetesPodNameLabel = "io.kubernetes.pod.name"
    61  	// KubernetesPodNamespaceLabel is pod namespace container label (copied from kubetypes),
    62  	KubernetesPodNamespaceLabel = "io.kubernetes.pod.namespace"
    63  	// KubernetesPodUIDLabel is uid container label (copied from kubetypes).
    64  	KubernetesPodUIDLabel = "io.kubernetes.pod.uid"
    65  	// KubernetesContainerNameLabel is container name label (copied from kubetypes)
    66  	KubernetesContainerNameLabel = "io.kubernetes.container.name"
    67  )
    68  
    69  type domainSettings struct {
    70  	useKvm           bool
    71  	domainName       string
    72  	domainUUID       string
    73  	memory           int
    74  	memoryUnit       string
    75  	vcpuNum          int
    76  	cpuShares        uint
    77  	cpuPeriod        uint64
    78  	cpuQuota         int64
    79  	rootDiskFilepath string
    80  	netFdKey         string
    81  	enableSriov      bool
    82  	cpuModel         string
    83  	systemUUID       *uuid.UUID
    84  }
    85  
    86  func (ds *domainSettings) createDomain(config *types.VMConfig) *libvirtxml.Domain {
    87  	domainType := defaultDomainType
    88  	emulator := defaultEmulator
    89  	if !ds.useKvm {
    90  		domainType = noKvmDomainType
    91  		emulator = noKvmEmulator
    92  	}
    93  
    94  	scsiControllerIndex := uint(0)
    95  	domain := &libvirtxml.Domain{
    96  		Devices: &libvirtxml.DomainDeviceList{
    97  			Emulator: "/vmwrapper",
    98  			Inputs: []libvirtxml.DomainInput{
    99  				{Type: "tablet", Bus: "usb"},
   100  			},
   101  			Graphics: []libvirtxml.DomainGraphic{
   102  				{VNC: &libvirtxml.DomainGraphicVNC{Port: -1}},
   103  			},
   104  			Videos: []libvirtxml.DomainVideo{
   105  				{Model: libvirtxml.DomainVideoModel{Type: "cirrus"}},
   106  			},
   107  			Controllers: []libvirtxml.DomainController{
   108  				{Type: "scsi", Index: &scsiControllerIndex, Model: "virtio-scsi"},
   109  			},
   110  		},
   111  
   112  		OS: &libvirtxml.DomainOS{
   113  			Type: &libvirtxml.DomainOSType{Type: "hvm"},
   114  			BootDevices: []libvirtxml.DomainBootDevice{
   115  				{Dev: "hd"},
   116  			},
   117  		},
   118  
   119  		Features: &libvirtxml.DomainFeatureList{ACPI: &libvirtxml.DomainFeature{}},
   120  
   121  		OnPoweroff: "destroy",
   122  		OnReboot:   "restart",
   123  		OnCrash:    "restart",
   124  
   125  		Type: domainType,
   126  
   127  		Name:   ds.domainName,
   128  		UUID:   ds.domainUUID,
   129  		Memory: &libvirtxml.DomainMemory{Value: uint(ds.memory), Unit: ds.memoryUnit},
   130  		VCPU:   &libvirtxml.DomainVCPU{Value: ds.vcpuNum},
   131  		CPUTune: &libvirtxml.DomainCPUTune{
   132  			Shares: &libvirtxml.DomainCPUTuneShares{Value: ds.cpuShares},
   133  			Period: &libvirtxml.DomainCPUTunePeriod{Value: ds.cpuPeriod},
   134  			Quota:  &libvirtxml.DomainCPUTuneQuota{Value: ds.cpuQuota},
   135  		},
   136  		// This causes '"qemu: qemu_thread_create: Resource temporarily unavailable"' QEMU errors
   137  		// when Virtlet is run as a non-privileged user.
   138  		// Under strace, it looks like a bunch of mmap()s failing with EAGAIN
   139  		// which happens due to mlockall() call somewhere above that.
   140  		// This could be worked around using setrlimit() but really
   141  		// swap handling is not needed here because it's incorrect
   142  		// to have swap enabled on the nodes of a real Kubernetes cluster.
   143  
   144  		// MemoryBacking: &libvirtxml.DomainMemoryBacking{Locked: &libvirtxml.DomainMemoryBackingLocked{}},
   145  
   146  		QEMUCommandline: &libvirtxml.DomainQEMUCommandline{
   147  			Envs: []libvirtxml.DomainQEMUCommandlineEnv{
   148  				{Name: vconfig.EmulatorEnvVarName, Value: emulator},
   149  				{Name: vconfig.NetKeyEnvVarName, Value: ds.netFdKey},
   150  				{Name: vconfig.ContainerIDEnvVarName, Value: config.DomainUUID},
   151  				{Name: vconfig.LogPathEnvVarName,
   152  					Value: filepath.Join(config.LogDirectory, config.LogPath)},
   153  			},
   154  		},
   155  	}
   156  
   157  	// Set cpu model.
   158  	// If user understand the cpu definition of libvirt,
   159  	// the user is very professional, we prior to use it.
   160  	if config.ParsedAnnotations.CPUSetting != nil {
   161  		domain.CPU = config.ParsedAnnotations.CPUSetting
   162  	} else {
   163  		switch ds.cpuModel {
   164  		case types.CPUModelHostModel:
   165  			// The following enables nested virtualization.
   166  			// In case of intel processors it requires nested=1 option
   167  			// for kvm_intel module. That can be passed like this:
   168  			// modprobe kvm_intel nested=1
   169  			domain.CPU = &libvirtxml.DomainCPU{
   170  				Mode: types.CPUModelHostModel,
   171  				Model: &libvirtxml.DomainCPUModel{
   172  					Fallback: "forbid",
   173  				},
   174  				Features: []libvirtxml.DomainCPUFeature{
   175  					{
   176  						Policy: "require",
   177  						Name:   "vmx",
   178  					},
   179  				},
   180  			}
   181  		case "":
   182  			// leave it empty
   183  		default:
   184  			glog.Warningf("Unknown value set in VIRTLET_CPU_MODEL: %q", ds.cpuModel)
   185  		}
   186  	}
   187  
   188  	if ds.systemUUID != nil {
   189  		domain.SysInfo = &libvirtxml.DomainSysInfo{
   190  			Type: "smbios",
   191  			System: &libvirtxml.DomainSysInfoSystem{
   192  				Entry: []libvirtxml.DomainSysInfoEntry{
   193  					{
   194  						Name:  "uuid",
   195  						Value: ds.systemUUID.String(),
   196  					},
   197  				},
   198  			},
   199  		}
   200  	}
   201  
   202  	if ds.enableSriov {
   203  		domain.QEMUCommandline.Envs = append(domain.QEMUCommandline.Envs,
   204  			libvirtxml.DomainQEMUCommandlineEnv{Name: "VMWRAPPER_KEEP_PRIVS", Value: "1"})
   205  	}
   206  
   207  	return domain
   208  }
   209  
   210  // VirtualizationConfig specifies configuration options for VirtualizationTool.
   211  type VirtualizationConfig struct {
   212  	// True if KVM should be disabled
   213  	DisableKVM bool
   214  	// True if SR-IOV support needs to be enabled
   215  	EnableSriov bool
   216  	// List of raw devices that can be accessed by the VM.
   217  	RawDevices []string
   218  	// Kubelet's root dir
   219  	// FIXME: kubelet's --root-dir may be something other than /var/lib/kubelet
   220  	// Need to remove it from daemonset mounts (both dev and non-dev)
   221  	// Use 'nsenter -t 1 -m -- tar ...' or something to grab the path
   222  	// from root namespace
   223  	KubeletRootDir string
   224  	// The path of streamer socket used for
   225  	// logging. By default, the path is empty. When the path is empty,
   226  	// logging is disabled for the VMs.
   227  	StreamerSocketPath string
   228  	// The name of libvirt volume pool to use for the VMs.
   229  	VolumePoolName string
   230  	// CPUModel contains type (can be overloaded by pod annotation)
   231  	// of cpu model to be passed in libvirt domain definition.
   232  	// Empty value denotes libvirt defaults usage.
   233  	CPUModel string
   234  	// Path to the directory used for shared filesystems
   235  	SharedFilesystemPath string
   236  }
   237  
   238  // VirtualizationTool provides methods to operate on libvirt.
   239  type VirtualizationTool struct {
   240  	domainConn    virt.DomainConnection
   241  	storageConn   virt.StorageConnection
   242  	imageManager  ImageManager
   243  	metadataStore metadata.Store
   244  	clock         clockwork.Clock
   245  	volumeSource  VMVolumeSource
   246  	config        VirtualizationConfig
   247  	fsys          fs.FileSystem
   248  	commander     utils.Commander
   249  }
   250  
   251  var _ volumeOwner = &VirtualizationTool{}
   252  
   253  // NewVirtualizationTool verifies existence of volumes pool in libvirt store
   254  // and returns initialized VirtualizationTool.
   255  func NewVirtualizationTool(domainConn virt.DomainConnection,
   256  	storageConn virt.StorageConnection, imageManager ImageManager,
   257  	metadataStore metadata.Store, volumeSource VMVolumeSource,
   258  	config VirtualizationConfig, fsys fs.FileSystem,
   259  	commander utils.Commander) *VirtualizationTool {
   260  	return &VirtualizationTool{
   261  		domainConn:    domainConn,
   262  		storageConn:   storageConn,
   263  		imageManager:  imageManager,
   264  		metadataStore: metadataStore,
   265  		clock:         clockwork.NewRealClock(),
   266  		volumeSource:  volumeSource,
   267  		config:        config,
   268  		fsys:          fsys,
   269  		commander:     commander,
   270  	}
   271  }
   272  
   273  // SetClock sets the clock to use (used in tests)
   274  func (v *VirtualizationTool) SetClock(clock clockwork.Clock) {
   275  	v.clock = clock
   276  }
   277  
   278  func (v *VirtualizationTool) addSerialDevicesToDomain(domain *libvirtxml.Domain) error {
   279  	port := uint(0)
   280  	timeout := uint(1)
   281  	if v.config.StreamerSocketPath != "" {
   282  		domain.Devices.Serials = []libvirtxml.DomainSerial{
   283  			{
   284  				Source: &libvirtxml.DomainChardevSource{
   285  					UNIX: &libvirtxml.DomainChardevSourceUNIX{
   286  						Mode: "connect",
   287  						Path: v.config.StreamerSocketPath,
   288  						Reconnect: &libvirtxml.DomainChardevSourceReconnect{
   289  							Enabled: "yes",
   290  							Timeout: &timeout,
   291  						},
   292  					},
   293  				},
   294  				Target: &libvirtxml.DomainSerialTarget{Port: &port},
   295  			},
   296  		}
   297  	} else {
   298  		domain.Devices.Serials = []libvirtxml.DomainSerial{
   299  			{
   300  				Target: &libvirtxml.DomainSerialTarget{Port: &port},
   301  			},
   302  		}
   303  		domain.Devices.Consoles = []libvirtxml.DomainConsole{
   304  			{
   305  				Target: &libvirtxml.DomainConsoleTarget{Type: "serial", Port: &port},
   306  			},
   307  		}
   308  	}
   309  	return nil
   310  }
   311  
   312  // CreateContainer defines libvirt domain for VM, prepares it's disks and stores
   313  // all info in metadata store.  It returns domain uuid generated basing on pod
   314  // sandbox id.
   315  func (v *VirtualizationTool) CreateContainer(config *types.VMConfig, netFdKey string) (string, error) {
   316  	if err := config.LoadAnnotations(); err != nil {
   317  		return "", err
   318  	}
   319  
   320  	var domainUUID string
   321  	if config.ParsedAnnotations.SystemUUID != nil {
   322  		domainUUID = config.ParsedAnnotations.SystemUUID.String()
   323  	} else {
   324  		domainUUID = utils.NewUUID5(ContainerNsUUID, config.PodSandboxID)
   325  	}
   326  	// FIXME: this field should be moved to VMStatus struct (to be added)
   327  	config.DomainUUID = domainUUID
   328  	cpuModel := v.config.CPUModel
   329  	if config.ParsedAnnotations.CPUModel != "" {
   330  		cpuModel = string(config.ParsedAnnotations.CPUModel)
   331  	}
   332  	settings := domainSettings{
   333  		domainUUID: domainUUID,
   334  		// Note: using only first 13 characters because libvirt has an issue with handling
   335  		// long path names for qemu monitor socket
   336  		domainName:  "virtlet-" + domainUUID[:13] + "-" + config.Name,
   337  		netFdKey:    netFdKey,
   338  		vcpuNum:     config.ParsedAnnotations.VCPUCount,
   339  		memory:      int(config.MemoryLimitInBytes),
   340  		cpuShares:   uint(config.CPUShares),
   341  		cpuPeriod:   uint64(config.CPUPeriod),
   342  		enableSriov: v.config.EnableSriov,
   343  		// CPU bandwidth limits for domains are actually set equal per
   344  		// each vCPU by libvirt. Thus, to limit overall VM's CPU
   345  		// threads consumption by the value from the pod definition
   346  		// we need to perform this division
   347  		cpuQuota:   config.CPUQuota / int64(config.ParsedAnnotations.VCPUCount),
   348  		memoryUnit: "b",
   349  		useKvm:     !v.config.DisableKVM,
   350  		cpuModel:   cpuModel,
   351  		systemUUID: config.ParsedAnnotations.SystemUUID,
   352  	}
   353  	if settings.memory == 0 {
   354  		settings.memory = defaultMemory
   355  		settings.memoryUnit = defaultMemoryUnit
   356  	}
   357  
   358  	domainDef := settings.createDomain(config)
   359  	diskList, err := newDiskList(config, v.volumeSource, v)
   360  	if err != nil {
   361  		return "", err
   362  	}
   363  	domainDef.Devices.Disks, domainDef.Devices.Filesystems, err = diskList.setup()
   364  	if err != nil {
   365  		return "", err
   366  	}
   367  
   368  	ok := false
   369  	defer func() {
   370  		if ok {
   371  			return
   372  		}
   373  		if err := v.removeDomain(settings.domainUUID, config, types.ContainerState_CONTAINER_UNKNOWN, true); err != nil {
   374  			glog.Warningf("Failed to remove domain %q: %v", settings.domainUUID, err)
   375  		}
   376  		if err := diskList.teardown(); err != nil {
   377  			glog.Warningf("error tearing down volumes after an error: %v", err)
   378  		}
   379  	}()
   380  
   381  	if err := v.addSerialDevicesToDomain(domainDef); err != nil {
   382  		return "", err
   383  	}
   384  
   385  	if config.ContainerLabels == nil {
   386  		config.ContainerLabels = map[string]string{}
   387  	}
   388  	config.ContainerLabels[kubetypes.KubernetesPodNameLabel] = config.PodName
   389  	config.ContainerLabels[kubetypes.KubernetesPodNamespaceLabel] = config.PodNamespace
   390  	config.ContainerLabels[kubetypes.KubernetesPodUIDLabel] = config.PodSandboxID
   391  	config.ContainerLabels[kubetypes.KubernetesContainerNameLabel] = config.Name
   392  
   393  	domain, err := v.domainConn.DefineDomain(domainDef)
   394  	if err == nil {
   395  		err = diskList.writeImages(domain)
   396  	}
   397  	if err == nil {
   398  		err = v.metadataStore.Container(settings.domainUUID).Save(
   399  			func(_ *types.ContainerInfo) (*types.ContainerInfo, error) {
   400  				return &types.ContainerInfo{
   401  					Name:      config.Name,
   402  					CreatedAt: v.clock.Now().UnixNano(),
   403  					Config:    *config,
   404  					State:     types.ContainerState_CONTAINER_CREATED,
   405  				}, nil
   406  			})
   407  	}
   408  	if err != nil {
   409  		return "", err
   410  	}
   411  
   412  	ok = true
   413  	return settings.domainUUID, nil
   414  }
   415  
   416  func (v *VirtualizationTool) updateDiskImages(containerID string) error {
   417  	domain, err := v.domainConn.LookupDomainByUUIDString(containerID)
   418  	if err != nil {
   419  		return fmt.Errorf("failed to look up domain %q: %v", containerID, err)
   420  	}
   421  
   422  	config, _, err := v.getVMConfigFromMetadata(containerID)
   423  	if err != nil {
   424  		return err
   425  	}
   426  
   427  	if config == nil {
   428  		glog.Warningf("No info found for domain %q in the metadata store. Not updating disk images", containerID)
   429  		return nil
   430  	}
   431  
   432  	diskList, err := newDiskList(config, v.volumeSource, v)
   433  	if err != nil {
   434  		return err
   435  	}
   436  
   437  	return diskList.writeImages(domain)
   438  }
   439  
   440  // UpdateContainerNetwork updates network info for the container
   441  func (v *VirtualizationTool) UpdateContainerNetwork(containerID string, csn *network.ContainerSideNetwork) error {
   442  	if err := v.metadataStore.Container(containerID).Save(
   443  		func(c *types.ContainerInfo) (*types.ContainerInfo, error) {
   444  			// make sure the container is not removed during the call
   445  			if c != nil {
   446  				c.Config.ContainerSideNetwork = csn
   447  			}
   448  			return c, nil
   449  		}); err != nil {
   450  		return fmt.Errorf("error updating container info: %v", err)
   451  	}
   452  
   453  	// propagate network config to cloud-init
   454  	if err := v.updateDiskImages(containerID); err != nil {
   455  		return fmt.Errorf("domain %q: error updating disk images: %v", containerID, err)
   456  	}
   457  
   458  	return nil
   459  }
   460  
   461  func (v *VirtualizationTool) startContainer(containerID string) error {
   462  	domain, err := v.domainConn.LookupDomainByUUIDString(containerID)
   463  	if err != nil {
   464  		return fmt.Errorf("failed to look up domain %q: %v", containerID, err)
   465  	}
   466  
   467  	state, err := domain.State()
   468  	if err != nil {
   469  		return fmt.Errorf("failed to get state of the domain %q: %v", containerID, err)
   470  	}
   471  	if state != virt.DomainStateShutoff {
   472  		return fmt.Errorf("domain %q: bad state %v upon StartContainer()", containerID, state)
   473  	}
   474  
   475  	if err = domain.Create(); err != nil {
   476  		return fmt.Errorf("failed to create domain %q: %v", containerID, err)
   477  	}
   478  
   479  	// XXX: maybe we don't really have to wait here but I couldn't
   480  	// find it in libvirt docs.
   481  	if err = utils.WaitLoop(func() (bool, error) {
   482  		state, err := domain.State()
   483  		if err != nil {
   484  			return false, fmt.Errorf("failed to get state of the domain %q: %v", containerID, err)
   485  		}
   486  		switch state {
   487  		case virt.DomainStateRunning:
   488  			return true, nil
   489  		case virt.DomainStateShutdown:
   490  			return false, fmt.Errorf("unexpected shutdown for new domain %q", containerID)
   491  		case virt.DomainStateCrashed:
   492  			return false, fmt.Errorf("domain %q crashed on start", containerID)
   493  		default:
   494  			return false, nil
   495  		}
   496  	}, domainStartCheckInterval, domainStartTimeout, v.clock); err != nil {
   497  		return err
   498  	}
   499  
   500  	return v.metadataStore.Container(containerID).Save(
   501  		func(c *types.ContainerInfo) (*types.ContainerInfo, error) {
   502  			// make sure the container is not removed during the call
   503  			if c != nil {
   504  				c.State = types.ContainerState_CONTAINER_RUNNING
   505  				c.StartedAt = v.clock.Now().UnixNano()
   506  			}
   507  			return c, nil
   508  		})
   509  }
   510  
   511  // StartContainer calls libvirt to start domain, waits up to 10 seconds for
   512  // DOMAIN_RUNNING state, then updates it's state in metadata store.
   513  // If there was an error it will be returned to caller after an domain removal
   514  // attempt.  If also it had an error - both of them will be combined.
   515  func (v *VirtualizationTool) StartContainer(containerID string) error {
   516  	return v.startContainer(containerID)
   517  }
   518  
   519  // StopContainer calls graceful shutdown of domain and if it was non successful
   520  // it calls libvirt to destroy that domain.
   521  // Successful shutdown or destroy of domain is followed by removal of
   522  // VM info from metadata store.
   523  // Succeeded removal of metadata is followed by volumes cleanup.
   524  func (v *VirtualizationTool) StopContainer(containerID string, timeout time.Duration) error {
   525  	domain, err := v.domainConn.LookupDomainByUUIDString(containerID)
   526  	if err != nil {
   527  		return err
   528  	}
   529  
   530  	// We try to shut down the VM gracefully first. This may take several attempts
   531  	// because shutdown requests may be ignored e.g. when the VM boots.
   532  	// If this fails, we just destroy the domain (i.e. power off the VM).
   533  	err = utils.WaitLoop(func() (bool, error) {
   534  		_, err := v.domainConn.LookupDomainByUUIDString(containerID)
   535  		if err == virt.ErrDomainNotFound {
   536  			return true, nil
   537  		}
   538  		if err != nil {
   539  			return false, fmt.Errorf("failed to look up the domain %q: %v", containerID, err)
   540  		}
   541  
   542  		// domain.Shutdown() may return 'invalid operation' error if domain is already
   543  		// shut down. But checking the state beforehand will not make the situation
   544  		// any simpler because we'll still have a race, thus we need multiple attempts
   545  		domainShutdownErr := domain.Shutdown()
   546  
   547  		state, err := domain.State()
   548  		if err != nil {
   549  			return false, fmt.Errorf("failed to get state of the domain %q: %v", containerID, err)
   550  		}
   551  
   552  		if state == virt.DomainStateShutoff {
   553  			return true, nil
   554  		}
   555  
   556  		if domainShutdownErr != nil {
   557  			// The domain is not in 'DOMAIN_SHUTOFF' state and domain.Shutdown() failed,
   558  			// so we need to return the error that happened during Shutdown()
   559  			return false, fmt.Errorf("failed to shut down domain %q: %v", containerID, err)
   560  		}
   561  
   562  		return false, nil
   563  	}, domainShutdownRetryInterval, timeout, v.clock)
   564  
   565  	if err != nil {
   566  		glog.Warningf("Failed to shut down VM %q: %v -- trying to destroy the domain", containerID, err)
   567  		// if the domain is destroyed successfully we return no error
   568  		if err = domain.Destroy(); err != nil {
   569  			return fmt.Errorf("failed to destroy the domain: %v", err)
   570  		}
   571  	}
   572  
   573  	if err == nil {
   574  		err = v.metadataStore.Container(containerID).Save(
   575  			func(c *types.ContainerInfo) (*types.ContainerInfo, error) {
   576  				// make sure the container is not removed during the call
   577  				if c != nil {
   578  					c.State = types.ContainerState_CONTAINER_EXITED
   579  				}
   580  				return c, nil
   581  			})
   582  	}
   583  
   584  	if err == nil {
   585  		// Note: volume cleanup is done right after domain has been stopped
   586  		// due to by the time the ContainerRemove request all flexvolume
   587  		// data is already removed by kubelet's VolumeManager
   588  		return v.cleanupVolumes(containerID)
   589  	}
   590  
   591  	return err
   592  }
   593  
   594  func (v *VirtualizationTool) getVMConfigFromMetadata(containerID string) (*types.VMConfig, types.ContainerState, error) {
   595  	containerInfo, err := v.metadataStore.Container(containerID).Retrieve()
   596  	if err != nil {
   597  		glog.Errorf("Error when retrieving domain %q info from metadata store: %v", containerID, err)
   598  		return nil, types.ContainerState_CONTAINER_UNKNOWN, err
   599  	}
   600  	if containerInfo == nil {
   601  		// the vm is already removed
   602  		return nil, types.ContainerState_CONTAINER_UNKNOWN, nil
   603  	}
   604  
   605  	return &containerInfo.Config, containerInfo.State, nil
   606  }
   607  
   608  func (v *VirtualizationTool) cleanupVolumes(containerID string) error {
   609  	config, _, err := v.getVMConfigFromMetadata(containerID)
   610  	if err != nil {
   611  		return err
   612  	}
   613  
   614  	if config == nil {
   615  		glog.Warningf("No info found for domain %q in metadata store. Volume cleanup skipped.", containerID)
   616  		return nil
   617  	}
   618  
   619  	diskList, err := newDiskList(config, v.volumeSource, v)
   620  	if err == nil {
   621  		err = diskList.teardown()
   622  	}
   623  
   624  	var errs []string
   625  	if err != nil {
   626  		glog.Errorf("Volume teardown failed for domain %q: %v", containerID, err)
   627  		errs = append(errs, err.Error())
   628  	}
   629  
   630  	return nil
   631  }
   632  
   633  func (v *VirtualizationTool) removeDomain(containerID string, config *types.VMConfig, state types.ContainerState, failUponVolumeTeardownFailure bool) error {
   634  	// Give a chance to gracefully stop domain
   635  	// TODO: handle errors - there could be e.g. lost connection error
   636  	domain, err := v.domainConn.LookupDomainByUUIDString(containerID)
   637  	if err != nil && err != virt.ErrDomainNotFound {
   638  		return err
   639  	}
   640  
   641  	if domain != nil {
   642  		if state == types.ContainerState_CONTAINER_RUNNING {
   643  			if err := domain.Destroy(); err != nil {
   644  				return fmt.Errorf("failed to destroy the domain: %v", err)
   645  			}
   646  		}
   647  
   648  		if err := domain.Undefine(); err != nil {
   649  			return fmt.Errorf("error undefining the domain %q: %v", containerID, err)
   650  		}
   651  
   652  		// Wait until domain is really removed or timeout after 5 sec.
   653  		if err := utils.WaitLoop(func() (bool, error) {
   654  			if _, err := v.domainConn.LookupDomainByUUIDString(containerID); err == virt.ErrDomainNotFound {
   655  				return true, nil
   656  			} else if err != nil {
   657  				// Unexpected error occurred
   658  				return false, fmt.Errorf("error looking up domain %q: %v", containerID, err)
   659  			}
   660  			return false, nil
   661  		}, domainDestroyCheckInterval, domainDestroyTimeout, v.clock); err != nil {
   662  			return err
   663  		}
   664  	}
   665  
   666  	diskList, err := newDiskList(config, v.volumeSource, v)
   667  	if err == nil {
   668  		err = diskList.teardown()
   669  	}
   670  
   671  	switch {
   672  	case err == nil:
   673  		return nil
   674  	case failUponVolumeTeardownFailure:
   675  		return err
   676  	default:
   677  		glog.Warningf("Error during volume teardown for container %s: %v", containerID, err)
   678  		return nil
   679  	}
   680  }
   681  
   682  // RemoveContainer tries to gracefully stop domain, then forcibly removes it
   683  // even if it's still running.
   684  // It waits up to 5 sec for doing the job by libvirt.
   685  func (v *VirtualizationTool) RemoveContainer(containerID string) error {
   686  	config, state, err := v.getVMConfigFromMetadata(containerID)
   687  
   688  	if err != nil {
   689  		return err
   690  	}
   691  
   692  	if config == nil {
   693  		glog.Warningf("No info found for domain %q in metadata store. Domain cleanup skipped", containerID)
   694  		return nil
   695  	}
   696  
   697  	if err := v.removeDomain(containerID, config, state, state == types.ContainerState_CONTAINER_CREATED ||
   698  		state == types.ContainerState_CONTAINER_RUNNING); err != nil {
   699  		return err
   700  	}
   701  
   702  	if v.metadataStore.Container(containerID).Save(
   703  		func(_ *types.ContainerInfo) (*types.ContainerInfo, error) {
   704  			return nil, nil // delete container
   705  		},
   706  	); err != nil {
   707  		glog.Errorf("Error when removing container '%s' from metadata store: %v", containerID, err)
   708  		return err
   709  	}
   710  
   711  	return nil
   712  }
   713  
   714  func virtToKubeState(domainState virt.DomainState, lastState types.ContainerState) types.ContainerState {
   715  	var containerState types.ContainerState
   716  
   717  	switch domainState {
   718  	case virt.DomainStateShutdown:
   719  		// the domain is being shut down, but is still running
   720  		fallthrough
   721  	case virt.DomainStateRunning:
   722  		containerState = types.ContainerState_CONTAINER_RUNNING
   723  	case virt.DomainStatePaused:
   724  		if lastState == types.ContainerState_CONTAINER_CREATED {
   725  			containerState = types.ContainerState_CONTAINER_CREATED
   726  		} else {
   727  			containerState = types.ContainerState_CONTAINER_EXITED
   728  		}
   729  	case virt.DomainStateShutoff:
   730  		if lastState == types.ContainerState_CONTAINER_CREATED {
   731  			containerState = types.ContainerState_CONTAINER_CREATED
   732  		} else {
   733  			containerState = types.ContainerState_CONTAINER_EXITED
   734  		}
   735  	case virt.DomainStateCrashed:
   736  		containerState = types.ContainerState_CONTAINER_EXITED
   737  	case virt.DomainStatePMSuspended:
   738  		containerState = types.ContainerState_CONTAINER_EXITED
   739  	default:
   740  		containerState = types.ContainerState_CONTAINER_UNKNOWN
   741  	}
   742  
   743  	return containerState
   744  }
   745  
   746  func (v *VirtualizationTool) getPodContainer(podSandboxID string) (*types.ContainerInfo, error) {
   747  	// FIXME: is it possible for multiple containers to exist?
   748  	domainContainers, err := v.metadataStore.ListPodContainers(podSandboxID)
   749  	if err != nil {
   750  		// There's no such sandbox. Looks like it's already removed, so return an empty list
   751  		return nil, nil
   752  	}
   753  	for _, containerMeta := range domainContainers {
   754  		// TODO: Distinguish lack of domain from other errors
   755  		_, err := v.domainConn.LookupDomainByUUIDString(containerMeta.GetID())
   756  		if err != nil {
   757  			// There's no such domain. Looks like it's already removed, so return an empty list
   758  			return nil, nil
   759  		}
   760  
   761  		// Verify if there is container metadata
   762  		containerInfo, err := containerMeta.Retrieve()
   763  		if err != nil {
   764  			return nil, err
   765  		}
   766  		if containerInfo == nil {
   767  			// There's no such container - looks like it's already removed, but still is mentioned in sandbox
   768  			return nil, fmt.Errorf("container metadata not found, but it's still mentioned in sandbox %s", podSandboxID)
   769  		}
   770  
   771  		return containerInfo, nil
   772  	}
   773  	return nil, nil
   774  }
   775  
   776  // ListContainers queries libvirt for domains denoted by container id or
   777  // pod standbox id or for all domains and after gathering theirs description
   778  // from metadata and conversion of status from libvirt to kubeapi compatible
   779  // returns them as a list of kubeapi Containers.
   780  func (v *VirtualizationTool) ListContainers(filter *types.ContainerFilter) ([]*types.ContainerInfo, error) {
   781  	var containers []*types.ContainerInfo
   782  	switch {
   783  	case filter != nil && filter.Id != "":
   784  		containerInfo, err := v.ContainerInfo(filter.Id)
   785  		if err != nil || containerInfo == nil {
   786  			return nil, err
   787  		}
   788  		containers = append(containers, containerInfo)
   789  	case filter != nil && filter.PodSandboxID != "":
   790  		containerInfo, err := v.getPodContainer(filter.PodSandboxID)
   791  		if err != nil || containerInfo == nil {
   792  			return nil, err
   793  		}
   794  		containers = append(containers, containerInfo)
   795  	default:
   796  		// Get list of all the defined domains from libvirt
   797  		// and check each container against the remaining
   798  		// filter settings
   799  		domains, err := v.domainConn.ListDomains()
   800  		if err != nil {
   801  			return nil, err
   802  		}
   803  		for _, domain := range domains {
   804  			containerID, err := domain.UUIDString()
   805  			if err != nil {
   806  				return nil, err
   807  			}
   808  			containerInfo, err := v.ContainerInfo(containerID)
   809  			if err != nil {
   810  				return nil, err
   811  			}
   812  
   813  			if containerInfo == nil {
   814  				glog.V(1).Infof("Failed to find info for domain with id %q in virtlet db, considering it a non-virtlet libvirt domain.", containerID)
   815  				continue
   816  			}
   817  			containers = append(containers, containerInfo)
   818  		}
   819  	}
   820  
   821  	if filter == nil {
   822  		return containers, nil
   823  	}
   824  
   825  	var r []*types.ContainerInfo
   826  	for _, c := range containers {
   827  		if filterContainer(c, *filter) {
   828  			r = append(r, c)
   829  		}
   830  	}
   831  
   832  	return r, nil
   833  }
   834  
   835  // ContainerInfo returns info for the specified container, making sure it's also
   836  // present among libvirt domains. If it isn't, the function returns nil
   837  func (v *VirtualizationTool) ContainerInfo(containerID string) (*types.ContainerInfo, error) {
   838  	domain, err := v.domainConn.LookupDomainByUUIDString(containerID)
   839  	if err != nil {
   840  		return nil, err
   841  	}
   842  
   843  	containerInfo, err := v.metadataStore.Container(containerID).Retrieve()
   844  	if err != nil {
   845  		return nil, err
   846  	}
   847  	if containerInfo == nil {
   848  		return nil, nil
   849  	}
   850  
   851  	state, err := domain.State()
   852  	if err != nil {
   853  		return nil, err
   854  	}
   855  
   856  	containerState := virtToKubeState(state, containerInfo.State)
   857  	if containerInfo.State != containerState {
   858  		if err := v.metadataStore.Container(containerID).Save(
   859  			func(c *types.ContainerInfo) (*types.ContainerInfo, error) {
   860  				// make sure the container is not removed during the call
   861  				if c != nil {
   862  					c.State = containerState
   863  				}
   864  				return c, nil
   865  			},
   866  		); err != nil {
   867  			return nil, err
   868  		}
   869  		containerInfo.State = containerState
   870  	}
   871  	return containerInfo, nil
   872  }
   873  
   874  // VMStats returns current cpu/memory/disk usage for VM
   875  func (v *VirtualizationTool) VMStats(containerID string, name string) (*types.VMStats, error) {
   876  	domain, err := v.domainConn.LookupDomainByUUIDString(containerID)
   877  	if err != nil {
   878  		return nil, err
   879  	}
   880  	vs := types.VMStats{
   881  		Timestamp:   v.clock.Now().UnixNano(),
   882  		ContainerID: containerID,
   883  		Name:        name,
   884  	}
   885  
   886  	rss, err := domain.GetRSS()
   887  	if err != nil {
   888  		return nil, err
   889  	}
   890  	vs.MemoryUsage = rss
   891  
   892  	cpuTime, err := domain.GetCPUTime()
   893  	if err != nil {
   894  		return nil, err
   895  	}
   896  	vs.CpuUsage = cpuTime
   897  
   898  	domainxml, err := domain.XML()
   899  	if err != nil {
   900  		return nil, err
   901  	}
   902  
   903  	rootDiskLocation := ""
   904  	for _, disk := range domainxml.Devices.Disks {
   905  		if disk.Source == nil || disk.Source.File == nil {
   906  			continue
   907  		}
   908  		fname := disk.Source.File.File
   909  		// TODO: split file name and use HasPrefix on last part
   910  		// instead of Contains
   911  		if strings.Contains(fname, "virtlet_root_") {
   912  			rootDiskLocation = fname
   913  		}
   914  	}
   915  	if rootDiskLocation == "" {
   916  		return nil, fmt.Errorf("cannot locate root disk in domain definition")
   917  	}
   918  
   919  	rootDiskSize, err := v.ImageManager().BytesUsedBy(rootDiskLocation)
   920  	if err != nil {
   921  		return nil, err
   922  	}
   923  	vs.FsBytes = rootDiskSize
   924  
   925  	glog.V(4).Infof("VMStats - cpu: %d, mem: %d, disk: %d, timestamp: %d", vs.CpuUsage, vs.MemoryUsage, vs.FsBytes, vs.Timestamp)
   926  
   927  	return &vs, nil
   928  }
   929  
   930  // ListVMStats returns statistics (same as VMStats) for all containers matching
   931  // provided filter (id AND podstandboxid AND labels)
   932  func (v *VirtualizationTool) ListVMStats(filter *types.VMStatsFilter) ([]types.VMStats, error) {
   933  	var containersFilter *types.ContainerFilter
   934  	if filter != nil {
   935  		containersFilter = &types.ContainerFilter{}
   936  		if filter.Id != "" {
   937  			containersFilter.Id = filter.Id
   938  		}
   939  		if filter.PodSandboxID != "" {
   940  			containersFilter.PodSandboxID = filter.PodSandboxID
   941  		}
   942  		if filter.LabelSelector != nil {
   943  			containersFilter.LabelSelector = filter.LabelSelector
   944  		}
   945  	}
   946  
   947  	infos, err := v.ListContainers(containersFilter)
   948  	if err != nil {
   949  		return nil, err
   950  	}
   951  
   952  	var statsList []types.VMStats
   953  	for _, info := range infos {
   954  		stats, err := v.VMStats(info.Id, info.Name)
   955  		if err != nil {
   956  			return nil, err
   957  		}
   958  		statsList = append(statsList, *stats)
   959  	}
   960  	return statsList, nil
   961  }
   962  
   963  // volumeOwner implementation follows
   964  
   965  // StoragePool implements volumeOwner StoragePool method
   966  func (v *VirtualizationTool) StoragePool() (virt.StoragePool, error) {
   967  	return ensureStoragePool(v.storageConn, v.config.VolumePoolName)
   968  }
   969  
   970  // DomainConnection implements volumeOwner DomainConnection method
   971  func (v *VirtualizationTool) DomainConnection() virt.DomainConnection { return v.domainConn }
   972  
   973  // StorageConnection implements volumeOwner StorageConnection method
   974  func (v *VirtualizationTool) StorageConnection() virt.StorageConnection { return v.storageConn }
   975  
   976  // ImageManager implements volumeOwner ImageManager method
   977  func (v *VirtualizationTool) ImageManager() ImageManager { return v.imageManager }
   978  
   979  // RawDevices implements volumeOwner RawDevices method
   980  func (v *VirtualizationTool) RawDevices() []string { return v.config.RawDevices }
   981  
   982  // KubeletRootDir implements volumeOwner KubeletRootDir method
   983  func (v *VirtualizationTool) KubeletRootDir() string { return v.config.KubeletRootDir }
   984  
   985  // VolumePoolName implements volumeOwner VolumePoolName method
   986  func (v *VirtualizationTool) VolumePoolName() string { return v.config.VolumePoolName }
   987  
   988  // FileSystem implements volumeOwner FileSystem method
   989  func (v *VirtualizationTool) FileSystem() fs.FileSystem { return v.fsys }
   990  
   991  // SharedFilesystemPath implements volumeOwner SharedFilesystemPath method
   992  func (v *VirtualizationTool) SharedFilesystemPath() string { return v.config.SharedFilesystemPath }
   993  
   994  // Commander implements volumeOwner Commander method
   995  func (v *VirtualizationTool) Commander() utils.Commander { return v.commander }
   996  
   997  func filterContainer(containerInfo *types.ContainerInfo, filter types.ContainerFilter) bool {
   998  	if filter.Id != "" && containerInfo.Id != filter.Id {
   999  		return false
  1000  	}
  1001  
  1002  	if filter.PodSandboxID != "" && containerInfo.Config.PodSandboxID != filter.PodSandboxID {
  1003  		return false
  1004  	}
  1005  
  1006  	if filter.State != nil && containerInfo.State != *filter.State {
  1007  		return false
  1008  	}
  1009  	if filter.LabelSelector != nil {
  1010  		sel := fields.SelectorFromSet(filter.LabelSelector)
  1011  		if !sel.Matches(fields.Set(containerInfo.Config.ContainerLabels)) {
  1012  			return false
  1013  		}
  1014  	}
  1015  
  1016  	return true
  1017  }