github.com/kata-containers/runtime@v0.0.0-20210505125100-04f29832a923/virtcontainers/container.go

github.com/kata-containers/runtime@v0.0.0-20210505125100-04f29832a923/virtcontainers/container.go (about)

     1  // +build linux
     2  // Copyright (c) 2016 Intel Corporation
     3  // Copyright (c) 2014,2015,2016,2017 Docker, Inc.
     4  // SPDX-License-Identifier: Apache-2.0
     5  //
     6  
     7  package virtcontainers
     8  
     9  import (
    10  	"context"
    11  	"encoding/hex"
    12  	"fmt"
    13  	"io"
    14  	"os"
    15  	"path/filepath"
    16  	"syscall"
    17  	"time"
    18  
    19  	"github.com/containerd/cgroups"
    20  	vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups"
    21  	vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types"
    22  	"github.com/kata-containers/runtime/virtcontainers/types"
    23  	"github.com/kata-containers/runtime/virtcontainers/utils"
    24  	specs "github.com/opencontainers/runtime-spec/specs-go"
    25  	opentracing "github.com/opentracing/opentracing-go"
    26  	"github.com/pkg/errors"
    27  	"github.com/sirupsen/logrus"
    28  	"golang.org/x/sys/unix"
    29  
    30  	"github.com/kata-containers/runtime/virtcontainers/device/config"
    31  	"github.com/kata-containers/runtime/virtcontainers/device/manager"
    32  	"github.com/kata-containers/runtime/virtcontainers/pkg/rootless"
    33  	"github.com/kata-containers/runtime/virtcontainers/store"
    34  )
    35  
    36  // https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
    37  // This file has definitions for major device numbers.
    38  var cdromMajors = map[int64]string{
    39  	11: "SCSI_CDROM_MAJOR",
    40  	15: "CDU31A_CDROM_MAJOR",
    41  	16: "GOLDSTAR_CDROM_MAJOR",
    42  	17: "OPTICS_CDROM_MAJOR",
    43  	18: "SANYO_CDROM_MAJOR",
    44  	20: "MITSUMI_X_CDROM_MAJOR",
    45  	23: "MITSUMI_CDROM_MAJOR",
    46  	24: "CDU535_CDROM_MAJOR",
    47  	25: "MATSUSHITA_CDROM_MAJOR",
    48  	26: "MATSUSHITA_CDROM2_MAJOR",
    49  	27: "MATSUSHITA_CDROM3_MAJOR",
    50  	28: "MATSUSHITA_CDROM4_MAJOR",
    51  	29: "AZTECH_CDROM_MAJOR",
    52  	32: "CM206_CDROM_MAJOR",
    53  }
    54  
    55  // https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h
    56  // #define FLOPPY_MAJOR		2
    57  const floppyMajor = int64(2)
    58  
    59  // Process gathers data related to a container process.
    60  type Process struct {
    61  	// Token is the process execution context ID. It must be
    62  	// unique per sandbox.
    63  	// Token is used to manipulate processes for containers
    64  	// that have not started yet, and later identify them
    65  	// uniquely within a sandbox.
    66  	Token string
    67  
    68  	// Pid is the process ID as seen by the host software
    69  	// stack, e.g. CRI-O, containerd. This is typically the
    70  	// shim PID.
    71  	Pid int
    72  
    73  	StartTime time.Time
    74  }
    75  
    76  // ContainerStatus describes a container status.
    77  type ContainerStatus struct {
    78  	ID        string
    79  	State     types.ContainerState
    80  	PID       int
    81  	StartTime time.Time
    82  	RootFs    string
    83  	Spec      *specs.Spec
    84  
    85  	// Annotations allow clients to store arbitrary values,
    86  	// for example to add additional status values required
    87  	// to support particular specifications.
    88  	Annotations map[string]string
    89  }
    90  
    91  // ThrottlingData gather the date related to container cpu throttling.
    92  type ThrottlingData struct {
    93  	// Number of periods with throttling active
    94  	Periods uint64 `json:"periods,omitempty"`
    95  	// Number of periods when the container hit its throttling limit.
    96  	ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
    97  	// Aggregate time the container was throttled for in nanoseconds.
    98  	ThrottledTime uint64 `json:"throttled_time,omitempty"`
    99  }
   100  
   101  // CPUUsage denotes the usage of a CPU.
   102  // All CPU stats are aggregate since container inception.
   103  type CPUUsage struct {
   104  	// Total CPU time consumed.
   105  	// Units: nanoseconds.
   106  	TotalUsage uint64 `json:"total_usage,omitempty"`
   107  	// Total CPU time consumed per core.
   108  	// Units: nanoseconds.
   109  	PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
   110  	// Time spent by tasks of the cgroup in kernel mode.
   111  	// Units: nanoseconds.
   112  	UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
   113  	// Time spent by tasks of the cgroup in user mode.
   114  	// Units: nanoseconds.
   115  	UsageInUsermode uint64 `json:"usage_in_usermode"`
   116  }
   117  
   118  // CPUStats describes the cpu stats
   119  type CPUStats struct {
   120  	CPUUsage       CPUUsage       `json:"cpu_usage,omitempty"`
   121  	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
   122  }
   123  
   124  // MemoryData gather the data related to memory
   125  type MemoryData struct {
   126  	Usage    uint64 `json:"usage,omitempty"`
   127  	MaxUsage uint64 `json:"max_usage,omitempty"`
   128  	Failcnt  uint64 `json:"failcnt"`
   129  	Limit    uint64 `json:"limit"`
   130  }
   131  
   132  // MemoryStats describes the memory stats
   133  type MemoryStats struct {
   134  	// memory used for cache
   135  	Cache uint64 `json:"cache,omitempty"`
   136  	// usage of memory
   137  	Usage MemoryData `json:"usage,omitempty"`
   138  	// usage of memory  swap
   139  	SwapUsage MemoryData `json:"swap_usage,omitempty"`
   140  	// usage of kernel memory
   141  	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
   142  	// usage of kernel TCP memory
   143  	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
   144  	// if true, memory usage is accounted for throughout a hierarchy of cgroups.
   145  	UseHierarchy bool `json:"use_hierarchy"`
   146  
   147  	Stats map[string]uint64 `json:"stats,omitempty"`
   148  }
   149  
   150  // PidsStats describes the pids stats
   151  type PidsStats struct {
   152  	// number of pids in the cgroup
   153  	Current uint64 `json:"current,omitempty"`
   154  	// active pids hard limit
   155  	Limit uint64 `json:"limit,omitempty"`
   156  }
   157  
   158  // BlkioStatEntry gather date related to a block device
   159  type BlkioStatEntry struct {
   160  	Major uint64 `json:"major,omitempty"`
   161  	Minor uint64 `json:"minor,omitempty"`
   162  	Op    string `json:"op,omitempty"`
   163  	Value uint64 `json:"value,omitempty"`
   164  }
   165  
   166  // BlkioStats describes block io stats
   167  type BlkioStats struct {
   168  	// number of bytes tranferred to and from the block device
   169  	IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
   170  	IoServicedRecursive     []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
   171  	IoQueuedRecursive       []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
   172  	IoServiceTimeRecursive  []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
   173  	IoWaitTimeRecursive     []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
   174  	IoMergedRecursive       []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
   175  	IoTimeRecursive         []BlkioStatEntry `json:"io_time_recursive,omitempty"`
   176  	SectorsRecursive        []BlkioStatEntry `json:"sectors_recursive,omitempty"`
   177  }
   178  
   179  // HugetlbStats describes hugetable memory stats
   180  type HugetlbStats struct {
   181  	// current res_counter usage for hugetlb
   182  	Usage uint64 `json:"usage,omitempty"`
   183  	// maximum usage ever recorded.
   184  	MaxUsage uint64 `json:"max_usage,omitempty"`
   185  	// number of times hugetlb usage allocation failure.
   186  	Failcnt uint64 `json:"failcnt"`
   187  }
   188  
   189  // CgroupStats describes all cgroup subsystem stats
   190  type CgroupStats struct {
   191  	CPUStats    CPUStats    `json:"cpu_stats,omitempty"`
   192  	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
   193  	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
   194  	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
   195  	// the map is in the format "size of hugepage: stats of the hugepage"
   196  	HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
   197  }
   198  
   199  // NetworkStats describe all network stats.
   200  type NetworkStats struct {
   201  	// Name is the name of the network interface.
   202  	Name string `json:"name,omitempty"`
   203  
   204  	RxBytes   uint64 `json:"rx_bytes,omitempty"`
   205  	RxPackets uint64 `json:"rx_packets,omitempty"`
   206  	RxErrors  uint64 `json:"rx_errors,omitempty"`
   207  	RxDropped uint64 `json:"rx_dropped,omitempty"`
   208  	TxBytes   uint64 `json:"tx_bytes,omitempty"`
   209  	TxPackets uint64 `json:"tx_packets,omitempty"`
   210  	TxErrors  uint64 `json:"tx_errors,omitempty"`
   211  	TxDropped uint64 `json:"tx_dropped,omitempty"`
   212  }
   213  
   214  // ContainerStats describes a container stats.
   215  type ContainerStats struct {
   216  	CgroupStats  *CgroupStats
   217  	NetworkStats []*NetworkStats
   218  }
   219  
   220  // ContainerResources describes container resources
   221  type ContainerResources struct {
   222  	// VCPUs are the number of vCPUs that are being used by the container
   223  	VCPUs uint32
   224  
   225  	// Mem is the memory that is being used by the container
   226  	MemByte int64
   227  }
   228  
   229  // ContainerConfig describes one container runtime configuration.
   230  type ContainerConfig struct {
   231  	ID string
   232  
   233  	// RootFs is the container workload image on the host.
   234  	RootFs RootFs
   235  
   236  	// ReadOnlyRootfs indicates if the rootfs should be mounted readonly
   237  	ReadonlyRootfs bool
   238  
   239  	// Cmd specifies the command to run on a container
   240  	Cmd types.Cmd
   241  
   242  	// Annotations allow clients to store arbitrary values,
   243  	// for example to add additional status values required
   244  	// to support particular specifications.
   245  	Annotations map[string]string
   246  
   247  	Mounts []Mount
   248  
   249  	// Device configuration for devices that must be available within the container.
   250  	DeviceInfos []config.DeviceInfo
   251  
   252  	// Resources container resources
   253  	Resources specs.LinuxResources
   254  
   255  	// Raw OCI specification, it won't be saved to disk.
   256  	CustomSpec *specs.Spec `json:"-"`
   257  }
   258  
   259  // valid checks that the container configuration is valid.
   260  func (c *ContainerConfig) valid() bool {
   261  	if c == nil {
   262  		return false
   263  	}
   264  
   265  	if c.ID == "" {
   266  		return false
   267  	}
   268  
   269  	return true
   270  }
   271  
   272  // SystemMountsInfo describes additional information for system mounts that the agent
   273  // needs to handle
   274  type SystemMountsInfo struct {
   275  	// Indicates if /dev has been passed as a bind mount for the host /dev
   276  	BindMountDev bool
   277  
   278  	// Size of /dev/shm assigned on the host.
   279  	DevShmSize uint
   280  }
   281  
   282  // ContainerDevice describes a device associated with container
   283  type ContainerDevice struct {
   284  	// ID is device id referencing the device from sandbox's device manager
   285  	ID string
   286  
   287  	// ContainerPath is device path displayed in container
   288  	ContainerPath string
   289  
   290  	// FileMode permission bits for the device.
   291  	FileMode os.FileMode
   292  
   293  	// UID is user ID in the container namespace
   294  	UID uint32
   295  
   296  	// GID is group ID in the container namespace
   297  	GID uint32
   298  }
   299  
   300  // RootFs describes the container's rootfs.
   301  type RootFs struct {
   302  	// Source specifies the BlockDevice path
   303  	Source string
   304  	// Target specify where the rootfs is mounted if it has been mounted
   305  	Target string
   306  	// Type specifies the type of filesystem to mount.
   307  	Type string
   308  	// Options specifies zero or more fstab style mount options.
   309  	Options []string
   310  	// Mounted specifies whether the rootfs has be mounted or not
   311  	Mounted bool
   312  }
   313  
   314  // Container is composed of a set of containers and a runtime environment.
   315  // A Container can be created, deleted, started, stopped, listed, entered, paused and restored.
   316  type Container struct {
   317  	id        string
   318  	sandboxID string
   319  
   320  	rootFs RootFs
   321  
   322  	config *ContainerConfig
   323  
   324  	sandbox *Sandbox
   325  
   326  	containerPath string
   327  	rootfsSuffix  string
   328  
   329  	state types.ContainerState
   330  
   331  	process Process
   332  
   333  	mounts []Mount
   334  
   335  	devices []ContainerDevice
   336  
   337  	systemMountsInfo SystemMountsInfo
   338  
   339  	ctx context.Context
   340  
   341  	store *store.VCStore
   342  }
   343  
   344  // ID returns the container identifier string.
   345  func (c *Container) ID() string {
   346  	return c.id
   347  }
   348  
   349  // Logger returns a logrus logger appropriate for logging Container messages
   350  func (c *Container) Logger() *logrus.Entry {
   351  	return virtLog.WithFields(logrus.Fields{
   352  		"subsystem": "container",
   353  		"sandbox":   c.sandboxID,
   354  	})
   355  }
   356  
   357  func (c *Container) trace(name string) (opentracing.Span, context.Context) {
   358  	if c.ctx == nil {
   359  		c.Logger().WithField("type", "bug").Error("trace called before context set")
   360  		c.ctx = context.Background()
   361  	}
   362  
   363  	span, ctx := opentracing.StartSpanFromContext(c.ctx, name)
   364  
   365  	span.SetTag("subsystem", "container")
   366  
   367  	return span, ctx
   368  }
   369  
   370  // Sandbox returns the sandbox handler related to this container.
   371  func (c *Container) Sandbox() VCSandbox {
   372  	return c.sandbox
   373  }
   374  
   375  // Process returns the container process.
   376  func (c *Container) Process() Process {
   377  	return c.process
   378  }
   379  
   380  // GetToken returns the token related to this container's process.
   381  func (c *Container) GetToken() string {
   382  	return c.process.Token
   383  }
   384  
   385  // GetPid returns the pid related to this container's process.
   386  func (c *Container) GetPid() int {
   387  	return c.process.Pid
   388  }
   389  
   390  func (c *Container) setStateFstype(fstype string) error {
   391  	c.state.Fstype = fstype
   392  
   393  	return nil
   394  }
   395  
   396  // GetAnnotations returns container's annotations
   397  func (c *Container) GetAnnotations() map[string]string {
   398  	return c.config.Annotations
   399  }
   400  
   401  // GetPatchedOCISpec returns container's OCI specification
   402  // This OCI specification was patched when the sandbox was created
   403  // by containerCapabilities(), SetEphemeralStorageType() and others
   404  // in order to support:
   405  // * capabilities
   406  // * Ephemeral storage
   407  // * k8s empty dir
   408  // If you need the original (vanilla) OCI spec,
   409  // use compatoci.GetContainerSpec() instead.
   410  func (c *Container) GetPatchedOCISpec() *specs.Spec {
   411  	return c.config.CustomSpec
   412  }
   413  
   414  // storeContainer stores a container config.
   415  func (c *Container) storeContainer() error {
   416  	if err := c.sandbox.Save(); err != nil {
   417  		return err
   418  	}
   419  	return nil
   420  }
   421  
   422  // setContainerState sets both the in-memory and on-disk state of the
   423  // container.
   424  func (c *Container) setContainerState(state types.StateString) error {
   425  	if state == "" {
   426  		return vcTypes.ErrNeedState
   427  	}
   428  
   429  	c.Logger().Debugf("Setting container state from %v to %v", c.state.State, state)
   430  	// update in-memory state
   431  	c.state.State = state
   432  
   433  	if useOldStore(c.sandbox.ctx) {
   434  		// experimental runtime use "persist.json" which doesn't need "state.json" anymore
   435  		// update on-disk state
   436  		if err := c.store.Store(store.State, c.state); err != nil {
   437  			return err
   438  		}
   439  	} else {
   440  		// flush data to storage
   441  		if err := c.sandbox.Save(); err != nil {
   442  			return err
   443  		}
   444  	}
   445  
   446  	return nil
   447  }
   448  
   449  func (c *Container) shareFiles(m Mount, idx int, hostSharedDir, hostMountDir, guestSharedDir string) (string, bool, error) {
   450  	randBytes, err := utils.GenerateRandomBytes(8)
   451  	if err != nil {
   452  		return "", false, err
   453  	}
   454  
   455  	filename := fmt.Sprintf("%s-%s-%s", c.id, hex.EncodeToString(randBytes), filepath.Base(m.Destination))
   456  	guestDest := filepath.Join(guestSharedDir, filename)
   457  
   458  	// copy file to contaier's rootfs if filesystem sharing is not supported, otherwise
   459  	// bind mount it in the shared directory.
   460  	caps := c.sandbox.hypervisor.capabilities()
   461  	if !caps.IsFsSharingSupported() {
   462  		c.Logger().Debug("filesystem sharing is not supported, files will be copied")
   463  
   464  		fileInfo, err := os.Stat(m.Source)
   465  		if err != nil {
   466  			return "", false, err
   467  		}
   468  
   469  		// Ignore the mount if this is not a regular file (excludes
   470  		// directory, socket, device, ...) as it cannot be handled by
   471  		// a simple copy. But this should not be treated as an error,
   472  		// only as a limitation.
   473  		if !fileInfo.Mode().IsRegular() {
   474  			c.Logger().WithField("ignored-file", m.Source).Debug("Ignoring non-regular file as FS sharing not supported")
   475  			return "", true, nil
   476  		}
   477  
   478  		if err := c.sandbox.agent.copyFile(m.Source, guestDest); err != nil {
   479  			return "", false, err
   480  		}
   481  	} else {
   482  		// These mounts are created in the shared dir
   483  		mountDest := filepath.Join(hostMountDir, filename)
   484  		if err := bindMount(c.ctx, m.Source, mountDest, m.ReadOnly, "private"); err != nil {
   485  			return "", false, err
   486  		}
   487  		// Save HostPath mount value into the mount list of the container.
   488  		c.mounts[idx].HostPath = mountDest
   489  		// bindmount remount event is not propagated to mount subtrees, so we have to remount the shared dir mountpoint directly.
   490  		if m.ReadOnly {
   491  			mountDest = filepath.Join(hostSharedDir, filename)
   492  			if err := remountRo(c.ctx, mountDest); err != nil {
   493  				return "", false, err
   494  			}
   495  		}
   496  	}
   497  
   498  	return guestDest, false, nil
   499  }
   500  
   501  // mountSharedDirMounts handles bind-mounts by bindmounting to the host shared
   502  // directory which is mounted through virtiofs/9pfs in the VM.
   503  // It also updates the container mount list with the HostPath info, and store
   504  // container mounts to the storage. This way, we will have the HostPath info
   505  // available when we will need to unmount those mounts.
   506  func (c *Container) mountSharedDirMounts(hostSharedDir, hostMountDir, guestSharedDir string) (sharedDirMounts map[string]Mount, ignoredMounts map[string]Mount, err error) {
   507  	sharedDirMounts = make(map[string]Mount)
   508  	ignoredMounts = make(map[string]Mount)
   509  	var devicesToDetach []string
   510  	defer func() {
   511  		if err != nil {
   512  			for _, id := range devicesToDetach {
   513  				c.sandbox.devManager.DetachDevice(id, c.sandbox)
   514  			}
   515  		}
   516  	}()
   517  	for idx, m := range c.mounts {
   518  		// Skip mounting certain system paths from the source on the host side
   519  		// into the container as it does not make sense to do so.
   520  		// Example sources could be /sys/fs/cgroup etc.
   521  		if isSystemMount(m.Source) {
   522  			continue
   523  		}
   524  
   525  		// Check if mount is a block device file. If it is, the block device will be attached to the host
   526  		// instead of passing this as a shared mount:
   527  		if len(m.BlockDeviceID) > 0 {
   528  			// Attach this block device, all other devices passed in the config have been attached at this point
   529  			if err = c.sandbox.devManager.AttachDevice(m.BlockDeviceID, c.sandbox); err != nil {
   530  				return nil, nil, err
   531  			}
   532  			devicesToDetach = append(devicesToDetach, m.BlockDeviceID)
   533  			continue
   534  		}
   535  
   536  		// For non-block based mounts, we are only interested in bind mounts
   537  		if m.Type != "bind" {
   538  			continue
   539  		}
   540  
   541  		// We need to treat /dev/shm as a special case. This is passed as a bind mount in the spec,
   542  		// but it does not make sense to pass this as a 9p mount from the host side.
   543  		// This needs to be handled purely in the guest, by allocating memory for this inside the VM.
   544  		if m.Destination == "/dev/shm" {
   545  			continue
   546  		}
   547  
   548  		// Ignore /dev, directories and all other device files. We handle
   549  		// only regular files in /dev. It does not make sense to pass the host
   550  		// device nodes to the guest.
   551  		if isHostDevice(m.Destination) {
   552  			continue
   553  		}
   554  
   555  		var ignore bool
   556  		var guestDest string
   557  		guestDest, ignore, err = c.shareFiles(m, idx, hostSharedDir, hostMountDir, guestSharedDir)
   558  		if err != nil {
   559  			return nil, nil, err
   560  		}
   561  
   562  		// Expand the list of mounts to ignore.
   563  		if ignore {
   564  			ignoredMounts[m.Source] = Mount{Source: m.Source}
   565  			continue
   566  		}
   567  
   568  		sharedDirMount := Mount{
   569  			Source:      guestDest,
   570  			Destination: m.Destination,
   571  			Type:        m.Type,
   572  			Options:     m.Options,
   573  			ReadOnly:    m.ReadOnly,
   574  		}
   575  
   576  		sharedDirMounts[sharedDirMount.Destination] = sharedDirMount
   577  	}
   578  
   579  	return sharedDirMounts, ignoredMounts, nil
   580  }
   581  
   582  func (c *Container) unmountHostMounts() error {
   583  	var span opentracing.Span
   584  	span, c.ctx = c.trace("unmountHostMounts")
   585  	defer span.Finish()
   586  
   587  	for _, m := range c.mounts {
   588  		if m.HostPath != "" {
   589  			span, _ := c.trace("unmount")
   590  			span.SetTag("host-path", m.HostPath)
   591  
   592  			if err := syscall.Unmount(m.HostPath, syscall.MNT_DETACH|UmountNoFollow); err != nil {
   593  				c.Logger().WithFields(logrus.Fields{
   594  					"host-path": m.HostPath,
   595  					"error":     err,
   596  				}).Warn("Could not umount")
   597  				return err
   598  			}
   599  
   600  			if m.Type == "bind" {
   601  				s, err := os.Stat(m.HostPath)
   602  				if err != nil {
   603  					return errors.Wrapf(err, "Could not stat host-path %v", m.HostPath)
   604  				}
   605  				// Remove the empty file or directory
   606  				if s.Mode().IsRegular() && s.Size() == 0 {
   607  					os.Remove(m.HostPath)
   608  				}
   609  				if s.Mode().IsDir() {
   610  					syscall.Rmdir(m.HostPath)
   611  				}
   612  			}
   613  
   614  			span.Finish()
   615  		}
   616  	}
   617  
   618  	return nil
   619  }
   620  
   621  func filterDevices(c *Container, devices []ContainerDevice) (ret []ContainerDevice) {
   622  	for _, dev := range devices {
   623  		major, _ := c.sandbox.devManager.GetDeviceByID(dev.ID).GetMajorMinor()
   624  		if _, ok := cdromMajors[major]; ok {
   625  			c.Logger().WithFields(logrus.Fields{
   626  				"device": dev.ContainerPath,
   627  			}).Info("Not attach device because it is a CDROM")
   628  			continue
   629  		}
   630  
   631  		if major == floppyMajor {
   632  			c.Logger().WithFields(logrus.Fields{
   633  				"device": dev.ContainerPath,
   634  			}).Info("Not attaching device because it is a floppy drive")
   635  			continue
   636  		}
   637  
   638  		ret = append(ret, dev)
   639  	}
   640  	return
   641  }
   642  
   643  // Add any mount based block devices to the device manager and save the
   644  // device ID for the particular mount. This'll occur when the mountpoint source
   645  // is a block device.
   646  func (c *Container) createBlockDevices() error {
   647  	if !c.checkBlockDeviceSupport() {
   648  		c.Logger().Warn("Block device not supported")
   649  		return nil
   650  	}
   651  
   652  	// iterate all mounts and create block device if it's block based.
   653  	for i, m := range c.mounts {
   654  		if len(m.BlockDeviceID) > 0 {
   655  			// Non-empty m.BlockDeviceID indicates there's already one device
   656  			// associated with the mount,so no need to create a new device for it
   657  			// and we only create block device for bind mount
   658  			continue
   659  		}
   660  
   661  		if m.Type != "bind" {
   662  			// We only handle for bind-mounts
   663  			continue
   664  		}
   665  
   666  		var stat unix.Stat_t
   667  		if err := unix.Stat(m.Source, &stat); err != nil {
   668  			return fmt.Errorf("stat %q failed: %v", m.Source, err)
   669  		}
   670  
   671  		var di *config.DeviceInfo
   672  		var err error
   673  
   674  		// Check if mount is a block device file. If it is, the block device will be attached to the host
   675  		// instead of passing this as a shared mount.
   676  		if stat.Mode&unix.S_IFBLK == unix.S_IFBLK {
   677  			di = &config.DeviceInfo{
   678  				HostPath:      m.Source,
   679  				ContainerPath: m.Destination,
   680  				DevType:       "b",
   681  				Major:         int64(unix.Major(stat.Rdev)),
   682  				Minor:         int64(unix.Minor(stat.Rdev)),
   683  				ReadOnly:      m.ReadOnly,
   684  			}
   685  			// check whether source can be used as a pmem device
   686  		} else if di, err = config.PmemDeviceInfo(m.Source, m.Destination); err != nil {
   687  			c.Logger().WithError(err).
   688  				WithField("mount-source", m.Source).
   689  				Debug("no loop device")
   690  		}
   691  
   692  		if err == nil && di != nil {
   693  			b, err := c.sandbox.devManager.NewDevice(*di)
   694  			if err != nil {
   695  				// Do not return an error, try to create
   696  				// devices for other mounts
   697  				c.Logger().WithError(err).WithField("mount-source", m.Source).
   698  					Error("device manager failed to create new device")
   699  				continue
   700  
   701  			}
   702  
   703  			c.mounts[i].BlockDeviceID = b.DeviceID()
   704  		}
   705  	}
   706  
   707  	return nil
   708  }
   709  
   710  // newContainer creates a Container structure from a sandbox and a container configuration.
   711  func newContainer(sandbox *Sandbox, contConfig *ContainerConfig) (*Container, error) {
   712  	span, _ := sandbox.trace("newContainer")
   713  	defer span.Finish()
   714  
   715  	if !contConfig.valid() {
   716  		return &Container{}, fmt.Errorf("Invalid container configuration")
   717  	}
   718  
   719  	c := &Container{
   720  		id:            contConfig.ID,
   721  		sandboxID:     sandbox.id,
   722  		rootFs:        contConfig.RootFs,
   723  		config:        contConfig,
   724  		sandbox:       sandbox,
   725  		containerPath: filepath.Join(sandbox.id, contConfig.ID),
   726  		rootfsSuffix:  "rootfs",
   727  		state:         types.ContainerState{},
   728  		process:       Process{},
   729  		mounts:        contConfig.Mounts,
   730  		ctx:           sandbox.ctx,
   731  	}
   732  
   733  	if useOldStore(sandbox.ctx) {
   734  		ctrStore, err := store.NewVCContainerStore(sandbox.ctx, c.sandboxID, c.id)
   735  		if err != nil {
   736  			return nil, err
   737  		}
   738  		c.store = ctrStore
   739  		state, err := c.store.LoadContainerState()
   740  		if err == nil {
   741  			c.state = state
   742  		}
   743  
   744  		var process Process
   745  		if err := c.store.Load(store.Process, &process); err == nil {
   746  			c.process = process
   747  		}
   748  	} else {
   749  		// experimental runtime use "persist.json" instead of legacy "state.json" as storage
   750  		err := c.Restore()
   751  		if err == nil {
   752  			//container restored
   753  			return c, nil
   754  		}
   755  
   756  		// Unexpected error
   757  		if !os.IsNotExist(err) && err != errContainerPersistNotExist {
   758  			return nil, err
   759  		}
   760  	}
   761  
   762  	// If mounts are block devices, add to devmanager
   763  	if err := c.createMounts(); err != nil {
   764  		return nil, err
   765  	}
   766  
   767  	// Add container's devices to sandbox's device-manager
   768  	if err := c.createDevices(contConfig); err != nil {
   769  		return nil, err
   770  	}
   771  
   772  	return c, nil
   773  }
   774  
   775  func (c *Container) loadMounts() ([]Mount, error) {
   776  	var mounts []Mount
   777  	if err := c.store.Load(store.Mounts, &mounts); err != nil {
   778  		return []Mount{}, err
   779  	}
   780  
   781  	return mounts, nil
   782  }
   783  
   784  func (c *Container) loadDevices() ([]ContainerDevice, error) {
   785  	var devices []ContainerDevice
   786  
   787  	if err := c.store.Load(store.DeviceIDs, &devices); err != nil {
   788  		return []ContainerDevice{}, err
   789  	}
   790  
   791  	return devices, nil
   792  }
   793  
   794  func (c *Container) createMounts() error {
   795  	if useOldStore(c.sandbox.ctx) {
   796  		mounts, err := c.loadMounts()
   797  		if err == nil {
   798  			// restore mounts from disk
   799  			c.mounts = mounts
   800  			return nil
   801  		}
   802  	}
   803  
   804  	// Create block devices for newly created container
   805  	return c.createBlockDevices()
   806  }
   807  
   808  func (c *Container) createDevices(contConfig *ContainerConfig) error {
   809  	// If sandbox supports "newstore", only newly created container can reach this function,
   810  	// so we don't call restore when `supportNewStore` is true
   811  	if useOldStore(c.sandbox.ctx) {
   812  		// Devices will be found in storage after create stage has completed.
   813  		// We load devices from storage at all other stages.
   814  		storedDevices, err := c.loadDevices()
   815  		if err == nil {
   816  			c.devices = storedDevices
   817  			return nil
   818  		}
   819  	}
   820  
   821  	// If devices were not found in storage, create Device implementations
   822  	// from the configuration. This should happen at create.
   823  	var storedDevices []ContainerDevice
   824  	for _, info := range contConfig.DeviceInfos {
   825  		dev, err := c.sandbox.devManager.NewDevice(info)
   826  		if err != nil {
   827  			return err
   828  		}
   829  
   830  		storedDevices = append(storedDevices, ContainerDevice{
   831  			ID:            dev.DeviceID(),
   832  			ContainerPath: info.ContainerPath,
   833  			FileMode:      info.FileMode,
   834  			UID:           info.UID,
   835  			GID:           info.GID,
   836  		})
   837  	}
   838  	c.devices = filterDevices(c, storedDevices)
   839  	return nil
   840  }
   841  
   842  // rollbackFailingContainerCreation rolls back important steps that might have
   843  // been performed before the container creation failed.
   844  // - Unplug CPU and memory resources from the VM.
   845  // - Unplug devices from the VM.
   846  func (c *Container) rollbackFailingContainerCreation() {
   847  	if err := c.detachDevices(); err != nil {
   848  		c.Logger().WithError(err).Error("rollback failed detachDevices()")
   849  	}
   850  	if err := c.removeDrive(); err != nil {
   851  		c.Logger().WithError(err).Error("rollback failed removeDrive()")
   852  	}
   853  	if err := c.unmountHostMounts(); err != nil {
   854  		c.Logger().WithError(err).Error("rollback failed unmountHostMounts()")
   855  	}
   856  	if err := bindUnmountContainerRootfs(c.ctx, getMountPath(c.sandbox.id), c); err != nil {
   857  		c.Logger().WithError(err).Error("rollback failed bindUnmountContainerRootfs()")
   858  	}
   859  }
   860  
   861  func (c *Container) checkBlockDeviceSupport() bool {
   862  	if !c.sandbox.config.HypervisorConfig.DisableBlockDeviceUse {
   863  		agentCaps := c.sandbox.agent.capabilities()
   864  		hypervisorCaps := c.sandbox.hypervisor.capabilities()
   865  
   866  		if agentCaps.IsBlockDeviceSupported() && hypervisorCaps.IsBlockDeviceHotplugSupported() {
   867  			return true
   868  		}
   869  	}
   870  
   871  	return false
   872  }
   873  
   874  // createContainer creates and start a container inside a Sandbox. It has to be
   875  // called only when a new container, not known by the sandbox, has to be created.
   876  func (c *Container) create() (err error) {
   877  	// In case the container creation fails, the following takes care
   878  	// of rolling back all the actions previously performed.
   879  	defer func() {
   880  		if err != nil {
   881  			c.Logger().WithError(err).Error("container create failed")
   882  			c.rollbackFailingContainerCreation()
   883  		}
   884  	}()
   885  
   886  	if c.checkBlockDeviceSupport() {
   887  		// If the rootfs is backed by a block device, go ahead and hotplug it to the guest
   888  		if err = c.hotplugDrive(); err != nil {
   889  			return
   890  		}
   891  	}
   892  
   893  	var (
   894  		machineType        = c.sandbox.config.HypervisorConfig.HypervisorMachineType
   895  		normalAttachedDevs []ContainerDevice //for q35: normally attached devices
   896  		delayAttachedDevs  []ContainerDevice //for q35: delay attached devices, for example, large bar space device
   897  	)
   898  	// Fix: https://github.com/kata-containers/runtime/issues/2460
   899  	if machineType == QemuQ35 {
   900  		// add Large Bar space device to delayAttachedDevs
   901  		for _, device := range c.devices {
   902  			var isLargeBarSpace bool
   903  			isLargeBarSpace, err = manager.IsVFIOLargeBarSpaceDevice(device.ContainerPath)
   904  			if err != nil {
   905  				return
   906  			}
   907  			if isLargeBarSpace {
   908  				delayAttachedDevs = append(delayAttachedDevs, device)
   909  			} else {
   910  				normalAttachedDevs = append(normalAttachedDevs, device)
   911  			}
   912  		}
   913  	} else {
   914  		normalAttachedDevs = c.devices
   915  	}
   916  
   917  	c.Logger().WithFields(logrus.Fields{
   918  		"machine_type": machineType,
   919  		"devices":      normalAttachedDevs,
   920  	}).Info("normal attach devices")
   921  	if len(normalAttachedDevs) > 0 {
   922  		if err = c.attachDevices(normalAttachedDevs); err != nil {
   923  			return
   924  		}
   925  	}
   926  
   927  	// Deduce additional system mount info that should be handled by the agent
   928  	// inside the VM
   929  	c.getSystemMountInfo()
   930  
   931  	process, err := c.sandbox.agent.createContainer(c.sandbox, c)
   932  	if err != nil {
   933  		return err
   934  	}
   935  	c.process = *process
   936  
   937  	// lazy attach device after createContainer for q35
   938  	if machineType == QemuQ35 && len(delayAttachedDevs) > 0 {
   939  		c.Logger().WithFields(logrus.Fields{
   940  			"machine_type": machineType,
   941  			"devices":      delayAttachedDevs,
   942  		}).Info("lazy attach devices")
   943  		if err = c.attachDevices(delayAttachedDevs); err != nil {
   944  			return
   945  		}
   946  	}
   947  
   948  	if !rootless.IsRootless() && !c.sandbox.config.SandboxCgroupOnly {
   949  		if err = c.cgroupsCreate(); err != nil {
   950  			return
   951  		}
   952  	}
   953  
   954  	if err = c.setContainerState(types.StateReady); err != nil {
   955  		return
   956  	}
   957  
   958  	return nil
   959  }
   960  
   961  func (c *Container) delete() error {
   962  	if c.state.State != types.StateReady &&
   963  		c.state.State != types.StateStopped {
   964  		return fmt.Errorf("Container not ready or stopped, impossible to delete")
   965  	}
   966  
   967  	// Remove the container from sandbox structure
   968  	if err := c.sandbox.removeContainer(c.id); err != nil {
   969  		return err
   970  	}
   971  
   972  	// If running rootless, there are no cgroups to remove
   973  	if !c.sandbox.config.SandboxCgroupOnly || !rootless.IsRootless() {
   974  		if err := c.cgroupsDelete(); err != nil {
   975  			return err
   976  		}
   977  	}
   978  
   979  	return c.sandbox.storeSandbox()
   980  }
   981  
   982  // checkSandboxRunning validates the container state.
   983  //
   984  // cmd specifies the operation (or verb) that the retrieval is destined
   985  // for and is only used to make the returned error as descriptive as
   986  // possible.
   987  func (c *Container) checkSandboxRunning(cmd string) error {
   988  	if cmd == "" {
   989  		return fmt.Errorf("Cmd cannot be empty")
   990  	}
   991  
   992  	if c.sandbox.state.State != types.StateRunning {
   993  		return fmt.Errorf("Sandbox not running, impossible to %s the container", cmd)
   994  	}
   995  
   996  	return nil
   997  }
   998  
   999  func (c *Container) getSystemMountInfo() {
  1000  	// check if /dev needs to be bind mounted from host /dev
  1001  	c.systemMountsInfo.BindMountDev = false
  1002  
  1003  	for _, m := range c.mounts {
  1004  		if m.Source == "/dev" && m.Destination == "/dev" && m.Type == "bind" {
  1005  			c.systemMountsInfo.BindMountDev = true
  1006  		}
  1007  	}
  1008  
  1009  	// TODO Deduce /dev/shm size. See https://github.com/clearcontainers/runtime/issues/138
  1010  }
  1011  
  1012  func (c *Container) start() error {
  1013  	if err := c.checkSandboxRunning("start"); err != nil {
  1014  		return err
  1015  	}
  1016  
  1017  	if c.state.State != types.StateReady &&
  1018  		c.state.State != types.StateStopped {
  1019  		return fmt.Errorf("Container not ready or stopped, impossible to start")
  1020  	}
  1021  
  1022  	if err := c.state.ValidTransition(c.state.State, types.StateRunning); err != nil {
  1023  		return err
  1024  	}
  1025  
  1026  	if err := c.sandbox.agent.startContainer(c.sandbox, c); err != nil {
  1027  		c.Logger().WithError(err).Error("Failed to start container")
  1028  
  1029  		if err := c.stop(true); err != nil {
  1030  			c.Logger().WithError(err).Warn("Failed to stop container")
  1031  		}
  1032  		return err
  1033  	}
  1034  
  1035  	return c.setContainerState(types.StateRunning)
  1036  }
  1037  
  1038  func (c *Container) stop(force bool) error {
  1039  	span, _ := c.trace("stop")
  1040  	defer span.Finish()
  1041  
  1042  	// In case the container status has been updated implicitly because
  1043  	// the container process has terminated, it might be possible that
  1044  	// someone try to stop the container, and we don't want to issue an
  1045  	// error in that case. This should be a no-op.
  1046  	//
  1047  	// This has to be handled before the transition validation since this
  1048  	// is an exception.
  1049  	if c.state.State == types.StateStopped {
  1050  		c.Logger().Info("Container already stopped")
  1051  		return nil
  1052  	}
  1053  
  1054  	if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil {
  1055  		return err
  1056  	}
  1057  
  1058  	defer func() {
  1059  		span, _ := c.trace("stopShim")
  1060  		defer span.Finish()
  1061  
  1062  		// If shim is still running something went wrong
  1063  		// Make sure we stop the shim process
  1064  		if running, _ := isShimRunning(c.process.Pid); running {
  1065  			l := c.Logger()
  1066  			l.Error("Failed to stop container so stopping dangling shim")
  1067  			if err := stopShim(c.process.Pid); err != nil {
  1068  				l.WithError(err).Warn("failed to stop shim")
  1069  			}
  1070  		}
  1071  
  1072  	}()
  1073  
  1074  	// Here we expect that stop() has been called because the container
  1075  	// process returned or because it received a signal. In case of a
  1076  	// signal, we want to give it some time to end the container process.
  1077  	// However, if the signal didn't reach its goal, the caller still
  1078  	// expects this container to be stopped, that's why we should not
  1079  	// return an error, but instead try to kill it forcefully.
  1080  	if err := waitForShim(c.process.Pid); err != nil {
  1081  		// Force the container to be killed.
  1082  		if err := c.kill(syscall.SIGKILL, true); err != nil && !force {
  1083  			return err
  1084  		}
  1085  
  1086  		// Wait for the end of container process. We expect this call
  1087  		// to succeed. Indeed, we have already given a second chance
  1088  		// to the container by trying to kill it with SIGKILL, there
  1089  		// is no reason to try to go further if we got an error.
  1090  		if err := waitForShim(c.process.Pid); err != nil && !force {
  1091  			return err
  1092  		}
  1093  	}
  1094  
  1095  	// Force the container to be killed. For most of the cases, this
  1096  	// should not matter and it should return an error that will be
  1097  	// ignored.
  1098  	// But for the specific case where the shim has been SIGKILL'ed,
  1099  	// the container is still running inside the VM. And this is why
  1100  	// this signal will ensure the container will get killed to match
  1101  	// the state of the shim. This will allow the following call to
  1102  	// stopContainer() to succeed in such particular case.
  1103  	c.kill(syscall.SIGKILL, true)
  1104  
  1105  	// Since the agent has supported the MultiWaitProcess, it's better to
  1106  	// wait the process here to make sure the process has exited before to
  1107  	// issue stopContainer, otherwise the RemoveContainerRequest in it will
  1108  	// get failed if the process hasn't exited.
  1109  	c.sandbox.agent.waitProcess(c, c.id)
  1110  
  1111  	defer func() {
  1112  		// Save device and drive data.
  1113  		// TODO: can we merge this saving with setContainerState()?
  1114  		if err := c.sandbox.Save(); err != nil {
  1115  			c.Logger().WithError(err).Info("save container state failed")
  1116  		}
  1117  	}()
  1118  
  1119  	if err := c.sandbox.agent.stopContainer(c.sandbox, *c); err != nil && !force {
  1120  		return err
  1121  	}
  1122  
  1123  	if err := c.unmountHostMounts(); err != nil && !force {
  1124  		return err
  1125  	}
  1126  
  1127  	if err := bindUnmountContainerRootfs(c.ctx, getMountPath(c.sandbox.id), c); err != nil && !force {
  1128  		return err
  1129  	}
  1130  
  1131  	if err := c.detachDevices(); err != nil && !force {
  1132  		return err
  1133  	}
  1134  
  1135  	if err := c.removeDrive(); err != nil && !force {
  1136  		return err
  1137  	}
  1138  
  1139  	shareDir := filepath.Join(kataHostSharedDir(), c.sandbox.id, c.id)
  1140  	if err := syscall.Rmdir(shareDir); err != nil {
  1141  		c.Logger().WithError(err).WithField("share-dir", shareDir).Warn("Could not remove container share dir")
  1142  	}
  1143  
  1144  	// container was killed by force, container MUST change its state
  1145  	// as soon as possible just in case one of below operations fail leaving
  1146  	// the containers in a bad state.
  1147  	if err := c.setContainerState(types.StateStopped); err != nil {
  1148  		return err
  1149  	}
  1150  
  1151  	return nil
  1152  }
  1153  
  1154  func (c *Container) enter(cmd types.Cmd) (*Process, error) {
  1155  	if err := c.checkSandboxRunning("enter"); err != nil {
  1156  		return nil, err
  1157  	}
  1158  
  1159  	if c.state.State != types.StateReady &&
  1160  		c.state.State != types.StateRunning {
  1161  		return nil, fmt.Errorf("Container not ready or running, " +
  1162  			"impossible to enter")
  1163  	}
  1164  
  1165  	process, err := c.sandbox.agent.exec(c.sandbox, *c, cmd)
  1166  	if err != nil {
  1167  		return nil, err
  1168  	}
  1169  
  1170  	return process, nil
  1171  }
  1172  
  1173  func (c *Container) wait(processID string) (int32, error) {
  1174  	if c.state.State != types.StateReady &&
  1175  		c.state.State != types.StateRunning {
  1176  		return 0, fmt.Errorf("Container not ready or running, " +
  1177  			"impossible to wait")
  1178  	}
  1179  
  1180  	return c.sandbox.agent.waitProcess(c, processID)
  1181  }
  1182  
  1183  func (c *Container) kill(signal syscall.Signal, all bool) error {
  1184  	return c.signalProcess(c.process.Token, signal, all)
  1185  }
  1186  
  1187  func (c *Container) signalProcess(processID string, signal syscall.Signal, all bool) error {
  1188  	if c.sandbox.state.State != types.StateReady && c.sandbox.state.State != types.StateRunning {
  1189  		return fmt.Errorf("Sandbox not ready or running, impossible to signal the container")
  1190  	}
  1191  
  1192  	if c.state.State != types.StateReady && c.state.State != types.StateRunning && c.state.State != types.StatePaused {
  1193  		return fmt.Errorf("Container not ready, running or paused, impossible to signal the container")
  1194  	}
  1195  
  1196  	return c.sandbox.agent.signalProcess(c, processID, signal, all)
  1197  }
  1198  
  1199  func (c *Container) winsizeProcess(processID string, height, width uint32) error {
  1200  	if c.state.State != types.StateReady && c.state.State != types.StateRunning {
  1201  		return fmt.Errorf("Container not ready or running, impossible to signal the container")
  1202  	}
  1203  
  1204  	return c.sandbox.agent.winsizeProcess(c, processID, height, width)
  1205  }
  1206  
  1207  func (c *Container) ioStream(processID string) (io.WriteCloser, io.Reader, io.Reader, error) {
  1208  	if c.state.State != types.StateReady && c.state.State != types.StateRunning {
  1209  		return nil, nil, nil, fmt.Errorf("Container not ready or running, impossible to signal the container")
  1210  	}
  1211  
  1212  	stream := newIOStream(c.sandbox, c, processID)
  1213  
  1214  	return stream.stdin(), stream.stdout(), stream.stderr(), nil
  1215  }
  1216  
  1217  func (c *Container) processList(options ProcessListOptions) (ProcessList, error) {
  1218  	if err := c.checkSandboxRunning("ps"); err != nil {
  1219  		return nil, err
  1220  	}
  1221  
  1222  	if c.state.State != types.StateRunning {
  1223  		return nil, fmt.Errorf("Container not running, impossible to list processes")
  1224  	}
  1225  
  1226  	return c.sandbox.agent.processListContainer(c.sandbox, *c, options)
  1227  }
  1228  
  1229  func (c *Container) stats() (*ContainerStats, error) {
  1230  	if err := c.checkSandboxRunning("stats"); err != nil {
  1231  		return nil, err
  1232  	}
  1233  	return c.sandbox.agent.statsContainer(c.sandbox, *c)
  1234  }
  1235  
  1236  func (c *Container) update(resources specs.LinuxResources) error {
  1237  	if err := c.checkSandboxRunning("update"); err != nil {
  1238  		return err
  1239  	}
  1240  
  1241  	if state := c.state.State; !(state == types.StateRunning || state == types.StateReady) {
  1242  		return fmt.Errorf("Container(%s) not running or ready, impossible to update", state)
  1243  	}
  1244  
  1245  	if c.config.Resources.CPU == nil {
  1246  		c.config.Resources.CPU = &specs.LinuxCPU{}
  1247  	}
  1248  
  1249  	if cpu := resources.CPU; cpu != nil {
  1250  		if p := cpu.Period; p != nil && *p != 0 {
  1251  			c.config.Resources.CPU.Period = p
  1252  		}
  1253  		if q := cpu.Quota; q != nil && *q != 0 {
  1254  			c.config.Resources.CPU.Quota = q
  1255  		}
  1256  		if cpu.Cpus != "" {
  1257  			c.config.Resources.CPU.Cpus = cpu.Cpus
  1258  		}
  1259  		if cpu.Mems != "" {
  1260  			c.config.Resources.CPU.Mems = cpu.Mems
  1261  		}
  1262  	}
  1263  
  1264  	if c.config.Resources.Memory == nil {
  1265  		c.config.Resources.Memory = &specs.LinuxMemory{}
  1266  	}
  1267  
  1268  	if mem := resources.Memory; mem != nil && mem.Limit != nil {
  1269  		c.config.Resources.Memory.Limit = mem.Limit
  1270  	}
  1271  
  1272  	if err := c.sandbox.updateResources(); err != nil {
  1273  		return err
  1274  	}
  1275  
  1276  	if !c.sandbox.config.SandboxCgroupOnly {
  1277  		if err := c.cgroupsUpdate(resources); err != nil {
  1278  			return err
  1279  		}
  1280  	}
  1281  
  1282  	// There currently isn't a notion of cpusets.cpus or mems being tracked
  1283  	// inside of the guest. Make sure we clear these before asking agent to update
  1284  	// the container's cgroups.
  1285  	if resources.CPU != nil {
  1286  		resources.CPU.Mems = ""
  1287  		resources.CPU.Cpus = ""
  1288  	}
  1289  
  1290  	return c.sandbox.agent.updateContainer(c.sandbox, *c, resources)
  1291  }
  1292  
  1293  func (c *Container) pause() error {
  1294  	if err := c.checkSandboxRunning("pause"); err != nil {
  1295  		return err
  1296  	}
  1297  
  1298  	if c.state.State != types.StateRunning {
  1299  		return fmt.Errorf("Container not running, impossible to pause")
  1300  	}
  1301  
  1302  	if err := c.sandbox.agent.pauseContainer(c.sandbox, *c); err != nil {
  1303  		return err
  1304  	}
  1305  
  1306  	return c.setContainerState(types.StatePaused)
  1307  }
  1308  
  1309  func (c *Container) resume() error {
  1310  	if err := c.checkSandboxRunning("resume"); err != nil {
  1311  		return err
  1312  	}
  1313  
  1314  	if c.state.State != types.StatePaused {
  1315  		return fmt.Errorf("Container not paused, impossible to resume")
  1316  	}
  1317  
  1318  	if err := c.sandbox.agent.resumeContainer(c.sandbox, *c); err != nil {
  1319  		return err
  1320  	}
  1321  
  1322  	return c.setContainerState(types.StateRunning)
  1323  }
  1324  
  1325  // hotplugDrive will attempt to hotplug the container rootfs if it is backed by a
  1326  // block device
  1327  func (c *Container) hotplugDrive() error {
  1328  	var dev device
  1329  	var err error
  1330  
  1331  	// Check to see if the rootfs is an umounted block device (source) or if the
  1332  	// mount (target) is backed by a block device:
  1333  	if !c.rootFs.Mounted {
  1334  		dev, err = getDeviceForPath(c.rootFs.Source)
  1335  		// there is no "rootfs" dir on block device backed rootfs
  1336  		c.rootfsSuffix = ""
  1337  	} else {
  1338  		dev, err = getDeviceForPath(c.rootFs.Target)
  1339  	}
  1340  
  1341  	if err == errMountPointNotFound {
  1342  		return nil
  1343  	}
  1344  
  1345  	if err != nil {
  1346  		return err
  1347  	}
  1348  
  1349  	c.Logger().WithFields(logrus.Fields{
  1350  		"device-major": dev.major,
  1351  		"device-minor": dev.minor,
  1352  		"mount-point":  dev.mountPoint,
  1353  	}).Info("device details")
  1354  
  1355  	isDM, err := checkStorageDriver(dev.major, dev.minor)
  1356  	if err != nil {
  1357  		return err
  1358  	}
  1359  
  1360  	if !isDM {
  1361  		return nil
  1362  	}
  1363  
  1364  	devicePath := c.rootFs.Source
  1365  	fsType := c.rootFs.Type
  1366  	if c.rootFs.Mounted {
  1367  		if dev.mountPoint == c.rootFs.Target {
  1368  			c.rootfsSuffix = ""
  1369  		}
  1370  		// If device mapper device, then fetch the full path of the device
  1371  		devicePath, fsType, _, err = utils.GetDevicePathAndFsTypeOptions(dev.mountPoint)
  1372  		if err != nil {
  1373  			return err
  1374  		}
  1375  	}
  1376  
  1377  	devicePath, err = filepath.EvalSymlinks(devicePath)
  1378  	if err != nil {
  1379  		return err
  1380  	}
  1381  
  1382  	c.Logger().WithFields(logrus.Fields{
  1383  		"device-path": devicePath,
  1384  		"fs-type":     fsType,
  1385  	}).Info("Block device detected")
  1386  
  1387  	if err = c.plugDevice(devicePath); err != nil {
  1388  		return err
  1389  	}
  1390  
  1391  	return c.setStateFstype(fsType)
  1392  }
  1393  
  1394  // plugDevice will attach the rootfs if blockdevice is supported (this is rootfs specific)
  1395  func (c *Container) plugDevice(devicePath string) error {
  1396  	var stat unix.Stat_t
  1397  	if err := unix.Stat(devicePath, &stat); err != nil {
  1398  		return fmt.Errorf("stat %q failed: %v", devicePath, err)
  1399  	}
  1400  
  1401  	if c.checkBlockDeviceSupport() && stat.Mode&unix.S_IFBLK == unix.S_IFBLK {
  1402  		b, err := c.sandbox.devManager.NewDevice(config.DeviceInfo{
  1403  			HostPath:      devicePath,
  1404  			ContainerPath: filepath.Join(kataGuestSharedDir(), c.id),
  1405  			DevType:       "b",
  1406  			Major:         int64(unix.Major(stat.Rdev)),
  1407  			Minor:         int64(unix.Minor(stat.Rdev)),
  1408  		})
  1409  		if err != nil {
  1410  			return fmt.Errorf("device manager failed to create rootfs device for %q: %v", devicePath, err)
  1411  		}
  1412  
  1413  		c.state.BlockDeviceID = b.DeviceID()
  1414  
  1415  		// attach rootfs device
  1416  		if err := c.sandbox.devManager.AttachDevice(b.DeviceID(), c.sandbox); err != nil {
  1417  			return err
  1418  		}
  1419  	}
  1420  	return nil
  1421  }
  1422  
  1423  // isDriveUsed checks if a drive has been used for container rootfs
  1424  func (c *Container) isDriveUsed() bool {
  1425  	return !(c.state.Fstype == "")
  1426  }
  1427  
  1428  func (c *Container) removeDrive() (err error) {
  1429  	if c.isDriveUsed() {
  1430  		c.Logger().Info("unplugging block device")
  1431  
  1432  		devID := c.state.BlockDeviceID
  1433  		err := c.sandbox.devManager.DetachDevice(devID, c.sandbox)
  1434  		if err != nil && err != manager.ErrDeviceNotAttached {
  1435  			return err
  1436  		}
  1437  
  1438  		if err = c.sandbox.devManager.RemoveDevice(devID); err != nil {
  1439  			c.Logger().WithFields(logrus.Fields{
  1440  				"container": c.id,
  1441  				"device-id": devID,
  1442  			}).WithError(err).Error("remove device failed")
  1443  
  1444  			// ignore the device not exist error
  1445  			if err != manager.ErrDeviceNotExist {
  1446  				return err
  1447  			}
  1448  		}
  1449  	}
  1450  
  1451  	return nil
  1452  }
  1453  
  1454  func (c *Container) attachDevices(devices []ContainerDevice) error {
  1455  	// there's no need to do rollback when error happens,
  1456  	// because if attachDevices fails, container creation will fail too,
  1457  	// and rollbackFailingContainerCreation could do all the rollbacks
  1458  
  1459  	// since devices with large bar space require delayed attachment,
  1460  	// the devices need to be split into two lists, normalAttachedDevs and delayAttachedDevs.
  1461  	// so c.device is not used here. See issue https://github.com/kata-containers/runtime/issues/2460.
  1462  	for _, dev := range devices {
  1463  		if err := c.sandbox.devManager.AttachDevice(dev.ID, c.sandbox); err != nil {
  1464  			return err
  1465  		}
  1466  	}
  1467  	return nil
  1468  }
  1469  
  1470  func (c *Container) detachDevices() error {
  1471  	for _, dev := range c.devices {
  1472  		err := c.sandbox.devManager.DetachDevice(dev.ID, c.sandbox)
  1473  		if err != nil && err != manager.ErrDeviceNotAttached {
  1474  			return err
  1475  		}
  1476  
  1477  		if err = c.sandbox.devManager.RemoveDevice(dev.ID); err != nil {
  1478  			c.Logger().WithFields(logrus.Fields{
  1479  				"container": c.id,
  1480  				"device-id": dev.ID,
  1481  			}).WithError(err).Error("remove device failed")
  1482  
  1483  			// ignore the device not exist error
  1484  			if err != manager.ErrDeviceNotExist {
  1485  				return err
  1486  			}
  1487  		}
  1488  	}
  1489  	return nil
  1490  }
  1491  
  1492  // cgroupsCreate creates cgroups on the host for the associated container
  1493  func (c *Container) cgroupsCreate() (err error) {
  1494  	spec := c.GetPatchedOCISpec()
  1495  	if spec == nil {
  1496  		return errorMissingOCISpec
  1497  	}
  1498  
  1499  	// https://github.com/kata-containers/runtime/issues/168
  1500  	resources := specs.LinuxResources{
  1501  		CPU: nil,
  1502  	}
  1503  
  1504  	if spec.Linux != nil && spec.Linux.Resources != nil {
  1505  		resources.CPU = validCPUResources(spec.Linux.Resources.CPU)
  1506  	}
  1507  
  1508  	c.state.CgroupPath, err = vccgroups.ValidCgroupPath(spec.Linux.CgroupsPath, c.sandbox.config.SystemdCgroup)
  1509  	if err != nil {
  1510  		return fmt.Errorf("Invalid cgroup path: %v", err)
  1511  	}
  1512  
  1513  	cgroup, err := cgroupsNewFunc(cgroups.V1,
  1514  		cgroups.StaticPath(c.state.CgroupPath), &resources)
  1515  	if err != nil {
  1516  		return fmt.Errorf("Could not create cgroup for %v: %v", c.state.CgroupPath, err)
  1517  	}
  1518  
  1519  	c.config.Resources = resources
  1520  
  1521  	// Add shim into cgroup
  1522  	if c.process.Pid > 0 {
  1523  		if err := cgroup.Add(cgroups.Process{Pid: c.process.Pid}); err != nil {
  1524  			return fmt.Errorf("Could not add PID %d to cgroup %v: %v", c.process.Pid, spec.Linux.CgroupsPath, err)
  1525  		}
  1526  	}
  1527  
  1528  	return nil
  1529  }
  1530  
  1531  // cgroupsDelete deletes the cgroups on the host for the associated container
  1532  func (c *Container) cgroupsDelete() error {
  1533  
  1534  	if c.state.CgroupPath == "" {
  1535  		c.Logger().Debug("container does not have host cgroups: nothing to update")
  1536  		return nil
  1537  	}
  1538  
  1539  	cgroup, err := cgroupsLoadFunc(cgroups.V1,
  1540  		cgroups.StaticPath(c.state.CgroupPath))
  1541  
  1542  	if err == cgroups.ErrCgroupDeleted {
  1543  		// cgroup already deleted
  1544  		return nil
  1545  	}
  1546  
  1547  	if err != nil {
  1548  		return fmt.Errorf("Could not load container cgroup %v: %v", c.state.CgroupPath, err)
  1549  	}
  1550  
  1551  	// move running process here, that way cgroup can be removed
  1552  	parent, err := parentCgroup(cgroups.V1, c.state.CgroupPath)
  1553  	if err != nil {
  1554  		// parent cgroup doesn't exist, that means there are no process running
  1555  		// and the container cgroup was removed.
  1556  		c.Logger().WithError(err).Warn("Container cgroup doesn't exist")
  1557  		return nil
  1558  	}
  1559  
  1560  	if err := cgroup.MoveTo(parent); err != nil {
  1561  		// Don't fail, cgroup can be deleted
  1562  		c.Logger().WithError(err).Warn("Could not move container process into parent cgroup")
  1563  	}
  1564  
  1565  	if err := cgroup.Delete(); err != nil {
  1566  		return fmt.Errorf("Could not delete container cgroup path='%v': error='%v'", c.state.CgroupPath, err)
  1567  	}
  1568  
  1569  	return nil
  1570  }
  1571  
  1572  // cgroupsUpdate updates cgroups on the host for the associated container
  1573  func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error {
  1574  
  1575  	if c.state.CgroupPath == "" {
  1576  		c.Logger().Debug("container does not have host cgroups: nothing to update")
  1577  		return nil
  1578  	}
  1579  	cgroup, err := cgroupsLoadFunc(cgroups.V1,
  1580  		cgroups.StaticPath(c.state.CgroupPath))
  1581  	if err != nil {
  1582  		return fmt.Errorf("Could not load cgroup %v: %v", c.state.CgroupPath, err)
  1583  	}
  1584  
  1585  	// Issue: https://github.com/kata-containers/runtime/issues/168
  1586  	r := specs.LinuxResources{
  1587  		CPU: validCPUResources(resources.CPU),
  1588  	}
  1589  
  1590  	// update cgroup
  1591  	if err := cgroup.Update(&r); err != nil {
  1592  		return fmt.Errorf("Could not update container cgroup path='%v': error='%v'", c.state.CgroupPath, err)
  1593  	}
  1594  
  1595  	// store new resources
  1596  	c.config.Resources = r
  1597  	if err := c.storeContainer(); err != nil {
  1598  		return err
  1599  	}
  1600  
  1601  	return nil
  1602  }