github.com/kata-containers/runtime@v0.0.0-20210505125100-04f29832a923/virtcontainers/container.go (about) 1 // +build linux 2 // Copyright (c) 2016 Intel Corporation 3 // Copyright (c) 2014,2015,2016,2017 Docker, Inc. 4 // SPDX-License-Identifier: Apache-2.0 5 // 6 7 package virtcontainers 8 9 import ( 10 "context" 11 "encoding/hex" 12 "fmt" 13 "io" 14 "os" 15 "path/filepath" 16 "syscall" 17 "time" 18 19 "github.com/containerd/cgroups" 20 vccgroups "github.com/kata-containers/runtime/virtcontainers/pkg/cgroups" 21 vcTypes "github.com/kata-containers/runtime/virtcontainers/pkg/types" 22 "github.com/kata-containers/runtime/virtcontainers/types" 23 "github.com/kata-containers/runtime/virtcontainers/utils" 24 specs "github.com/opencontainers/runtime-spec/specs-go" 25 opentracing "github.com/opentracing/opentracing-go" 26 "github.com/pkg/errors" 27 "github.com/sirupsen/logrus" 28 "golang.org/x/sys/unix" 29 30 "github.com/kata-containers/runtime/virtcontainers/device/config" 31 "github.com/kata-containers/runtime/virtcontainers/device/manager" 32 "github.com/kata-containers/runtime/virtcontainers/pkg/rootless" 33 "github.com/kata-containers/runtime/virtcontainers/store" 34 ) 35 36 // https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h 37 // This file has definitions for major device numbers. 38 var cdromMajors = map[int64]string{ 39 11: "SCSI_CDROM_MAJOR", 40 15: "CDU31A_CDROM_MAJOR", 41 16: "GOLDSTAR_CDROM_MAJOR", 42 17: "OPTICS_CDROM_MAJOR", 43 18: "SANYO_CDROM_MAJOR", 44 20: "MITSUMI_X_CDROM_MAJOR", 45 23: "MITSUMI_CDROM_MAJOR", 46 24: "CDU535_CDROM_MAJOR", 47 25: "MATSUSHITA_CDROM_MAJOR", 48 26: "MATSUSHITA_CDROM2_MAJOR", 49 27: "MATSUSHITA_CDROM3_MAJOR", 50 28: "MATSUSHITA_CDROM4_MAJOR", 51 29: "AZTECH_CDROM_MAJOR", 52 32: "CM206_CDROM_MAJOR", 53 } 54 55 // https://github.com/torvalds/linux/blob/master/include/uapi/linux/major.h 56 // #define FLOPPY_MAJOR 2 57 const floppyMajor = int64(2) 58 59 // Process gathers data related to a container process. 60 type Process struct { 61 // Token is the process execution context ID. It must be 62 // unique per sandbox. 63 // Token is used to manipulate processes for containers 64 // that have not started yet, and later identify them 65 // uniquely within a sandbox. 66 Token string 67 68 // Pid is the process ID as seen by the host software 69 // stack, e.g. CRI-O, containerd. This is typically the 70 // shim PID. 71 Pid int 72 73 StartTime time.Time 74 } 75 76 // ContainerStatus describes a container status. 77 type ContainerStatus struct { 78 ID string 79 State types.ContainerState 80 PID int 81 StartTime time.Time 82 RootFs string 83 Spec *specs.Spec 84 85 // Annotations allow clients to store arbitrary values, 86 // for example to add additional status values required 87 // to support particular specifications. 88 Annotations map[string]string 89 } 90 91 // ThrottlingData gather the date related to container cpu throttling. 92 type ThrottlingData struct { 93 // Number of periods with throttling active 94 Periods uint64 `json:"periods,omitempty"` 95 // Number of periods when the container hit its throttling limit. 96 ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` 97 // Aggregate time the container was throttled for in nanoseconds. 98 ThrottledTime uint64 `json:"throttled_time,omitempty"` 99 } 100 101 // CPUUsage denotes the usage of a CPU. 102 // All CPU stats are aggregate since container inception. 103 type CPUUsage struct { 104 // Total CPU time consumed. 105 // Units: nanoseconds. 106 TotalUsage uint64 `json:"total_usage,omitempty"` 107 // Total CPU time consumed per core. 108 // Units: nanoseconds. 109 PercpuUsage []uint64 `json:"percpu_usage,omitempty"` 110 // Time spent by tasks of the cgroup in kernel mode. 111 // Units: nanoseconds. 112 UsageInKernelmode uint64 `json:"usage_in_kernelmode"` 113 // Time spent by tasks of the cgroup in user mode. 114 // Units: nanoseconds. 115 UsageInUsermode uint64 `json:"usage_in_usermode"` 116 } 117 118 // CPUStats describes the cpu stats 119 type CPUStats struct { 120 CPUUsage CPUUsage `json:"cpu_usage,omitempty"` 121 ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` 122 } 123 124 // MemoryData gather the data related to memory 125 type MemoryData struct { 126 Usage uint64 `json:"usage,omitempty"` 127 MaxUsage uint64 `json:"max_usage,omitempty"` 128 Failcnt uint64 `json:"failcnt"` 129 Limit uint64 `json:"limit"` 130 } 131 132 // MemoryStats describes the memory stats 133 type MemoryStats struct { 134 // memory used for cache 135 Cache uint64 `json:"cache,omitempty"` 136 // usage of memory 137 Usage MemoryData `json:"usage,omitempty"` 138 // usage of memory swap 139 SwapUsage MemoryData `json:"swap_usage,omitempty"` 140 // usage of kernel memory 141 KernelUsage MemoryData `json:"kernel_usage,omitempty"` 142 // usage of kernel TCP memory 143 KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` 144 // if true, memory usage is accounted for throughout a hierarchy of cgroups. 145 UseHierarchy bool `json:"use_hierarchy"` 146 147 Stats map[string]uint64 `json:"stats,omitempty"` 148 } 149 150 // PidsStats describes the pids stats 151 type PidsStats struct { 152 // number of pids in the cgroup 153 Current uint64 `json:"current,omitempty"` 154 // active pids hard limit 155 Limit uint64 `json:"limit,omitempty"` 156 } 157 158 // BlkioStatEntry gather date related to a block device 159 type BlkioStatEntry struct { 160 Major uint64 `json:"major,omitempty"` 161 Minor uint64 `json:"minor,omitempty"` 162 Op string `json:"op,omitempty"` 163 Value uint64 `json:"value,omitempty"` 164 } 165 166 // BlkioStats describes block io stats 167 type BlkioStats struct { 168 // number of bytes tranferred to and from the block device 169 IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` 170 IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` 171 IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` 172 IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` 173 IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` 174 IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` 175 IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` 176 SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` 177 } 178 179 // HugetlbStats describes hugetable memory stats 180 type HugetlbStats struct { 181 // current res_counter usage for hugetlb 182 Usage uint64 `json:"usage,omitempty"` 183 // maximum usage ever recorded. 184 MaxUsage uint64 `json:"max_usage,omitempty"` 185 // number of times hugetlb usage allocation failure. 186 Failcnt uint64 `json:"failcnt"` 187 } 188 189 // CgroupStats describes all cgroup subsystem stats 190 type CgroupStats struct { 191 CPUStats CPUStats `json:"cpu_stats,omitempty"` 192 MemoryStats MemoryStats `json:"memory_stats,omitempty"` 193 PidsStats PidsStats `json:"pids_stats,omitempty"` 194 BlkioStats BlkioStats `json:"blkio_stats,omitempty"` 195 // the map is in the format "size of hugepage: stats of the hugepage" 196 HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` 197 } 198 199 // NetworkStats describe all network stats. 200 type NetworkStats struct { 201 // Name is the name of the network interface. 202 Name string `json:"name,omitempty"` 203 204 RxBytes uint64 `json:"rx_bytes,omitempty"` 205 RxPackets uint64 `json:"rx_packets,omitempty"` 206 RxErrors uint64 `json:"rx_errors,omitempty"` 207 RxDropped uint64 `json:"rx_dropped,omitempty"` 208 TxBytes uint64 `json:"tx_bytes,omitempty"` 209 TxPackets uint64 `json:"tx_packets,omitempty"` 210 TxErrors uint64 `json:"tx_errors,omitempty"` 211 TxDropped uint64 `json:"tx_dropped,omitempty"` 212 } 213 214 // ContainerStats describes a container stats. 215 type ContainerStats struct { 216 CgroupStats *CgroupStats 217 NetworkStats []*NetworkStats 218 } 219 220 // ContainerResources describes container resources 221 type ContainerResources struct { 222 // VCPUs are the number of vCPUs that are being used by the container 223 VCPUs uint32 224 225 // Mem is the memory that is being used by the container 226 MemByte int64 227 } 228 229 // ContainerConfig describes one container runtime configuration. 230 type ContainerConfig struct { 231 ID string 232 233 // RootFs is the container workload image on the host. 234 RootFs RootFs 235 236 // ReadOnlyRootfs indicates if the rootfs should be mounted readonly 237 ReadonlyRootfs bool 238 239 // Cmd specifies the command to run on a container 240 Cmd types.Cmd 241 242 // Annotations allow clients to store arbitrary values, 243 // for example to add additional status values required 244 // to support particular specifications. 245 Annotations map[string]string 246 247 Mounts []Mount 248 249 // Device configuration for devices that must be available within the container. 250 DeviceInfos []config.DeviceInfo 251 252 // Resources container resources 253 Resources specs.LinuxResources 254 255 // Raw OCI specification, it won't be saved to disk. 256 CustomSpec *specs.Spec `json:"-"` 257 } 258 259 // valid checks that the container configuration is valid. 260 func (c *ContainerConfig) valid() bool { 261 if c == nil { 262 return false 263 } 264 265 if c.ID == "" { 266 return false 267 } 268 269 return true 270 } 271 272 // SystemMountsInfo describes additional information for system mounts that the agent 273 // needs to handle 274 type SystemMountsInfo struct { 275 // Indicates if /dev has been passed as a bind mount for the host /dev 276 BindMountDev bool 277 278 // Size of /dev/shm assigned on the host. 279 DevShmSize uint 280 } 281 282 // ContainerDevice describes a device associated with container 283 type ContainerDevice struct { 284 // ID is device id referencing the device from sandbox's device manager 285 ID string 286 287 // ContainerPath is device path displayed in container 288 ContainerPath string 289 290 // FileMode permission bits for the device. 291 FileMode os.FileMode 292 293 // UID is user ID in the container namespace 294 UID uint32 295 296 // GID is group ID in the container namespace 297 GID uint32 298 } 299 300 // RootFs describes the container's rootfs. 301 type RootFs struct { 302 // Source specifies the BlockDevice path 303 Source string 304 // Target specify where the rootfs is mounted if it has been mounted 305 Target string 306 // Type specifies the type of filesystem to mount. 307 Type string 308 // Options specifies zero or more fstab style mount options. 309 Options []string 310 // Mounted specifies whether the rootfs has be mounted or not 311 Mounted bool 312 } 313 314 // Container is composed of a set of containers and a runtime environment. 315 // A Container can be created, deleted, started, stopped, listed, entered, paused and restored. 316 type Container struct { 317 id string 318 sandboxID string 319 320 rootFs RootFs 321 322 config *ContainerConfig 323 324 sandbox *Sandbox 325 326 containerPath string 327 rootfsSuffix string 328 329 state types.ContainerState 330 331 process Process 332 333 mounts []Mount 334 335 devices []ContainerDevice 336 337 systemMountsInfo SystemMountsInfo 338 339 ctx context.Context 340 341 store *store.VCStore 342 } 343 344 // ID returns the container identifier string. 345 func (c *Container) ID() string { 346 return c.id 347 } 348 349 // Logger returns a logrus logger appropriate for logging Container messages 350 func (c *Container) Logger() *logrus.Entry { 351 return virtLog.WithFields(logrus.Fields{ 352 "subsystem": "container", 353 "sandbox": c.sandboxID, 354 }) 355 } 356 357 func (c *Container) trace(name string) (opentracing.Span, context.Context) { 358 if c.ctx == nil { 359 c.Logger().WithField("type", "bug").Error("trace called before context set") 360 c.ctx = context.Background() 361 } 362 363 span, ctx := opentracing.StartSpanFromContext(c.ctx, name) 364 365 span.SetTag("subsystem", "container") 366 367 return span, ctx 368 } 369 370 // Sandbox returns the sandbox handler related to this container. 371 func (c *Container) Sandbox() VCSandbox { 372 return c.sandbox 373 } 374 375 // Process returns the container process. 376 func (c *Container) Process() Process { 377 return c.process 378 } 379 380 // GetToken returns the token related to this container's process. 381 func (c *Container) GetToken() string { 382 return c.process.Token 383 } 384 385 // GetPid returns the pid related to this container's process. 386 func (c *Container) GetPid() int { 387 return c.process.Pid 388 } 389 390 func (c *Container) setStateFstype(fstype string) error { 391 c.state.Fstype = fstype 392 393 return nil 394 } 395 396 // GetAnnotations returns container's annotations 397 func (c *Container) GetAnnotations() map[string]string { 398 return c.config.Annotations 399 } 400 401 // GetPatchedOCISpec returns container's OCI specification 402 // This OCI specification was patched when the sandbox was created 403 // by containerCapabilities(), SetEphemeralStorageType() and others 404 // in order to support: 405 // * capabilities 406 // * Ephemeral storage 407 // * k8s empty dir 408 // If you need the original (vanilla) OCI spec, 409 // use compatoci.GetContainerSpec() instead. 410 func (c *Container) GetPatchedOCISpec() *specs.Spec { 411 return c.config.CustomSpec 412 } 413 414 // storeContainer stores a container config. 415 func (c *Container) storeContainer() error { 416 if err := c.sandbox.Save(); err != nil { 417 return err 418 } 419 return nil 420 } 421 422 // setContainerState sets both the in-memory and on-disk state of the 423 // container. 424 func (c *Container) setContainerState(state types.StateString) error { 425 if state == "" { 426 return vcTypes.ErrNeedState 427 } 428 429 c.Logger().Debugf("Setting container state from %v to %v", c.state.State, state) 430 // update in-memory state 431 c.state.State = state 432 433 if useOldStore(c.sandbox.ctx) { 434 // experimental runtime use "persist.json" which doesn't need "state.json" anymore 435 // update on-disk state 436 if err := c.store.Store(store.State, c.state); err != nil { 437 return err 438 } 439 } else { 440 // flush data to storage 441 if err := c.sandbox.Save(); err != nil { 442 return err 443 } 444 } 445 446 return nil 447 } 448 449 func (c *Container) shareFiles(m Mount, idx int, hostSharedDir, hostMountDir, guestSharedDir string) (string, bool, error) { 450 randBytes, err := utils.GenerateRandomBytes(8) 451 if err != nil { 452 return "", false, err 453 } 454 455 filename := fmt.Sprintf("%s-%s-%s", c.id, hex.EncodeToString(randBytes), filepath.Base(m.Destination)) 456 guestDest := filepath.Join(guestSharedDir, filename) 457 458 // copy file to contaier's rootfs if filesystem sharing is not supported, otherwise 459 // bind mount it in the shared directory. 460 caps := c.sandbox.hypervisor.capabilities() 461 if !caps.IsFsSharingSupported() { 462 c.Logger().Debug("filesystem sharing is not supported, files will be copied") 463 464 fileInfo, err := os.Stat(m.Source) 465 if err != nil { 466 return "", false, err 467 } 468 469 // Ignore the mount if this is not a regular file (excludes 470 // directory, socket, device, ...) as it cannot be handled by 471 // a simple copy. But this should not be treated as an error, 472 // only as a limitation. 473 if !fileInfo.Mode().IsRegular() { 474 c.Logger().WithField("ignored-file", m.Source).Debug("Ignoring non-regular file as FS sharing not supported") 475 return "", true, nil 476 } 477 478 if err := c.sandbox.agent.copyFile(m.Source, guestDest); err != nil { 479 return "", false, err 480 } 481 } else { 482 // These mounts are created in the shared dir 483 mountDest := filepath.Join(hostMountDir, filename) 484 if err := bindMount(c.ctx, m.Source, mountDest, m.ReadOnly, "private"); err != nil { 485 return "", false, err 486 } 487 // Save HostPath mount value into the mount list of the container. 488 c.mounts[idx].HostPath = mountDest 489 // bindmount remount event is not propagated to mount subtrees, so we have to remount the shared dir mountpoint directly. 490 if m.ReadOnly { 491 mountDest = filepath.Join(hostSharedDir, filename) 492 if err := remountRo(c.ctx, mountDest); err != nil { 493 return "", false, err 494 } 495 } 496 } 497 498 return guestDest, false, nil 499 } 500 501 // mountSharedDirMounts handles bind-mounts by bindmounting to the host shared 502 // directory which is mounted through virtiofs/9pfs in the VM. 503 // It also updates the container mount list with the HostPath info, and store 504 // container mounts to the storage. This way, we will have the HostPath info 505 // available when we will need to unmount those mounts. 506 func (c *Container) mountSharedDirMounts(hostSharedDir, hostMountDir, guestSharedDir string) (sharedDirMounts map[string]Mount, ignoredMounts map[string]Mount, err error) { 507 sharedDirMounts = make(map[string]Mount) 508 ignoredMounts = make(map[string]Mount) 509 var devicesToDetach []string 510 defer func() { 511 if err != nil { 512 for _, id := range devicesToDetach { 513 c.sandbox.devManager.DetachDevice(id, c.sandbox) 514 } 515 } 516 }() 517 for idx, m := range c.mounts { 518 // Skip mounting certain system paths from the source on the host side 519 // into the container as it does not make sense to do so. 520 // Example sources could be /sys/fs/cgroup etc. 521 if isSystemMount(m.Source) { 522 continue 523 } 524 525 // Check if mount is a block device file. If it is, the block device will be attached to the host 526 // instead of passing this as a shared mount: 527 if len(m.BlockDeviceID) > 0 { 528 // Attach this block device, all other devices passed in the config have been attached at this point 529 if err = c.sandbox.devManager.AttachDevice(m.BlockDeviceID, c.sandbox); err != nil { 530 return nil, nil, err 531 } 532 devicesToDetach = append(devicesToDetach, m.BlockDeviceID) 533 continue 534 } 535 536 // For non-block based mounts, we are only interested in bind mounts 537 if m.Type != "bind" { 538 continue 539 } 540 541 // We need to treat /dev/shm as a special case. This is passed as a bind mount in the spec, 542 // but it does not make sense to pass this as a 9p mount from the host side. 543 // This needs to be handled purely in the guest, by allocating memory for this inside the VM. 544 if m.Destination == "/dev/shm" { 545 continue 546 } 547 548 // Ignore /dev, directories and all other device files. We handle 549 // only regular files in /dev. It does not make sense to pass the host 550 // device nodes to the guest. 551 if isHostDevice(m.Destination) { 552 continue 553 } 554 555 var ignore bool 556 var guestDest string 557 guestDest, ignore, err = c.shareFiles(m, idx, hostSharedDir, hostMountDir, guestSharedDir) 558 if err != nil { 559 return nil, nil, err 560 } 561 562 // Expand the list of mounts to ignore. 563 if ignore { 564 ignoredMounts[m.Source] = Mount{Source: m.Source} 565 continue 566 } 567 568 sharedDirMount := Mount{ 569 Source: guestDest, 570 Destination: m.Destination, 571 Type: m.Type, 572 Options: m.Options, 573 ReadOnly: m.ReadOnly, 574 } 575 576 sharedDirMounts[sharedDirMount.Destination] = sharedDirMount 577 } 578 579 return sharedDirMounts, ignoredMounts, nil 580 } 581 582 func (c *Container) unmountHostMounts() error { 583 var span opentracing.Span 584 span, c.ctx = c.trace("unmountHostMounts") 585 defer span.Finish() 586 587 for _, m := range c.mounts { 588 if m.HostPath != "" { 589 span, _ := c.trace("unmount") 590 span.SetTag("host-path", m.HostPath) 591 592 if err := syscall.Unmount(m.HostPath, syscall.MNT_DETACH|UmountNoFollow); err != nil { 593 c.Logger().WithFields(logrus.Fields{ 594 "host-path": m.HostPath, 595 "error": err, 596 }).Warn("Could not umount") 597 return err 598 } 599 600 if m.Type == "bind" { 601 s, err := os.Stat(m.HostPath) 602 if err != nil { 603 return errors.Wrapf(err, "Could not stat host-path %v", m.HostPath) 604 } 605 // Remove the empty file or directory 606 if s.Mode().IsRegular() && s.Size() == 0 { 607 os.Remove(m.HostPath) 608 } 609 if s.Mode().IsDir() { 610 syscall.Rmdir(m.HostPath) 611 } 612 } 613 614 span.Finish() 615 } 616 } 617 618 return nil 619 } 620 621 func filterDevices(c *Container, devices []ContainerDevice) (ret []ContainerDevice) { 622 for _, dev := range devices { 623 major, _ := c.sandbox.devManager.GetDeviceByID(dev.ID).GetMajorMinor() 624 if _, ok := cdromMajors[major]; ok { 625 c.Logger().WithFields(logrus.Fields{ 626 "device": dev.ContainerPath, 627 }).Info("Not attach device because it is a CDROM") 628 continue 629 } 630 631 if major == floppyMajor { 632 c.Logger().WithFields(logrus.Fields{ 633 "device": dev.ContainerPath, 634 }).Info("Not attaching device because it is a floppy drive") 635 continue 636 } 637 638 ret = append(ret, dev) 639 } 640 return 641 } 642 643 // Add any mount based block devices to the device manager and save the 644 // device ID for the particular mount. This'll occur when the mountpoint source 645 // is a block device. 646 func (c *Container) createBlockDevices() error { 647 if !c.checkBlockDeviceSupport() { 648 c.Logger().Warn("Block device not supported") 649 return nil 650 } 651 652 // iterate all mounts and create block device if it's block based. 653 for i, m := range c.mounts { 654 if len(m.BlockDeviceID) > 0 { 655 // Non-empty m.BlockDeviceID indicates there's already one device 656 // associated with the mount,so no need to create a new device for it 657 // and we only create block device for bind mount 658 continue 659 } 660 661 if m.Type != "bind" { 662 // We only handle for bind-mounts 663 continue 664 } 665 666 var stat unix.Stat_t 667 if err := unix.Stat(m.Source, &stat); err != nil { 668 return fmt.Errorf("stat %q failed: %v", m.Source, err) 669 } 670 671 var di *config.DeviceInfo 672 var err error 673 674 // Check if mount is a block device file. If it is, the block device will be attached to the host 675 // instead of passing this as a shared mount. 676 if stat.Mode&unix.S_IFBLK == unix.S_IFBLK { 677 di = &config.DeviceInfo{ 678 HostPath: m.Source, 679 ContainerPath: m.Destination, 680 DevType: "b", 681 Major: int64(unix.Major(stat.Rdev)), 682 Minor: int64(unix.Minor(stat.Rdev)), 683 ReadOnly: m.ReadOnly, 684 } 685 // check whether source can be used as a pmem device 686 } else if di, err = config.PmemDeviceInfo(m.Source, m.Destination); err != nil { 687 c.Logger().WithError(err). 688 WithField("mount-source", m.Source). 689 Debug("no loop device") 690 } 691 692 if err == nil && di != nil { 693 b, err := c.sandbox.devManager.NewDevice(*di) 694 if err != nil { 695 // Do not return an error, try to create 696 // devices for other mounts 697 c.Logger().WithError(err).WithField("mount-source", m.Source). 698 Error("device manager failed to create new device") 699 continue 700 701 } 702 703 c.mounts[i].BlockDeviceID = b.DeviceID() 704 } 705 } 706 707 return nil 708 } 709 710 // newContainer creates a Container structure from a sandbox and a container configuration. 711 func newContainer(sandbox *Sandbox, contConfig *ContainerConfig) (*Container, error) { 712 span, _ := sandbox.trace("newContainer") 713 defer span.Finish() 714 715 if !contConfig.valid() { 716 return &Container{}, fmt.Errorf("Invalid container configuration") 717 } 718 719 c := &Container{ 720 id: contConfig.ID, 721 sandboxID: sandbox.id, 722 rootFs: contConfig.RootFs, 723 config: contConfig, 724 sandbox: sandbox, 725 containerPath: filepath.Join(sandbox.id, contConfig.ID), 726 rootfsSuffix: "rootfs", 727 state: types.ContainerState{}, 728 process: Process{}, 729 mounts: contConfig.Mounts, 730 ctx: sandbox.ctx, 731 } 732 733 if useOldStore(sandbox.ctx) { 734 ctrStore, err := store.NewVCContainerStore(sandbox.ctx, c.sandboxID, c.id) 735 if err != nil { 736 return nil, err 737 } 738 c.store = ctrStore 739 state, err := c.store.LoadContainerState() 740 if err == nil { 741 c.state = state 742 } 743 744 var process Process 745 if err := c.store.Load(store.Process, &process); err == nil { 746 c.process = process 747 } 748 } else { 749 // experimental runtime use "persist.json" instead of legacy "state.json" as storage 750 err := c.Restore() 751 if err == nil { 752 //container restored 753 return c, nil 754 } 755 756 // Unexpected error 757 if !os.IsNotExist(err) && err != errContainerPersistNotExist { 758 return nil, err 759 } 760 } 761 762 // If mounts are block devices, add to devmanager 763 if err := c.createMounts(); err != nil { 764 return nil, err 765 } 766 767 // Add container's devices to sandbox's device-manager 768 if err := c.createDevices(contConfig); err != nil { 769 return nil, err 770 } 771 772 return c, nil 773 } 774 775 func (c *Container) loadMounts() ([]Mount, error) { 776 var mounts []Mount 777 if err := c.store.Load(store.Mounts, &mounts); err != nil { 778 return []Mount{}, err 779 } 780 781 return mounts, nil 782 } 783 784 func (c *Container) loadDevices() ([]ContainerDevice, error) { 785 var devices []ContainerDevice 786 787 if err := c.store.Load(store.DeviceIDs, &devices); err != nil { 788 return []ContainerDevice{}, err 789 } 790 791 return devices, nil 792 } 793 794 func (c *Container) createMounts() error { 795 if useOldStore(c.sandbox.ctx) { 796 mounts, err := c.loadMounts() 797 if err == nil { 798 // restore mounts from disk 799 c.mounts = mounts 800 return nil 801 } 802 } 803 804 // Create block devices for newly created container 805 return c.createBlockDevices() 806 } 807 808 func (c *Container) createDevices(contConfig *ContainerConfig) error { 809 // If sandbox supports "newstore", only newly created container can reach this function, 810 // so we don't call restore when `supportNewStore` is true 811 if useOldStore(c.sandbox.ctx) { 812 // Devices will be found in storage after create stage has completed. 813 // We load devices from storage at all other stages. 814 storedDevices, err := c.loadDevices() 815 if err == nil { 816 c.devices = storedDevices 817 return nil 818 } 819 } 820 821 // If devices were not found in storage, create Device implementations 822 // from the configuration. This should happen at create. 823 var storedDevices []ContainerDevice 824 for _, info := range contConfig.DeviceInfos { 825 dev, err := c.sandbox.devManager.NewDevice(info) 826 if err != nil { 827 return err 828 } 829 830 storedDevices = append(storedDevices, ContainerDevice{ 831 ID: dev.DeviceID(), 832 ContainerPath: info.ContainerPath, 833 FileMode: info.FileMode, 834 UID: info.UID, 835 GID: info.GID, 836 }) 837 } 838 c.devices = filterDevices(c, storedDevices) 839 return nil 840 } 841 842 // rollbackFailingContainerCreation rolls back important steps that might have 843 // been performed before the container creation failed. 844 // - Unplug CPU and memory resources from the VM. 845 // - Unplug devices from the VM. 846 func (c *Container) rollbackFailingContainerCreation() { 847 if err := c.detachDevices(); err != nil { 848 c.Logger().WithError(err).Error("rollback failed detachDevices()") 849 } 850 if err := c.removeDrive(); err != nil { 851 c.Logger().WithError(err).Error("rollback failed removeDrive()") 852 } 853 if err := c.unmountHostMounts(); err != nil { 854 c.Logger().WithError(err).Error("rollback failed unmountHostMounts()") 855 } 856 if err := bindUnmountContainerRootfs(c.ctx, getMountPath(c.sandbox.id), c); err != nil { 857 c.Logger().WithError(err).Error("rollback failed bindUnmountContainerRootfs()") 858 } 859 } 860 861 func (c *Container) checkBlockDeviceSupport() bool { 862 if !c.sandbox.config.HypervisorConfig.DisableBlockDeviceUse { 863 agentCaps := c.sandbox.agent.capabilities() 864 hypervisorCaps := c.sandbox.hypervisor.capabilities() 865 866 if agentCaps.IsBlockDeviceSupported() && hypervisorCaps.IsBlockDeviceHotplugSupported() { 867 return true 868 } 869 } 870 871 return false 872 } 873 874 // createContainer creates and start a container inside a Sandbox. It has to be 875 // called only when a new container, not known by the sandbox, has to be created. 876 func (c *Container) create() (err error) { 877 // In case the container creation fails, the following takes care 878 // of rolling back all the actions previously performed. 879 defer func() { 880 if err != nil { 881 c.Logger().WithError(err).Error("container create failed") 882 c.rollbackFailingContainerCreation() 883 } 884 }() 885 886 if c.checkBlockDeviceSupport() { 887 // If the rootfs is backed by a block device, go ahead and hotplug it to the guest 888 if err = c.hotplugDrive(); err != nil { 889 return 890 } 891 } 892 893 var ( 894 machineType = c.sandbox.config.HypervisorConfig.HypervisorMachineType 895 normalAttachedDevs []ContainerDevice //for q35: normally attached devices 896 delayAttachedDevs []ContainerDevice //for q35: delay attached devices, for example, large bar space device 897 ) 898 // Fix: https://github.com/kata-containers/runtime/issues/2460 899 if machineType == QemuQ35 { 900 // add Large Bar space device to delayAttachedDevs 901 for _, device := range c.devices { 902 var isLargeBarSpace bool 903 isLargeBarSpace, err = manager.IsVFIOLargeBarSpaceDevice(device.ContainerPath) 904 if err != nil { 905 return 906 } 907 if isLargeBarSpace { 908 delayAttachedDevs = append(delayAttachedDevs, device) 909 } else { 910 normalAttachedDevs = append(normalAttachedDevs, device) 911 } 912 } 913 } else { 914 normalAttachedDevs = c.devices 915 } 916 917 c.Logger().WithFields(logrus.Fields{ 918 "machine_type": machineType, 919 "devices": normalAttachedDevs, 920 }).Info("normal attach devices") 921 if len(normalAttachedDevs) > 0 { 922 if err = c.attachDevices(normalAttachedDevs); err != nil { 923 return 924 } 925 } 926 927 // Deduce additional system mount info that should be handled by the agent 928 // inside the VM 929 c.getSystemMountInfo() 930 931 process, err := c.sandbox.agent.createContainer(c.sandbox, c) 932 if err != nil { 933 return err 934 } 935 c.process = *process 936 937 // lazy attach device after createContainer for q35 938 if machineType == QemuQ35 && len(delayAttachedDevs) > 0 { 939 c.Logger().WithFields(logrus.Fields{ 940 "machine_type": machineType, 941 "devices": delayAttachedDevs, 942 }).Info("lazy attach devices") 943 if err = c.attachDevices(delayAttachedDevs); err != nil { 944 return 945 } 946 } 947 948 if !rootless.IsRootless() && !c.sandbox.config.SandboxCgroupOnly { 949 if err = c.cgroupsCreate(); err != nil { 950 return 951 } 952 } 953 954 if err = c.setContainerState(types.StateReady); err != nil { 955 return 956 } 957 958 return nil 959 } 960 961 func (c *Container) delete() error { 962 if c.state.State != types.StateReady && 963 c.state.State != types.StateStopped { 964 return fmt.Errorf("Container not ready or stopped, impossible to delete") 965 } 966 967 // Remove the container from sandbox structure 968 if err := c.sandbox.removeContainer(c.id); err != nil { 969 return err 970 } 971 972 // If running rootless, there are no cgroups to remove 973 if !c.sandbox.config.SandboxCgroupOnly || !rootless.IsRootless() { 974 if err := c.cgroupsDelete(); err != nil { 975 return err 976 } 977 } 978 979 return c.sandbox.storeSandbox() 980 } 981 982 // checkSandboxRunning validates the container state. 983 // 984 // cmd specifies the operation (or verb) that the retrieval is destined 985 // for and is only used to make the returned error as descriptive as 986 // possible. 987 func (c *Container) checkSandboxRunning(cmd string) error { 988 if cmd == "" { 989 return fmt.Errorf("Cmd cannot be empty") 990 } 991 992 if c.sandbox.state.State != types.StateRunning { 993 return fmt.Errorf("Sandbox not running, impossible to %s the container", cmd) 994 } 995 996 return nil 997 } 998 999 func (c *Container) getSystemMountInfo() { 1000 // check if /dev needs to be bind mounted from host /dev 1001 c.systemMountsInfo.BindMountDev = false 1002 1003 for _, m := range c.mounts { 1004 if m.Source == "/dev" && m.Destination == "/dev" && m.Type == "bind" { 1005 c.systemMountsInfo.BindMountDev = true 1006 } 1007 } 1008 1009 // TODO Deduce /dev/shm size. See https://github.com/clearcontainers/runtime/issues/138 1010 } 1011 1012 func (c *Container) start() error { 1013 if err := c.checkSandboxRunning("start"); err != nil { 1014 return err 1015 } 1016 1017 if c.state.State != types.StateReady && 1018 c.state.State != types.StateStopped { 1019 return fmt.Errorf("Container not ready or stopped, impossible to start") 1020 } 1021 1022 if err := c.state.ValidTransition(c.state.State, types.StateRunning); err != nil { 1023 return err 1024 } 1025 1026 if err := c.sandbox.agent.startContainer(c.sandbox, c); err != nil { 1027 c.Logger().WithError(err).Error("Failed to start container") 1028 1029 if err := c.stop(true); err != nil { 1030 c.Logger().WithError(err).Warn("Failed to stop container") 1031 } 1032 return err 1033 } 1034 1035 return c.setContainerState(types.StateRunning) 1036 } 1037 1038 func (c *Container) stop(force bool) error { 1039 span, _ := c.trace("stop") 1040 defer span.Finish() 1041 1042 // In case the container status has been updated implicitly because 1043 // the container process has terminated, it might be possible that 1044 // someone try to stop the container, and we don't want to issue an 1045 // error in that case. This should be a no-op. 1046 // 1047 // This has to be handled before the transition validation since this 1048 // is an exception. 1049 if c.state.State == types.StateStopped { 1050 c.Logger().Info("Container already stopped") 1051 return nil 1052 } 1053 1054 if err := c.state.ValidTransition(c.state.State, types.StateStopped); err != nil { 1055 return err 1056 } 1057 1058 defer func() { 1059 span, _ := c.trace("stopShim") 1060 defer span.Finish() 1061 1062 // If shim is still running something went wrong 1063 // Make sure we stop the shim process 1064 if running, _ := isShimRunning(c.process.Pid); running { 1065 l := c.Logger() 1066 l.Error("Failed to stop container so stopping dangling shim") 1067 if err := stopShim(c.process.Pid); err != nil { 1068 l.WithError(err).Warn("failed to stop shim") 1069 } 1070 } 1071 1072 }() 1073 1074 // Here we expect that stop() has been called because the container 1075 // process returned or because it received a signal. In case of a 1076 // signal, we want to give it some time to end the container process. 1077 // However, if the signal didn't reach its goal, the caller still 1078 // expects this container to be stopped, that's why we should not 1079 // return an error, but instead try to kill it forcefully. 1080 if err := waitForShim(c.process.Pid); err != nil { 1081 // Force the container to be killed. 1082 if err := c.kill(syscall.SIGKILL, true); err != nil && !force { 1083 return err 1084 } 1085 1086 // Wait for the end of container process. We expect this call 1087 // to succeed. Indeed, we have already given a second chance 1088 // to the container by trying to kill it with SIGKILL, there 1089 // is no reason to try to go further if we got an error. 1090 if err := waitForShim(c.process.Pid); err != nil && !force { 1091 return err 1092 } 1093 } 1094 1095 // Force the container to be killed. For most of the cases, this 1096 // should not matter and it should return an error that will be 1097 // ignored. 1098 // But for the specific case where the shim has been SIGKILL'ed, 1099 // the container is still running inside the VM. And this is why 1100 // this signal will ensure the container will get killed to match 1101 // the state of the shim. This will allow the following call to 1102 // stopContainer() to succeed in such particular case. 1103 c.kill(syscall.SIGKILL, true) 1104 1105 // Since the agent has supported the MultiWaitProcess, it's better to 1106 // wait the process here to make sure the process has exited before to 1107 // issue stopContainer, otherwise the RemoveContainerRequest in it will 1108 // get failed if the process hasn't exited. 1109 c.sandbox.agent.waitProcess(c, c.id) 1110 1111 defer func() { 1112 // Save device and drive data. 1113 // TODO: can we merge this saving with setContainerState()? 1114 if err := c.sandbox.Save(); err != nil { 1115 c.Logger().WithError(err).Info("save container state failed") 1116 } 1117 }() 1118 1119 if err := c.sandbox.agent.stopContainer(c.sandbox, *c); err != nil && !force { 1120 return err 1121 } 1122 1123 if err := c.unmountHostMounts(); err != nil && !force { 1124 return err 1125 } 1126 1127 if err := bindUnmountContainerRootfs(c.ctx, getMountPath(c.sandbox.id), c); err != nil && !force { 1128 return err 1129 } 1130 1131 if err := c.detachDevices(); err != nil && !force { 1132 return err 1133 } 1134 1135 if err := c.removeDrive(); err != nil && !force { 1136 return err 1137 } 1138 1139 shareDir := filepath.Join(kataHostSharedDir(), c.sandbox.id, c.id) 1140 if err := syscall.Rmdir(shareDir); err != nil { 1141 c.Logger().WithError(err).WithField("share-dir", shareDir).Warn("Could not remove container share dir") 1142 } 1143 1144 // container was killed by force, container MUST change its state 1145 // as soon as possible just in case one of below operations fail leaving 1146 // the containers in a bad state. 1147 if err := c.setContainerState(types.StateStopped); err != nil { 1148 return err 1149 } 1150 1151 return nil 1152 } 1153 1154 func (c *Container) enter(cmd types.Cmd) (*Process, error) { 1155 if err := c.checkSandboxRunning("enter"); err != nil { 1156 return nil, err 1157 } 1158 1159 if c.state.State != types.StateReady && 1160 c.state.State != types.StateRunning { 1161 return nil, fmt.Errorf("Container not ready or running, " + 1162 "impossible to enter") 1163 } 1164 1165 process, err := c.sandbox.agent.exec(c.sandbox, *c, cmd) 1166 if err != nil { 1167 return nil, err 1168 } 1169 1170 return process, nil 1171 } 1172 1173 func (c *Container) wait(processID string) (int32, error) { 1174 if c.state.State != types.StateReady && 1175 c.state.State != types.StateRunning { 1176 return 0, fmt.Errorf("Container not ready or running, " + 1177 "impossible to wait") 1178 } 1179 1180 return c.sandbox.agent.waitProcess(c, processID) 1181 } 1182 1183 func (c *Container) kill(signal syscall.Signal, all bool) error { 1184 return c.signalProcess(c.process.Token, signal, all) 1185 } 1186 1187 func (c *Container) signalProcess(processID string, signal syscall.Signal, all bool) error { 1188 if c.sandbox.state.State != types.StateReady && c.sandbox.state.State != types.StateRunning { 1189 return fmt.Errorf("Sandbox not ready or running, impossible to signal the container") 1190 } 1191 1192 if c.state.State != types.StateReady && c.state.State != types.StateRunning && c.state.State != types.StatePaused { 1193 return fmt.Errorf("Container not ready, running or paused, impossible to signal the container") 1194 } 1195 1196 return c.sandbox.agent.signalProcess(c, processID, signal, all) 1197 } 1198 1199 func (c *Container) winsizeProcess(processID string, height, width uint32) error { 1200 if c.state.State != types.StateReady && c.state.State != types.StateRunning { 1201 return fmt.Errorf("Container not ready or running, impossible to signal the container") 1202 } 1203 1204 return c.sandbox.agent.winsizeProcess(c, processID, height, width) 1205 } 1206 1207 func (c *Container) ioStream(processID string) (io.WriteCloser, io.Reader, io.Reader, error) { 1208 if c.state.State != types.StateReady && c.state.State != types.StateRunning { 1209 return nil, nil, nil, fmt.Errorf("Container not ready or running, impossible to signal the container") 1210 } 1211 1212 stream := newIOStream(c.sandbox, c, processID) 1213 1214 return stream.stdin(), stream.stdout(), stream.stderr(), nil 1215 } 1216 1217 func (c *Container) processList(options ProcessListOptions) (ProcessList, error) { 1218 if err := c.checkSandboxRunning("ps"); err != nil { 1219 return nil, err 1220 } 1221 1222 if c.state.State != types.StateRunning { 1223 return nil, fmt.Errorf("Container not running, impossible to list processes") 1224 } 1225 1226 return c.sandbox.agent.processListContainer(c.sandbox, *c, options) 1227 } 1228 1229 func (c *Container) stats() (*ContainerStats, error) { 1230 if err := c.checkSandboxRunning("stats"); err != nil { 1231 return nil, err 1232 } 1233 return c.sandbox.agent.statsContainer(c.sandbox, *c) 1234 } 1235 1236 func (c *Container) update(resources specs.LinuxResources) error { 1237 if err := c.checkSandboxRunning("update"); err != nil { 1238 return err 1239 } 1240 1241 if state := c.state.State; !(state == types.StateRunning || state == types.StateReady) { 1242 return fmt.Errorf("Container(%s) not running or ready, impossible to update", state) 1243 } 1244 1245 if c.config.Resources.CPU == nil { 1246 c.config.Resources.CPU = &specs.LinuxCPU{} 1247 } 1248 1249 if cpu := resources.CPU; cpu != nil { 1250 if p := cpu.Period; p != nil && *p != 0 { 1251 c.config.Resources.CPU.Period = p 1252 } 1253 if q := cpu.Quota; q != nil && *q != 0 { 1254 c.config.Resources.CPU.Quota = q 1255 } 1256 if cpu.Cpus != "" { 1257 c.config.Resources.CPU.Cpus = cpu.Cpus 1258 } 1259 if cpu.Mems != "" { 1260 c.config.Resources.CPU.Mems = cpu.Mems 1261 } 1262 } 1263 1264 if c.config.Resources.Memory == nil { 1265 c.config.Resources.Memory = &specs.LinuxMemory{} 1266 } 1267 1268 if mem := resources.Memory; mem != nil && mem.Limit != nil { 1269 c.config.Resources.Memory.Limit = mem.Limit 1270 } 1271 1272 if err := c.sandbox.updateResources(); err != nil { 1273 return err 1274 } 1275 1276 if !c.sandbox.config.SandboxCgroupOnly { 1277 if err := c.cgroupsUpdate(resources); err != nil { 1278 return err 1279 } 1280 } 1281 1282 // There currently isn't a notion of cpusets.cpus or mems being tracked 1283 // inside of the guest. Make sure we clear these before asking agent to update 1284 // the container's cgroups. 1285 if resources.CPU != nil { 1286 resources.CPU.Mems = "" 1287 resources.CPU.Cpus = "" 1288 } 1289 1290 return c.sandbox.agent.updateContainer(c.sandbox, *c, resources) 1291 } 1292 1293 func (c *Container) pause() error { 1294 if err := c.checkSandboxRunning("pause"); err != nil { 1295 return err 1296 } 1297 1298 if c.state.State != types.StateRunning { 1299 return fmt.Errorf("Container not running, impossible to pause") 1300 } 1301 1302 if err := c.sandbox.agent.pauseContainer(c.sandbox, *c); err != nil { 1303 return err 1304 } 1305 1306 return c.setContainerState(types.StatePaused) 1307 } 1308 1309 func (c *Container) resume() error { 1310 if err := c.checkSandboxRunning("resume"); err != nil { 1311 return err 1312 } 1313 1314 if c.state.State != types.StatePaused { 1315 return fmt.Errorf("Container not paused, impossible to resume") 1316 } 1317 1318 if err := c.sandbox.agent.resumeContainer(c.sandbox, *c); err != nil { 1319 return err 1320 } 1321 1322 return c.setContainerState(types.StateRunning) 1323 } 1324 1325 // hotplugDrive will attempt to hotplug the container rootfs if it is backed by a 1326 // block device 1327 func (c *Container) hotplugDrive() error { 1328 var dev device 1329 var err error 1330 1331 // Check to see if the rootfs is an umounted block device (source) or if the 1332 // mount (target) is backed by a block device: 1333 if !c.rootFs.Mounted { 1334 dev, err = getDeviceForPath(c.rootFs.Source) 1335 // there is no "rootfs" dir on block device backed rootfs 1336 c.rootfsSuffix = "" 1337 } else { 1338 dev, err = getDeviceForPath(c.rootFs.Target) 1339 } 1340 1341 if err == errMountPointNotFound { 1342 return nil 1343 } 1344 1345 if err != nil { 1346 return err 1347 } 1348 1349 c.Logger().WithFields(logrus.Fields{ 1350 "device-major": dev.major, 1351 "device-minor": dev.minor, 1352 "mount-point": dev.mountPoint, 1353 }).Info("device details") 1354 1355 isDM, err := checkStorageDriver(dev.major, dev.minor) 1356 if err != nil { 1357 return err 1358 } 1359 1360 if !isDM { 1361 return nil 1362 } 1363 1364 devicePath := c.rootFs.Source 1365 fsType := c.rootFs.Type 1366 if c.rootFs.Mounted { 1367 if dev.mountPoint == c.rootFs.Target { 1368 c.rootfsSuffix = "" 1369 } 1370 // If device mapper device, then fetch the full path of the device 1371 devicePath, fsType, _, err = utils.GetDevicePathAndFsTypeOptions(dev.mountPoint) 1372 if err != nil { 1373 return err 1374 } 1375 } 1376 1377 devicePath, err = filepath.EvalSymlinks(devicePath) 1378 if err != nil { 1379 return err 1380 } 1381 1382 c.Logger().WithFields(logrus.Fields{ 1383 "device-path": devicePath, 1384 "fs-type": fsType, 1385 }).Info("Block device detected") 1386 1387 if err = c.plugDevice(devicePath); err != nil { 1388 return err 1389 } 1390 1391 return c.setStateFstype(fsType) 1392 } 1393 1394 // plugDevice will attach the rootfs if blockdevice is supported (this is rootfs specific) 1395 func (c *Container) plugDevice(devicePath string) error { 1396 var stat unix.Stat_t 1397 if err := unix.Stat(devicePath, &stat); err != nil { 1398 return fmt.Errorf("stat %q failed: %v", devicePath, err) 1399 } 1400 1401 if c.checkBlockDeviceSupport() && stat.Mode&unix.S_IFBLK == unix.S_IFBLK { 1402 b, err := c.sandbox.devManager.NewDevice(config.DeviceInfo{ 1403 HostPath: devicePath, 1404 ContainerPath: filepath.Join(kataGuestSharedDir(), c.id), 1405 DevType: "b", 1406 Major: int64(unix.Major(stat.Rdev)), 1407 Minor: int64(unix.Minor(stat.Rdev)), 1408 }) 1409 if err != nil { 1410 return fmt.Errorf("device manager failed to create rootfs device for %q: %v", devicePath, err) 1411 } 1412 1413 c.state.BlockDeviceID = b.DeviceID() 1414 1415 // attach rootfs device 1416 if err := c.sandbox.devManager.AttachDevice(b.DeviceID(), c.sandbox); err != nil { 1417 return err 1418 } 1419 } 1420 return nil 1421 } 1422 1423 // isDriveUsed checks if a drive has been used for container rootfs 1424 func (c *Container) isDriveUsed() bool { 1425 return !(c.state.Fstype == "") 1426 } 1427 1428 func (c *Container) removeDrive() (err error) { 1429 if c.isDriveUsed() { 1430 c.Logger().Info("unplugging block device") 1431 1432 devID := c.state.BlockDeviceID 1433 err := c.sandbox.devManager.DetachDevice(devID, c.sandbox) 1434 if err != nil && err != manager.ErrDeviceNotAttached { 1435 return err 1436 } 1437 1438 if err = c.sandbox.devManager.RemoveDevice(devID); err != nil { 1439 c.Logger().WithFields(logrus.Fields{ 1440 "container": c.id, 1441 "device-id": devID, 1442 }).WithError(err).Error("remove device failed") 1443 1444 // ignore the device not exist error 1445 if err != manager.ErrDeviceNotExist { 1446 return err 1447 } 1448 } 1449 } 1450 1451 return nil 1452 } 1453 1454 func (c *Container) attachDevices(devices []ContainerDevice) error { 1455 // there's no need to do rollback when error happens, 1456 // because if attachDevices fails, container creation will fail too, 1457 // and rollbackFailingContainerCreation could do all the rollbacks 1458 1459 // since devices with large bar space require delayed attachment, 1460 // the devices need to be split into two lists, normalAttachedDevs and delayAttachedDevs. 1461 // so c.device is not used here. See issue https://github.com/kata-containers/runtime/issues/2460. 1462 for _, dev := range devices { 1463 if err := c.sandbox.devManager.AttachDevice(dev.ID, c.sandbox); err != nil { 1464 return err 1465 } 1466 } 1467 return nil 1468 } 1469 1470 func (c *Container) detachDevices() error { 1471 for _, dev := range c.devices { 1472 err := c.sandbox.devManager.DetachDevice(dev.ID, c.sandbox) 1473 if err != nil && err != manager.ErrDeviceNotAttached { 1474 return err 1475 } 1476 1477 if err = c.sandbox.devManager.RemoveDevice(dev.ID); err != nil { 1478 c.Logger().WithFields(logrus.Fields{ 1479 "container": c.id, 1480 "device-id": dev.ID, 1481 }).WithError(err).Error("remove device failed") 1482 1483 // ignore the device not exist error 1484 if err != manager.ErrDeviceNotExist { 1485 return err 1486 } 1487 } 1488 } 1489 return nil 1490 } 1491 1492 // cgroupsCreate creates cgroups on the host for the associated container 1493 func (c *Container) cgroupsCreate() (err error) { 1494 spec := c.GetPatchedOCISpec() 1495 if spec == nil { 1496 return errorMissingOCISpec 1497 } 1498 1499 // https://github.com/kata-containers/runtime/issues/168 1500 resources := specs.LinuxResources{ 1501 CPU: nil, 1502 } 1503 1504 if spec.Linux != nil && spec.Linux.Resources != nil { 1505 resources.CPU = validCPUResources(spec.Linux.Resources.CPU) 1506 } 1507 1508 c.state.CgroupPath, err = vccgroups.ValidCgroupPath(spec.Linux.CgroupsPath, c.sandbox.config.SystemdCgroup) 1509 if err != nil { 1510 return fmt.Errorf("Invalid cgroup path: %v", err) 1511 } 1512 1513 cgroup, err := cgroupsNewFunc(cgroups.V1, 1514 cgroups.StaticPath(c.state.CgroupPath), &resources) 1515 if err != nil { 1516 return fmt.Errorf("Could not create cgroup for %v: %v", c.state.CgroupPath, err) 1517 } 1518 1519 c.config.Resources = resources 1520 1521 // Add shim into cgroup 1522 if c.process.Pid > 0 { 1523 if err := cgroup.Add(cgroups.Process{Pid: c.process.Pid}); err != nil { 1524 return fmt.Errorf("Could not add PID %d to cgroup %v: %v", c.process.Pid, spec.Linux.CgroupsPath, err) 1525 } 1526 } 1527 1528 return nil 1529 } 1530 1531 // cgroupsDelete deletes the cgroups on the host for the associated container 1532 func (c *Container) cgroupsDelete() error { 1533 1534 if c.state.CgroupPath == "" { 1535 c.Logger().Debug("container does not have host cgroups: nothing to update") 1536 return nil 1537 } 1538 1539 cgroup, err := cgroupsLoadFunc(cgroups.V1, 1540 cgroups.StaticPath(c.state.CgroupPath)) 1541 1542 if err == cgroups.ErrCgroupDeleted { 1543 // cgroup already deleted 1544 return nil 1545 } 1546 1547 if err != nil { 1548 return fmt.Errorf("Could not load container cgroup %v: %v", c.state.CgroupPath, err) 1549 } 1550 1551 // move running process here, that way cgroup can be removed 1552 parent, err := parentCgroup(cgroups.V1, c.state.CgroupPath) 1553 if err != nil { 1554 // parent cgroup doesn't exist, that means there are no process running 1555 // and the container cgroup was removed. 1556 c.Logger().WithError(err).Warn("Container cgroup doesn't exist") 1557 return nil 1558 } 1559 1560 if err := cgroup.MoveTo(parent); err != nil { 1561 // Don't fail, cgroup can be deleted 1562 c.Logger().WithError(err).Warn("Could not move container process into parent cgroup") 1563 } 1564 1565 if err := cgroup.Delete(); err != nil { 1566 return fmt.Errorf("Could not delete container cgroup path='%v': error='%v'", c.state.CgroupPath, err) 1567 } 1568 1569 return nil 1570 } 1571 1572 // cgroupsUpdate updates cgroups on the host for the associated container 1573 func (c *Container) cgroupsUpdate(resources specs.LinuxResources) error { 1574 1575 if c.state.CgroupPath == "" { 1576 c.Logger().Debug("container does not have host cgroups: nothing to update") 1577 return nil 1578 } 1579 cgroup, err := cgroupsLoadFunc(cgroups.V1, 1580 cgroups.StaticPath(c.state.CgroupPath)) 1581 if err != nil { 1582 return fmt.Errorf("Could not load cgroup %v: %v", c.state.CgroupPath, err) 1583 } 1584 1585 // Issue: https://github.com/kata-containers/runtime/issues/168 1586 r := specs.LinuxResources{ 1587 CPU: validCPUResources(resources.CPU), 1588 } 1589 1590 // update cgroup 1591 if err := cgroup.Update(&r); err != nil { 1592 return fmt.Errorf("Could not update container cgroup path='%v': error='%v'", c.state.CgroupPath, err) 1593 } 1594 1595 // store new resources 1596 c.config.Resources = r 1597 if err := c.storeContainer(); err != nil { 1598 return err 1599 } 1600 1601 return nil 1602 }