github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/runc/libcontainer/container_linux.go (about) 1 // +build linux 2 3 package libcontainer 4 5 import ( 6 "bytes" 7 "encoding/json" 8 "fmt" 9 "io" 10 "io/ioutil" 11 "os" 12 "os/exec" 13 "path/filepath" 14 "reflect" 15 "strings" 16 "sync" 17 "syscall" 18 "time" 19 20 "github.com/Sirupsen/logrus" 21 "github.com/golang/protobuf/proto" 22 "github.com/opencontainers/runc/libcontainer/cgroups" 23 "github.com/opencontainers/runc/libcontainer/configs" 24 "github.com/opencontainers/runc/libcontainer/criurpc" 25 "github.com/opencontainers/runc/libcontainer/system" 26 "github.com/opencontainers/runc/libcontainer/utils" 27 "github.com/syndtr/gocapability/capability" 28 "github.com/vishvananda/netlink/nl" 29 ) 30 31 const stdioFdCount = 3 32 33 type linuxContainer struct { 34 id string 35 root string 36 config *configs.Config 37 cgroupManager cgroups.Manager 38 initArgs []string 39 initProcess parentProcess 40 initProcessStartTime string 41 criuPath string 42 m sync.Mutex 43 criuVersion int 44 state containerState 45 created time.Time 46 } 47 48 // State represents a running container's state 49 type State struct { 50 BaseState 51 52 // Platform specific fields below here 53 54 // Path to all the cgroups setup for a container. Key is cgroup subsystem name 55 // with the value as the path. 56 CgroupPaths map[string]string `json:"cgroup_paths"` 57 58 // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type 59 // with the value as the path. 60 NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` 61 62 // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore 63 ExternalDescriptors []string `json:"external_descriptors,omitempty"` 64 } 65 66 // Container is a libcontainer container object. 67 // 68 // Each container is thread-safe within the same process. Since a container can 69 // be destroyed by a separate process, any function may return that the container 70 // was not found. 71 type Container interface { 72 BaseContainer 73 74 // Methods below here are platform specific 75 76 // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. 77 // 78 // errors: 79 // Systemerror - System error. 80 Checkpoint(criuOpts *CriuOpts) error 81 82 // Restore restores the checkpointed container to a running state using the criu(8) utility. 83 // 84 // errors: 85 // Systemerror - System error. 86 Restore(process *Process, criuOpts *CriuOpts) error 87 88 // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses 89 // the execution of any user processes. Asynchronously, when the container finished being paused the 90 // state is changed to PAUSED. 91 // If the Container state is PAUSED, do nothing. 92 // 93 // errors: 94 // ContainerNotExists - Container no longer exists, 95 // ContainerNotRunning - Container not running or created, 96 // Systemerror - System error. 97 Pause() error 98 99 // If the Container state is PAUSED, resumes the execution of any user processes in the 100 // Container before setting the Container state to RUNNING. 101 // If the Container state is RUNNING, do nothing. 102 // 103 // errors: 104 // ContainerNotExists - Container no longer exists, 105 // ContainerNotPaused - Container is not paused, 106 // Systemerror - System error. 107 Resume() error 108 109 // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. 110 // 111 // errors: 112 // Systemerror - System error. 113 NotifyOOM() (<-chan struct{}, error) 114 115 // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level 116 // 117 // errors: 118 // Systemerror - System error. 119 NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) 120 } 121 122 // ID returns the container's unique ID 123 func (c *linuxContainer) ID() string { 124 return c.id 125 } 126 127 // Config returns the container's configuration 128 func (c *linuxContainer) Config() configs.Config { 129 return *c.config 130 } 131 132 func (c *linuxContainer) Status() (Status, error) { 133 c.m.Lock() 134 defer c.m.Unlock() 135 return c.currentStatus() 136 } 137 138 func (c *linuxContainer) State() (*State, error) { 139 c.m.Lock() 140 defer c.m.Unlock() 141 return c.currentState() 142 } 143 144 func (c *linuxContainer) Processes() ([]int, error) { 145 pids, err := c.cgroupManager.GetAllPids() 146 if err != nil { 147 return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") 148 } 149 return pids, nil 150 } 151 152 func (c *linuxContainer) Stats() (*Stats, error) { 153 var ( 154 err error 155 stats = &Stats{} 156 ) 157 if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { 158 return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") 159 } 160 for _, iface := range c.config.Networks { 161 switch iface.Type { 162 case "veth": 163 istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) 164 if err != nil { 165 return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) 166 } 167 stats.Interfaces = append(stats.Interfaces, istats) 168 } 169 } 170 return stats, nil 171 } 172 173 func (c *linuxContainer) Set(config configs.Config) error { 174 c.m.Lock() 175 defer c.m.Unlock() 176 status, err := c.currentStatus() 177 if err != nil { 178 return err 179 } 180 if status == Stopped { 181 return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) 182 } 183 c.config = &config 184 return c.cgroupManager.Set(c.config) 185 } 186 187 func (c *linuxContainer) Start(process *Process) error { 188 c.m.Lock() 189 defer c.m.Unlock() 190 status, err := c.currentStatus() 191 if err != nil { 192 return err 193 } 194 return c.start(process, status == Stopped) 195 } 196 197 func (c *linuxContainer) Run(process *Process) error { 198 c.m.Lock() 199 defer c.m.Unlock() 200 status, err := c.currentStatus() 201 if err != nil { 202 return err 203 } 204 if err := c.start(process, status == Stopped); err != nil { 205 return err 206 } 207 if status == Stopped { 208 return c.exec() 209 } 210 return nil 211 } 212 213 func (c *linuxContainer) Exec() error { 214 c.m.Lock() 215 defer c.m.Unlock() 216 return c.exec() 217 } 218 219 func (c *linuxContainer) exec() error { 220 path := filepath.Join(c.root, execFifoFilename) 221 f, err := os.OpenFile(path, os.O_RDONLY, 0) 222 if err != nil { 223 return newSystemErrorWithCause(err, "open exec fifo for reading") 224 } 225 defer f.Close() 226 data, err := ioutil.ReadAll(f) 227 if err != nil { 228 return err 229 } 230 if len(data) > 0 { 231 os.Remove(path) 232 return nil 233 } 234 return fmt.Errorf("cannot start an already running container") 235 } 236 237 func (c *linuxContainer) start(process *Process, isInit bool) error { 238 parent, err := c.newParentProcess(process, isInit) 239 if err != nil { 240 return newSystemErrorWithCause(err, "creating new parent process") 241 } 242 if err := parent.start(); err != nil { 243 // terminate the process to ensure that it properly is reaped. 244 if err := parent.terminate(); err != nil { 245 logrus.Warn(err) 246 } 247 return newSystemErrorWithCause(err, "starting container process") 248 } 249 // generate a timestamp indicating when the container was started 250 c.created = time.Now().UTC() 251 c.state = &runningState{ 252 c: c, 253 } 254 if isInit { 255 c.state = &createdState{ 256 c: c, 257 } 258 state, err := c.updateState(parent) 259 if err != nil { 260 return err 261 } 262 c.initProcessStartTime = state.InitProcessStartTime 263 264 if c.config.Hooks != nil { 265 s := configs.HookState{ 266 Version: c.config.Version, 267 ID: c.id, 268 Pid: parent.pid(), 269 Root: c.config.Rootfs, 270 BundlePath: utils.SearchLabels(c.config.Labels, "bundle"), 271 } 272 for i, hook := range c.config.Hooks.Poststart { 273 if err := hook.Run(s); err != nil { 274 if err := parent.terminate(); err != nil { 275 logrus.Warn(err) 276 } 277 return newSystemErrorWithCausef(err, "running poststart hook %d", i) 278 } 279 } 280 } 281 } 282 return nil 283 } 284 285 func (c *linuxContainer) Signal(s os.Signal, all bool) error { 286 if all { 287 return signalAllProcesses(c.cgroupManager, s) 288 } 289 if err := c.initProcess.signal(s); err != nil { 290 return newSystemErrorWithCause(err, "signaling init process") 291 } 292 return nil 293 } 294 295 func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { 296 parentPipe, childPipe, err := newPipe() 297 if err != nil { 298 return nil, newSystemErrorWithCause(err, "creating new init pipe") 299 } 300 rootDir, err := os.Open(c.root) 301 if err != nil { 302 return nil, err 303 } 304 cmd, err := c.commandTemplate(p, childPipe, rootDir) 305 if err != nil { 306 return nil, newSystemErrorWithCause(err, "creating new command template") 307 } 308 if !doInit { 309 return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir) 310 } 311 return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir) 312 } 313 314 func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) { 315 cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...) 316 cmd.Stdin = p.Stdin 317 cmd.Stdout = p.Stdout 318 cmd.Stderr = p.Stderr 319 cmd.Dir = c.config.Rootfs 320 if cmd.SysProcAttr == nil { 321 cmd.SysProcAttr = &syscall.SysProcAttr{} 322 } 323 cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir) 324 cmd.Env = append(cmd.Env, 325 fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2), 326 fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) 327 // NOTE: when running a container with no PID namespace and the parent process spawning the container is 328 // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason 329 // even with the parent still running. 330 if c.config.ParentDeathSignal > 0 { 331 cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) 332 } 333 return cmd, nil 334 } 335 336 func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) { 337 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) 338 nsMaps := make(map[configs.NamespaceType]string) 339 for _, ns := range c.config.Namespaces { 340 if ns.Path != "" { 341 nsMaps[ns.Type] = ns.Path 342 } 343 } 344 _, sharePidns := nsMaps[configs.NEWPID] 345 data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "") 346 if err != nil { 347 return nil, err 348 } 349 return &initProcess{ 350 cmd: cmd, 351 childPipe: childPipe, 352 parentPipe: parentPipe, 353 manager: c.cgroupManager, 354 config: c.newInitConfig(p), 355 container: c, 356 process: p, 357 bootstrapData: data, 358 sharePidns: sharePidns, 359 rootDir: rootDir, 360 }, nil 361 } 362 363 func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) { 364 cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) 365 state, err := c.currentState() 366 if err != nil { 367 return nil, newSystemErrorWithCause(err, "getting container's current state") 368 } 369 // for setns process, we dont have to set cloneflags as the process namespaces 370 // will only be set via setns syscall 371 data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath) 372 if err != nil { 373 return nil, err 374 } 375 // TODO: set on container for process management 376 return &setnsProcess{ 377 cmd: cmd, 378 cgroupPaths: c.cgroupManager.GetPaths(), 379 childPipe: childPipe, 380 parentPipe: parentPipe, 381 config: c.newInitConfig(p), 382 process: p, 383 bootstrapData: data, 384 rootDir: rootDir, 385 }, nil 386 } 387 388 func (c *linuxContainer) newInitConfig(process *Process) *initConfig { 389 cfg := &initConfig{ 390 Config: c.config, 391 Args: process.Args, 392 Env: process.Env, 393 User: process.User, 394 AdditionalGroups: process.AdditionalGroups, 395 Cwd: process.Cwd, 396 Console: process.consolePath, 397 Capabilities: process.Capabilities, 398 PassedFilesCount: len(process.ExtraFiles), 399 ContainerId: c.ID(), 400 NoNewPrivileges: c.config.NoNewPrivileges, 401 AppArmorProfile: c.config.AppArmorProfile, 402 ProcessLabel: c.config.ProcessLabel, 403 Rlimits: c.config.Rlimits, 404 ExecFifoPath: filepath.Join(c.root, execFifoFilename), 405 } 406 if process.NoNewPrivileges != nil { 407 cfg.NoNewPrivileges = *process.NoNewPrivileges 408 } 409 if process.AppArmorProfile != "" { 410 cfg.AppArmorProfile = process.AppArmorProfile 411 } 412 if process.Label != "" { 413 cfg.ProcessLabel = process.Label 414 } 415 if len(process.Rlimits) > 0 { 416 cfg.Rlimits = process.Rlimits 417 } 418 return cfg 419 } 420 421 func newPipe() (parent *os.File, child *os.File, err error) { 422 fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0) 423 if err != nil { 424 return nil, nil, err 425 } 426 return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil 427 } 428 429 func (c *linuxContainer) Destroy() error { 430 c.m.Lock() 431 defer c.m.Unlock() 432 return c.state.destroy() 433 } 434 435 func (c *linuxContainer) Pause() error { 436 c.m.Lock() 437 defer c.m.Unlock() 438 status, err := c.currentStatus() 439 if err != nil { 440 return err 441 } 442 switch status { 443 case Running, Created: 444 if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { 445 return err 446 } 447 return c.state.transition(&pausedState{ 448 c: c, 449 }) 450 } 451 return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) 452 } 453 454 func (c *linuxContainer) Resume() error { 455 c.m.Lock() 456 defer c.m.Unlock() 457 status, err := c.currentStatus() 458 if err != nil { 459 return err 460 } 461 if status != Paused { 462 return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) 463 } 464 if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { 465 return err 466 } 467 return c.state.transition(&runningState{ 468 c: c, 469 }) 470 } 471 472 func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { 473 return notifyOnOOM(c.cgroupManager.GetPaths()) 474 } 475 476 func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { 477 return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) 478 } 479 480 // checkCriuVersion checks Criu version greater than or equal to minVersion 481 func (c *linuxContainer) checkCriuVersion(minVersion string) error { 482 var x, y, z, versionReq int 483 484 _, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2 485 if err != nil { 486 _, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6 487 } 488 versionReq = x*10000 + y*100 + z 489 490 out, err := exec.Command(c.criuPath, "-V").Output() 491 if err != nil { 492 return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath) 493 } 494 495 x = 0 496 y = 0 497 z = 0 498 if ep := strings.Index(string(out), "-"); ep >= 0 { 499 // criu Git version format 500 var version string 501 if sp := strings.Index(string(out), "GitID"); sp > 0 { 502 version = string(out)[sp:ep] 503 } else { 504 return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath) 505 } 506 507 n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 508 if err != nil { 509 n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6 510 y++ 511 } else { 512 z++ 513 } 514 if n < 2 || err != nil { 515 return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) 516 } 517 } else { 518 // criu release version format 519 n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 520 if err != nil { 521 n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 522 } 523 if n < 2 || err != nil { 524 return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) 525 } 526 } 527 528 c.criuVersion = x*10000 + y*100 + z 529 530 if c.criuVersion < versionReq { 531 return fmt.Errorf("CRIU version must be %s or higher", minVersion) 532 } 533 534 return nil 535 } 536 537 const descriptorsFilename = "descriptors.json" 538 539 func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { 540 mountDest := m.Destination 541 if strings.HasPrefix(mountDest, c.config.Rootfs) { 542 mountDest = mountDest[len(c.config.Rootfs):] 543 } 544 545 extMnt := &criurpc.ExtMountMap{ 546 Key: proto.String(mountDest), 547 Val: proto.String(mountDest), 548 } 549 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 550 } 551 552 func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { 553 c.m.Lock() 554 defer c.m.Unlock() 555 556 if err := c.checkCriuVersion("1.5.2"); err != nil { 557 return err 558 } 559 560 if criuOpts.ImagesDirectory == "" { 561 return fmt.Errorf("invalid directory to save checkpoint") 562 } 563 564 // Since a container can be C/R'ed multiple times, 565 // the checkpoint directory may already exist. 566 if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) { 567 return err 568 } 569 570 if criuOpts.WorkDirectory == "" { 571 criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") 572 } 573 574 if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) { 575 return err 576 } 577 578 workDir, err := os.Open(criuOpts.WorkDirectory) 579 if err != nil { 580 return err 581 } 582 defer workDir.Close() 583 584 imageDir, err := os.Open(criuOpts.ImagesDirectory) 585 if err != nil { 586 return err 587 } 588 defer imageDir.Close() 589 590 rpcOpts := criurpc.CriuOpts{ 591 ImagesDirFd: proto.Int32(int32(imageDir.Fd())), 592 WorkDirFd: proto.Int32(int32(workDir.Fd())), 593 LogLevel: proto.Int32(4), 594 LogFile: proto.String("dump.log"), 595 Root: proto.String(c.config.Rootfs), 596 ManageCgroups: proto.Bool(true), 597 NotifyScripts: proto.Bool(true), 598 Pid: proto.Int32(int32(c.initProcess.pid())), 599 ShellJob: proto.Bool(criuOpts.ShellJob), 600 LeaveRunning: proto.Bool(criuOpts.LeaveRunning), 601 TcpEstablished: proto.Bool(criuOpts.TcpEstablished), 602 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), 603 FileLocks: proto.Bool(criuOpts.FileLocks), 604 EmptyNs: proto.Uint32(criuOpts.EmptyNs), 605 } 606 607 // append optional criu opts, e.g., page-server and port 608 if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { 609 rpcOpts.Ps = &criurpc.CriuPageServerInfo{ 610 Address: proto.String(criuOpts.PageServer.Address), 611 Port: proto.Int32(criuOpts.PageServer.Port), 612 } 613 } 614 615 // append optional manage cgroups mode 616 if criuOpts.ManageCgroupsMode != 0 { 617 if err := c.checkCriuVersion("1.7"); err != nil { 618 return err 619 } 620 mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) 621 rpcOpts.ManageCgroupsMode = &mode 622 } 623 624 t := criurpc.CriuReqType_DUMP 625 req := &criurpc.CriuReq{ 626 Type: &t, 627 Opts: &rpcOpts, 628 } 629 630 for _, m := range c.config.Mounts { 631 switch m.Device { 632 case "bind": 633 c.addCriuDumpMount(req, m) 634 break 635 case "cgroup": 636 binds, err := getCgroupMounts(m) 637 if err != nil { 638 return err 639 } 640 for _, b := range binds { 641 c.addCriuDumpMount(req, b) 642 } 643 break 644 } 645 } 646 647 // Write the FD info to a file in the image directory 648 649 fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) 650 if err != nil { 651 return err 652 } 653 654 err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655) 655 if err != nil { 656 return err 657 } 658 659 err = c.criuSwrk(nil, req, criuOpts, false) 660 if err != nil { 661 return err 662 } 663 return nil 664 } 665 666 func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { 667 mountDest := m.Destination 668 if strings.HasPrefix(mountDest, c.config.Rootfs) { 669 mountDest = mountDest[len(c.config.Rootfs):] 670 } 671 672 extMnt := &criurpc.ExtMountMap{ 673 Key: proto.String(mountDest), 674 Val: proto.String(m.Source), 675 } 676 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 677 } 678 679 func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { 680 for _, iface := range c.config.Networks { 681 switch iface.Type { 682 case "veth": 683 veth := new(criurpc.CriuVethPair) 684 veth.IfOut = proto.String(iface.HostInterfaceName) 685 veth.IfIn = proto.String(iface.Name) 686 req.Opts.Veths = append(req.Opts.Veths, veth) 687 break 688 case "loopback": 689 break 690 } 691 } 692 for _, i := range criuOpts.VethPairs { 693 veth := new(criurpc.CriuVethPair) 694 veth.IfOut = proto.String(i.HostInterfaceName) 695 veth.IfIn = proto.String(i.ContainerInterfaceName) 696 req.Opts.Veths = append(req.Opts.Veths, veth) 697 } 698 } 699 700 func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { 701 c.m.Lock() 702 defer c.m.Unlock() 703 if err := c.checkCriuVersion("1.5.2"); err != nil { 704 return err 705 } 706 if criuOpts.WorkDirectory == "" { 707 criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") 708 } 709 // Since a container can be C/R'ed multiple times, 710 // the work directory may already exist. 711 if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { 712 return err 713 } 714 workDir, err := os.Open(criuOpts.WorkDirectory) 715 if err != nil { 716 return err 717 } 718 defer workDir.Close() 719 if criuOpts.ImagesDirectory == "" { 720 return fmt.Errorf("invalid directory to restore checkpoint") 721 } 722 imageDir, err := os.Open(criuOpts.ImagesDirectory) 723 if err != nil { 724 return err 725 } 726 defer imageDir.Close() 727 // CRIU has a few requirements for a root directory: 728 // * it must be a mount point 729 // * its parent must not be overmounted 730 // c.config.Rootfs is bind-mounted to a temporary directory 731 // to satisfy these requirements. 732 root := filepath.Join(c.root, "criu-root") 733 if err := os.Mkdir(root, 0755); err != nil { 734 return err 735 } 736 defer os.Remove(root) 737 root, err = filepath.EvalSymlinks(root) 738 if err != nil { 739 return err 740 } 741 err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") 742 if err != nil { 743 return err 744 } 745 defer syscall.Unmount(root, syscall.MNT_DETACH) 746 t := criurpc.CriuReqType_RESTORE 747 req := &criurpc.CriuReq{ 748 Type: &t, 749 Opts: &criurpc.CriuOpts{ 750 ImagesDirFd: proto.Int32(int32(imageDir.Fd())), 751 WorkDirFd: proto.Int32(int32(workDir.Fd())), 752 EvasiveDevices: proto.Bool(true), 753 LogLevel: proto.Int32(4), 754 LogFile: proto.String("restore.log"), 755 RstSibling: proto.Bool(true), 756 Root: proto.String(root), 757 ManageCgroups: proto.Bool(true), 758 NotifyScripts: proto.Bool(true), 759 ShellJob: proto.Bool(criuOpts.ShellJob), 760 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), 761 TcpEstablished: proto.Bool(criuOpts.TcpEstablished), 762 FileLocks: proto.Bool(criuOpts.FileLocks), 763 EmptyNs: proto.Uint32(criuOpts.EmptyNs), 764 }, 765 } 766 767 for _, m := range c.config.Mounts { 768 switch m.Device { 769 case "bind": 770 c.addCriuRestoreMount(req, m) 771 break 772 case "cgroup": 773 binds, err := getCgroupMounts(m) 774 if err != nil { 775 return err 776 } 777 for _, b := range binds { 778 c.addCriuRestoreMount(req, b) 779 } 780 break 781 } 782 } 783 784 if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 { 785 c.restoreNetwork(req, criuOpts) 786 } 787 788 // append optional manage cgroups mode 789 if criuOpts.ManageCgroupsMode != 0 { 790 if err := c.checkCriuVersion("1.7"); err != nil { 791 return err 792 } 793 mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) 794 req.Opts.ManageCgroupsMode = &mode 795 } 796 797 var ( 798 fds []string 799 fdJSON []byte 800 ) 801 if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { 802 return err 803 } 804 805 if err := json.Unmarshal(fdJSON, &fds); err != nil { 806 return err 807 } 808 for i := range fds { 809 if s := fds[i]; strings.Contains(s, "pipe:") { 810 inheritFd := new(criurpc.InheritFd) 811 inheritFd.Key = proto.String(s) 812 inheritFd.Fd = proto.Int32(int32(i)) 813 req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) 814 } 815 } 816 return c.criuSwrk(process, req, criuOpts, true) 817 } 818 819 func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { 820 if err := c.cgroupManager.Apply(pid); err != nil { 821 return err 822 } 823 824 path := fmt.Sprintf("/proc/%d/cgroup", pid) 825 cgroupsPaths, err := cgroups.ParseCgroupFile(path) 826 if err != nil { 827 return err 828 } 829 830 for c, p := range cgroupsPaths { 831 cgroupRoot := &criurpc.CgroupRoot{ 832 Ctrl: proto.String(c), 833 Path: proto.String(p), 834 } 835 req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) 836 } 837 838 return nil 839 } 840 841 func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { 842 fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0) 843 if err != nil { 844 return err 845 } 846 847 logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) 848 criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") 849 criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") 850 defer criuClient.Close() 851 defer criuServer.Close() 852 853 args := []string{"swrk", "3"} 854 logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) 855 logrus.Debugf("Using CRIU with following args: %s", args) 856 cmd := exec.Command(c.criuPath, args...) 857 if process != nil { 858 cmd.Stdin = process.Stdin 859 cmd.Stdout = process.Stdout 860 cmd.Stderr = process.Stderr 861 } 862 cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) 863 864 if err := cmd.Start(); err != nil { 865 return err 866 } 867 criuServer.Close() 868 869 defer func() { 870 criuClient.Close() 871 _, err := cmd.Process.Wait() 872 if err != nil { 873 return 874 } 875 }() 876 877 if applyCgroups { 878 err := c.criuApplyCgroups(cmd.Process.Pid, req) 879 if err != nil { 880 return err 881 } 882 } 883 884 var extFds []string 885 if process != nil { 886 extFds, err = getPipeFds(cmd.Process.Pid) 887 if err != nil { 888 return err 889 } 890 } 891 892 logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) 893 val := reflect.ValueOf(req.GetOpts()) 894 v := reflect.Indirect(val) 895 for i := 0; i < v.NumField(); i++ { 896 st := v.Type() 897 name := st.Field(i).Name 898 if strings.HasPrefix(name, "XXX_") { 899 continue 900 } 901 value := val.MethodByName("Get" + name).Call([]reflect.Value{}) 902 logrus.Debugf("CRIU option %s with value %v", name, value[0]) 903 } 904 data, err := proto.Marshal(req) 905 if err != nil { 906 return err 907 } 908 _, err = criuClient.Write(data) 909 if err != nil { 910 return err 911 } 912 913 buf := make([]byte, 10*4096) 914 for true { 915 n, err := criuClient.Read(buf) 916 if err != nil { 917 return err 918 } 919 if n == 0 { 920 return fmt.Errorf("unexpected EOF") 921 } 922 if n == len(buf) { 923 return fmt.Errorf("buffer is too small") 924 } 925 926 resp := new(criurpc.CriuResp) 927 err = proto.Unmarshal(buf[:n], resp) 928 if err != nil { 929 return err 930 } 931 if !resp.GetSuccess() { 932 typeString := req.GetType().String() 933 return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) 934 } 935 936 t := resp.GetType() 937 switch { 938 case t == criurpc.CriuReqType_NOTIFY: 939 if err := c.criuNotifications(resp, process, opts, extFds); err != nil { 940 return err 941 } 942 t = criurpc.CriuReqType_NOTIFY 943 req = &criurpc.CriuReq{ 944 Type: &t, 945 NotifySuccess: proto.Bool(true), 946 } 947 data, err = proto.Marshal(req) 948 if err != nil { 949 return err 950 } 951 _, err = criuClient.Write(data) 952 if err != nil { 953 return err 954 } 955 continue 956 case t == criurpc.CriuReqType_RESTORE: 957 case t == criurpc.CriuReqType_DUMP: 958 break 959 default: 960 return fmt.Errorf("unable to parse the response %s", resp.String()) 961 } 962 963 break 964 } 965 966 // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. 967 // Here we want to wait only the CRIU process. 968 st, err := cmd.Process.Wait() 969 if err != nil { 970 return err 971 } 972 if !st.Success() { 973 return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) 974 } 975 return nil 976 } 977 978 // block any external network activity 979 func lockNetwork(config *configs.Config) error { 980 for _, config := range config.Networks { 981 strategy, err := getStrategy(config.Type) 982 if err != nil { 983 return err 984 } 985 986 if err := strategy.detach(config); err != nil { 987 return err 988 } 989 } 990 return nil 991 } 992 993 func unlockNetwork(config *configs.Config) error { 994 for _, config := range config.Networks { 995 strategy, err := getStrategy(config.Type) 996 if err != nil { 997 return err 998 } 999 if err = strategy.attach(config); err != nil { 1000 return err 1001 } 1002 } 1003 return nil 1004 } 1005 1006 func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error { 1007 notify := resp.GetNotify() 1008 if notify == nil { 1009 return fmt.Errorf("invalid response: %s", resp.String()) 1010 } 1011 switch { 1012 case notify.GetScript() == "post-dump": 1013 f, err := os.Create(filepath.Join(c.root, "checkpoint")) 1014 if err != nil { 1015 return err 1016 } 1017 f.Close() 1018 case notify.GetScript() == "network-unlock": 1019 if err := unlockNetwork(c.config); err != nil { 1020 return err 1021 } 1022 case notify.GetScript() == "network-lock": 1023 if err := lockNetwork(c.config); err != nil { 1024 return err 1025 } 1026 case notify.GetScript() == "setup-namespaces": 1027 if c.config.Hooks != nil { 1028 s := configs.HookState{ 1029 Version: c.config.Version, 1030 ID: c.id, 1031 Pid: int(notify.GetPid()), 1032 Root: c.config.Rootfs, 1033 } 1034 for i, hook := range c.config.Hooks.Prestart { 1035 if err := hook.Run(s); err != nil { 1036 return newSystemErrorWithCausef(err, "running prestart hook %d", i) 1037 } 1038 } 1039 } 1040 case notify.GetScript() == "post-restore": 1041 pid := notify.GetPid() 1042 r, err := newRestoredProcess(int(pid), fds) 1043 if err != nil { 1044 return err 1045 } 1046 process.ops = r 1047 if err := c.state.transition(&restoredState{ 1048 imageDir: opts.ImagesDirectory, 1049 c: c, 1050 }); err != nil { 1051 return err 1052 } 1053 // create a timestamp indicating when the restored checkpoint was started 1054 c.created = time.Now().UTC() 1055 if _, err := c.updateState(r); err != nil { 1056 return err 1057 } 1058 if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { 1059 if !os.IsNotExist(err) { 1060 logrus.Error(err) 1061 } 1062 } 1063 } 1064 return nil 1065 } 1066 1067 func (c *linuxContainer) updateState(process parentProcess) (*State, error) { 1068 c.initProcess = process 1069 state, err := c.currentState() 1070 if err != nil { 1071 return nil, err 1072 } 1073 err = c.saveState(state) 1074 if err != nil { 1075 return nil, err 1076 } 1077 return state, nil 1078 } 1079 1080 func (c *linuxContainer) saveState(s *State) error { 1081 f, err := os.Create(filepath.Join(c.root, stateFilename)) 1082 if err != nil { 1083 return err 1084 } 1085 defer f.Close() 1086 return utils.WriteJSON(f, s) 1087 } 1088 1089 func (c *linuxContainer) deleteState() error { 1090 return os.Remove(filepath.Join(c.root, stateFilename)) 1091 } 1092 1093 func (c *linuxContainer) currentStatus() (Status, error) { 1094 if err := c.refreshState(); err != nil { 1095 return -1, err 1096 } 1097 return c.state.status(), nil 1098 } 1099 1100 // refreshState needs to be called to verify that the current state on the 1101 // container is what is true. Because consumers of libcontainer can use it 1102 // out of process we need to verify the container's status based on runtime 1103 // information and not rely on our in process info. 1104 func (c *linuxContainer) refreshState() error { 1105 paused, err := c.isPaused() 1106 if err != nil { 1107 return err 1108 } 1109 if paused { 1110 return c.state.transition(&pausedState{c: c}) 1111 } 1112 t, err := c.runType() 1113 if err != nil { 1114 return err 1115 } 1116 switch t { 1117 case Created: 1118 return c.state.transition(&createdState{c: c}) 1119 case Running: 1120 return c.state.transition(&runningState{c: c}) 1121 } 1122 return c.state.transition(&stoppedState{c: c}) 1123 } 1124 1125 // doesInitProcessExist checks if the init process is still the same process 1126 // as the initial one, it could happen that the original process has exited 1127 // and a new process has been created with the same pid, in this case, the 1128 // container would already be stopped. 1129 func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) { 1130 startTime, err := system.GetProcessStartTime(initPid) 1131 if err != nil { 1132 return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid) 1133 } 1134 if c.initProcessStartTime != startTime { 1135 return false, nil 1136 } 1137 return true, nil 1138 } 1139 1140 func (c *linuxContainer) runType() (Status, error) { 1141 if c.initProcess == nil { 1142 return Stopped, nil 1143 } 1144 pid := c.initProcess.pid() 1145 // return Running if the init process is alive 1146 if err := syscall.Kill(pid, 0); err != nil { 1147 if err == syscall.ESRCH { 1148 // It means the process does not exist anymore, could happen when the 1149 // process exited just when we call the function, we should not return 1150 // error in this case. 1151 return Stopped, nil 1152 } 1153 return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid) 1154 } 1155 // check if the process is still the original init process. 1156 exist, err := c.doesInitProcessExist(pid) 1157 if !exist || err != nil { 1158 return Stopped, err 1159 } 1160 // check if the process that is running is the init process or the user's process. 1161 // this is the difference between the container Running and Created. 1162 environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid)) 1163 if err != nil { 1164 return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid) 1165 } 1166 check := []byte("_LIBCONTAINER") 1167 if bytes.Contains(environ, check) { 1168 return Created, nil 1169 } 1170 return Running, nil 1171 } 1172 1173 func (c *linuxContainer) isPaused() (bool, error) { 1174 data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) 1175 if err != nil { 1176 // If freezer cgroup is not mounted, the container would just be not paused. 1177 if os.IsNotExist(err) { 1178 return false, nil 1179 } 1180 return false, newSystemErrorWithCause(err, "checking if container is paused") 1181 } 1182 return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil 1183 } 1184 1185 func (c *linuxContainer) currentState() (*State, error) { 1186 var ( 1187 startTime string 1188 externalDescriptors []string 1189 pid = -1 1190 ) 1191 if c.initProcess != nil { 1192 pid = c.initProcess.pid() 1193 startTime, _ = c.initProcess.startTime() 1194 externalDescriptors = c.initProcess.externalDescriptors() 1195 } 1196 state := &State{ 1197 BaseState: BaseState{ 1198 ID: c.ID(), 1199 Config: *c.config, 1200 InitProcessPid: pid, 1201 InitProcessStartTime: startTime, 1202 Created: c.created, 1203 }, 1204 CgroupPaths: c.cgroupManager.GetPaths(), 1205 NamespacePaths: make(map[configs.NamespaceType]string), 1206 ExternalDescriptors: externalDescriptors, 1207 } 1208 if pid > 0 { 1209 for _, ns := range c.config.Namespaces { 1210 state.NamespacePaths[ns.Type] = ns.GetPath(pid) 1211 } 1212 for _, nsType := range configs.NamespaceTypes() { 1213 if !configs.IsNamespaceSupported(nsType) { 1214 continue 1215 } 1216 if _, ok := state.NamespacePaths[nsType]; !ok { 1217 ns := configs.Namespace{Type: nsType} 1218 state.NamespacePaths[ns.Type] = ns.GetPath(pid) 1219 } 1220 } 1221 } 1222 return state, nil 1223 } 1224 1225 // orderNamespacePaths sorts namespace paths into a list of paths that we 1226 // can setns in order. 1227 func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { 1228 paths := []string{} 1229 order := []configs.NamespaceType{ 1230 // The user namespace *must* be done first. 1231 configs.NEWUSER, 1232 configs.NEWIPC, 1233 configs.NEWUTS, 1234 configs.NEWNET, 1235 configs.NEWPID, 1236 configs.NEWNS, 1237 } 1238 1239 // Remove namespaces that we don't need to join. 1240 var nsTypes []configs.NamespaceType 1241 for _, ns := range order { 1242 if c.config.Namespaces.Contains(ns) { 1243 nsTypes = append(nsTypes, ns) 1244 } 1245 } 1246 for _, nsType := range nsTypes { 1247 if p, ok := namespaces[nsType]; ok && p != "" { 1248 // check if the requested namespace is supported 1249 if !configs.IsNamespaceSupported(nsType) { 1250 return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType)) 1251 } 1252 // only set to join this namespace if it exists 1253 if _, err := os.Lstat(p); err != nil { 1254 return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) 1255 } 1256 // do not allow namespace path with comma as we use it to separate 1257 // the namespace paths 1258 if strings.ContainsRune(p, ',') { 1259 return nil, newSystemError(fmt.Errorf("invalid path %s", p)) 1260 } 1261 paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p)) 1262 } 1263 } 1264 return paths, nil 1265 } 1266 1267 func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { 1268 data := bytes.NewBuffer(nil) 1269 for _, im := range idMap { 1270 line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) 1271 if _, err := data.WriteString(line); err != nil { 1272 return nil, err 1273 } 1274 } 1275 return data.Bytes(), nil 1276 } 1277 1278 // bootstrapData encodes the necessary data in netlink binary format 1279 // as a io.Reader. 1280 // Consumer can write the data to a bootstrap program 1281 // such as one that uses nsenter package to bootstrap the container's 1282 // init process correctly, i.e. with correct namespaces, uid/gid 1283 // mapping etc. 1284 func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) { 1285 // create the netlink message 1286 r := nl.NewNetlinkRequest(int(InitMsg), 0) 1287 1288 // write cloneFlags 1289 r.AddData(&Int32msg{ 1290 Type: CloneFlagsAttr, 1291 Value: uint32(cloneFlags), 1292 }) 1293 1294 // write console path 1295 if consolePath != "" { 1296 r.AddData(&Bytemsg{ 1297 Type: ConsolePathAttr, 1298 Value: []byte(consolePath), 1299 }) 1300 } 1301 1302 // write custom namespace paths 1303 if len(nsMaps) > 0 { 1304 nsPaths, err := c.orderNamespacePaths(nsMaps) 1305 if err != nil { 1306 return nil, err 1307 } 1308 r.AddData(&Bytemsg{ 1309 Type: NsPathsAttr, 1310 Value: []byte(strings.Join(nsPaths, ",")), 1311 }) 1312 } 1313 1314 // write namespace paths only when we are not joining an existing user ns 1315 _, joinExistingUser := nsMaps[configs.NEWUSER] 1316 if !joinExistingUser { 1317 // write uid mappings 1318 if len(c.config.UidMappings) > 0 { 1319 b, err := encodeIDMapping(c.config.UidMappings) 1320 if err != nil { 1321 return nil, err 1322 } 1323 r.AddData(&Bytemsg{ 1324 Type: UidmapAttr, 1325 Value: b, 1326 }) 1327 } 1328 1329 // write gid mappings 1330 if len(c.config.GidMappings) > 0 { 1331 b, err := encodeIDMapping(c.config.GidMappings) 1332 if err != nil { 1333 return nil, err 1334 } 1335 r.AddData(&Bytemsg{ 1336 Type: GidmapAttr, 1337 Value: b, 1338 }) 1339 // check if we have CAP_SETGID to setgroup properly 1340 pid, err := capability.NewPid(os.Getpid()) 1341 if err != nil { 1342 return nil, err 1343 } 1344 if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { 1345 r.AddData(&Boolmsg{ 1346 Type: SetgroupAttr, 1347 Value: true, 1348 }) 1349 } 1350 } 1351 } 1352 1353 return bytes.NewReader(r.Serialize()), nil 1354 }