github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/criu_linux.go (about) 1 package libcontainer 2 3 import ( 4 "bufio" 5 "bytes" 6 "encoding/json" 7 "errors" 8 "fmt" 9 "net" 10 "os" 11 "os/exec" 12 "path/filepath" 13 "reflect" 14 "strings" 15 "time" 16 17 "github.com/checkpoint-restore/go-criu/v6" 18 criurpc "github.com/checkpoint-restore/go-criu/v6/rpc" 19 securejoin "github.com/cyphar/filepath-securejoin" 20 "github.com/sirupsen/logrus" 21 "golang.org/x/sys/unix" 22 "google.golang.org/protobuf/proto" 23 24 "github.com/opencontainers/runc/libcontainer/cgroups" 25 "github.com/opencontainers/runc/libcontainer/configs" 26 "github.com/opencontainers/runc/libcontainer/utils" 27 ) 28 29 var criuFeatures *criurpc.CriuFeatures 30 31 var ErrCriuMissingFeatures = errors.New("criu is missing features") 32 33 func (c *Container) checkCriuFeatures(criuOpts *CriuOpts, criuFeat *criurpc.CriuFeatures) error { 34 t := criurpc.CriuReqType_FEATURE_CHECK 35 36 // make sure the features we are looking for are really not from 37 // some previous check 38 criuFeatures = nil 39 40 req := &criurpc.CriuReq{ 41 Type: &t, 42 Features: criuFeat, 43 } 44 45 err := c.criuSwrk(nil, req, criuOpts, nil) 46 if err != nil { 47 return fmt.Errorf("CRIU feature check failed: %w", err) 48 } 49 50 var missingFeatures []string 51 52 // The outer if checks if the fields actually exist 53 if (criuFeat.MemTrack != nil) && 54 (criuFeatures.MemTrack != nil) { 55 // The inner if checks if they are set to true 56 if *criuFeat.MemTrack && !*criuFeatures.MemTrack { 57 missingFeatures = append(missingFeatures, "MemTrack") 58 logrus.Debugf("CRIU does not support MemTrack") 59 } 60 } 61 62 // This needs to be repeated for every new feature check. 63 // Is there a way to put this in a function. Reflection? 64 if (criuFeat.LazyPages != nil) && 65 (criuFeatures.LazyPages != nil) { 66 if *criuFeat.LazyPages && !*criuFeatures.LazyPages { 67 missingFeatures = append(missingFeatures, "LazyPages") 68 logrus.Debugf("CRIU does not support LazyPages") 69 } 70 } 71 72 if len(missingFeatures) != 0 { 73 return fmt.Errorf("%w: %v", ErrCriuMissingFeatures, missingFeatures) 74 } 75 76 return nil 77 } 78 79 func compareCriuVersion(criuVersion int, minVersion int) error { 80 // simple function to perform the actual version compare 81 if criuVersion < minVersion { 82 return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion) 83 } 84 85 return nil 86 } 87 88 // checkCriuVersion checks CRIU version greater than or equal to minVersion. 89 func (c *Container) checkCriuVersion(minVersion int) error { 90 // If the version of criu has already been determined there is no need 91 // to ask criu for the version again. Use the value from c.criuVersion. 92 if c.criuVersion != 0 { 93 return compareCriuVersion(c.criuVersion, minVersion) 94 } 95 96 criu := criu.MakeCriu() 97 var err error 98 c.criuVersion, err = criu.GetCriuVersion() 99 if err != nil { 100 return fmt.Errorf("CRIU version check failed: %w", err) 101 } 102 103 return compareCriuVersion(c.criuVersion, minVersion) 104 } 105 106 const descriptorsFilename = "descriptors.json" 107 108 func (c *Container) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { 109 mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) 110 if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { 111 mountDest = dest[len(c.config.Rootfs):] 112 } 113 extMnt := &criurpc.ExtMountMap{ 114 Key: proto.String(mountDest), 115 Val: proto.String(mountDest), 116 } 117 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 118 } 119 120 func (c *Container) addMaskPaths(req *criurpc.CriuReq) error { 121 for _, path := range c.config.MaskPaths { 122 fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) 123 if err != nil { 124 if os.IsNotExist(err) { 125 continue 126 } 127 return err 128 } 129 if fi.IsDir() { 130 continue 131 } 132 133 extMnt := &criurpc.ExtMountMap{ 134 Key: proto.String(path), 135 Val: proto.String("/dev/null"), 136 } 137 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 138 } 139 return nil 140 } 141 142 func (c *Container) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { 143 // CRIU will evaluate a configuration starting with release 3.11. 144 // Settings in the configuration file will overwrite RPC settings. 145 // Look for annotations. The annotation 'org.criu.config' 146 // specifies if CRIU should use a different, container specific 147 // configuration file. 148 configFile, exists := utils.SearchLabels(c.config.Labels, "org.criu.config") 149 if exists { 150 // If the annotation 'org.criu.config' exists and is set 151 // to a non-empty string, tell CRIU to use that as a 152 // configuration file. If the file does not exist, CRIU 153 // will just ignore it. 154 if configFile != "" { 155 rpcOpts.ConfigFile = proto.String(configFile) 156 } 157 // If 'org.criu.config' exists and is set to an empty 158 // string, a runc specific CRIU configuration file will 159 // be not set at all. 160 } else { 161 // If the mentioned annotation has not been found, specify 162 // a default CRIU configuration file. 163 rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") 164 } 165 } 166 167 func (c *Container) criuSupportsExtNS(t configs.NamespaceType) bool { 168 var minVersion int 169 switch t { 170 case configs.NEWNET: 171 // CRIU supports different external namespace with different released CRIU versions. 172 // For network namespaces to work we need at least criu 3.11.0 => 31100. 173 minVersion = 31100 174 case configs.NEWPID: 175 // For PID namespaces criu 31500 is needed. 176 minVersion = 31500 177 default: 178 return false 179 } 180 return c.checkCriuVersion(minVersion) == nil 181 } 182 183 func criuNsToKey(t configs.NamespaceType) string { 184 return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated 185 } 186 187 func (c *Container) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { 188 if !c.criuSupportsExtNS(t) { 189 return nil 190 } 191 192 nsPath := c.config.Namespaces.PathOf(t) 193 if nsPath == "" { 194 return nil 195 } 196 // CRIU expects the information about an external namespace 197 // like this: --external <TYPE>[<inode>]:<key> 198 // This <key> is always 'extRoot<TYPE>NS'. 199 var ns unix.Stat_t 200 if err := unix.Stat(nsPath, &ns); err != nil { 201 return err 202 } 203 criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t)) 204 rpcOpts.External = append(rpcOpts.External, criuExternal) 205 206 return nil 207 } 208 209 func (c *Container) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error { 210 for _, ns := range c.config.Namespaces { 211 switch ns.Type { 212 case configs.NEWNET, configs.NEWPID: 213 // If the container is running in a network or PID namespace and has 214 // a path to the network or PID namespace configured, we will dump 215 // that network or PID namespace as an external namespace and we 216 // will expect that the namespace exists during restore. 217 // This basically means that CRIU will ignore the namespace 218 // and expect it to be setup correctly. 219 if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil { 220 return err 221 } 222 default: 223 // For all other namespaces except NET and PID CRIU has 224 // a simpler way of joining the existing namespace if set 225 nsPath := c.config.Namespaces.PathOf(ns.Type) 226 if nsPath == "" { 227 continue 228 } 229 if ns.Type == configs.NEWCGROUP { 230 // CRIU has no code to handle NEWCGROUP 231 return fmt.Errorf("Do not know how to handle namespace %v", ns.Type) 232 } 233 // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc 234 235 // CRIU will issue a warning for NEWUSER: 236 // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous' 237 rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{ 238 Ns: proto.String(configs.NsName(ns.Type)), 239 NsFile: proto.String(nsPath), 240 }) 241 } 242 } 243 244 return nil 245 } 246 247 func (c *Container) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { 248 if !c.criuSupportsExtNS(t) { 249 return nil 250 } 251 252 nsPath := c.config.Namespaces.PathOf(t) 253 if nsPath == "" { 254 return nil 255 } 256 // CRIU wants the information about an existing namespace 257 // like this: --inherit-fd fd[<fd>]:<key> 258 // The <key> needs to be the same as during checkpointing. 259 // We are always using 'extRoot<TYPE>NS' as the key in this. 260 nsFd, err := os.Open(nsPath) 261 if err != nil { 262 logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) 263 return fmt.Errorf("Requested network namespace %v does not exist", nsPath) 264 } 265 inheritFd := &criurpc.InheritFd{ 266 Key: proto.String(criuNsToKey(t)), 267 // The offset of four is necessary because 0, 1, 2 and 3 are 268 // already used by stdin, stdout, stderr, 'criu swrk' socket. 269 Fd: proto.Int32(int32(4 + len(*extraFiles))), 270 } 271 rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) 272 // All open FDs need to be transferred to CRIU via extraFiles 273 *extraFiles = append(*extraFiles, nsFd) 274 275 return nil 276 } 277 278 func (c *Container) Checkpoint(criuOpts *CriuOpts) error { 279 const logFile = "dump.log" 280 c.m.Lock() 281 defer c.m.Unlock() 282 283 // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). 284 // (CLI prints a warning) 285 // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has 286 // support for doing unprivileged dumps, but the setup of 287 // rootless containers might make this complicated. 288 289 // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 290 if err := c.checkCriuVersion(30000); err != nil { 291 return err 292 } 293 294 if criuOpts.ImagesDirectory == "" { 295 return errors.New("invalid directory to save checkpoint") 296 } 297 298 // Since a container can be C/R'ed multiple times, 299 // the checkpoint directory may already exist. 300 if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) { 301 return err 302 } 303 304 logDir := criuOpts.ImagesDirectory 305 imageDir, err := os.Open(criuOpts.ImagesDirectory) 306 if err != nil { 307 return err 308 } 309 defer imageDir.Close() 310 311 rpcOpts := criurpc.CriuOpts{ 312 ImagesDirFd: proto.Int32(int32(imageDir.Fd())), 313 LogLevel: proto.Int32(4), 314 LogFile: proto.String(logFile), 315 Root: proto.String(c.config.Rootfs), 316 ManageCgroups: proto.Bool(true), 317 NotifyScripts: proto.Bool(true), 318 Pid: proto.Int32(int32(c.initProcess.pid())), 319 ShellJob: proto.Bool(criuOpts.ShellJob), 320 LeaveRunning: proto.Bool(criuOpts.LeaveRunning), 321 TcpEstablished: proto.Bool(criuOpts.TcpEstablished), 322 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), 323 FileLocks: proto.Bool(criuOpts.FileLocks), 324 EmptyNs: proto.Uint32(criuOpts.EmptyNs), 325 OrphanPtsMaster: proto.Bool(true), 326 AutoDedup: proto.Bool(criuOpts.AutoDedup), 327 LazyPages: proto.Bool(criuOpts.LazyPages), 328 } 329 330 // if criuOpts.WorkDirectory is not set, criu default is used. 331 if criuOpts.WorkDirectory != "" { 332 if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { 333 return err 334 } 335 workDir, err := os.Open(criuOpts.WorkDirectory) 336 if err != nil { 337 return err 338 } 339 defer workDir.Close() 340 rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd())) 341 logDir = criuOpts.WorkDirectory 342 } 343 344 c.handleCriuConfigurationFile(&rpcOpts) 345 346 // If the container is running in a network namespace and has 347 // a path to the network namespace configured, we will dump 348 // that network namespace as an external namespace and we 349 // will expect that the namespace exists during restore. 350 // This basically means that CRIU will ignore the namespace 351 // and expect to be setup correctly. 352 if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { 353 return err 354 } 355 356 // Same for possible external PID namespaces 357 if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { 358 return err 359 } 360 361 // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup 362 // is not set, CRIU uses ptrace() to pause the processes. 363 // Note cgroup v2 freezer is only supported since CRIU release 3.14. 364 if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil { 365 if fcg := c.cgroupManager.Path("freezer"); fcg != "" { 366 rpcOpts.FreezeCgroup = proto.String(fcg) 367 } 368 } 369 370 // append optional criu opts, e.g., page-server and port 371 if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { 372 rpcOpts.Ps = &criurpc.CriuPageServerInfo{ 373 Address: proto.String(criuOpts.PageServer.Address), 374 Port: proto.Int32(criuOpts.PageServer.Port), 375 } 376 } 377 378 // pre-dump may need parentImage param to complete iterative migration 379 if criuOpts.ParentImage != "" { 380 rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) 381 rpcOpts.TrackMem = proto.Bool(true) 382 } 383 384 // append optional manage cgroups mode 385 if criuOpts.ManageCgroupsMode != 0 { 386 mode := criuOpts.ManageCgroupsMode 387 rpcOpts.ManageCgroupsMode = &mode 388 } 389 390 var t criurpc.CriuReqType 391 if criuOpts.PreDump { 392 feat := criurpc.CriuFeatures{ 393 MemTrack: proto.Bool(true), 394 } 395 396 if err := c.checkCriuFeatures(criuOpts, &feat); err != nil { 397 return err 398 } 399 400 t = criurpc.CriuReqType_PRE_DUMP 401 } else { 402 t = criurpc.CriuReqType_DUMP 403 } 404 405 if criuOpts.LazyPages { 406 // lazy migration requested; check if criu supports it 407 feat := criurpc.CriuFeatures{ 408 LazyPages: proto.Bool(true), 409 } 410 if err := c.checkCriuFeatures(criuOpts, &feat); err != nil { 411 return err 412 } 413 414 if fd := criuOpts.StatusFd; fd != -1 { 415 // check that the FD is valid 416 flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0) 417 if err != nil { 418 return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err) 419 } 420 // and writable 421 if flags&unix.O_WRONLY == 0 { 422 return fmt.Errorf("invalid --status-fd argument %d: not writable", fd) 423 } 424 425 if c.checkCriuVersion(31500) != nil { 426 // For criu 3.15+, use notifications (see case "status-ready" 427 // in criuNotifications). Otherwise, rely on criu status fd. 428 rpcOpts.StatusFd = proto.Int32(int32(fd)) 429 } 430 } 431 } 432 433 req := &criurpc.CriuReq{ 434 Type: &t, 435 Opts: &rpcOpts, 436 } 437 438 // no need to dump all this in pre-dump 439 if !criuOpts.PreDump { 440 hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) 441 for _, m := range c.config.Mounts { 442 switch m.Device { 443 case "bind": 444 c.addCriuDumpMount(req, m) 445 case "cgroup": 446 if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { 447 // real mount(s) 448 continue 449 } 450 // a set of "external" bind mounts 451 binds, err := getCgroupMounts(m) 452 if err != nil { 453 return err 454 } 455 for _, b := range binds { 456 c.addCriuDumpMount(req, b) 457 } 458 } 459 } 460 461 if err := c.addMaskPaths(req); err != nil { 462 return err 463 } 464 465 for _, node := range c.config.Devices { 466 m := &configs.Mount{Destination: node.Path, Source: node.Path} 467 c.addCriuDumpMount(req, m) 468 } 469 470 // Write the FD info to a file in the image directory 471 fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) 472 if err != nil { 473 return err 474 } 475 476 err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600) 477 if err != nil { 478 return err 479 } 480 } 481 482 err = c.criuSwrk(nil, req, criuOpts, nil) 483 if err != nil { 484 logCriuErrors(logDir, logFile) 485 return err 486 } 487 return nil 488 } 489 490 func (c *Container) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { 491 mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) 492 if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { 493 mountDest = dest[len(c.config.Rootfs):] 494 } 495 extMnt := &criurpc.ExtMountMap{ 496 Key: proto.String(mountDest), 497 Val: proto.String(m.Source), 498 } 499 req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) 500 } 501 502 func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { 503 for _, iface := range c.config.Networks { 504 switch iface.Type { 505 case "veth": 506 veth := new(criurpc.CriuVethPair) 507 veth.IfOut = proto.String(iface.HostInterfaceName) 508 veth.IfIn = proto.String(iface.Name) 509 req.Opts.Veths = append(req.Opts.Veths, veth) 510 case "loopback": 511 // Do nothing 512 } 513 } 514 for _, i := range criuOpts.VethPairs { 515 veth := new(criurpc.CriuVethPair) 516 veth.IfOut = proto.String(i.HostInterfaceName) 517 veth.IfIn = proto.String(i.ContainerInterfaceName) 518 req.Opts.Veths = append(req.Opts.Veths, veth) 519 } 520 } 521 522 // makeCriuRestoreMountpoints makes the actual mountpoints for the 523 // restore using CRIU. This function is inspired from the code in 524 // rootfs_linux.go. 525 func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error { 526 me := mountEntry{Mount: m} 527 dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) 528 if err != nil { 529 return err 530 } 531 // TODO: pass srcFD? Not sure if criu is impacted by issue #2484. 532 if err := checkProcMount(c.config.Rootfs, dest, me); err != nil { 533 return err 534 } 535 switch m.Device { 536 case "cgroup": 537 // No mount point(s) need to be created: 538 // 539 // * for v1, mount points are saved by CRIU because 540 // /sys/fs/cgroup is a tmpfs mount 541 // 542 // * for v2, /sys/fs/cgroup is a real mount, but 543 // the mountpoint appears as soon as /sys is mounted 544 return nil 545 case "bind": 546 // For bind-mounts (unlike other filesystem types), we need to check if 547 // the source exists. 548 fi, _, err := me.srcStat() 549 if err != nil { 550 // error out if the source of a bind mount does not exist as we 551 // will be unable to bind anything to it. 552 return err 553 } 554 if err := createIfNotExists(dest, fi.IsDir()); err != nil { 555 return err 556 } 557 default: 558 // for all other filesystems just create the mountpoints 559 if err := os.MkdirAll(dest, 0o755); err != nil { 560 return err 561 } 562 } 563 return nil 564 } 565 566 // isPathInPrefixList is a small function for CRIU restore to make sure 567 // mountpoints, which are on a tmpfs, are not created in the roofs. 568 func isPathInPrefixList(path string, prefix []string) bool { 569 for _, p := range prefix { 570 if strings.HasPrefix(path, p+"/") { 571 return true 572 } 573 } 574 return false 575 } 576 577 // prepareCriuRestoreMounts tries to set up the rootfs of the 578 // container to be restored in the same way runc does it for 579 // initial container creation. Even for a read-only rootfs container 580 // runc modifies the rootfs to add mountpoints which do not exist. 581 // This function also creates missing mountpoints as long as they 582 // are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. 583 func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { 584 // First get a list of a all tmpfs mounts 585 tmpfs := []string{} 586 for _, m := range mounts { 587 switch m.Device { 588 case "tmpfs": 589 tmpfs = append(tmpfs, m.Destination) 590 } 591 } 592 // Now go through all mounts and create the mountpoints 593 // if the mountpoints are not on a tmpfs, as CRIU will 594 // restore the complete tmpfs content from its checkpoint. 595 umounts := []string{} 596 defer func() { 597 for _, u := range umounts { 598 _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error { 599 if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil { 600 if e != unix.EINVAL { 601 // Ignore EINVAL as it means 'target is not a mount point.' 602 // It probably has already been unmounted. 603 logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e) 604 } 605 } 606 return nil 607 }) 608 } 609 }() 610 for _, m := range mounts { 611 if !isPathInPrefixList(m.Destination, tmpfs) { 612 if err := c.makeCriuRestoreMountpoints(m); err != nil { 613 return err 614 } 615 // If the mount point is a bind mount, we need to mount 616 // it now so that runc can create the necessary mount 617 // points for mounts in bind mounts. 618 // This also happens during initial container creation. 619 // Without this CRIU restore will fail 620 // See: https://github.com/opencontainers/runc/issues/2748 621 // It is also not necessary to order the mount points 622 // because during initial container creation mounts are 623 // set up in the order they are configured. 624 if m.Device == "bind" { 625 if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error { 626 return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") 627 }); err != nil { 628 return err 629 } 630 umounts = append(umounts, m.Destination) 631 } 632 } 633 } 634 return nil 635 } 636 637 // Restore restores the checkpointed container to a running state using the 638 // criu(8) utility. 639 func (c *Container) Restore(process *Process, criuOpts *CriuOpts) error { 640 const logFile = "restore.log" 641 c.m.Lock() 642 defer c.m.Unlock() 643 644 var extraFiles []*os.File 645 646 // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). 647 // (CLI prints a warning) 648 // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have 649 // support for unprivileged restore at the moment. 650 651 // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 652 if err := c.checkCriuVersion(30000); err != nil { 653 return err 654 } 655 if criuOpts.ImagesDirectory == "" { 656 return errors.New("invalid directory to restore checkpoint") 657 } 658 logDir := criuOpts.ImagesDirectory 659 imageDir, err := os.Open(criuOpts.ImagesDirectory) 660 if err != nil { 661 return err 662 } 663 defer imageDir.Close() 664 // CRIU has a few requirements for a root directory: 665 // * it must be a mount point 666 // * its parent must not be overmounted 667 // c.config.Rootfs is bind-mounted to a temporary directory 668 // to satisfy these requirements. 669 root := filepath.Join(c.stateDir, "criu-root") 670 if err := os.Mkdir(root, 0o755); err != nil { 671 return err 672 } 673 defer os.Remove(root) 674 root, err = filepath.EvalSymlinks(root) 675 if err != nil { 676 return err 677 } 678 err = mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") 679 if err != nil { 680 return err 681 } 682 defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck 683 t := criurpc.CriuReqType_RESTORE 684 req := &criurpc.CriuReq{ 685 Type: &t, 686 Opts: &criurpc.CriuOpts{ 687 ImagesDirFd: proto.Int32(int32(imageDir.Fd())), 688 EvasiveDevices: proto.Bool(true), 689 LogLevel: proto.Int32(4), 690 LogFile: proto.String(logFile), 691 RstSibling: proto.Bool(true), 692 Root: proto.String(root), 693 ManageCgroups: proto.Bool(true), 694 NotifyScripts: proto.Bool(true), 695 ShellJob: proto.Bool(criuOpts.ShellJob), 696 ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), 697 TcpEstablished: proto.Bool(criuOpts.TcpEstablished), 698 FileLocks: proto.Bool(criuOpts.FileLocks), 699 EmptyNs: proto.Uint32(criuOpts.EmptyNs), 700 OrphanPtsMaster: proto.Bool(true), 701 AutoDedup: proto.Bool(criuOpts.AutoDedup), 702 LazyPages: proto.Bool(criuOpts.LazyPages), 703 }, 704 } 705 706 if criuOpts.LsmProfile != "" { 707 // CRIU older than 3.16 has a bug which breaks the possibility 708 // to set a different LSM profile. 709 if err := c.checkCriuVersion(31600); err != nil { 710 return errors.New("--lsm-profile requires at least CRIU 3.16") 711 } 712 req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile) 713 } 714 if criuOpts.LsmMountContext != "" { 715 if err := c.checkCriuVersion(31600); err != nil { 716 return errors.New("--lsm-mount-context requires at least CRIU 3.16") 717 } 718 req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext) 719 } 720 721 if criuOpts.WorkDirectory != "" { 722 // Since a container can be C/R'ed multiple times, 723 // the work directory may already exist. 724 if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { 725 return err 726 } 727 workDir, err := os.Open(criuOpts.WorkDirectory) 728 if err != nil { 729 return err 730 } 731 defer workDir.Close() 732 req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd())) 733 logDir = criuOpts.WorkDirectory 734 } 735 c.handleCriuConfigurationFile(req.Opts) 736 737 if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil { 738 return err 739 } 740 741 // This will modify the rootfs of the container in the same way runc 742 // modifies the container during initial creation. 743 if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { 744 return err 745 } 746 747 hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) 748 for _, m := range c.config.Mounts { 749 switch m.Device { 750 case "bind": 751 c.addCriuRestoreMount(req, m) 752 case "cgroup": 753 if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { 754 continue 755 } 756 // cgroup v1 is a set of bind mounts, unless cgroupns is used 757 binds, err := getCgroupMounts(m) 758 if err != nil { 759 return err 760 } 761 for _, b := range binds { 762 c.addCriuRestoreMount(req, b) 763 } 764 } 765 } 766 767 if len(c.config.MaskPaths) > 0 { 768 m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} 769 c.addCriuRestoreMount(req, m) 770 } 771 772 for _, node := range c.config.Devices { 773 m := &configs.Mount{Destination: node.Path, Source: node.Path} 774 c.addCriuRestoreMount(req, m) 775 } 776 777 if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { 778 c.restoreNetwork(req, criuOpts) 779 } 780 781 // append optional manage cgroups mode 782 if criuOpts.ManageCgroupsMode != 0 { 783 mode := criuOpts.ManageCgroupsMode 784 req.Opts.ManageCgroupsMode = &mode 785 } 786 787 var ( 788 fds []string 789 fdJSON []byte 790 ) 791 if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { 792 return err 793 } 794 795 if err := json.Unmarshal(fdJSON, &fds); err != nil { 796 return err 797 } 798 for i := range fds { 799 if s := fds[i]; strings.Contains(s, "pipe:") { 800 inheritFd := new(criurpc.InheritFd) 801 inheritFd.Key = proto.String(s) 802 inheritFd.Fd = proto.Int32(int32(i)) 803 req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) 804 } 805 } 806 err = c.criuSwrk(process, req, criuOpts, extraFiles) 807 if err != nil { 808 logCriuErrors(logDir, logFile) 809 } 810 811 // Now that CRIU is done let's close all opened FDs CRIU needed. 812 for _, fd := range extraFiles { 813 fd.Close() 814 } 815 816 return err 817 } 818 819 // logCriuErrors tries to find and log errors from a criu log file. 820 // The output is similar to what "grep -n -B5 Error" does. 821 func logCriuErrors(dir, file string) { 822 lookFor := []byte("Error") // Print the line that contains this... 823 const max = 5 + 1 // ... and a few preceding lines. 824 825 logFile := filepath.Join(dir, file) 826 f, err := os.Open(logFile) 827 if err != nil { 828 logrus.Warn(err) 829 return 830 } 831 defer f.Close() 832 833 var lines [max][]byte 834 var idx, lineNo, printedLineNo int 835 s := bufio.NewScanner(f) 836 for s.Scan() { 837 lineNo++ 838 lines[idx] = s.Bytes() 839 idx = (idx + 1) % max 840 if !bytes.Contains(s.Bytes(), lookFor) { 841 continue 842 } 843 // Found an error. 844 if printedLineNo == 0 { 845 logrus.Warnf("--- Quoting %q", logFile) 846 } else if lineNo-max > printedLineNo { 847 // Mark the gap. 848 logrus.Warn("...") 849 } 850 // Print the last lines. 851 for add := 0; add < max; add++ { 852 i := (idx + add) % max 853 s := lines[i] 854 actLineNo := lineNo + add - max + 1 855 if len(s) > 0 && actLineNo > printedLineNo { 856 logrus.Warnf("%d:%s", actLineNo, s) 857 printedLineNo = actLineNo 858 } 859 } 860 } 861 if printedLineNo != 0 { 862 logrus.Warn("---") // End of "Quoting ...". 863 } 864 if err := s.Err(); err != nil { 865 logrus.Warnf("read %q: %v", logFile, err) 866 } 867 } 868 869 func (c *Container) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { 870 // need to apply cgroups only on restore 871 if req.GetType() != criurpc.CriuReqType_RESTORE { 872 return nil 873 } 874 875 // XXX: Do we need to deal with this case? AFAIK criu still requires root. 876 if err := c.cgroupManager.Apply(pid); err != nil { 877 return err 878 } 879 880 if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil { 881 return err 882 } 883 884 // TODO(@kolyshkin): should we use c.cgroupManager.GetPaths() 885 // instead of reading /proc/pid/cgroup? 886 path := fmt.Sprintf("/proc/%d/cgroup", pid) 887 cgroupsPaths, err := cgroups.ParseCgroupFile(path) 888 if err != nil { 889 return err 890 } 891 892 for c, p := range cgroupsPaths { 893 cgroupRoot := &criurpc.CgroupRoot{ 894 Ctrl: proto.String(c), 895 Path: proto.String(p), 896 } 897 req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) 898 } 899 900 return nil 901 } 902 903 func (c *Container) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error { 904 fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) 905 if err != nil { 906 return err 907 } 908 909 criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") 910 criuClientFileCon, err := net.FileConn(criuClient) 911 criuClient.Close() 912 if err != nil { 913 return err 914 } 915 916 criuClientCon := criuClientFileCon.(*net.UnixConn) 917 defer criuClientCon.Close() 918 919 criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") 920 defer criuServer.Close() 921 922 if c.criuVersion != 0 { 923 // If the CRIU Version is still '0' then this is probably 924 // the initial CRIU run to detect the version. Skip it. 925 logrus.Debugf("Using CRIU %d", c.criuVersion) 926 } 927 cmd := exec.Command("criu", "swrk", "3") 928 if process != nil { 929 cmd.Stdin = process.Stdin 930 cmd.Stdout = process.Stdout 931 cmd.Stderr = process.Stderr 932 } 933 cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) 934 if extraFiles != nil { 935 cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) 936 } 937 938 if err := cmd.Start(); err != nil { 939 return err 940 } 941 // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang. 942 criuServer.Close() 943 // cmd.Process will be replaced by a restored init. 944 criuProcess := cmd.Process 945 946 var criuProcessState *os.ProcessState 947 defer func() { 948 if criuProcessState == nil { 949 criuClientCon.Close() 950 _, err := criuProcess.Wait() 951 if err != nil { 952 logrus.Warnf("wait on criuProcess returned %v", err) 953 } 954 } 955 }() 956 957 if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil { 958 return err 959 } 960 961 var extFds []string 962 if process != nil { 963 extFds, err = getPipeFds(criuProcess.Pid) 964 if err != nil { 965 return err 966 } 967 } 968 969 logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) 970 // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() 971 // should be empty. For older CRIU versions it still will be 972 // available but empty. criurpc.CriuReqType_VERSION actually 973 // has no req.GetOpts(). 974 if logrus.GetLevel() >= logrus.DebugLevel && 975 !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || 976 req.GetType() == criurpc.CriuReqType_VERSION) { 977 978 val := reflect.ValueOf(req.GetOpts()) 979 v := reflect.Indirect(val) 980 for i := 0; i < v.NumField(); i++ { 981 st := v.Type() 982 name := st.Field(i).Name 983 if 'A' <= name[0] && name[0] <= 'Z' { 984 value := val.MethodByName("Get" + name).Call([]reflect.Value{}) 985 logrus.Debugf("CRIU option %s with value %v", name, value[0]) 986 } 987 } 988 } 989 data, err := proto.Marshal(req) 990 if err != nil { 991 return err 992 } 993 _, err = criuClientCon.Write(data) 994 if err != nil { 995 return err 996 } 997 998 buf := make([]byte, 10*4096) 999 oob := make([]byte, 4096) 1000 for { 1001 n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) 1002 if req.Opts != nil && req.Opts.StatusFd != nil { 1003 // Close status_fd as soon as we got something back from criu, 1004 // assuming it has consumed (reopened) it by this time. 1005 // Otherwise it will might be left open forever and whoever 1006 // is waiting on it will wait forever. 1007 fd := int(*req.Opts.StatusFd) 1008 _ = unix.Close(fd) 1009 req.Opts.StatusFd = nil 1010 } 1011 if err != nil { 1012 return err 1013 } 1014 if n == 0 { 1015 return errors.New("unexpected EOF") 1016 } 1017 if n == len(buf) { 1018 return errors.New("buffer is too small") 1019 } 1020 1021 resp := new(criurpc.CriuResp) 1022 err = proto.Unmarshal(buf[:n], resp) 1023 if err != nil { 1024 return err 1025 } 1026 t := resp.GetType() 1027 if !resp.GetSuccess() { 1028 return fmt.Errorf("criu failed: type %s errno %d", t, resp.GetCrErrno()) 1029 } 1030 1031 switch t { 1032 case criurpc.CriuReqType_FEATURE_CHECK: 1033 logrus.Debugf("Feature check says: %s", resp) 1034 criuFeatures = resp.GetFeatures() 1035 case criurpc.CriuReqType_NOTIFY: 1036 if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil { 1037 return err 1038 } 1039 req = &criurpc.CriuReq{ 1040 Type: &t, 1041 NotifySuccess: proto.Bool(true), 1042 } 1043 data, err = proto.Marshal(req) 1044 if err != nil { 1045 return err 1046 } 1047 _, err = criuClientCon.Write(data) 1048 if err != nil { 1049 return err 1050 } 1051 continue 1052 case criurpc.CriuReqType_RESTORE: 1053 case criurpc.CriuReqType_DUMP: 1054 case criurpc.CriuReqType_PRE_DUMP: 1055 default: 1056 return fmt.Errorf("unable to parse the response %s", resp.String()) 1057 } 1058 1059 break 1060 } 1061 1062 _ = criuClientCon.CloseWrite() 1063 // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. 1064 // Here we want to wait only the CRIU process. 1065 criuProcessState, err = criuProcess.Wait() 1066 if err != nil { 1067 return err 1068 } 1069 1070 // In pre-dump mode CRIU is in a loop and waits for 1071 // the final DUMP command. 1072 // The current runc pre-dump approach, however, is 1073 // start criu in PRE_DUMP once for a single pre-dump 1074 // and not the whole series of pre-dump, pre-dump, ...m, dump 1075 // If we got the message CriuReqType_PRE_DUMP it means 1076 // CRIU was successful and we need to forcefully stop CRIU 1077 if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { 1078 return fmt.Errorf("criu failed: %s", criuProcessState) 1079 } 1080 return nil 1081 } 1082 1083 // lockNetwork blocks any external network activity. 1084 func lockNetwork(config *configs.Config) error { 1085 for _, config := range config.Networks { 1086 strategy, err := getStrategy(config.Type) 1087 if err != nil { 1088 return err 1089 } 1090 1091 if err := strategy.detach(config); err != nil { 1092 return err 1093 } 1094 } 1095 return nil 1096 } 1097 1098 func unlockNetwork(config *configs.Config) error { 1099 for _, config := range config.Networks { 1100 strategy, err := getStrategy(config.Type) 1101 if err != nil { 1102 return err 1103 } 1104 if err = strategy.attach(config); err != nil { 1105 return err 1106 } 1107 } 1108 return nil 1109 } 1110 1111 func (c *Container) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error { 1112 notify := resp.GetNotify() 1113 if notify == nil { 1114 return fmt.Errorf("invalid response: %s", resp.String()) 1115 } 1116 script := notify.GetScript() 1117 logrus.Debugf("notify: %s\n", script) 1118 switch script { 1119 case "post-dump": 1120 f, err := os.Create(filepath.Join(c.stateDir, "checkpoint")) 1121 if err != nil { 1122 return err 1123 } 1124 f.Close() 1125 case "network-unlock": 1126 if err := unlockNetwork(c.config); err != nil { 1127 return err 1128 } 1129 case "network-lock": 1130 if err := lockNetwork(c.config); err != nil { 1131 return err 1132 } 1133 case "setup-namespaces": 1134 if c.config.Hooks != nil { 1135 s, err := c.currentOCIState() 1136 if err != nil { 1137 return nil 1138 } 1139 s.Pid = int(notify.GetPid()) 1140 1141 if err := c.config.Hooks.Run(configs.Prestart, s); err != nil { 1142 return err 1143 } 1144 if err := c.config.Hooks.Run(configs.CreateRuntime, s); err != nil { 1145 return err 1146 } 1147 } 1148 case "post-restore": 1149 pid := notify.GetPid() 1150 1151 p, err := os.FindProcess(int(pid)) 1152 if err != nil { 1153 return err 1154 } 1155 cmd.Process = p 1156 1157 r, err := newRestoredProcess(cmd, fds) 1158 if err != nil { 1159 return err 1160 } 1161 process.ops = r 1162 if err := c.state.transition(&restoredState{ 1163 imageDir: opts.ImagesDirectory, 1164 c: c, 1165 }); err != nil { 1166 return err 1167 } 1168 // create a timestamp indicating when the restored checkpoint was started 1169 c.created = time.Now().UTC() 1170 if _, err := c.updateState(r); err != nil { 1171 return err 1172 } 1173 if err := os.Remove(filepath.Join(c.stateDir, "checkpoint")); err != nil { 1174 if !os.IsNotExist(err) { 1175 logrus.Error(err) 1176 } 1177 } 1178 case "orphan-pts-master": 1179 scm, err := unix.ParseSocketControlMessage(oob) 1180 if err != nil { 1181 return err 1182 } 1183 fds, err := unix.ParseUnixRights(&scm[0]) 1184 if err != nil { 1185 return err 1186 } 1187 1188 master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") 1189 defer master.Close() 1190 1191 // While we can access console.master, using the API is a good idea. 1192 if err := utils.SendFile(process.ConsoleSocket, master); err != nil { 1193 return err 1194 } 1195 case "status-ready": 1196 if opts.StatusFd != -1 { 1197 // write \0 to status fd to notify that lazy page server is ready 1198 _, err := unix.Write(opts.StatusFd, []byte{0}) 1199 if err != nil { 1200 logrus.Warnf("can't write \\0 to status fd: %v", err) 1201 } 1202 _ = unix.Close(opts.StatusFd) 1203 opts.StatusFd = -1 1204 } 1205 } 1206 return nil 1207 }