github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/container/container.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package container creates and manipulates containers. 16 package container 17 18 import ( 19 "context" 20 "errors" 21 "fmt" 22 "io/ioutil" 23 "os" 24 "os/exec" 25 "regexp" 26 "strconv" 27 "strings" 28 "syscall" 29 "time" 30 31 "github.com/cenkalti/backoff" 32 specs "github.com/opencontainers/runtime-spec/specs-go" 33 "golang.org/x/sys/unix" 34 "github.com/SagerNet/gvisor/pkg/abi/linux" 35 "github.com/SagerNet/gvisor/pkg/cleanup" 36 "github.com/SagerNet/gvisor/pkg/log" 37 "github.com/SagerNet/gvisor/pkg/sentry/control" 38 "github.com/SagerNet/gvisor/pkg/sentry/sighandling" 39 "github.com/SagerNet/gvisor/runsc/boot" 40 "github.com/SagerNet/gvisor/runsc/cgroup" 41 "github.com/SagerNet/gvisor/runsc/config" 42 "github.com/SagerNet/gvisor/runsc/console" 43 "github.com/SagerNet/gvisor/runsc/sandbox" 44 "github.com/SagerNet/gvisor/runsc/specutils" 45 ) 46 47 // validateID validates the container id. 48 func validateID(id string) error { 49 // See libcontainer/factory_linux.go. 50 idRegex := regexp.MustCompile(`^[\w+-\.]+$`) 51 if !idRegex.MatchString(id) { 52 return fmt.Errorf("invalid container id: %v", id) 53 } 54 return nil 55 } 56 57 // Container represents a containerized application. When running, the 58 // container is associated with a single Sandbox. 59 // 60 // Container metadata can be saved and loaded to disk. Within a root directory, 61 // we maintain subdirectories for each container named with the container id. 62 // The container metadata is stored as a json within the container directory 63 // in a file named "meta.json". This metadata format is defined by us and is 64 // not part of the OCI spec. 65 // 66 // Containers must write their metadata files after any change to their internal 67 // states. The entire container directory is deleted when the container is 68 // destroyed. 69 // 70 // When the container is stopped, all processes that belong to the container 71 // must be stopped before Destroy() returns. containerd makes roughly the 72 // following calls to stop a container: 73 // - First it attempts to kill the container process with 74 // 'runsc kill SIGTERM'. After some time, it escalates to SIGKILL. In a 75 // separate thread, it's waiting on the container. As soon as the wait 76 // returns, it moves on to the next step: 77 // - It calls 'runsc kill --all SIGKILL' to stop every process that belongs to 78 // the container. 'kill --all SIGKILL' waits for all processes before 79 // returning. 80 // - Containerd waits for stdin, stdout and stderr to drain and be closed. 81 // - It calls 'runsc delete'. runc implementation kills --all SIGKILL once 82 // again just to be sure, waits, and then proceeds with remaining teardown. 83 // 84 // Container is thread-unsafe. 85 type Container struct { 86 // ID is the container ID. 87 ID string `json:"id"` 88 89 // Spec is the OCI runtime spec that configures this container. 90 Spec *specs.Spec `json:"spec"` 91 92 // BundleDir is the directory containing the container bundle. 93 BundleDir string `json:"bundleDir"` 94 95 // CreatedAt is the time the container was created. 96 CreatedAt time.Time `json:"createdAt"` 97 98 // Owner is the container owner. 99 Owner string `json:"owner"` 100 101 // ConsoleSocket is the path to a unix domain socket that will receive 102 // the console FD. 103 ConsoleSocket string `json:"consoleSocket"` 104 105 // Status is the current container Status. 106 Status Status `json:"status"` 107 108 // GoferPid is the PID of the gofer running along side the sandbox. May 109 // be 0 if the gofer has been killed. 110 GoferPid int `json:"goferPid"` 111 112 // Sandbox is the sandbox this container is running in. It's set when the 113 // container is created and reset when the sandbox is destroyed. 114 Sandbox *sandbox.Sandbox `json:"sandbox"` 115 116 // Saver handles load from/save to the state file safely from multiple 117 // processes. 118 Saver StateFile `json:"saver"` 119 120 // 121 // Fields below this line are not saved in the state file and will not 122 // be preserved across commands. 123 // 124 125 // goferIsChild is set if a gofer process is a child of the current process. 126 // 127 // This field isn't saved to json, because only a creator of a gofer 128 // process will have it as a child process. 129 goferIsChild bool 130 } 131 132 // Args is used to configure a new container. 133 type Args struct { 134 // ID is the container unique identifier. 135 ID string 136 137 // Spec is the OCI spec that describes the container. 138 Spec *specs.Spec 139 140 // BundleDir is the directory containing the container bundle. 141 BundleDir string 142 143 // ConsoleSocket is the path to a unix domain socket that will receive 144 // the console FD. It may be empty. 145 ConsoleSocket string 146 147 // PIDFile is the filename where the container's root process PID will be 148 // written to. It may be empty. 149 PIDFile string 150 151 // UserLog is the filename to send user-visible logs to. It may be empty. 152 // 153 // It only applies for the init container. 154 UserLog string 155 156 // Attached indicates that the sandbox lifecycle is attached with the caller. 157 // If the caller exits, the sandbox should exit too. 158 // 159 // It only applies for the init container. 160 Attached bool 161 } 162 163 // New creates the container in a new Sandbox process, unless the metadata 164 // indicates that an existing Sandbox should be used. The caller must call 165 // Destroy() on the container. 166 func New(conf *config.Config, args Args) (*Container, error) { 167 log.Debugf("Create container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 168 if err := validateID(args.ID); err != nil { 169 return nil, err 170 } 171 172 if err := os.MkdirAll(conf.RootDir, 0711); err != nil { 173 return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) 174 } 175 176 sandboxID := args.ID 177 if !isRoot(args.Spec) { 178 var ok bool 179 sandboxID, ok = specutils.SandboxID(args.Spec) 180 if !ok { 181 return nil, fmt.Errorf("no sandbox ID found when creating container") 182 } 183 } 184 185 c := &Container{ 186 ID: args.ID, 187 Spec: args.Spec, 188 ConsoleSocket: args.ConsoleSocket, 189 BundleDir: args.BundleDir, 190 Status: Creating, 191 CreatedAt: time.Now(), 192 Owner: os.Getenv("USER"), 193 Saver: StateFile{ 194 RootDir: conf.RootDir, 195 ID: FullID{ 196 SandboxID: sandboxID, 197 ContainerID: args.ID, 198 }, 199 }, 200 } 201 // The Cleanup object cleans up partially created containers when an error 202 // occurs. Any errors occurring during cleanup itself are ignored. 203 cu := cleanup.Make(func() { _ = c.Destroy() }) 204 defer cu.Clean() 205 206 // Lock the container metadata file to prevent concurrent creations of 207 // containers with the same id. 208 if err := c.Saver.lockForNew(); err != nil { 209 return nil, err 210 } 211 defer c.Saver.unlock() 212 213 // If the metadata annotations indicate that this container should be started 214 // in an existing sandbox, we must do so. These are the possible metadata 215 // annotation states: 216 // 1. No annotations: it means that there is a single container and this 217 // container is obviously the root. Both container and sandbox share the 218 // ID. 219 // 2. Container type == sandbox: it means this is the root container 220 // starting the sandbox. Both container and sandbox share the same ID. 221 // 3. Container type == container: it means this is a subcontainer of an 222 // already started sandbox. In this case, container ID is different than 223 // the sandbox ID. 224 if isRoot(args.Spec) { 225 log.Debugf("Creating new sandbox for container, cid: %s", args.ID) 226 227 if args.Spec.Linux == nil { 228 args.Spec.Linux = &specs.Linux{} 229 } 230 // Don't force the use of cgroups in tests because they lack permission to do so. 231 if args.Spec.Linux.CgroupsPath == "" && !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 232 args.Spec.Linux.CgroupsPath = "/" + args.ID 233 } 234 // Create and join cgroup before processes are created to ensure they are 235 // part of the cgroup from the start (and all their children processes). 236 cg, err := cgroup.NewFromSpec(args.Spec) 237 if err != nil { 238 return nil, err 239 } 240 if cg != nil { 241 // TODO(github.com/SagerNet/issue/3481): Remove when cgroups v2 is supported. 242 if !conf.Rootless && cgroup.IsOnlyV2() { 243 return nil, fmt.Errorf("cgroups V2 is not yet supported. Enable cgroups V1 and retry") 244 } 245 // If there is cgroup config, install it before creating sandbox process. 246 if err := cg.Install(args.Spec.Linux.Resources); err != nil { 247 switch { 248 case errors.Is(err, unix.EACCES) && conf.Rootless: 249 log.Warningf("Skipping cgroup configuration in rootless mode: %v", err) 250 cg = nil 251 default: 252 return nil, fmt.Errorf("configuring cgroup: %v", err) 253 } 254 } 255 } 256 if err := runInCgroup(cg, func() error { 257 ioFiles, specFile, err := c.createGoferProcess(args.Spec, conf, args.BundleDir, args.Attached) 258 if err != nil { 259 return err 260 } 261 262 // Start a new sandbox for this container. Any errors after this point 263 // must destroy the container. 264 sandArgs := &sandbox.Args{ 265 ID: sandboxID, 266 Spec: args.Spec, 267 BundleDir: args.BundleDir, 268 ConsoleSocket: args.ConsoleSocket, 269 UserLog: args.UserLog, 270 IOFiles: ioFiles, 271 MountsFile: specFile, 272 Cgroup: cg, 273 Attached: args.Attached, 274 } 275 sand, err := sandbox.New(conf, sandArgs) 276 if err != nil { 277 return err 278 } 279 c.Sandbox = sand 280 return nil 281 282 }); err != nil { 283 return nil, err 284 } 285 } else { 286 log.Debugf("Creating new container, cid: %s, sandbox: %s", c.ID, sandboxID) 287 288 // Find the sandbox associated with this ID. 289 fullID := FullID{ 290 SandboxID: sandboxID, 291 ContainerID: sandboxID, 292 } 293 sb, err := Load(conf.RootDir, fullID, LoadOpts{Exact: true}) 294 if err != nil { 295 return nil, err 296 } 297 c.Sandbox = sb.Sandbox 298 299 // If the console control socket file is provided, then create a new 300 // pty master/slave pair and send the TTY to the sandbox process. 301 var tty *os.File 302 if c.ConsoleSocket != "" { 303 // Create a new TTY pair and send the master on the provided socket. 304 var err error 305 tty, err = console.NewWithSocket(c.ConsoleSocket) 306 if err != nil { 307 return nil, fmt.Errorf("setting up console with socket %q: %w", c.ConsoleSocket, err) 308 } 309 // tty file is transferred to the sandbox, then it can be closed here. 310 defer tty.Close() 311 } 312 313 if err := c.Sandbox.CreateContainer(c.ID, tty); err != nil { 314 return nil, err 315 } 316 } 317 c.changeStatus(Created) 318 319 // Save the metadata file. 320 if err := c.saveLocked(); err != nil { 321 return nil, err 322 } 323 324 // Write the PID file. Containerd considers the create complete after 325 // this file is created, so it must be the last thing we do. 326 if args.PIDFile != "" { 327 if err := ioutil.WriteFile(args.PIDFile, []byte(strconv.Itoa(c.SandboxPid())), 0644); err != nil { 328 return nil, fmt.Errorf("error writing PID file: %v", err) 329 } 330 } 331 332 cu.Release() 333 return c, nil 334 } 335 336 // Start starts running the containerized process inside the sandbox. 337 func (c *Container) Start(conf *config.Config) error { 338 log.Debugf("Start container, cid: %s", c.ID) 339 340 if err := c.Saver.lock(); err != nil { 341 return err 342 } 343 unlock := cleanup.Make(func() { c.Saver.unlock() }) 344 defer unlock.Clean() 345 346 if err := c.requireStatus("start", Created); err != nil { 347 return err 348 } 349 350 // "If any prestart hook fails, the runtime MUST generate an error, 351 // stop and destroy the container" -OCI spec. 352 if c.Spec.Hooks != nil { 353 if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { 354 return err 355 } 356 } 357 358 if isRoot(c.Spec) { 359 if err := c.Sandbox.StartRoot(c.Spec, conf); err != nil { 360 return err 361 } 362 } else { 363 // Join cgroup to start gofer process to ensure it's part of the cgroup from 364 // the start (and all their children processes). 365 if err := runInCgroup(c.Sandbox.Cgroup, func() error { 366 // Create the gofer process. 367 goferFiles, mountsFile, err := c.createGoferProcess(c.Spec, conf, c.BundleDir, false) 368 if err != nil { 369 return err 370 } 371 defer func() { 372 _ = mountsFile.Close() 373 for _, f := range goferFiles { 374 _ = f.Close() 375 } 376 }() 377 378 cleanMounts, err := specutils.ReadMounts(mountsFile) 379 if err != nil { 380 return fmt.Errorf("reading mounts file: %v", err) 381 } 382 c.Spec.Mounts = cleanMounts 383 384 // Setup stdios if the container is not using terminal. Otherwise TTY was 385 // already setup in create. 386 var stdios []*os.File 387 if !c.Spec.Process.Terminal { 388 stdios = []*os.File{os.Stdin, os.Stdout, os.Stderr} 389 } 390 391 return c.Sandbox.StartContainer(c.Spec, conf, c.ID, stdios, goferFiles) 392 }); err != nil { 393 return err 394 } 395 } 396 397 // "If any poststart hook fails, the runtime MUST log a warning, but 398 // the remaining hooks and lifecycle continue as if the hook had 399 // succeeded" -OCI spec. 400 if c.Spec.Hooks != nil { 401 executeHooksBestEffort(c.Spec.Hooks.Poststart, c.State()) 402 } 403 404 c.changeStatus(Running) 405 if err := c.saveLocked(); err != nil { 406 return err 407 } 408 409 // Release lock before adjusting OOM score because the lock is acquired there. 410 unlock.Clean() 411 412 // Adjust the oom_score_adj for sandbox. This must be done after saveLocked(). 413 if err := adjustSandboxOOMScoreAdj(c.Sandbox, c.Spec, c.Saver.RootDir, false); err != nil { 414 return err 415 } 416 417 // Set container's oom_score_adj to the gofer since it is dedicated to 418 // the container, in case the gofer uses up too much memory. 419 return c.adjustGoferOOMScoreAdj() 420 } 421 422 // Restore takes a container and replaces its kernel and file system 423 // to restore a container from its state file. 424 func (c *Container) Restore(spec *specs.Spec, conf *config.Config, restoreFile string) error { 425 log.Debugf("Restore container, cid: %s", c.ID) 426 if err := c.Saver.lock(); err != nil { 427 return err 428 } 429 defer c.Saver.unlock() 430 431 if err := c.requireStatus("restore", Created); err != nil { 432 return err 433 } 434 435 // "If any prestart hook fails, the runtime MUST generate an error, 436 // stop and destroy the container" -OCI spec. 437 if c.Spec.Hooks != nil { 438 if err := executeHooks(c.Spec.Hooks.Prestart, c.State()); err != nil { 439 return err 440 } 441 } 442 443 if err := c.Sandbox.Restore(c.ID, spec, conf, restoreFile); err != nil { 444 return err 445 } 446 c.changeStatus(Running) 447 return c.saveLocked() 448 } 449 450 // Run is a helper that calls Create + Start + Wait. 451 func Run(conf *config.Config, args Args) (unix.WaitStatus, error) { 452 log.Debugf("Run container, cid: %s, rootDir: %q", args.ID, conf.RootDir) 453 c, err := New(conf, args) 454 if err != nil { 455 return 0, fmt.Errorf("creating container: %v", err) 456 } 457 // Clean up partially created container if an error occurs. 458 // Any errors returned by Destroy() itself are ignored. 459 cu := cleanup.Make(func() { 460 c.Destroy() 461 }) 462 defer cu.Clean() 463 464 if conf.RestoreFile != "" { 465 log.Debugf("Restore: %v", conf.RestoreFile) 466 if err := c.Restore(args.Spec, conf, conf.RestoreFile); err != nil { 467 return 0, fmt.Errorf("starting container: %v", err) 468 } 469 } else { 470 if err := c.Start(conf); err != nil { 471 return 0, fmt.Errorf("starting container: %v", err) 472 } 473 } 474 if args.Attached { 475 return c.Wait() 476 } 477 cu.Release() 478 return 0, nil 479 } 480 481 // Execute runs the specified command in the container. It returns the PID of 482 // the newly created process. 483 func (c *Container) Execute(args *control.ExecArgs) (int32, error) { 484 log.Debugf("Execute in container, cid: %s, args: %+v", c.ID, args) 485 if err := c.requireStatus("execute in", Created, Running); err != nil { 486 return 0, err 487 } 488 args.ContainerID = c.ID 489 return c.Sandbox.Execute(args) 490 } 491 492 // Event returns events for the container. 493 func (c *Container) Event() (*boot.EventOut, error) { 494 log.Debugf("Getting events for container, cid: %s", c.ID) 495 if err := c.requireStatus("get events for", Created, Running, Paused); err != nil { 496 return nil, err 497 } 498 event, err := c.Sandbox.Event(c.ID) 499 if err != nil { 500 return nil, err 501 } 502 503 // Some stats can utilize host cgroups for accuracy. 504 c.populateStats(event) 505 506 return event, nil 507 } 508 509 // SandboxPid returns the Pid of the sandbox the container is running in, or -1 if the 510 // container is not running. 511 func (c *Container) SandboxPid() int { 512 if err := c.requireStatus("get PID", Created, Running, Paused); err != nil { 513 return -1 514 } 515 return c.Sandbox.Pid 516 } 517 518 // Wait waits for the container to exit, and returns its WaitStatus. 519 // Call to wait on a stopped container is needed to retrieve the exit status 520 // and wait returns immediately. 521 func (c *Container) Wait() (unix.WaitStatus, error) { 522 log.Debugf("Wait on container, cid: %s", c.ID) 523 ws, err := c.Sandbox.Wait(c.ID) 524 if err == nil { 525 // Wait succeeded, container is not running anymore. 526 c.changeStatus(Stopped) 527 } 528 return ws, err 529 } 530 531 // WaitRootPID waits for process 'pid' in the sandbox's PID namespace and 532 // returns its WaitStatus. 533 func (c *Container) WaitRootPID(pid int32) (unix.WaitStatus, error) { 534 log.Debugf("Wait on process %d in sandbox, cid: %s", pid, c.Sandbox.ID) 535 if !c.IsSandboxRunning() { 536 return 0, fmt.Errorf("sandbox is not running") 537 } 538 return c.Sandbox.WaitPID(c.Sandbox.ID, pid) 539 } 540 541 // WaitPID waits for process 'pid' in the container's PID namespace and returns 542 // its WaitStatus. 543 func (c *Container) WaitPID(pid int32) (unix.WaitStatus, error) { 544 log.Debugf("Wait on process %d in container, cid: %s", pid, c.ID) 545 if !c.IsSandboxRunning() { 546 return 0, fmt.Errorf("sandbox is not running") 547 } 548 return c.Sandbox.WaitPID(c.ID, pid) 549 } 550 551 // SignalContainer sends the signal to the container. If all is true and signal 552 // is SIGKILL, then waits for all processes to exit before returning. 553 // SignalContainer returns an error if the container is already stopped. 554 // TODO(b/113680494): Distinguish different error types. 555 func (c *Container) SignalContainer(sig unix.Signal, all bool) error { 556 log.Debugf("Signal container, cid: %s, signal: %v (%d)", c.ID, sig, sig) 557 // Signaling container in Stopped state is allowed. When all=false, 558 // an error will be returned anyway; when all=true, this allows 559 // sending signal to other processes inside the container even 560 // after the init process exits. This is especially useful for 561 // container cleanup. 562 if err := c.requireStatus("signal", Running, Stopped); err != nil { 563 return err 564 } 565 if !c.IsSandboxRunning() { 566 return fmt.Errorf("sandbox is not running") 567 } 568 return c.Sandbox.SignalContainer(c.ID, sig, all) 569 } 570 571 // SignalProcess sends sig to a specific process in the container. 572 func (c *Container) SignalProcess(sig unix.Signal, pid int32) error { 573 log.Debugf("Signal process %d in container, cid: %s, signal: %v (%d)", pid, c.ID, sig, sig) 574 if err := c.requireStatus("signal a process inside", Running); err != nil { 575 return err 576 } 577 if !c.IsSandboxRunning() { 578 return fmt.Errorf("sandbox is not running") 579 } 580 return c.Sandbox.SignalProcess(c.ID, int32(pid), sig, false) 581 } 582 583 // ForwardSignals forwards all signals received by the current process to the 584 // container process inside the sandbox. It returns a function that will stop 585 // forwarding signals. 586 func (c *Container) ForwardSignals(pid int32, fgProcess bool) func() { 587 log.Debugf("Forwarding all signals to container, cid: %s, PIDPID: %d, fgProcess: %t", c.ID, pid, fgProcess) 588 stop := sighandling.StartSignalForwarding(func(sig linux.Signal) { 589 log.Debugf("Forwarding signal %d to container, cid: %s, PID: %d, fgProcess: %t", sig, c.ID, pid, fgProcess) 590 if err := c.Sandbox.SignalProcess(c.ID, pid, unix.Signal(sig), fgProcess); err != nil { 591 log.Warningf("error forwarding signal %d to container %q: %v", sig, c.ID, err) 592 } 593 }) 594 return func() { 595 log.Debugf("Done forwarding signals to container, cid: %s, PID: %d, fgProcess: %t", c.ID, pid, fgProcess) 596 stop() 597 } 598 } 599 600 // Checkpoint sends the checkpoint call to the container. 601 // The statefile will be written to f, the file at the specified image-path. 602 func (c *Container) Checkpoint(f *os.File) error { 603 log.Debugf("Checkpoint container, cid: %s", c.ID) 604 if err := c.requireStatus("checkpoint", Created, Running, Paused); err != nil { 605 return err 606 } 607 return c.Sandbox.Checkpoint(c.ID, f) 608 } 609 610 // Pause suspends the container and its kernel. 611 // The call only succeeds if the container's status is created or running. 612 func (c *Container) Pause() error { 613 log.Debugf("Pausing container, cid: %s", c.ID) 614 if err := c.Saver.lock(); err != nil { 615 return err 616 } 617 defer c.Saver.unlock() 618 619 if c.Status != Created && c.Status != Running { 620 return fmt.Errorf("cannot pause container %q in state %v", c.ID, c.Status) 621 } 622 623 if err := c.Sandbox.Pause(c.ID); err != nil { 624 return fmt.Errorf("pausing container %q: %v", c.ID, err) 625 } 626 c.changeStatus(Paused) 627 return c.saveLocked() 628 } 629 630 // Resume unpauses the container and its kernel. 631 // The call only succeeds if the container's status is paused. 632 func (c *Container) Resume() error { 633 log.Debugf("Resuming container, cid: %s", c.ID) 634 if err := c.Saver.lock(); err != nil { 635 return err 636 } 637 defer c.Saver.unlock() 638 639 if c.Status != Paused { 640 return fmt.Errorf("cannot resume container %q in state %v", c.ID, c.Status) 641 } 642 if err := c.Sandbox.Resume(c.ID); err != nil { 643 return fmt.Errorf("resuming container: %v", err) 644 } 645 c.changeStatus(Running) 646 return c.saveLocked() 647 } 648 649 // State returns the metadata of the container. 650 func (c *Container) State() specs.State { 651 return specs.State{ 652 Version: specs.Version, 653 ID: c.ID, 654 Status: c.Status.String(), 655 Pid: c.SandboxPid(), 656 Bundle: c.BundleDir, 657 } 658 } 659 660 // Processes retrieves the list of processes and associated metadata inside a 661 // container. 662 func (c *Container) Processes() ([]*control.Process, error) { 663 if err := c.requireStatus("get processes of", Running, Paused); err != nil { 664 return nil, err 665 } 666 return c.Sandbox.Processes(c.ID) 667 } 668 669 // Destroy stops all processes and frees all resources associated with the 670 // container. 671 func (c *Container) Destroy() error { 672 log.Debugf("Destroy container, cid: %s", c.ID) 673 674 if err := c.Saver.lock(); err != nil { 675 return err 676 } 677 defer func() { 678 c.Saver.unlock() 679 c.Saver.close() 680 }() 681 682 // Stored for later use as stop() sets c.Sandbox to nil. 683 sb := c.Sandbox 684 685 // We must perform the following cleanup steps: 686 // * stop the container and gofer processes, 687 // * remove the container filesystem on the host, and 688 // * delete the container metadata directory. 689 // 690 // It's possible for one or more of these steps to fail, but we should 691 // do our best to perform all of the cleanups. Hence, we keep a slice 692 // of errors return their concatenation. 693 var errs []string 694 if err := c.stop(); err != nil { 695 err = fmt.Errorf("stopping container: %v", err) 696 log.Warningf("%v", err) 697 errs = append(errs, err.Error()) 698 } 699 700 if err := c.Saver.destroy(); err != nil { 701 err = fmt.Errorf("deleting container state files: %v", err) 702 log.Warningf("%v", err) 703 errs = append(errs, err.Error()) 704 } 705 706 c.changeStatus(Stopped) 707 708 // Adjust oom_score_adj for the sandbox. This must be done after the container 709 // is stopped and the directory at c.Root is removed. 710 // 711 // Use 'sb' to tell whether it has been executed before because Destroy must 712 // be idempotent. 713 if sb != nil { 714 if err := adjustSandboxOOMScoreAdj(sb, c.Spec, c.Saver.RootDir, true); err != nil { 715 errs = append(errs, err.Error()) 716 } 717 } 718 719 // "If any poststop hook fails, the runtime MUST log a warning, but the 720 // remaining hooks and lifecycle continue as if the hook had 721 // succeeded" - OCI spec. 722 // 723 // Based on the OCI, "The post-stop hooks MUST be called after the container 724 // is deleted but before the delete operation returns" 725 // Run it here to: 726 // 1) Conform to the OCI. 727 // 2) Make sure it only runs once, because the root has been deleted, the 728 // container can't be loaded again. 729 if c.Spec.Hooks != nil { 730 executeHooksBestEffort(c.Spec.Hooks.Poststop, c.State()) 731 } 732 733 if len(errs) == 0 { 734 return nil 735 } 736 return fmt.Errorf(strings.Join(errs, "\n")) 737 } 738 739 // saveLocked saves the container metadata to a file. 740 // 741 // Precondition: container must be locked with container.lock(). 742 func (c *Container) saveLocked() error { 743 log.Debugf("Save container, cid: %s", c.ID) 744 if err := c.Saver.saveLocked(c); err != nil { 745 return fmt.Errorf("saving container metadata: %v", err) 746 } 747 return nil 748 } 749 750 // stop stops the container (for regular containers) or the sandbox (for 751 // root containers), and waits for the container or sandbox and the gofer 752 // to stop. If any of them doesn't stop before timeout, an error is returned. 753 func (c *Container) stop() error { 754 var cgroup *cgroup.Cgroup 755 756 if c.Sandbox != nil { 757 log.Debugf("Destroying container, cid: %s", c.ID) 758 if err := c.Sandbox.DestroyContainer(c.ID); err != nil { 759 return fmt.Errorf("destroying container %q: %v", c.ID, err) 760 } 761 // Only uninstall cgroup for sandbox stop. 762 if c.Sandbox.IsRootContainer(c.ID) { 763 cgroup = c.Sandbox.Cgroup 764 } 765 // Only set sandbox to nil after it has been told to destroy the container. 766 c.Sandbox = nil 767 } 768 769 // Try killing gofer if it does not exit with container. 770 if c.GoferPid != 0 { 771 log.Debugf("Killing gofer for container, cid: %s, PID: %d", c.ID, c.GoferPid) 772 if err := unix.Kill(c.GoferPid, unix.SIGKILL); err != nil { 773 // The gofer may already be stopped, log the error. 774 log.Warningf("Error sending signal %d to gofer %d: %v", unix.SIGKILL, c.GoferPid, err) 775 } 776 } 777 778 if err := c.waitForStopped(); err != nil { 779 return err 780 } 781 782 // Gofer is running in cgroups, so Cgroup.Uninstall has to be called after it. 783 if cgroup != nil { 784 if err := cgroup.Uninstall(); err != nil { 785 return err 786 } 787 } 788 return nil 789 } 790 791 func (c *Container) waitForStopped() error { 792 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 793 defer cancel() 794 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 795 op := func() error { 796 if c.IsSandboxRunning() { 797 if err := c.SignalContainer(unix.Signal(0), false); err == nil { 798 return fmt.Errorf("container is still running") 799 } 800 } 801 if c.GoferPid == 0 { 802 return nil 803 } 804 if c.goferIsChild { 805 // The gofer process is a child of the current process, 806 // so we can wait it and collect its zombie. 807 wpid, err := unix.Wait4(int(c.GoferPid), nil, unix.WNOHANG, nil) 808 if err != nil { 809 return fmt.Errorf("error waiting the gofer process: %v", err) 810 } 811 if wpid == 0 { 812 return fmt.Errorf("gofer is still running") 813 } 814 815 } else if err := unix.Kill(c.GoferPid, 0); err == nil { 816 return fmt.Errorf("gofer is still running") 817 } 818 c.GoferPid = 0 819 return nil 820 } 821 return backoff.Retry(op, b) 822 } 823 824 func (c *Container) createGoferProcess(spec *specs.Spec, conf *config.Config, bundleDir string, attached bool) ([]*os.File, *os.File, error) { 825 // Start with the general config flags. 826 args := conf.ToFlags() 827 828 var goferEnds []*os.File 829 830 // nextFD is the next available file descriptor for the gofer process. 831 // It starts at 3 because 0-2 are used by stdin/stdout/stderr. 832 nextFD := 3 833 834 if conf.LogFilename != "" { 835 logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 836 if err != nil { 837 return nil, nil, fmt.Errorf("opening log file %q: %v", conf.LogFilename, err) 838 } 839 defer logFile.Close() 840 goferEnds = append(goferEnds, logFile) 841 args = append(args, "--log-fd="+strconv.Itoa(nextFD)) 842 nextFD++ 843 } 844 845 if conf.DebugLog != "" { 846 test := "" 847 if len(conf.TestOnlyTestNameEnv) != 0 { 848 // Fetch test name if one is provided and the test only flag was set. 849 if t, ok := specutils.EnvVar(spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 850 test = t 851 } 852 } 853 debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "gofer", test) 854 if err != nil { 855 return nil, nil, fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) 856 } 857 defer debugLogFile.Close() 858 goferEnds = append(goferEnds, debugLogFile) 859 args = append(args, "--debug-log-fd="+strconv.Itoa(nextFD)) 860 nextFD++ 861 } 862 863 args = append(args, "gofer", "--bundle", bundleDir) 864 865 // Open the spec file to donate to the sandbox. 866 specFile, err := specutils.OpenSpec(bundleDir) 867 if err != nil { 868 return nil, nil, fmt.Errorf("opening spec file: %v", err) 869 } 870 defer specFile.Close() 871 goferEnds = append(goferEnds, specFile) 872 args = append(args, "--spec-fd="+strconv.Itoa(nextFD)) 873 nextFD++ 874 875 // Create pipe that allows gofer to send mount list to sandbox after all paths 876 // have been resolved. 877 mountsSand, mountsGofer, err := os.Pipe() 878 if err != nil { 879 return nil, nil, err 880 } 881 defer mountsGofer.Close() 882 goferEnds = append(goferEnds, mountsGofer) 883 args = append(args, fmt.Sprintf("--mounts-fd=%d", nextFD)) 884 nextFD++ 885 886 // Add root mount and then add any other additional mounts. 887 mountCount := 1 888 for _, m := range spec.Mounts { 889 if specutils.Is9PMount(m, conf.VFS2) { 890 mountCount++ 891 } 892 } 893 894 sandEnds := make([]*os.File, 0, mountCount) 895 for i := 0; i < mountCount; i++ { 896 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 897 if err != nil { 898 return nil, nil, err 899 } 900 sandEnds = append(sandEnds, os.NewFile(uintptr(fds[0]), "sandbox IO FD")) 901 902 goferEnd := os.NewFile(uintptr(fds[1]), "gofer IO FD") 903 defer goferEnd.Close() 904 goferEnds = append(goferEnds, goferEnd) 905 906 args = append(args, fmt.Sprintf("--io-fds=%d", nextFD)) 907 nextFD++ 908 } 909 910 binPath := specutils.ExePath 911 cmd := exec.Command(binPath, args...) 912 cmd.ExtraFiles = goferEnds 913 cmd.Args[0] = "runsc-gofer" 914 915 if attached { 916 // The gofer is attached to the lifetime of this process, so it 917 // should synchronously die when this process dies. 918 cmd.SysProcAttr = &unix.SysProcAttr{ 919 Pdeathsig: unix.SIGKILL, 920 } 921 } 922 923 // Enter new namespaces to isolate from the rest of the system. Don't unshare 924 // cgroup because gofer is added to a cgroup in the caller's namespace. 925 nss := []specs.LinuxNamespace{ 926 {Type: specs.IPCNamespace}, 927 {Type: specs.MountNamespace}, 928 {Type: specs.NetworkNamespace}, 929 {Type: specs.PIDNamespace}, 930 {Type: specs.UTSNamespace}, 931 } 932 933 // Setup any uid/gid mappings, and create or join the configured user 934 // namespace so the gofer's view of the filesystem aligns with the 935 // users in the sandbox. 936 userNS := specutils.FilterNS([]specs.LinuxNamespaceType{specs.UserNamespace}, spec) 937 nss = append(nss, userNS...) 938 specutils.SetUIDGIDMappings(cmd, spec) 939 if len(userNS) != 0 { 940 // We need to set UID and GID to have capabilities in a new user namespace. 941 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 942 } 943 944 // Start the gofer in the given namespace. 945 log.Debugf("Starting gofer: %s %v", binPath, args) 946 if err := specutils.StartInNS(cmd, nss); err != nil { 947 return nil, nil, fmt.Errorf("gofer: %v", err) 948 } 949 log.Infof("Gofer started, PID: %d", cmd.Process.Pid) 950 c.GoferPid = cmd.Process.Pid 951 c.goferIsChild = true 952 return sandEnds, mountsSand, nil 953 } 954 955 // changeStatus transitions from one status to another ensuring that the 956 // transition is valid. 957 func (c *Container) changeStatus(s Status) { 958 switch s { 959 case Creating: 960 // Initial state, never transitions to it. 961 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 962 963 case Created: 964 if c.Status != Creating { 965 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 966 } 967 if c.Sandbox == nil { 968 panic("sandbox cannot be nil") 969 } 970 971 case Paused: 972 if c.Status != Running { 973 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 974 } 975 if c.Sandbox == nil { 976 panic("sandbox cannot be nil") 977 } 978 979 case Running: 980 if c.Status != Created && c.Status != Paused { 981 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 982 } 983 if c.Sandbox == nil { 984 panic("sandbox cannot be nil") 985 } 986 987 case Stopped: 988 if c.Status != Creating && c.Status != Created && c.Status != Running && c.Status != Stopped { 989 panic(fmt.Sprintf("invalid state transition: %v => %v", c.Status, s)) 990 } 991 992 default: 993 panic(fmt.Sprintf("invalid new state: %v", s)) 994 } 995 c.Status = s 996 } 997 998 // IsSandboxRunning returns true if the sandbox exists and is running. 999 func (c *Container) IsSandboxRunning() bool { 1000 return c.Sandbox != nil && c.Sandbox.IsRunning() 1001 } 1002 1003 func (c *Container) requireStatus(action string, statuses ...Status) error { 1004 for _, s := range statuses { 1005 if c.Status == s { 1006 return nil 1007 } 1008 } 1009 return fmt.Errorf("cannot %s container %q in state %s", action, c.ID, c.Status) 1010 } 1011 1012 func isRoot(spec *specs.Spec) bool { 1013 return specutils.SpecContainerType(spec) != specutils.ContainerTypeContainer 1014 } 1015 1016 // runInCgroup executes fn inside the specified cgroup. If cg is nil, execute 1017 // it in the current context. 1018 func runInCgroup(cg *cgroup.Cgroup, fn func() error) error { 1019 if cg == nil { 1020 return fn() 1021 } 1022 restore, err := cg.Join() 1023 defer restore() 1024 if err != nil { 1025 return err 1026 } 1027 return fn() 1028 } 1029 1030 // adjustGoferOOMScoreAdj sets the oom_store_adj for the container's gofer. 1031 func (c *Container) adjustGoferOOMScoreAdj() error { 1032 if c.GoferPid == 0 || c.Spec.Process.OOMScoreAdj == nil { 1033 return nil 1034 } 1035 return setOOMScoreAdj(c.GoferPid, *c.Spec.Process.OOMScoreAdj) 1036 } 1037 1038 // adjustSandboxOOMScoreAdj sets the oom_score_adj for the sandbox. 1039 // oom_score_adj is set to the lowest oom_score_adj among the containers 1040 // running in the sandbox. 1041 // 1042 // TODO(github.com/SagerNet/issue/238): This call could race with other containers being 1043 // created at the same time and end up setting the wrong oom_score_adj to the 1044 // sandbox. Use rpc client to synchronize. 1045 func adjustSandboxOOMScoreAdj(s *sandbox.Sandbox, spec *specs.Spec, rootDir string, destroy bool) error { 1046 // Adjustment can be skipped if the root container is exiting, because it 1047 // brings down the entire sandbox. 1048 if isRoot(spec) && destroy { 1049 return nil 1050 } 1051 1052 containers, err := loadSandbox(rootDir, s.ID) 1053 if err != nil { 1054 return fmt.Errorf("loading sandbox containers: %v", err) 1055 } 1056 1057 // Do nothing if the sandbox has been terminated. 1058 if len(containers) == 0 { 1059 return nil 1060 } 1061 1062 // Get the lowest score for all containers. 1063 var lowScore int 1064 scoreFound := false 1065 for _, container := range containers { 1066 // Special multi-container support for CRI. Ignore the root container when 1067 // calculating oom_score_adj for the sandbox because it is the 1068 // infrastructure (pause) container and always has a very low oom_score_adj. 1069 // 1070 // We will use OOMScoreAdj in the single-container case where the 1071 // containerd container-type annotation is not present. 1072 if specutils.SpecContainerType(container.Spec) == specutils.ContainerTypeSandbox { 1073 continue 1074 } 1075 1076 if container.Spec.Process.OOMScoreAdj != nil && (!scoreFound || *container.Spec.Process.OOMScoreAdj < lowScore) { 1077 scoreFound = true 1078 lowScore = *container.Spec.Process.OOMScoreAdj 1079 } 1080 } 1081 1082 // If the container is destroyed and remaining containers have no 1083 // oomScoreAdj specified then we must revert to the original oom_score_adj 1084 // saved with the root container. 1085 if !scoreFound && destroy { 1086 lowScore = containers[0].Sandbox.OriginalOOMScoreAdj 1087 scoreFound = true 1088 } 1089 1090 // Only set oom_score_adj if one of the containers has oom_score_adj set. If 1091 // not, oom_score_adj is inherited from the parent process. 1092 // 1093 // See: https://github.com/opencontainers/runtime-spec/blob/master/config.md#linux-process 1094 if !scoreFound { 1095 return nil 1096 } 1097 1098 // Set the lowest of all containers oom_score_adj to the sandbox. 1099 return setOOMScoreAdj(s.Pid, lowScore) 1100 } 1101 1102 // setOOMScoreAdj sets oom_score_adj to the given value for the given PID. 1103 // /proc must be available and mounted read-write. scoreAdj should be between 1104 // -1000 and 1000. It's a noop if the process has already exited. 1105 func setOOMScoreAdj(pid int, scoreAdj int) error { 1106 f, err := os.OpenFile(fmt.Sprintf("/proc/%d/oom_score_adj", pid), os.O_WRONLY, 0644) 1107 if err != nil { 1108 // Ignore NotExist errors because it can race with process exit. 1109 if os.IsNotExist(err) { 1110 log.Warningf("Process (%d) not found setting oom_score_adj", pid) 1111 return nil 1112 } 1113 return err 1114 } 1115 defer f.Close() 1116 if _, err := f.WriteString(strconv.Itoa(scoreAdj)); err != nil { 1117 if errors.Is(err, unix.ESRCH) { 1118 log.Warningf("Process (%d) exited while setting oom_score_adj", pid) 1119 return nil 1120 } 1121 return fmt.Errorf("setting oom_score_adj to %q: %v", scoreAdj, err) 1122 } 1123 return nil 1124 } 1125 1126 // populateStats populates event with stats estimates based on cgroups and the 1127 // sentry's accounting. 1128 // TODO(github.com/SagerNet/issue/172): This is an estimation; we should do more 1129 // detailed accounting. 1130 func (c *Container) populateStats(event *boot.EventOut) { 1131 // The events command, when run for all running containers, should 1132 // account for the full cgroup CPU usage. We split cgroup usage 1133 // proportionally according to the sentry-internal usage measurements, 1134 // only counting Running containers. 1135 log.Debugf("event.ContainerUsage: %v", event.ContainerUsage) 1136 var containerUsage uint64 1137 var allContainersUsage uint64 1138 for ID, usage := range event.ContainerUsage { 1139 allContainersUsage += usage 1140 if ID == c.ID { 1141 containerUsage = usage 1142 } 1143 } 1144 1145 cgroup, err := c.Sandbox.NewCGroup() 1146 if err != nil { 1147 // No cgroup, so rely purely on the sentry's accounting. 1148 log.Warningf("events: no cgroups") 1149 event.Event.Data.CPU.Usage.Total = containerUsage 1150 return 1151 } 1152 1153 // Get the host cgroup CPU usage. 1154 cgroupsUsage, err := cgroup.CPUUsage() 1155 if err != nil { 1156 // No cgroup usage, so rely purely on the sentry's accounting. 1157 log.Warningf("events: failed when getting cgroup CPU usage for container: %v", err) 1158 event.Event.Data.CPU.Usage.Total = containerUsage 1159 return 1160 } 1161 1162 // If the sentry reports no CPU usage, fall back on cgroups and split usage 1163 // equally across containers. 1164 if allContainersUsage == 0 { 1165 log.Warningf("events: no sentry CPU usage reported") 1166 allContainersUsage = cgroupsUsage 1167 containerUsage = cgroupsUsage / uint64(len(event.ContainerUsage)) 1168 } 1169 1170 // Scaling can easily overflow a uint64 (e.g. a containerUsage and 1171 // cgroupsUsage of 16 seconds each will overflow), so use floats. 1172 total := float64(containerUsage) * (float64(cgroupsUsage) / float64(allContainersUsage)) 1173 log.Debugf("Usage, container: %d, cgroups: %d, all: %d, total: %.0f", containerUsage, cgroupsUsage, allContainersUsage, total) 1174 event.Event.Data.CPU.Usage.Total = uint64(total) 1175 return 1176 }