github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/sandbox/sandbox.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sandbox creates and manipulates sandboxes. 16 package sandbox 17 18 import ( 19 "context" 20 "fmt" 21 "io" 22 "math" 23 "os" 24 "os/exec" 25 "strconv" 26 "strings" 27 "syscall" 28 "time" 29 30 "github.com/cenkalti/backoff" 31 specs "github.com/opencontainers/runtime-spec/specs-go" 32 "github.com/syndtr/gocapability/capability" 33 "golang.org/x/sys/unix" 34 "github.com/SagerNet/gvisor/pkg/cleanup" 35 "github.com/SagerNet/gvisor/pkg/control/client" 36 "github.com/SagerNet/gvisor/pkg/control/server" 37 "github.com/SagerNet/gvisor/pkg/coverage" 38 "github.com/SagerNet/gvisor/pkg/log" 39 "github.com/SagerNet/gvisor/pkg/sentry/control" 40 "github.com/SagerNet/gvisor/pkg/sentry/platform" 41 "github.com/SagerNet/gvisor/pkg/sync" 42 "github.com/SagerNet/gvisor/pkg/urpc" 43 "github.com/SagerNet/gvisor/runsc/boot" 44 "github.com/SagerNet/gvisor/runsc/boot/platforms" 45 "github.com/SagerNet/gvisor/runsc/cgroup" 46 "github.com/SagerNet/gvisor/runsc/config" 47 "github.com/SagerNet/gvisor/runsc/console" 48 "github.com/SagerNet/gvisor/runsc/specutils" 49 ) 50 51 // Sandbox wraps a sandbox process. 52 // 53 // It is used to start/stop sandbox process (and associated processes like 54 // gofers), as well as for running and manipulating containers inside a running 55 // sandbox. 56 // 57 // Note: Sandbox must be immutable because a copy of it is saved for each 58 // container and changes would not be synchronized to all of them. 59 type Sandbox struct { 60 // ID is the id of the sandbox (immutable). By convention, this is the same 61 // ID as the first container run in the sandbox. 62 ID string `json:"id"` 63 64 // Pid is the pid of the running sandbox (immutable). May be 0 if the sandbox 65 // is not running. 66 Pid int `json:"pid"` 67 68 // Cgroup has the cgroup configuration for the sandbox. 69 Cgroup *cgroup.Cgroup `json:"cgroup"` 70 71 // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox 72 // started, before it may be modified. 73 OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` 74 75 // child is set if a sandbox process is a child of the current process. 76 // 77 // This field isn't saved to json, because only a creator of sandbox 78 // will have it as a child process. 79 child bool 80 81 // statusMu protects status. 82 statusMu sync.Mutex 83 84 // status is the exit status of a sandbox process. It's only set if the 85 // child==true and the sandbox was waited on. This field allows for multiple 86 // threads to wait on sandbox and get the exit code, since Linux will return 87 // WaitStatus to one of the waiters only. 88 status unix.WaitStatus 89 } 90 91 // Args is used to configure a new sandbox. 92 type Args struct { 93 // ID is the sandbox unique identifier. 94 ID string 95 96 // Spec is the OCI spec that describes the container. 97 Spec *specs.Spec 98 99 // BundleDir is the directory containing the container bundle. 100 BundleDir string 101 102 // ConsoleSocket is the path to a unix domain socket that will receive 103 // the console FD. It may be empty. 104 ConsoleSocket string 105 106 // UserLog is the filename to send user-visible logs to. It may be empty. 107 UserLog string 108 109 // IOFiles is the list of files that connect to a 9P endpoint for the mounts 110 // points using Gofers. They must be in the same order as mounts appear in 111 // the spec. 112 IOFiles []*os.File 113 114 // MountsFile is a file container mount information from the spec. It's 115 // equivalent to the mounts from the spec, except that all paths have been 116 // resolved to their final absolute location. 117 MountsFile *os.File 118 119 // Gcgroup is the cgroup that the sandbox is part of. 120 Cgroup *cgroup.Cgroup 121 122 // Attached indicates that the sandbox lifecycle is attached with the caller. 123 // If the caller exits, the sandbox should exit too. 124 Attached bool 125 } 126 127 // New creates the sandbox process. The caller must call Destroy() on the 128 // sandbox. 129 func New(conf *config.Config, args *Args) (*Sandbox, error) { 130 s := &Sandbox{ID: args.ID, Cgroup: args.Cgroup} 131 // The Cleanup object cleans up partially created sandboxes when an error 132 // occurs. Any errors occurring during cleanup itself are ignored. 133 c := cleanup.Make(func() { 134 if err := s.destroy(); err != nil { 135 log.Warningf("error destroying sandbox: %v", err) 136 } 137 }) 138 defer c.Clean() 139 140 // Create pipe to synchronize when sandbox process has been booted. 141 clientSyncFile, sandboxSyncFile, err := os.Pipe() 142 if err != nil { 143 return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) 144 } 145 defer clientSyncFile.Close() 146 147 // Create the sandbox process. 148 err = s.createSandboxProcess(conf, args, sandboxSyncFile) 149 // sandboxSyncFile has to be closed to be able to detect when the sandbox 150 // process exits unexpectedly. 151 sandboxSyncFile.Close() 152 if err != nil { 153 return nil, err 154 } 155 156 // Wait until the sandbox has booted. 157 b := make([]byte, 1) 158 if l, err := clientSyncFile.Read(b); err != nil || l != 1 { 159 err := fmt.Errorf("waiting for sandbox to start: %v", err) 160 // If the sandbox failed to start, it may be because the binary 161 // permissions were incorrect. Check the bits and return a more helpful 162 // error message. 163 // 164 // NOTE: The error message is checked because error types are lost over 165 // rpc calls. 166 if strings.Contains(err.Error(), io.EOF.Error()) { 167 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 168 return nil, fmt.Errorf("%v: %v", err, permsErr) 169 } 170 } 171 return nil, err 172 } 173 174 c.Release() 175 return s, nil 176 } 177 178 // CreateContainer creates a non-root container inside the sandbox. 179 func (s *Sandbox) CreateContainer(cid string, tty *os.File) error { 180 log.Debugf("Create non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) 181 sandboxConn, err := s.sandboxConnect() 182 if err != nil { 183 return fmt.Errorf("couldn't connect to sandbox: %v", err) 184 } 185 defer sandboxConn.Close() 186 187 var files []*os.File 188 if tty != nil { 189 files = []*os.File{tty} 190 } 191 192 args := boot.CreateArgs{ 193 CID: cid, 194 FilePayload: urpc.FilePayload{Files: files}, 195 } 196 if err := sandboxConn.Call(boot.ContainerCreate, &args, nil); err != nil { 197 return fmt.Errorf("creating non-root container %q: %v", cid, err) 198 } 199 return nil 200 } 201 202 // StartRoot starts running the root container process inside the sandbox. 203 func (s *Sandbox) StartRoot(spec *specs.Spec, conf *config.Config) error { 204 log.Debugf("Start root sandbox %q, PID: %d", s.ID, s.Pid) 205 conn, err := s.sandboxConnect() 206 if err != nil { 207 return err 208 } 209 defer conn.Close() 210 211 // Configure the network. 212 if err := setupNetwork(conn, s.Pid, spec, conf); err != nil { 213 return fmt.Errorf("setting up network: %v", err) 214 } 215 216 // Send a message to the sandbox control server to start the root 217 // container. 218 if err := conn.Call(boot.RootContainerStart, &s.ID, nil); err != nil { 219 return fmt.Errorf("starting root container: %v", err) 220 } 221 222 return nil 223 } 224 225 // StartContainer starts running a non-root container inside the sandbox. 226 func (s *Sandbox) StartContainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles []*os.File) error { 227 log.Debugf("Start non-root container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid) 228 sandboxConn, err := s.sandboxConnect() 229 if err != nil { 230 return fmt.Errorf("couldn't connect to sandbox: %v", err) 231 } 232 defer sandboxConn.Close() 233 234 // The payload must contain stdin/stdout/stderr (which may be empty if using 235 // TTY) followed by gofer files. 236 payload := urpc.FilePayload{} 237 payload.Files = append(payload.Files, stdios...) 238 payload.Files = append(payload.Files, goferFiles...) 239 240 // Start running the container. 241 args := boot.StartArgs{ 242 Spec: spec, 243 Conf: conf, 244 CID: cid, 245 FilePayload: payload, 246 } 247 if err := sandboxConn.Call(boot.ContainerStart, &args, nil); err != nil { 248 return fmt.Errorf("starting non-root container %v: %v", spec.Process.Args, err) 249 } 250 return nil 251 } 252 253 // Restore sends the restore call for a container in the sandbox. 254 func (s *Sandbox) Restore(cid string, spec *specs.Spec, conf *config.Config, filename string) error { 255 log.Debugf("Restore sandbox %q", s.ID) 256 257 rf, err := os.Open(filename) 258 if err != nil { 259 return fmt.Errorf("opening restore file %q failed: %v", filename, err) 260 } 261 defer rf.Close() 262 263 opt := boot.RestoreOpts{ 264 FilePayload: urpc.FilePayload{ 265 Files: []*os.File{rf}, 266 }, 267 SandboxID: s.ID, 268 } 269 270 // If the platform needs a device FD we must pass it in. 271 if deviceFile, err := deviceFileForPlatform(conf.Platform); err != nil { 272 return err 273 } else if deviceFile != nil { 274 defer deviceFile.Close() 275 opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile) 276 } 277 278 conn, err := s.sandboxConnect() 279 if err != nil { 280 return err 281 } 282 defer conn.Close() 283 284 // Configure the network. 285 if err := setupNetwork(conn, s.Pid, spec, conf); err != nil { 286 return fmt.Errorf("setting up network: %v", err) 287 } 288 289 // Restore the container and start the root container. 290 if err := conn.Call(boot.ContainerRestore, &opt, nil); err != nil { 291 return fmt.Errorf("restoring container %q: %v", cid, err) 292 } 293 294 return nil 295 } 296 297 // Processes retrieves the list of processes and associated metadata for a 298 // given container in this sandbox. 299 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { 300 log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) 301 conn, err := s.sandboxConnect() 302 if err != nil { 303 return nil, err 304 } 305 defer conn.Close() 306 307 var pl []*control.Process 308 if err := conn.Call(boot.ContainerProcesses, &cid, &pl); err != nil { 309 return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) 310 } 311 return pl, nil 312 } 313 314 // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. 315 func (s *Sandbox) NewCGroup() (*cgroup.Cgroup, error) { 316 return cgroup.NewFromPid(s.Pid) 317 } 318 319 // Execute runs the specified command in the container. It returns the PID of 320 // the newly created process. 321 func (s *Sandbox) Execute(args *control.ExecArgs) (int32, error) { 322 log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) 323 conn, err := s.sandboxConnect() 324 if err != nil { 325 return 0, s.connError(err) 326 } 327 defer conn.Close() 328 329 // Send a message to the sandbox control server to start the container. 330 var pid int32 331 if err := conn.Call(boot.ContainerExecuteAsync, args, &pid); err != nil { 332 return 0, fmt.Errorf("executing command %q in sandbox: %v", args, err) 333 } 334 return pid, nil 335 } 336 337 // Event retrieves stats about the sandbox such as memory and CPU utilization. 338 func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { 339 log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) 340 conn, err := s.sandboxConnect() 341 if err != nil { 342 return nil, err 343 } 344 defer conn.Close() 345 346 var e boot.EventOut 347 // TODO(b/129292330): Pass in the container id (cid) here. The sandbox 348 // should return events only for that container. 349 if err := conn.Call(boot.ContainerEvent, nil, &e); err != nil { 350 return nil, fmt.Errorf("retrieving event data from sandbox: %v", err) 351 } 352 e.Event.ID = cid 353 return &e, nil 354 } 355 356 func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { 357 log.Debugf("Connecting to sandbox %q", s.ID) 358 conn, err := client.ConnectTo(boot.ControlSocketAddr(s.ID)) 359 if err != nil { 360 return nil, s.connError(err) 361 } 362 return conn, nil 363 } 364 365 func (s *Sandbox) connError(err error) error { 366 return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid, err) 367 } 368 369 // createSandboxProcess starts the sandbox as a subprocess by running the "boot" 370 // command, passing in the bundle dir. 371 func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { 372 // nextFD is used to get unused FDs that we can pass to the sandbox. It 373 // starts at 3 because 0, 1, and 2 are taken by stdin/out/err. 374 nextFD := 3 375 376 binPath := specutils.ExePath 377 cmd := exec.Command(binPath, conf.ToFlags()...) 378 cmd.SysProcAttr = &unix.SysProcAttr{} 379 380 // Open the log files to pass to the sandbox as FDs. 381 // 382 // These flags must come BEFORE the "boot" command in cmd.Args. 383 if conf.LogFilename != "" { 384 logFile, err := os.OpenFile(conf.LogFilename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) 385 if err != nil { 386 return fmt.Errorf("opening log file %q: %v", conf.LogFilename, err) 387 } 388 defer logFile.Close() 389 cmd.ExtraFiles = append(cmd.ExtraFiles, logFile) 390 cmd.Args = append(cmd.Args, "--log-fd="+strconv.Itoa(nextFD)) 391 nextFD++ 392 } 393 394 test := "" 395 if len(conf.TestOnlyTestNameEnv) != 0 { 396 // Fetch test name if one is provided and the test only flag was set. 397 if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 398 test = t 399 } 400 } 401 if conf.DebugLog != "" { 402 debugLogFile, err := specutils.DebugLogFile(conf.DebugLog, "boot", test) 403 if err != nil { 404 return fmt.Errorf("opening debug log file in %q: %v", conf.DebugLog, err) 405 } 406 defer debugLogFile.Close() 407 cmd.ExtraFiles = append(cmd.ExtraFiles, debugLogFile) 408 cmd.Args = append(cmd.Args, "--debug-log-fd="+strconv.Itoa(nextFD)) 409 nextFD++ 410 } 411 if conf.PanicLog != "" { 412 panicLogFile, err := specutils.DebugLogFile(conf.PanicLog, "panic", test) 413 if err != nil { 414 return fmt.Errorf("opening panic log file in %q: %v", conf.PanicLog, err) 415 } 416 defer panicLogFile.Close() 417 cmd.ExtraFiles = append(cmd.ExtraFiles, panicLogFile) 418 cmd.Args = append(cmd.Args, "--panic-log-fd="+strconv.Itoa(nextFD)) 419 nextFD++ 420 } 421 covFilename := conf.CoverageReport 422 if covFilename == "" { 423 covFilename = os.Getenv("GO_COVERAGE_FILE") 424 } 425 if covFilename != "" && coverage.Available() { 426 covFile, err := specutils.DebugLogFile(covFilename, "cov", test) 427 if err != nil { 428 return fmt.Errorf("opening debug log file in %q: %v", covFilename, err) 429 } 430 defer covFile.Close() 431 cmd.ExtraFiles = append(cmd.ExtraFiles, covFile) 432 cmd.Args = append(cmd.Args, "--coverage-fd="+strconv.Itoa(nextFD)) 433 nextFD++ 434 } 435 436 // Add the "boot" command to the args. 437 // 438 // All flags after this must be for the boot command 439 cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) 440 441 // Create a socket for the control server and donate it to the sandbox. 442 addr := boot.ControlSocketAddr(s.ID) 443 sockFD, err := server.CreateSocket(addr) 444 log.Infof("Creating sandbox process with addr: %s", addr[1:]) // skip "\00". 445 if err != nil { 446 return fmt.Errorf("creating control server socket for sandbox %q: %v", s.ID, err) 447 } 448 controllerFile := os.NewFile(uintptr(sockFD), "control_server_socket") 449 defer controllerFile.Close() 450 cmd.ExtraFiles = append(cmd.ExtraFiles, controllerFile) 451 cmd.Args = append(cmd.Args, "--controller-fd="+strconv.Itoa(nextFD)) 452 nextFD++ 453 454 defer args.MountsFile.Close() 455 cmd.ExtraFiles = append(cmd.ExtraFiles, args.MountsFile) 456 cmd.Args = append(cmd.Args, "--mounts-fd="+strconv.Itoa(nextFD)) 457 nextFD++ 458 459 specFile, err := specutils.OpenSpec(args.BundleDir) 460 if err != nil { 461 return err 462 } 463 defer specFile.Close() 464 cmd.ExtraFiles = append(cmd.ExtraFiles, specFile) 465 cmd.Args = append(cmd.Args, "--spec-fd="+strconv.Itoa(nextFD)) 466 nextFD++ 467 468 cmd.ExtraFiles = append(cmd.ExtraFiles, startSyncFile) 469 cmd.Args = append(cmd.Args, "--start-sync-fd="+strconv.Itoa(nextFD)) 470 nextFD++ 471 472 // If there is a gofer, sends all socket ends to the sandbox. 473 for _, f := range args.IOFiles { 474 defer f.Close() 475 cmd.ExtraFiles = append(cmd.ExtraFiles, f) 476 cmd.Args = append(cmd.Args, "--io-fds="+strconv.Itoa(nextFD)) 477 nextFD++ 478 } 479 480 gPlatform, err := platform.Lookup(conf.Platform) 481 if err != nil { 482 return err 483 } 484 485 if deviceFile, err := gPlatform.OpenDevice(); err != nil { 486 return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) 487 } else if deviceFile != nil { 488 defer deviceFile.Close() 489 cmd.ExtraFiles = append(cmd.ExtraFiles, deviceFile) 490 cmd.Args = append(cmd.Args, "--device-fd="+strconv.Itoa(nextFD)) 491 nextFD++ 492 } 493 494 // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff 495 // isn't set. 496 if conf.Platform == "kvm" { 497 cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") 498 } 499 500 // The current process' stdio must be passed to the application via the 501 // --stdio-fds flag. The stdio of the sandbox process itself must not 502 // be connected to the same FDs, otherwise we risk leaking sandbox 503 // errors to the application, so we set the sandbox stdio to nil, 504 // causing them to read/write from the null device. 505 cmd.Stdin = nil 506 cmd.Stdout = nil 507 cmd.Stderr = nil 508 509 // If the console control socket file is provided, then create a new 510 // pty master/replica pair and set the TTY on the sandbox process. 511 if args.Spec.Process.Terminal && args.ConsoleSocket != "" { 512 // console.NewWithSocket will send the master on the given 513 // socket, and return the replica. 514 tty, err := console.NewWithSocket(args.ConsoleSocket) 515 if err != nil { 516 return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) 517 } 518 defer tty.Close() 519 520 // Set the TTY as a controlling TTY on the sandbox process. 521 cmd.SysProcAttr.Setctty = true 522 // The Ctty FD must be the FD in the child process's FD table, 523 // which will be nextFD in this case. 524 // See https://github.com/golang/go/issues/29458. 525 cmd.SysProcAttr.Ctty = nextFD 526 527 // Pass the tty as all stdio fds to sandbox. 528 for i := 0; i < 3; i++ { 529 cmd.ExtraFiles = append(cmd.ExtraFiles, tty) 530 cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) 531 nextFD++ 532 } 533 534 if conf.Debug { 535 // If debugging, send the boot process stdio to the 536 // TTY, so that it is easier to find. 537 cmd.Stdin = tty 538 cmd.Stdout = tty 539 cmd.Stderr = tty 540 } 541 } else { 542 // If not using a console, pass our current stdio as the 543 // container stdio via flags. 544 for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { 545 cmd.ExtraFiles = append(cmd.ExtraFiles, f) 546 cmd.Args = append(cmd.Args, "--stdio-fds="+strconv.Itoa(nextFD)) 547 nextFD++ 548 } 549 550 if conf.Debug { 551 // If debugging, send the boot process stdio to the 552 // this process' stdio, so that is is easier to find. 553 cmd.Stdin = os.Stdin 554 cmd.Stdout = os.Stdout 555 cmd.Stderr = os.Stderr 556 } 557 } 558 559 // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT 560 // when re-parented. 561 cmd.SysProcAttr.Setsid = true 562 563 // nss is the set of namespaces to join or create before starting the sandbox 564 // process. Mount, IPC and UTS namespaces from the host are not used as they 565 // are virtualized inside the sandbox. Be paranoid and run inside an empty 566 // namespace for these. Don't unshare cgroup because sandbox is added to a 567 // cgroup in the caller's namespace. 568 log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") 569 nss := []specs.LinuxNamespace{ 570 {Type: specs.IPCNamespace}, 571 {Type: specs.MountNamespace}, 572 {Type: specs.UTSNamespace}, 573 } 574 575 if gPlatform.Requirements().RequiresCurrentPIDNS { 576 // TODO(b/75837838): Also set a new PID namespace so that we limit 577 // access to other host processes. 578 log.Infof("Sandbox will be started in the current PID namespace") 579 } else { 580 log.Infof("Sandbox will be started in a new PID namespace") 581 nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) 582 cmd.Args = append(cmd.Args, "--pidns=true") 583 } 584 585 // Joins the network namespace if network is enabled. the sandbox talks 586 // directly to the host network, which may have been configured in the 587 // namespace. 588 if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { 589 log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) 590 nss = append(nss, ns) 591 } else if conf.Network == config.NetworkHost { 592 log.Infof("Sandbox will be started in the host network namespace") 593 } else { 594 log.Infof("Sandbox will be started in new network namespace") 595 nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) 596 } 597 598 // User namespace depends on the network type. Host network requires to run 599 // inside the user namespace specified in the spec or the current namespace 600 // if none is configured. 601 if conf.Network == config.NetworkHost { 602 if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { 603 log.Infof("Sandbox will be started in container's user namespace: %+v", userns) 604 nss = append(nss, userns) 605 specutils.SetUIDGIDMappings(cmd, args.Spec) 606 } else { 607 log.Infof("Sandbox will be started in the current user namespace") 608 } 609 // When running in the caller's defined user namespace, apply the same 610 // capabilities to the sandbox process to ensure it abides to the same 611 // rules. 612 cmd.Args = append(cmd.Args, "--apply-caps=true") 613 614 // If we have CAP_SYS_ADMIN, we can create an empty chroot and 615 // bind-mount the executable inside it. 616 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 617 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 618 619 } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) { 620 log.Infof("Sandbox will be started in minimal chroot") 621 cmd.Args = append(cmd.Args, "--setup-root") 622 } else { 623 return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") 624 } 625 } else { 626 // If we have CAP_SETUID and CAP_SETGID, then we can also run 627 // as user nobody. 628 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 629 log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) 630 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 631 } else if specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { 632 log.Infof("Sandbox will be started in new user namespace") 633 nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) 634 cmd.Args = append(cmd.Args, "--setup-root") 635 636 const nobody = 65534 637 if conf.Rootless { 638 log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) 639 cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ 640 { 641 ContainerID: nobody, 642 HostID: os.Getuid(), 643 Size: 1, 644 }, 645 } 646 cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ 647 { 648 ContainerID: nobody, 649 HostID: os.Getgid(), 650 Size: 1, 651 }, 652 } 653 654 } else { 655 // Map nobody in the new namespace to nobody in the parent namespace. 656 // 657 // A sandbox process will construct an empty 658 // root for itself, so it has to have 659 // CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. 660 cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ 661 { 662 ContainerID: nobody, 663 HostID: nobody, 664 Size: 1, 665 }, 666 } 667 cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ 668 { 669 ContainerID: nobody, 670 HostID: nobody, 671 Size: 1, 672 }, 673 } 674 } 675 676 // Set credentials to run as user and group nobody. 677 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} 678 cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, uintptr(capability.CAP_SYS_ADMIN), uintptr(capability.CAP_SYS_CHROOT)) 679 } else { 680 return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") 681 } 682 } 683 684 cmd.Args[0] = "runsc-sandbox" 685 686 if s.Cgroup != nil { 687 cpuNum, err := s.Cgroup.NumCPU() 688 if err != nil { 689 return fmt.Errorf("getting cpu count from cgroups: %v", err) 690 } 691 if conf.CPUNumFromQuota { 692 // Dropping below 2 CPUs can trigger application to disable 693 // locks that can lead do hard to debug errors, so just 694 // leaving two cores as reasonable default. 695 const minCPUs = 2 696 697 quota, err := s.Cgroup.CPUQuota() 698 if err != nil { 699 return fmt.Errorf("getting cpu qouta from cgroups: %v", err) 700 } 701 if n := int(math.Ceil(quota)); n > 0 { 702 if n < minCPUs { 703 n = minCPUs 704 } 705 if n < cpuNum { 706 // Only lower the cpu number. 707 cpuNum = n 708 } 709 } 710 } 711 cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) 712 713 mem, err := s.Cgroup.MemoryLimit() 714 if err != nil { 715 return fmt.Errorf("getting memory limit from cgroups: %v", err) 716 } 717 // When memory limit is unset, a "large" number is returned. In that case, 718 // just stick with the default. 719 if mem < 0x7ffffffffffff000 { 720 cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) 721 } 722 } 723 724 if args.UserLog != "" { 725 f, err := os.OpenFile(args.UserLog, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0664) 726 if err != nil { 727 return fmt.Errorf("opening compat log file: %v", err) 728 } 729 defer f.Close() 730 731 cmd.ExtraFiles = append(cmd.ExtraFiles, f) 732 cmd.Args = append(cmd.Args, "--user-log-fd", strconv.Itoa(nextFD)) 733 nextFD++ 734 } 735 736 _ = nextFD // All FD assignment is finished. 737 738 if args.Attached { 739 // Kill sandbox if parent process exits in attached mode. 740 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 741 // Tells boot that any process it creates must have pdeathsig set. 742 cmd.Args = append(cmd.Args, "--attached") 743 } 744 745 // Add container as the last argument. 746 cmd.Args = append(cmd.Args, s.ID) 747 748 // Log the FDs we are donating to the sandbox process. 749 for i, f := range cmd.ExtraFiles { 750 log.Debugf("Donating FD %d: %q", i+3, f.Name()) 751 } 752 753 log.Debugf("Starting sandbox: %s %v", binPath, cmd.Args) 754 log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) 755 if err := specutils.StartInNS(cmd, nss); err != nil { 756 err := fmt.Errorf("starting sandbox: %v", err) 757 // If the sandbox failed to start, it may be because the binary 758 // permissions were incorrect. Check the bits and return a more helpful 759 // error message. 760 // 761 // NOTE: The error message is checked because error types are lost over 762 // rpc calls. 763 if strings.Contains(err.Error(), unix.EACCES.Error()) { 764 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 765 return fmt.Errorf("%v: %v", err, permsErr) 766 } 767 } 768 return err 769 } 770 s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) 771 if err != nil { 772 return err 773 } 774 775 s.child = true 776 s.Pid = cmd.Process.Pid 777 log.Infof("Sandbox started, PID: %d", s.Pid) 778 779 return nil 780 } 781 782 // Wait waits for the containerized process to exit, and returns its WaitStatus. 783 func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { 784 log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) 785 786 if conn, err := s.sandboxConnect(); err != nil { 787 // The sandbox may have exited while before we had a chance to wait on it. 788 // There is nothing we can do for subcontainers. For the init container, we 789 // can try to get the sandbox exit code. 790 if !s.IsRootContainer(cid) { 791 return unix.WaitStatus(0), err 792 } 793 log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 794 } else { 795 defer conn.Close() 796 797 // Try the Wait RPC to the sandbox. 798 var ws unix.WaitStatus 799 err = conn.Call(boot.ContainerWait, &cid, &ws) 800 conn.Close() 801 if err == nil { 802 if s.IsRootContainer(cid) { 803 if err := s.waitForStopped(); err != nil { 804 return unix.WaitStatus(0), err 805 } 806 } 807 // It worked! 808 return ws, nil 809 } 810 // See comment above. 811 if !s.IsRootContainer(cid) { 812 return unix.WaitStatus(0), err 813 } 814 815 // The sandbox may have exited after we connected, but before 816 // or during the Wait RPC. 817 log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 818 } 819 820 // The sandbox may have already exited, or exited while handling the Wait RPC. 821 // The best we can do is ask Linux what the sandbox exit status was, since in 822 // most cases that will be the same as the container exit status. 823 if err := s.waitForStopped(); err != nil { 824 return unix.WaitStatus(0), err 825 } 826 if !s.child { 827 return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") 828 } 829 830 s.statusMu.Lock() 831 defer s.statusMu.Unlock() 832 return s.status, nil 833 } 834 835 // WaitPID waits for process 'pid' in the container's sandbox and returns its 836 // WaitStatus. 837 func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { 838 log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) 839 var ws unix.WaitStatus 840 conn, err := s.sandboxConnect() 841 if err != nil { 842 return ws, err 843 } 844 defer conn.Close() 845 846 args := &boot.WaitPIDArgs{ 847 PID: pid, 848 CID: cid, 849 } 850 if err := conn.Call(boot.ContainerWaitPID, args, &ws); err != nil { 851 return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %v", pid, s.ID, err) 852 } 853 return ws, nil 854 } 855 856 // IsRootContainer returns true if the specified container ID belongs to the 857 // root container. 858 func (s *Sandbox) IsRootContainer(cid string) bool { 859 return s.ID == cid 860 } 861 862 // Destroy frees all resources associated with the sandbox. It fails fast and 863 // is idempotent. 864 func (s *Sandbox) destroy() error { 865 log.Debugf("Destroy sandbox %q", s.ID) 866 if s.Pid != 0 { 867 log.Debugf("Killing sandbox %q", s.ID) 868 if err := unix.Kill(s.Pid, unix.SIGKILL); err != nil && err != unix.ESRCH { 869 return fmt.Errorf("killing sandbox %q PID %q: %v", s.ID, s.Pid, err) 870 } 871 if err := s.waitForStopped(); err != nil { 872 return fmt.Errorf("waiting sandbox %q stop: %v", s.ID, err) 873 } 874 } 875 876 return nil 877 } 878 879 // SignalContainer sends the signal to a container in the sandbox. If all is 880 // true and signal is SIGKILL, then waits for all processes to exit before 881 // returning. 882 func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { 883 log.Debugf("Signal sandbox %q", s.ID) 884 conn, err := s.sandboxConnect() 885 if err != nil { 886 return err 887 } 888 defer conn.Close() 889 890 mode := boot.DeliverToProcess 891 if all { 892 mode = boot.DeliverToAllProcesses 893 } 894 895 args := boot.SignalArgs{ 896 CID: cid, 897 Signo: int32(sig), 898 Mode: mode, 899 } 900 if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil { 901 return fmt.Errorf("signaling container %q: %v", cid, err) 902 } 903 return nil 904 } 905 906 // SignalProcess sends the signal to a particular process in the container. If 907 // fgProcess is true, then the signal is sent to the foreground process group 908 // in the same session that PID belongs to. This is only valid if the process 909 // is attached to a host TTY. 910 func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error { 911 log.Debugf("Signal sandbox %q", s.ID) 912 conn, err := s.sandboxConnect() 913 if err != nil { 914 return err 915 } 916 defer conn.Close() 917 918 mode := boot.DeliverToProcess 919 if fgProcess { 920 mode = boot.DeliverToForegroundProcessGroup 921 } 922 923 args := boot.SignalArgs{ 924 CID: cid, 925 Signo: int32(sig), 926 PID: pid, 927 Mode: mode, 928 } 929 if err := conn.Call(boot.ContainerSignal, &args, nil); err != nil { 930 return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) 931 } 932 return nil 933 } 934 935 // Checkpoint sends the checkpoint call for a container in the sandbox. 936 // The statefile will be written to f. 937 func (s *Sandbox) Checkpoint(cid string, f *os.File) error { 938 log.Debugf("Checkpoint sandbox %q", s.ID) 939 conn, err := s.sandboxConnect() 940 if err != nil { 941 return err 942 } 943 defer conn.Close() 944 945 opt := control.SaveOpts{ 946 FilePayload: urpc.FilePayload{ 947 Files: []*os.File{f}, 948 }, 949 } 950 951 if err := conn.Call(boot.ContainerCheckpoint, &opt, nil); err != nil { 952 return fmt.Errorf("checkpointing container %q: %v", cid, err) 953 } 954 return nil 955 } 956 957 // Pause sends the pause call for a container in the sandbox. 958 func (s *Sandbox) Pause(cid string) error { 959 log.Debugf("Pause sandbox %q", s.ID) 960 conn, err := s.sandboxConnect() 961 if err != nil { 962 return err 963 } 964 defer conn.Close() 965 966 if err := conn.Call(boot.ContainerPause, nil, nil); err != nil { 967 return fmt.Errorf("pausing container %q: %v", cid, err) 968 } 969 return nil 970 } 971 972 // Resume sends the resume call for a container in the sandbox. 973 func (s *Sandbox) Resume(cid string) error { 974 log.Debugf("Resume sandbox %q", s.ID) 975 conn, err := s.sandboxConnect() 976 if err != nil { 977 return err 978 } 979 defer conn.Close() 980 981 if err := conn.Call(boot.ContainerResume, nil, nil); err != nil { 982 return fmt.Errorf("resuming container %q: %v", cid, err) 983 } 984 return nil 985 } 986 987 // IsRunning returns true if the sandbox or gofer process is running. 988 func (s *Sandbox) IsRunning() bool { 989 if s.Pid != 0 { 990 // Send a signal 0 to the sandbox process. 991 if err := unix.Kill(s.Pid, 0); err == nil { 992 // Succeeded, process is running. 993 return true 994 } 995 } 996 return false 997 } 998 999 // Stacks collects and returns all stacks for the sandbox. 1000 func (s *Sandbox) Stacks() (string, error) { 1001 log.Debugf("Stacks sandbox %q", s.ID) 1002 conn, err := s.sandboxConnect() 1003 if err != nil { 1004 return "", err 1005 } 1006 defer conn.Close() 1007 1008 var stacks string 1009 if err := conn.Call(boot.SandboxStacks, nil, &stacks); err != nil { 1010 return "", fmt.Errorf("getting sandbox %q stacks: %v", s.ID, err) 1011 } 1012 return stacks, nil 1013 } 1014 1015 // HeapProfile writes a heap profile to the given file. 1016 func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { 1017 log.Debugf("Heap profile %q", s.ID) 1018 conn, err := s.sandboxConnect() 1019 if err != nil { 1020 return err 1021 } 1022 defer conn.Close() 1023 1024 opts := control.HeapProfileOpts{ 1025 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1026 Delay: delay, 1027 } 1028 return conn.Call(boot.HeapProfile, &opts, nil) 1029 } 1030 1031 // CPUProfile collects a CPU profile. 1032 func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { 1033 log.Debugf("CPU profile %q", s.ID) 1034 conn, err := s.sandboxConnect() 1035 if err != nil { 1036 return err 1037 } 1038 defer conn.Close() 1039 1040 opts := control.CPUProfileOpts{ 1041 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1042 Duration: duration, 1043 } 1044 return conn.Call(boot.CPUProfile, &opts, nil) 1045 } 1046 1047 // BlockProfile writes a block profile to the given file. 1048 func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { 1049 log.Debugf("Block profile %q", s.ID) 1050 conn, err := s.sandboxConnect() 1051 if err != nil { 1052 return err 1053 } 1054 defer conn.Close() 1055 1056 opts := control.BlockProfileOpts{ 1057 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1058 Duration: duration, 1059 } 1060 return conn.Call(boot.BlockProfile, &opts, nil) 1061 } 1062 1063 // MutexProfile writes a mutex profile to the given file. 1064 func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { 1065 log.Debugf("Mutex profile %q", s.ID) 1066 conn, err := s.sandboxConnect() 1067 if err != nil { 1068 return err 1069 } 1070 defer conn.Close() 1071 1072 opts := control.MutexProfileOpts{ 1073 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1074 Duration: duration, 1075 } 1076 return conn.Call(boot.MutexProfile, &opts, nil) 1077 } 1078 1079 // Trace collects an execution trace. 1080 func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { 1081 log.Debugf("Trace %q", s.ID) 1082 conn, err := s.sandboxConnect() 1083 if err != nil { 1084 return err 1085 } 1086 defer conn.Close() 1087 1088 opts := control.TraceProfileOpts{ 1089 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1090 Duration: duration, 1091 } 1092 return conn.Call(boot.Trace, &opts, nil) 1093 } 1094 1095 // ChangeLogging changes logging options. 1096 func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { 1097 log.Debugf("Change logging start %q", s.ID) 1098 conn, err := s.sandboxConnect() 1099 if err != nil { 1100 return err 1101 } 1102 defer conn.Close() 1103 1104 if err := conn.Call(boot.ChangeLogging, &args, nil); err != nil { 1105 return fmt.Errorf("changing sandbox %q logging: %v", s.ID, err) 1106 } 1107 return nil 1108 } 1109 1110 // DestroyContainer destroys the given container. If it is the root container, 1111 // then the entire sandbox is destroyed. 1112 func (s *Sandbox) DestroyContainer(cid string) error { 1113 if err := s.destroyContainer(cid); err != nil { 1114 // If the sandbox isn't running, the container has already been destroyed, 1115 // ignore the error in this case. 1116 if s.IsRunning() { 1117 return err 1118 } 1119 } 1120 return nil 1121 } 1122 1123 func (s *Sandbox) destroyContainer(cid string) error { 1124 if s.IsRootContainer(cid) { 1125 log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) 1126 return s.destroy() 1127 } 1128 1129 log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) 1130 conn, err := s.sandboxConnect() 1131 if err != nil { 1132 return err 1133 } 1134 defer conn.Close() 1135 if err := conn.Call(boot.ContainerDestroy, &cid, nil); err != nil { 1136 return fmt.Errorf("destroying container %q: %v", cid, err) 1137 } 1138 return nil 1139 } 1140 1141 func (s *Sandbox) waitForStopped() error { 1142 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1143 defer cancel() 1144 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1145 op := func() error { 1146 if s.child { 1147 s.statusMu.Lock() 1148 defer s.statusMu.Unlock() 1149 if s.Pid == 0 { 1150 return nil 1151 } 1152 // The sandbox process is a child of the current process, 1153 // so we can wait it and collect its zombie. 1154 wpid, err := unix.Wait4(int(s.Pid), &s.status, unix.WNOHANG, nil) 1155 if err != nil { 1156 return fmt.Errorf("error waiting the sandbox process: %v", err) 1157 } 1158 if wpid == 0 { 1159 return fmt.Errorf("sandbox is still running") 1160 } 1161 s.Pid = 0 1162 } else if s.IsRunning() { 1163 return fmt.Errorf("sandbox is still running") 1164 } 1165 return nil 1166 } 1167 return backoff.Retry(op, b) 1168 } 1169 1170 // deviceFileForPlatform opens the device file for the given platform. If the 1171 // platform does not need a device file, then nil is returned. 1172 func deviceFileForPlatform(name string) (*os.File, error) { 1173 p, err := platform.Lookup(name) 1174 if err != nil { 1175 return nil, err 1176 } 1177 1178 f, err := p.OpenDevice() 1179 if err != nil { 1180 return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) 1181 } 1182 return f, nil 1183 } 1184 1185 // checkBinaryPermissions verifies that the required binary bits are set on 1186 // the runsc executable. 1187 func checkBinaryPermissions(conf *config.Config) error { 1188 // All platforms need the other exe bit 1189 neededBits := os.FileMode(0001) 1190 if conf.Platform == platforms.Ptrace { 1191 // Ptrace needs the other read bit 1192 neededBits |= os.FileMode(0004) 1193 } 1194 1195 exePath, err := os.Executable() 1196 if err != nil { 1197 return fmt.Errorf("getting exe path: %v", err) 1198 } 1199 1200 // Check the permissions of the runsc binary and print an error if it 1201 // doesn't match expectations. 1202 info, err := os.Stat(exePath) 1203 if err != nil { 1204 return fmt.Errorf("stat file: %v", err) 1205 } 1206 1207 if info.Mode().Perm()&neededBits != neededBits { 1208 return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) 1209 } 1210 return nil 1211 }