github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/sandbox/sandbox.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sandbox creates and manipulates sandboxes. 16 package sandbox 17 18 import ( 19 "context" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "math" 25 "os" 26 "os/exec" 27 "path/filepath" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/cenkalti/backoff" 34 specs "github.com/opencontainers/runtime-spec/specs-go" 35 "github.com/syndtr/gocapability/capability" 36 "golang.org/x/sys/unix" 37 "github.com/metacubex/gvisor/pkg/abi/linux" 38 "github.com/metacubex/gvisor/pkg/atomicbitops" 39 "github.com/metacubex/gvisor/pkg/cleanup" 40 "github.com/metacubex/gvisor/pkg/control/client" 41 "github.com/metacubex/gvisor/pkg/control/server" 42 "github.com/metacubex/gvisor/pkg/coverage" 43 "github.com/metacubex/gvisor/pkg/log" 44 metricpb "github.com/metacubex/gvisor/pkg/metric/metric_go_proto" 45 "github.com/metacubex/gvisor/pkg/prometheus" 46 "github.com/metacubex/gvisor/pkg/sentry/control" 47 "github.com/metacubex/gvisor/pkg/sentry/devices/nvproxy" 48 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/erofs" 49 "github.com/metacubex/gvisor/pkg/sentry/platform" 50 "github.com/metacubex/gvisor/pkg/sentry/seccheck" 51 "github.com/metacubex/gvisor/pkg/state/statefile" 52 "github.com/metacubex/gvisor/pkg/sync" 53 "github.com/metacubex/gvisor/pkg/urpc" 54 "github.com/metacubex/gvisor/runsc/boot" 55 "github.com/metacubex/gvisor/runsc/boot/procfs" 56 "github.com/metacubex/gvisor/runsc/cgroup" 57 "github.com/metacubex/gvisor/runsc/config" 58 "github.com/metacubex/gvisor/runsc/console" 59 "github.com/metacubex/gvisor/runsc/donation" 60 "github.com/metacubex/gvisor/runsc/specutils" 61 ) 62 63 const ( 64 // namespaceAnnotation is a pod annotation populated by containerd. 65 // It contains the name of the pod that a sandbox is in when running in Kubernetes. 66 podNameAnnotation = "io.kubernetes.cri.sandbox-name" 67 68 // namespaceAnnotation is a pod annotation populated by containerd. 69 // It contains the namespace of the pod that a sandbox is in when running in Kubernetes. 70 namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" 71 ) 72 73 // createControlSocket finds a location and creates the socket used to 74 // communicate with the sandbox. The socket is a UDS on the host filesystem. 75 // 76 // Note that abstract sockets are *not* used, because any user can connect to 77 // them. There is no file mode protecting abstract sockets. 78 func createControlSocket(rootDir, id string) (string, int, error) { 79 name := fmt.Sprintf("runsc-%s.sock", id) 80 81 // Only use absolute paths to guarantee resolution from anywhere. 82 for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} { 83 path := filepath.Join(dir, name) 84 log.Debugf("Attempting to create socket file %q", path) 85 fd, err := server.CreateSocket(path) 86 if err == nil { 87 log.Debugf("Using socket file %q", path) 88 return path, fd, nil 89 } 90 log.Debugf("Failed to create socket file %q: %v", path, err) 91 } 92 return "", -1, fmt.Errorf("unable to find location to write socket file") 93 } 94 95 // pid is an atomic type that implements JSON marshal/unmarshal interfaces. 96 type pid struct { 97 val atomicbitops.Int64 98 } 99 100 func (p *pid) store(pid int) { 101 p.val.Store(int64(pid)) 102 } 103 104 func (p *pid) load() int { 105 return int(p.val.Load()) 106 } 107 108 // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. 109 func (p *pid) UnmarshalJSON(b []byte) error { 110 var pid int 111 112 if err := json.Unmarshal(b, &pid); err != nil { 113 return err 114 } 115 p.store(pid) 116 return nil 117 } 118 119 // MarshalJSON implements json.Marshaler.MarshalJSON 120 func (p *pid) MarshalJSON() ([]byte, error) { 121 return json.Marshal(p.load()) 122 } 123 124 // Sandbox wraps a sandbox process. 125 // 126 // It is used to start/stop sandbox process (and associated processes like 127 // gofers), as well as for running and manipulating containers inside a running 128 // sandbox. 129 // 130 // Note: Sandbox must be immutable because a copy of it is saved for each 131 // container and changes would not be synchronized to all of them. 132 type Sandbox struct { 133 // ID is the id of the sandbox (immutable). By convention, this is the same 134 // ID as the first container run in the sandbox. 135 ID string `json:"id"` 136 137 // PodName is the name of the Kubernetes Pod (if any) that this sandbox 138 // represents. Unset if not running under containerd or Kubernetes. 139 PodName string `json:"podName"` 140 141 // Namespace is the Kubernetes namespace (if any) of the pod that this 142 // sandbox represents. Unset if not running under containerd or Kubernetes. 143 Namespace string `json:"namespace"` 144 145 // Pid is the pid of the running sandbox. May be 0 if the sandbox 146 // is not running. 147 Pid pid `json:"pid"` 148 149 // UID is the user ID in the parent namespace that the sandbox is running as. 150 UID int `json:"uid"` 151 // GID is the group ID in the parent namespace that the sandbox is running as. 152 GID int `json:"gid"` 153 154 // CgroupJSON contains the cgroup configuration that the sandbox is part of 155 // and allow serialization of the configuration into json 156 CgroupJSON cgroup.CgroupJSON `json:"cgroup"` 157 158 // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox 159 // started, before it may be modified. 160 OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` 161 162 // RegisteredMetrics is the set of metrics registered in the sandbox. 163 // Used for verifying metric data integrity after containers are started. 164 // Only populated if exporting metrics was requested when the sandbox was 165 // created. 166 RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"` 167 168 // MetricMetadata are key-value pairs that are useful to export about this 169 // sandbox, but not part of the set of labels that uniquely identify it. 170 // They are static once initialized, and typically contain high-level 171 // configuration information about the sandbox. 172 MetricMetadata map[string]string `json:"metricMetadata"` 173 174 // MetricServerAddress is the address of the metric server that this sandbox 175 // intends to export metrics for. 176 // Only populated if exporting metrics was requested when the sandbox was 177 // created. 178 MetricServerAddress string `json:"metricServerAddress"` 179 180 // ControlSocketPath is the path to the sandbox's uRPC server socket. 181 // Connections to the sandbox are made through this. 182 ControlSocketPath string `json:"controlSocketPath"` 183 184 // MountHints provides extra information about container mounts that apply 185 // to the entire pod. 186 MountHints *boot.PodMountHints `json:"mountHints"` 187 188 // child is set if a sandbox process is a child of the current process. 189 // 190 // This field isn't saved to json, because only a creator of sandbox 191 // will have it as a child process. 192 child bool `nojson:"true"` 193 194 // statusMu protects status. 195 statusMu sync.Mutex `nojson:"true"` 196 197 // status is the exit status of a sandbox process. It's only set if the 198 // child==true and the sandbox was waited on. This field allows for multiple 199 // threads to wait on sandbox and get the exit code, since Linux will return 200 // WaitStatus to one of the waiters only. 201 status unix.WaitStatus `nojson:"true"` 202 } 203 204 // Getpid returns the process ID of the sandbox process. 205 func (s *Sandbox) Getpid() int { 206 return s.Pid.load() 207 } 208 209 // Args is used to configure a new sandbox. 210 type Args struct { 211 // ID is the sandbox unique identifier. 212 ID string 213 214 // Spec is the OCI spec that describes the container. 215 Spec *specs.Spec 216 217 // BundleDir is the directory containing the container bundle. 218 BundleDir string 219 220 // ConsoleSocket is the path to a unix domain socket that will receive 221 // the console FD. It may be empty. 222 ConsoleSocket string 223 224 // UserLog is the filename to send user-visible logs to. It may be empty. 225 UserLog string 226 227 // IOFiles is the list of image files and/or socket files that connect to 228 // a gofer endpoint for the mount points using Gofers. They must be in the 229 // same order as mounts appear in the spec. 230 IOFiles []*os.File 231 232 // File that connects to a gofer endpoint for a device mount point at /dev. 233 DevIOFile *os.File 234 235 // GoferFilestoreFiles are the regular files that will back the overlayfs or 236 // tmpfs mount if a gofer mount is to be overlaid. 237 GoferFilestoreFiles []*os.File 238 239 // GoferMountConfs contains information about how the gofer mounts have been 240 // configured. The first entry is for rootfs and the following entries are 241 // for bind mounts in Spec.Mounts (in the same order). 242 GoferMountConfs boot.GoferMountConfFlags 243 244 // MountHints provides extra information about containers mounts that apply 245 // to the entire pod. 246 MountHints *boot.PodMountHints 247 248 // MountsFile is a file container mount information from the spec. It's 249 // equivalent to the mounts from the spec, except that all paths have been 250 // resolved to their final absolute location. 251 MountsFile *os.File 252 253 // Gcgroup is the cgroup that the sandbox is part of. 254 Cgroup cgroup.Cgroup 255 256 // Attached indicates that the sandbox lifecycle is attached with the caller. 257 // If the caller exits, the sandbox should exit too. 258 Attached bool 259 260 // SinkFiles is the an ordered array of files to be used by seccheck sinks 261 // configured from the --pod-init-config file. 262 SinkFiles []*os.File 263 264 // PassFiles are user-supplied files from the host to be exposed to the 265 // sandboxed app. 266 PassFiles map[int]*os.File 267 268 // ExecFile is the file from the host used for program execution. 269 ExecFile *os.File 270 } 271 272 // New creates the sandbox process. The caller must call Destroy() on the 273 // sandbox. 274 func New(conf *config.Config, args *Args) (*Sandbox, error) { 275 s := &Sandbox{ 276 ID: args.ID, 277 CgroupJSON: cgroup.CgroupJSON{ 278 Cgroup: args.Cgroup, 279 }, 280 UID: -1, // prevent usage before it's set. 281 GID: -1, // prevent usage before it's set. 282 MetricMetadata: conf.MetricMetadata(), 283 MetricServerAddress: conf.MetricServer, 284 MountHints: args.MountHints, 285 } 286 if args.Spec != nil && args.Spec.Annotations != nil { 287 s.PodName = args.Spec.Annotations[podNameAnnotation] 288 s.Namespace = args.Spec.Annotations[namespaceAnnotation] 289 } 290 291 // The Cleanup object cleans up partially created sandboxes when an error 292 // occurs. Any errors occurring during cleanup itself are ignored. 293 c := cleanup.Make(func() { 294 if err := s.destroy(); err != nil { 295 log.Warningf("error destroying sandbox: %v", err) 296 } 297 }) 298 defer c.Clean() 299 300 if len(conf.PodInitConfig) > 0 { 301 initConf, err := boot.LoadInitConfig(conf.PodInitConfig) 302 if err != nil { 303 return nil, fmt.Errorf("loading init config file: %w", err) 304 } 305 args.SinkFiles, err = initConf.Setup() 306 if err != nil { 307 return nil, fmt.Errorf("cannot init config: %w", err) 308 } 309 } 310 311 // Create pipe to synchronize when sandbox process has been booted. 312 clientSyncFile, sandboxSyncFile, err := os.Pipe() 313 if err != nil { 314 return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) 315 } 316 defer clientSyncFile.Close() 317 318 // Create the sandbox process. 319 err = s.createSandboxProcess(conf, args, sandboxSyncFile) 320 // sandboxSyncFile has to be closed to be able to detect when the sandbox 321 // process exits unexpectedly. 322 sandboxSyncFile.Close() 323 if err != nil { 324 return nil, fmt.Errorf("cannot create sandbox process: %w", err) 325 } 326 327 // Wait until the sandbox has booted. 328 b := make([]byte, 1) 329 if l, err := clientSyncFile.Read(b); err != nil || l != 1 { 330 err := fmt.Errorf("waiting for sandbox to start: %v", err) 331 // If the sandbox failed to start, it may be because the binary 332 // permissions were incorrect. Check the bits and return a more helpful 333 // error message. 334 // 335 // NOTE: The error message is checked because error types are lost over 336 // rpc calls. 337 if strings.Contains(err.Error(), io.EOF.Error()) { 338 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 339 return nil, fmt.Errorf("%v: %v", err, permsErr) 340 } 341 } 342 return nil, fmt.Errorf("cannot read client sync file: %w", err) 343 } 344 345 if conf.MetricServer != "" { 346 // The control server is up and the sandbox was configured to export metrics. 347 // We must gather data about registered metrics prior to any process starting in the sandbox. 348 log.Debugf("Getting metric registration information from sandbox %q", s.ID) 349 var registeredMetrics control.MetricsRegistrationResponse 350 if err := s.call(boot.MetricsGetRegistered, nil, ®isteredMetrics); err != nil { 351 return nil, fmt.Errorf("cannot get registered metrics: %v", err) 352 } 353 s.RegisteredMetrics = registeredMetrics.RegisteredMetrics 354 } 355 356 c.Release() 357 return s, nil 358 } 359 360 // CreateSubcontainer creates a container inside the sandbox. 361 func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error { 362 log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 363 364 var files []*os.File 365 if tty != nil { 366 files = []*os.File{tty} 367 } 368 if err := s.configureStdios(conf, files); err != nil { 369 return err 370 } 371 372 args := boot.CreateArgs{ 373 CID: cid, 374 FilePayload: urpc.FilePayload{Files: files}, 375 } 376 if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil { 377 return fmt.Errorf("creating sub-container %q: %w", cid, err) 378 } 379 return nil 380 } 381 382 // StartRoot starts running the root container process inside the sandbox. 383 func (s *Sandbox) StartRoot(conf *config.Config) error { 384 pid := s.Pid.load() 385 log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid) 386 conn, err := s.sandboxConnect() 387 if err != nil { 388 return err 389 } 390 defer conn.Close() 391 392 // Configure the network. 393 if err := setupNetwork(conn, pid, conf); err != nil { 394 return fmt.Errorf("setting up network: %w", err) 395 } 396 397 // Send a message to the sandbox control server to start the root container. 398 if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil { 399 return fmt.Errorf("starting root container: %w", err) 400 } 401 402 return nil 403 } 404 405 // StartSubcontainer starts running a sub-container inside the sandbox. 406 func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error { 407 log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 408 409 if err := s.configureStdios(conf, stdios); err != nil { 410 return err 411 } 412 s.fixPidns(spec) 413 414 // The payload contains (in this specific order): 415 // * stdin/stdout/stderr (optional: only present when not using TTY) 416 // * The subcontainer's gofer filestore files (optional) 417 // * The subcontainer's dev gofer file (optional) 418 // * Gofer files. 419 payload := urpc.FilePayload{} 420 payload.Files = append(payload.Files, stdios...) 421 payload.Files = append(payload.Files, goferFilestores...) 422 if devIOFile != nil { 423 payload.Files = append(payload.Files, devIOFile) 424 } 425 payload.Files = append(payload.Files, goferFiles...) 426 427 // Start running the container. 428 args := boot.StartArgs{ 429 Spec: spec, 430 Conf: conf, 431 CID: cid, 432 NumGoferFilestoreFDs: len(goferFilestores), 433 IsDevIoFilePresent: devIOFile != nil, 434 GoferMountConfs: goferConfs, 435 FilePayload: payload, 436 } 437 if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil { 438 return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) 439 } 440 return nil 441 } 442 443 // Restore sends the restore call for a container in the sandbox. 444 func (s *Sandbox) Restore(conf *config.Config, cid string, filename string) error { 445 log.Debugf("Restore sandbox %q", s.ID) 446 447 rf, err := os.Open(filename) 448 if err != nil { 449 return fmt.Errorf("opening restore file %q failed: %v", filename, err) 450 } 451 defer rf.Close() 452 453 opt := boot.RestoreOpts{ 454 FilePayload: urpc.FilePayload{ 455 Files: []*os.File{rf}, 456 }, 457 SandboxID: s.ID, 458 } 459 460 // If the platform needs a device FD we must pass it in. 461 if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil { 462 return err 463 } else if deviceFile != nil { 464 defer deviceFile.Close() 465 opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile) 466 } 467 468 conn, err := s.sandboxConnect() 469 if err != nil { 470 return err 471 } 472 defer conn.Close() 473 474 // Configure the network. 475 if err := setupNetwork(conn, s.Pid.load(), conf); err != nil { 476 return fmt.Errorf("setting up network: %v", err) 477 } 478 479 // Restore the container and start the root container. 480 if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { 481 return fmt.Errorf("restoring container %q: %v", cid, err) 482 } 483 484 return nil 485 } 486 487 // Processes retrieves the list of processes and associated metadata for a 488 // given container in this sandbox. 489 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { 490 log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) 491 var pl []*control.Process 492 if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil { 493 return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) 494 } 495 return pl, nil 496 } 497 498 // CreateTraceSession creates a new trace session. 499 func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error { 500 log.Debugf("Creating trace session in sandbox %q", s.ID) 501 502 sinkFiles, err := seccheck.SetupSinks(config.Sinks) 503 if err != nil { 504 return err 505 } 506 defer func() { 507 for _, f := range sinkFiles { 508 _ = f.Close() 509 } 510 }() 511 512 arg := boot.CreateTraceSessionArgs{ 513 Config: *config, 514 Force: force, 515 FilePayload: urpc.FilePayload{ 516 Files: sinkFiles, 517 }, 518 } 519 if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil { 520 return fmt.Errorf("creating trace session: %w", err) 521 } 522 return nil 523 } 524 525 // DeleteTraceSession deletes an existing trace session. 526 func (s *Sandbox) DeleteTraceSession(name string) error { 527 log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID) 528 if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil { 529 return fmt.Errorf("deleting trace session: %w", err) 530 } 531 return nil 532 } 533 534 // ListTraceSessions lists all trace sessions. 535 func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) { 536 log.Debugf("Listing trace sessions in sandbox %q", s.ID) 537 var sessions []seccheck.SessionConfig 538 if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil { 539 return nil, fmt.Errorf("listing trace session: %w", err) 540 } 541 return sessions, nil 542 } 543 544 // ProcfsDump collects and returns a procfs dump for the sandbox. 545 func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) { 546 log.Debugf("Procfs dump %q", s.ID) 547 var procfsDump []procfs.ProcessProcfsDump 548 if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil { 549 return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 550 } 551 return procfsDump, nil 552 } 553 554 // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. 555 func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) { 556 return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */) 557 } 558 559 // Execute runs the specified command in the container. It returns the PID of 560 // the newly created process. 561 func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 562 log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) 563 564 // Stdios are those files which have an FD <= 2 in the process. We do not 565 // want the ownership of other files to be changed by configureStdios. 566 var stdios []*os.File 567 for i, fd := range args.GuestFDs { 568 if fd > 2 || i >= len(args.Files) { 569 continue 570 } 571 stdios = append(stdios, args.Files[i]) 572 } 573 574 if err := s.configureStdios(conf, stdios); err != nil { 575 return 0, err 576 } 577 578 // Send a message to the sandbox control server to start the container. 579 var pid int32 580 if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil { 581 return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err) 582 } 583 return pid, nil 584 } 585 586 // Event retrieves stats about the sandbox such as memory and CPU utilization. 587 func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { 588 log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) 589 var e boot.EventOut 590 if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil { 591 return nil, fmt.Errorf("retrieving event data from sandbox: %w", err) 592 } 593 return &e, nil 594 } 595 596 // PortForward starts port forwarding to the sandbox. 597 func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error { 598 log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts) 599 conn, err := s.sandboxConnect() 600 if err != nil { 601 return err 602 } 603 defer conn.Close() 604 605 if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil { 606 return fmt.Errorf("port forwarding to sandbox: %v", err) 607 } 608 609 return nil 610 } 611 612 func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { 613 log.Debugf("Connecting to sandbox %q", s.ID) 614 path := s.ControlSocketPath 615 if len(path) >= linux.UnixPathMax { 616 // This is not an abstract socket path. It is a filesystem path. 617 // UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead 618 // open the socket using open(2) and use /proc to refer to the open FD. 619 sockFD, err := unix.Open(path, unix.O_PATH, 0) 620 if err != nil { 621 return nil, fmt.Errorf("failed to open socket at %q", path) 622 } 623 defer unix.Close(sockFD) 624 path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD)) 625 } 626 conn, err := client.ConnectTo(path) 627 if err != nil { 628 return nil, s.connError(err) 629 } 630 return conn, nil 631 } 632 633 func (s *Sandbox) call(method string, arg, result any) error { 634 conn, err := s.sandboxConnect() 635 if err != nil { 636 return err 637 } 638 defer conn.Close() 639 640 return conn.Call(method, arg, result) 641 } 642 643 func (s *Sandbox) connError(err error) error { 644 return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err) 645 } 646 647 // createSandboxProcess starts the sandbox as a subprocess by running the "boot" 648 // command, passing in the bundle dir. 649 func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { 650 donations := donation.Agency{} 651 defer donations.Close() 652 653 // pgalloc.MemoryFile (which provides application memory) sometimes briefly 654 // mlock(2)s ranges of memory in order to fault in a large number of pages at 655 // a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc 656 // expects to run in a memory cgroup that limits its memory usage as 657 // required. 658 // This needs to be done before exec'ing `runsc boot`, as that subcommand 659 // runs as an unprivileged user that will not be able to call `setrlimit` 660 // by itself. Calling `setrlimit` here will have the side-effect of setting 661 // the limit on the currently-running `runsc` process as well, but that 662 // should be OK too. 663 var rlim unix.Rlimit 664 if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 665 log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err) 666 } else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY { 667 rlim.Cur = unix.RLIM_INFINITY 668 rlim.Max = unix.RLIM_INFINITY 669 if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 670 // We may not have CAP_SYS_RESOURCE, so this failure may be expected. 671 log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err) 672 } 673 } 674 675 // 676 // These flags must come BEFORE the "boot" command in cmd.Args. 677 // 678 679 // Open the log files to pass to the sandbox as FDs. 680 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 681 return err 682 } 683 684 test := "" 685 if len(conf.TestOnlyTestNameEnv) != 0 { 686 // Fetch test name if one is provided and the test only flag was set. 687 if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 688 test = t 689 } 690 } 691 if specutils.IsDebugCommand(conf, "boot") { 692 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil { 693 return err 694 } 695 } 696 if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil { 697 return err 698 } 699 covFilename := conf.CoverageReport 700 if covFilename == "" { 701 covFilename = os.Getenv("GO_COVERAGE_FILE") 702 } 703 if covFilename != "" && coverage.Available() { 704 if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil { 705 return err 706 } 707 } 708 if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test); err != nil { 709 return err 710 } 711 712 // Relay all the config flags to the sandbox process. 713 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 714 cmd.SysProcAttr = &unix.SysProcAttr{ 715 // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT 716 // when re-parented. 717 Setsid: true, 718 } 719 720 // Set Args[0] to make easier to spot the sandbox process. Otherwise it's 721 // shown as `exe`. 722 cmd.Args[0] = "runsc-sandbox" 723 724 // Tranfer FDs that need to be present before the "boot" command. 725 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 726 nextFD := donations.Transfer(cmd, 3) 727 728 // Add the "boot" command to the args. 729 // 730 // All flags after this must be for the boot command 731 cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) 732 733 // Clear environment variables, unless --TESTONLY-unsafe-nonroot is set. 734 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 735 // Setting cmd.Env = nil causes cmd to inherit the current process's env. 736 cmd.Env = []string{} 737 } 738 739 // If there is a gofer, sends all socket ends to the sandbox. 740 donations.DonateAndClose("io-fds", args.IOFiles...) 741 donations.DonateAndClose("dev-io-fd", args.DevIOFile) 742 donations.DonateAndClose("gofer-filestore-fds", args.GoferFilestoreFiles...) 743 donations.DonateAndClose("mounts-fd", args.MountsFile) 744 donations.Donate("start-sync-fd", startSyncFile) 745 if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 746 return err 747 } 748 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 749 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil { 750 return err 751 } 752 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil { 753 return err 754 } 755 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil { 756 return err 757 } 758 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil { 759 return err 760 } 761 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil { 762 return err 763 } 764 765 // Pass gofer mount configs. 766 cmd.Args = append(cmd.Args, "--gofer-mount-confs="+args.GoferMountConfs.String()) 767 768 // Create a socket for the control server and donate it to the sandbox. 769 controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID) 770 if err != nil { 771 return fmt.Errorf("failed to create control socket: %v", err) 772 } 773 s.ControlSocketPath = controlSocketPath 774 log.Infof("Control socket path: %q", s.ControlSocketPath) 775 donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket")) 776 777 specFile, err := specutils.OpenSpec(args.BundleDir) 778 if err != nil { 779 return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err) 780 } 781 donations.DonateAndClose("spec-fd", specFile) 782 783 if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil { 784 return err 785 } 786 donations.DonateAndClose("sink-fds", args.SinkFiles...) 787 788 gPlatform, err := platform.Lookup(conf.Platform) 789 if err != nil { 790 return fmt.Errorf("cannot look up platform: %w", err) 791 } 792 if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil { 793 return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) 794 } else if deviceFile != nil { 795 donations.DonateAndClose("device-fd", deviceFile) 796 } 797 798 // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff 799 // isn't set. 800 if conf.Platform == "kvm" { 801 cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") 802 } 803 804 // nss is the set of namespaces to join or create before starting the sandbox 805 // process. Mount, IPC and UTS namespaces from the host are not used as they 806 // are virtualized inside the sandbox. Be paranoid and run inside an empty 807 // namespace for these. Don't unshare cgroup because sandbox is added to a 808 // cgroup in the caller's namespace. 809 log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") 810 nss := []specs.LinuxNamespace{ 811 {Type: specs.IPCNamespace}, 812 {Type: specs.MountNamespace}, 813 {Type: specs.UTSNamespace}, 814 } 815 816 if gPlatform.Requirements().RequiresCurrentPIDNS { 817 // TODO(b/75837838): Also set a new PID namespace so that we limit 818 // access to other host processes. 819 log.Infof("Sandbox will be started in the current PID namespace") 820 } else { 821 log.Infof("Sandbox will be started in a new PID namespace") 822 nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) 823 cmd.Args = append(cmd.Args, "--pidns=true") 824 } 825 826 if specutils.NVProxyEnabled(args.Spec, conf) { 827 nvidiaDriverVersion, err := nvproxy.HostDriverVersion() 828 if err != nil { 829 return fmt.Errorf("failed to get Nvidia driver version: %w", err) 830 } 831 cmd.Args = append(cmd.Args, "--nvidia-driver-version="+nvidiaDriverVersion) 832 } 833 834 // Joins the network namespace if network is enabled. the sandbox talks 835 // directly to the host network, which may have been configured in the 836 // namespace. 837 if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { 838 log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) 839 nss = append(nss, ns) 840 } else if conf.Network == config.NetworkHost { 841 log.Infof("Sandbox will be started in the host network namespace") 842 } else { 843 log.Infof("Sandbox will be started in new network namespace") 844 nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) 845 } 846 847 // These are set to the uid/gid that the sandbox process will use. May be 848 // overriden below. 849 s.UID = os.Getuid() 850 s.GID = os.Getgid() 851 852 // User namespace depends on the network type or whether access to the host 853 // filesystem is required. These features require to run inside the user 854 // namespace specified in the spec or the current namespace if none is 855 // configured. 856 rootlessEUID := unix.Geteuid() != 0 857 setUserMappings := false 858 if conf.Network == config.NetworkHost || conf.DirectFS { 859 if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { 860 log.Infof("Sandbox will be started in container's user namespace: %+v", userns) 861 nss = append(nss, userns) 862 if rootlessEUID { 863 syncFile, err := ConfigureCmdForRootless(cmd, &donations) 864 if err != nil { 865 return err 866 } 867 defer syncFile.Close() 868 setUserMappings = true 869 } else { 870 specutils.SetUIDGIDMappings(cmd, args.Spec) 871 // We need to set UID and GID to have capabilities in a new user namespace. 872 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 873 } 874 } else { 875 if rootlessEUID { 876 return fmt.Errorf("unable to run a rootless container without userns") 877 } 878 log.Infof("Sandbox will be started in the current user namespace") 879 } 880 // When running in the caller's defined user namespace, apply the same 881 // capabilities to the sandbox process to ensure it abides to the same 882 // rules. 883 cmd.Args = append(cmd.Args, "--apply-caps=true") 884 885 // If we have CAP_SYS_ADMIN, we can create an empty chroot and 886 // bind-mount the executable inside it. 887 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 888 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 889 } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID { 890 log.Infof("Sandbox will be started in minimal chroot") 891 cmd.Args = append(cmd.Args, "--setup-root") 892 } else { 893 return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") 894 } 895 } else { 896 // If we have CAP_SETUID and CAP_SETGID, then we can also run 897 // as user nobody. 898 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 899 log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) 900 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 901 } else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { 902 log.Infof("Sandbox will be started in new user namespace") 903 nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) 904 cmd.Args = append(cmd.Args, "--setup-root") 905 906 const nobody = 65534 907 if rootlessEUID || conf.Rootless { 908 log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) 909 } else { 910 // Map nobody in the new namespace to nobody in the parent namespace. 911 s.UID = nobody 912 s.GID = nobody 913 } 914 915 // Set credentials to run as user and group nobody. 916 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} 917 cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ 918 { 919 ContainerID: nobody, 920 HostID: s.UID, 921 Size: 1, 922 }, 923 } 924 cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ 925 { 926 ContainerID: nobody, 927 HostID: s.GID, 928 Size: 1, 929 }, 930 } 931 932 // A sandbox process will construct an empty root for itself, so it has 933 // to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. 934 cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, 935 uintptr(capability.CAP_SYS_ADMIN), 936 uintptr(capability.CAP_SYS_CHROOT), 937 // CAP_SETPCAP is required to clear the bounding set. 938 uintptr(capability.CAP_SETPCAP), 939 ) 940 941 } else { 942 return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") 943 } 944 } 945 946 // The current process' stdio must be passed to the application via the 947 // --stdio-fds flag. The stdio of the sandbox process itself must not 948 // be connected to the same FDs, otherwise we risk leaking sandbox 949 // errors to the application, so we set the sandbox stdio to nil, 950 // causing them to read/write from the null device. 951 cmd.Stdin = nil 952 cmd.Stdout = nil 953 cmd.Stderr = nil 954 var stdios [3]*os.File 955 956 // If the console control socket file is provided, then create a new 957 // pty master/replica pair and set the TTY on the sandbox process. 958 if args.Spec.Process.Terminal && args.ConsoleSocket != "" { 959 // console.NewWithSocket will send the master on the given 960 // socket, and return the replica. 961 tty, err := console.NewWithSocket(args.ConsoleSocket) 962 if err != nil { 963 return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) 964 } 965 defer tty.Close() 966 967 // Set the TTY as a controlling TTY on the sandbox process. 968 cmd.SysProcAttr.Setctty = true 969 970 // Inconveniently, the Ctty must be the FD in the *child* process's FD 971 // table. So transfer all files we have so far and make sure the next file 972 // added to donations is stdin. 973 // 974 // See https://github.com/golang/go/issues/29458. 975 nextFD = donations.Transfer(cmd, nextFD) 976 cmd.SysProcAttr.Ctty = nextFD 977 978 // Pass the tty as all stdio fds to sandbox. 979 stdios[0] = tty 980 stdios[1] = tty 981 stdios[2] = tty 982 983 if conf.Debug { 984 // If debugging, send the boot process stdio to the 985 // TTY, so that it is easier to find. 986 cmd.Stdin = tty 987 cmd.Stdout = tty 988 cmd.Stderr = tty 989 } 990 } else { 991 // If not using a console, pass our current stdio as the 992 // container stdio via flags. 993 stdios[0] = os.Stdin 994 stdios[1] = os.Stdout 995 stdios[2] = os.Stderr 996 997 if conf.Debug { 998 // If debugging, send the boot process stdio to the 999 // this process' stdio, so that is is easier to find. 1000 cmd.Stdin = os.Stdin 1001 cmd.Stdout = os.Stdout 1002 cmd.Stderr = os.Stderr 1003 } 1004 } 1005 if err := s.configureStdios(conf, stdios[:]); err != nil { 1006 return fmt.Errorf("configuring stdios: %w", err) 1007 } 1008 // Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above 1009 // because it relies on stdin being the next FD donated. 1010 donations.Donate("stdio-fds", stdios[:]...) 1011 1012 totalSysMem, err := totalSystemMemory() 1013 if err != nil { 1014 return err 1015 } 1016 cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10)) 1017 1018 mem := totalSysMem 1019 if s.CgroupJSON.Cgroup != nil { 1020 cpuNum, err := s.CgroupJSON.Cgroup.NumCPU() 1021 if err != nil { 1022 return fmt.Errorf("getting cpu count from cgroups: %v", err) 1023 } 1024 if conf.CPUNumFromQuota { 1025 // Dropping below 2 CPUs can trigger application to disable 1026 // locks that can lead do hard to debug errors, so just 1027 // leaving two cores as reasonable default. 1028 const minCPUs = 2 1029 1030 quota, err := s.CgroupJSON.Cgroup.CPUQuota() 1031 if err != nil { 1032 return fmt.Errorf("getting cpu quota from cgroups: %v", err) 1033 } 1034 if n := int(math.Ceil(quota)); n > 0 { 1035 if n < minCPUs { 1036 n = minCPUs 1037 } 1038 if n < cpuNum { 1039 // Only lower the cpu number. 1040 cpuNum = n 1041 } 1042 } 1043 } 1044 cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) 1045 1046 memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit() 1047 if err != nil { 1048 return fmt.Errorf("getting memory limit from cgroups: %v", err) 1049 } 1050 if memLimit < mem { 1051 mem = memLimit 1052 } 1053 } 1054 cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) 1055 1056 if args.Attached { 1057 // Kill sandbox if parent process exits in attached mode. 1058 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1059 // Tells boot that any process it creates must have pdeathsig set. 1060 cmd.Args = append(cmd.Args, "--attached") 1061 } 1062 1063 if args.ExecFile != nil { 1064 donations.Donate("exec-fd", args.ExecFile) 1065 } 1066 1067 nextFD = donations.Transfer(cmd, nextFD) 1068 1069 _ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles) 1070 1071 // Add container ID as the last argument. 1072 cmd.Args = append(cmd.Args, s.ID) 1073 1074 donation.LogDonations(cmd) 1075 log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args) 1076 log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) 1077 if err := specutils.StartInNS(cmd, nss); err != nil { 1078 err := fmt.Errorf("starting sandbox: %v", err) 1079 // If the sandbox failed to start, it may be because the binary 1080 // permissions were incorrect. Check the bits and return a more helpful 1081 // error message. 1082 // 1083 // NOTE: The error message is checked because error types are lost over 1084 // rpc calls. 1085 if strings.Contains(err.Error(), unix.EACCES.Error()) { 1086 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 1087 return fmt.Errorf("%v: %v", err, permsErr) 1088 } 1089 } 1090 return err 1091 } 1092 s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) 1093 if err != nil { 1094 return err 1095 } 1096 if setUserMappings { 1097 if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil { 1098 return err 1099 } 1100 } 1101 1102 s.child = true 1103 s.Pid.store(cmd.Process.Pid) 1104 log.Infof("Sandbox started, PID: %d", cmd.Process.Pid) 1105 1106 return nil 1107 } 1108 1109 // Wait waits for the containerized process to exit, and returns its WaitStatus. 1110 func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { 1111 log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) 1112 1113 if conn, err := s.sandboxConnect(); err != nil { 1114 // The sandbox may have exited while before we had a chance to wait on it. 1115 // There is nothing we can do for subcontainers. For the init container, we 1116 // can try to get the sandbox exit code. 1117 if !s.IsRootContainer(cid) { 1118 return unix.WaitStatus(0), err 1119 } 1120 log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1121 } else { 1122 defer conn.Close() 1123 1124 // Try the Wait RPC to the sandbox. 1125 var ws unix.WaitStatus 1126 err = conn.Call(boot.ContMgrWait, &cid, &ws) 1127 conn.Close() 1128 if err == nil { 1129 if s.IsRootContainer(cid) { 1130 if err := s.waitForStopped(); err != nil { 1131 return unix.WaitStatus(0), err 1132 } 1133 } 1134 // It worked! 1135 return ws, nil 1136 } 1137 // See comment above. 1138 if !s.IsRootContainer(cid) { 1139 return unix.WaitStatus(0), err 1140 } 1141 1142 // The sandbox may have exited after we connected, but before 1143 // or during the Wait RPC. 1144 log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1145 } 1146 1147 // The sandbox may have already exited, or exited while handling the Wait RPC. 1148 // The best we can do is ask Linux what the sandbox exit status was, since in 1149 // most cases that will be the same as the container exit status. 1150 if err := s.waitForStopped(); err != nil { 1151 return unix.WaitStatus(0), err 1152 } 1153 if !s.child { 1154 return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") 1155 } 1156 1157 s.statusMu.Lock() 1158 defer s.statusMu.Unlock() 1159 return s.status, nil 1160 } 1161 1162 // WaitPID waits for process 'pid' in the container's sandbox and returns its 1163 // WaitStatus. 1164 func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { 1165 log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) 1166 var ws unix.WaitStatus 1167 args := &boot.WaitPIDArgs{ 1168 PID: pid, 1169 CID: cid, 1170 } 1171 if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil { 1172 return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err) 1173 } 1174 return ws, nil 1175 } 1176 1177 // IsRootContainer returns true if the specified container ID belongs to the 1178 // root container. 1179 func (s *Sandbox) IsRootContainer(cid string) bool { 1180 return s.ID == cid 1181 } 1182 1183 // Destroy frees all resources associated with the sandbox. It fails fast and 1184 // is idempotent. 1185 func (s *Sandbox) destroy() error { 1186 log.Debugf("Destroying sandbox %q", s.ID) 1187 // Only delete the control file if it exists. 1188 if len(s.ControlSocketPath) > 0 { 1189 if err := os.Remove(s.ControlSocketPath); err != nil { 1190 log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err) 1191 } 1192 } 1193 pid := s.Pid.load() 1194 if pid != 0 { 1195 log.Debugf("Killing sandbox %q", s.ID) 1196 if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH { 1197 return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err) 1198 } 1199 if err := s.waitForStopped(); err != nil { 1200 return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err) 1201 } 1202 } 1203 1204 return nil 1205 } 1206 1207 // SignalContainer sends the signal to a container in the sandbox. If all is 1208 // true and signal is SIGKILL, then waits for all processes to exit before 1209 // returning. 1210 func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { 1211 log.Debugf("Signal sandbox %q", s.ID) 1212 mode := boot.DeliverToProcess 1213 if all { 1214 mode = boot.DeliverToAllProcesses 1215 } 1216 1217 args := boot.SignalArgs{ 1218 CID: cid, 1219 Signo: int32(sig), 1220 Mode: mode, 1221 } 1222 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1223 return fmt.Errorf("signaling container %q: %w", cid, err) 1224 } 1225 return nil 1226 } 1227 1228 // SignalProcess sends the signal to a particular process in the container. If 1229 // fgProcess is true, then the signal is sent to the foreground process group 1230 // in the same session that PID belongs to. This is only valid if the process 1231 // is attached to a host TTY. 1232 func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error { 1233 log.Debugf("Signal sandbox %q", s.ID) 1234 1235 mode := boot.DeliverToProcess 1236 if fgProcess { 1237 mode = boot.DeliverToForegroundProcessGroup 1238 } 1239 1240 args := boot.SignalArgs{ 1241 CID: cid, 1242 Signo: int32(sig), 1243 PID: pid, 1244 Mode: mode, 1245 } 1246 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1247 return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) 1248 } 1249 return nil 1250 } 1251 1252 // Checkpoint sends the checkpoint call for a container in the sandbox. 1253 // The statefile will be written to f. 1254 func (s *Sandbox) Checkpoint(cid string, f *os.File, options statefile.Options) error { 1255 log.Debugf("Checkpoint sandbox %q, options %+v", s.ID, options) 1256 opt := control.SaveOpts{ 1257 Metadata: options.WriteToMetadata(map[string]string{}), 1258 FilePayload: urpc.FilePayload{ 1259 Files: []*os.File{f}, 1260 }, 1261 } 1262 1263 if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil { 1264 return fmt.Errorf("checkpointing container %q: %w", cid, err) 1265 } 1266 return nil 1267 } 1268 1269 // Pause sends the pause call for a container in the sandbox. 1270 func (s *Sandbox) Pause(cid string) error { 1271 log.Debugf("Pause sandbox %q", s.ID) 1272 if err := s.call(boot.LifecyclePause, nil, nil); err != nil { 1273 return fmt.Errorf("pausing container %q: %w", cid, err) 1274 } 1275 return nil 1276 } 1277 1278 // Resume sends the resume call for a container in the sandbox. 1279 func (s *Sandbox) Resume(cid string) error { 1280 log.Debugf("Resume sandbox %q", s.ID) 1281 if err := s.call(boot.LifecycleResume, nil, nil); err != nil { 1282 return fmt.Errorf("resuming container %q: %w", cid, err) 1283 } 1284 return nil 1285 } 1286 1287 // Usage sends the collect call for a container in the sandbox. 1288 func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) { 1289 log.Debugf("Usage sandbox %q", s.ID) 1290 opts := control.MemoryUsageOpts{Full: Full} 1291 var m control.MemoryUsage 1292 if err := s.call(boot.UsageCollect, &opts, &m); err != nil { 1293 return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err) 1294 } 1295 return m, nil 1296 } 1297 1298 // UsageFD sends the usagefd call for a container in the sandbox. 1299 func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) { 1300 log.Debugf("Usage sandbox %q", s.ID) 1301 opts := control.MemoryUsageFileOpts{Version: 1} 1302 var m control.MemoryUsageFile 1303 if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil { 1304 return nil, fmt.Errorf("collecting usage FD: %w", err) 1305 } 1306 1307 if len(m.FilePayload.Files) != 2 { 1308 return nil, fmt.Errorf("wants exactly two fds") 1309 } 1310 return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) 1311 } 1312 1313 // GetRegisteredMetrics returns metric registration data from the sandbox. 1314 // This data is meant to be used as a way to sanity-check any exported metrics data during the 1315 // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce 1316 // bogus metrics. 1317 // This returns an error if the sandbox has not requested instrumentation during creation time. 1318 func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) { 1319 if s.RegisteredMetrics == nil { 1320 return nil, errors.New("sandbox did not request instrumentation when it was created") 1321 } 1322 return s.RegisteredMetrics, nil 1323 } 1324 1325 // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format. 1326 func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) { 1327 log.Debugf("Metrics export sandbox %q", s.ID) 1328 var data control.MetricsExportData 1329 if err := s.call(boot.MetricsExport, &opts, &data); err != nil { 1330 return nil, err 1331 } 1332 // Since we do not trust the output of the sandbox as-is, double-check that the options were 1333 // respected. 1334 if err := opts.Verify(&data); err != nil { 1335 return nil, err 1336 } 1337 return data.Snapshot, nil 1338 } 1339 1340 // IsRunning returns true if the sandbox or gofer process is running. 1341 func (s *Sandbox) IsRunning() bool { 1342 pid := s.Pid.load() 1343 if pid != 0 { 1344 // Send a signal 0 to the sandbox process. 1345 if err := unix.Kill(pid, 0); err == nil { 1346 // Succeeded, process is running. 1347 return true 1348 } 1349 } 1350 return false 1351 } 1352 1353 // Stacks collects and returns all stacks for the sandbox. 1354 func (s *Sandbox) Stacks() (string, error) { 1355 log.Debugf("Stacks sandbox %q", s.ID) 1356 var stacks string 1357 if err := s.call(boot.DebugStacks, nil, &stacks); err != nil { 1358 return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 1359 } 1360 return stacks, nil 1361 } 1362 1363 // HeapProfile writes a heap profile to the given file. 1364 func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { 1365 log.Debugf("Heap profile %q", s.ID) 1366 opts := control.HeapProfileOpts{ 1367 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1368 Delay: delay, 1369 } 1370 return s.call(boot.ProfileHeap, &opts, nil) 1371 } 1372 1373 // CPUProfile collects a CPU profile. 1374 func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { 1375 log.Debugf("CPU profile %q", s.ID) 1376 opts := control.CPUProfileOpts{ 1377 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1378 Duration: duration, 1379 } 1380 return s.call(boot.ProfileCPU, &opts, nil) 1381 } 1382 1383 // BlockProfile writes a block profile to the given file. 1384 func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { 1385 log.Debugf("Block profile %q", s.ID) 1386 opts := control.BlockProfileOpts{ 1387 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1388 Duration: duration, 1389 } 1390 return s.call(boot.ProfileBlock, &opts, nil) 1391 } 1392 1393 // MutexProfile writes a mutex profile to the given file. 1394 func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { 1395 log.Debugf("Mutex profile %q", s.ID) 1396 opts := control.MutexProfileOpts{ 1397 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1398 Duration: duration, 1399 } 1400 return s.call(boot.ProfileMutex, &opts, nil) 1401 } 1402 1403 // Trace collects an execution trace. 1404 func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { 1405 log.Debugf("Trace %q", s.ID) 1406 opts := control.TraceProfileOpts{ 1407 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1408 Duration: duration, 1409 } 1410 return s.call(boot.ProfileTrace, &opts, nil) 1411 } 1412 1413 // ChangeLogging changes logging options. 1414 func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { 1415 log.Debugf("Change logging start %q", s.ID) 1416 if err := s.call(boot.LoggingChange, &args, nil); err != nil { 1417 return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err) 1418 } 1419 return nil 1420 } 1421 1422 // DestroyContainer destroys the given container. If it is the root container, 1423 // then the entire sandbox is destroyed. 1424 func (s *Sandbox) DestroyContainer(cid string) error { 1425 if err := s.destroyContainer(cid); err != nil { 1426 // If the sandbox isn't running, the container has already been destroyed, 1427 // ignore the error in this case. 1428 if s.IsRunning() { 1429 return err 1430 } 1431 } 1432 return nil 1433 } 1434 1435 func (s *Sandbox) destroyContainer(cid string) error { 1436 if s.IsRootContainer(cid) { 1437 log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) 1438 return s.destroy() 1439 } 1440 1441 log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) 1442 if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil { 1443 return fmt.Errorf("destroying container %q: %w", cid, err) 1444 } 1445 return nil 1446 } 1447 1448 func (s *Sandbox) waitForStopped() error { 1449 if s.child { 1450 s.statusMu.Lock() 1451 defer s.statusMu.Unlock() 1452 pid := s.Pid.load() 1453 if pid == 0 { 1454 return nil 1455 } 1456 // The sandbox process is a child of the current process, 1457 // so we can wait on it to terminate and collect its zombie. 1458 if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil { 1459 return fmt.Errorf("error waiting the sandbox process: %v", err) 1460 } 1461 s.Pid.store(0) 1462 return nil 1463 } 1464 1465 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1466 defer cancel() 1467 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1468 op := func() error { 1469 if s.IsRunning() { 1470 return fmt.Errorf("sandbox is still running") 1471 } 1472 return nil 1473 } 1474 return backoff.Retry(op, b) 1475 } 1476 1477 // configureStdios change stdios ownership to give access to the sandbox 1478 // process. This may be skipped depending on the configuration. 1479 func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error { 1480 if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1481 // Cannot change ownership without CAP_CHOWN. 1482 return nil 1483 } 1484 1485 if s.UID < 0 || s.GID < 0 { 1486 panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID)) 1487 } 1488 for _, file := range stdios { 1489 log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) 1490 if err := file.Chown(s.UID, s.GID); err != nil { 1491 if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { 1492 log.Warningf("can't change an owner of %s: %s", file.Name(), err) 1493 continue 1494 } 1495 return err 1496 } 1497 } 1498 return nil 1499 } 1500 1501 // deviceFileForPlatform opens the device file for the given platform. If the 1502 // platform does not need a device file, then nil is returned. 1503 // devicePath may be empty to use a sane platform-specific default. 1504 func deviceFileForPlatform(name, devicePath string) (*os.File, error) { 1505 p, err := platform.Lookup(name) 1506 if err != nil { 1507 return nil, err 1508 } 1509 1510 f, err := p.OpenDevice(devicePath) 1511 if err != nil { 1512 return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) 1513 } 1514 return f, nil 1515 } 1516 1517 // checkBinaryPermissions verifies that the required binary bits are set on 1518 // the runsc executable. 1519 func checkBinaryPermissions(conf *config.Config) error { 1520 // All platforms need the other exe bit 1521 neededBits := os.FileMode(0001) 1522 if conf.Platform == "ptrace" { 1523 // Ptrace needs the other read bit 1524 neededBits |= os.FileMode(0004) 1525 } 1526 1527 exePath, err := os.Executable() 1528 if err != nil { 1529 return fmt.Errorf("getting exe path: %v", err) 1530 } 1531 1532 // Check the permissions of the runsc binary and print an error if it 1533 // doesn't match expectations. 1534 info, err := os.Stat(exePath) 1535 if err != nil { 1536 return fmt.Errorf("stat file: %v", err) 1537 } 1538 1539 if info.Mode().Perm()&neededBits != neededBits { 1540 return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) 1541 } 1542 return nil 1543 } 1544 1545 // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox. 1546 func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) { 1547 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1548 args := control.CgroupsReadArgs{ 1549 Args: []control.CgroupsReadArg{ 1550 { 1551 File: file, 1552 }, 1553 }, 1554 } 1555 var out control.CgroupsResults 1556 if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil { 1557 return "", err 1558 } 1559 if len(out.Results) != 1 { 1560 return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1561 } 1562 return out.Results[0].Unpack() 1563 } 1564 1565 // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox. 1566 func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error { 1567 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1568 args := control.CgroupsWriteArgs{ 1569 Args: []control.CgroupsWriteArg{ 1570 { 1571 File: file, 1572 Value: value, 1573 }, 1574 }, 1575 } 1576 var out control.CgroupsResults 1577 if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil { 1578 return err 1579 } 1580 if len(out.Results) != 1 { 1581 return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1582 } 1583 return out.Results[0].AsError() 1584 } 1585 1586 // fixPidns looks at the PID namespace path. If that path corresponds to the 1587 // sandbox process PID namespace, then change the spec so that the container 1588 // joins the sandbox root namespace. 1589 func (s *Sandbox) fixPidns(spec *specs.Spec) { 1590 pidns, ok := specutils.GetNS(specs.PIDNamespace, spec) 1591 if !ok { 1592 // pidns was not set, nothing to fix. 1593 return 1594 } 1595 if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) { 1596 // Fix only if the PID namespace corresponds to the sandbox's. 1597 return 1598 } 1599 1600 for i := range spec.Linux.Namespaces { 1601 if spec.Linux.Namespaces[i].Type == specs.PIDNamespace { 1602 // Removing the namespace makes the container join the sandbox root 1603 // namespace. 1604 log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path) 1605 spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...) 1606 return 1607 } 1608 } 1609 panic("unreachable") 1610 } 1611 1612 // ConfigureCmdForRootless configures cmd to donate a socket FD that can be 1613 // used to synchronize userns configuration. 1614 func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) { 1615 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1616 if err != nil { 1617 return nil, err 1618 } 1619 f := os.NewFile(uintptr(fds[1]), "userns sync other FD") 1620 donations.DonateAndClose("sync-userns-fd", f) 1621 if cmd.SysProcAttr == nil { 1622 cmd.SysProcAttr = &unix.SysProcAttr{} 1623 } 1624 cmd.SysProcAttr.AmbientCaps = []uintptr{ 1625 // Same as `cap` in cmd/gofer.go. 1626 unix.CAP_CHOWN, 1627 unix.CAP_DAC_OVERRIDE, 1628 unix.CAP_DAC_READ_SEARCH, 1629 unix.CAP_FOWNER, 1630 unix.CAP_FSETID, 1631 unix.CAP_SYS_CHROOT, 1632 // Needed for setuid(2)/setgid(2). 1633 unix.CAP_SETUID, 1634 unix.CAP_SETGID, 1635 // Needed for chroot. 1636 unix.CAP_SYS_ADMIN, 1637 // Needed to be able to clear bounding set (PR_CAPBSET_DROP). 1638 unix.CAP_SETPCAP, 1639 } 1640 return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil 1641 } 1642 1643 // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings 1644 // for process pid. 1645 func SetUserMappings(spec *specs.Spec, pid int) error { 1646 log.Debugf("Setting user mappings") 1647 args := []string{strconv.Itoa(pid)} 1648 for _, idMap := range spec.Linux.UIDMappings { 1649 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1650 idMap.HostID, idMap.ContainerID, idMap.Size) 1651 args = append(args, 1652 strconv.Itoa(int(idMap.ContainerID)), 1653 strconv.Itoa(int(idMap.HostID)), 1654 strconv.Itoa(int(idMap.Size)), 1655 ) 1656 } 1657 1658 out, err := exec.Command("newuidmap", args...).CombinedOutput() 1659 log.Debugf("newuidmap: %#v\n%s", args, out) 1660 if err != nil { 1661 return fmt.Errorf("newuidmap failed: %w", err) 1662 } 1663 1664 args = []string{strconv.Itoa(pid)} 1665 for _, idMap := range spec.Linux.GIDMappings { 1666 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1667 idMap.HostID, idMap.ContainerID, idMap.Size) 1668 args = append(args, 1669 strconv.Itoa(int(idMap.ContainerID)), 1670 strconv.Itoa(int(idMap.HostID)), 1671 strconv.Itoa(int(idMap.Size)), 1672 ) 1673 } 1674 out, err = exec.Command("newgidmap", args...).CombinedOutput() 1675 log.Debugf("newgidmap: %#v\n%s", args, out) 1676 if err != nil { 1677 return fmt.Errorf("newgidmap failed: %w", err) 1678 } 1679 return nil 1680 } 1681 1682 // Mount mounts a filesystem in a container. 1683 func (s *Sandbox) Mount(cid, fstype, src, dest string) error { 1684 var files []*os.File 1685 switch fstype { 1686 case erofs.Name: 1687 if imageFile, err := os.Open(src); err != nil { 1688 return fmt.Errorf("opening %s: %v", src, err) 1689 } else { 1690 files = append(files, imageFile) 1691 } 1692 1693 default: 1694 return fmt.Errorf("unsupported filesystem type: %v", fstype) 1695 } 1696 1697 args := boot.MountArgs{ 1698 ContainerID: cid, 1699 Source: src, 1700 Destination: dest, 1701 FsType: fstype, 1702 FilePayload: urpc.FilePayload{Files: files}, 1703 } 1704 return s.call(boot.ContMgrMount, &args, nil) 1705 }