github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/sandbox/sandbox.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sandbox creates and manipulates sandboxes. 16 package sandbox 17 18 import ( 19 "context" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "math" 25 "os" 26 "os/exec" 27 "path/filepath" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 34 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 35 "github.com/MerlinKodo/gvisor/pkg/cleanup" 36 "github.com/MerlinKodo/gvisor/pkg/control/client" 37 "github.com/MerlinKodo/gvisor/pkg/control/server" 38 "github.com/MerlinKodo/gvisor/pkg/coverage" 39 "github.com/MerlinKodo/gvisor/pkg/log" 40 metricpb "github.com/MerlinKodo/gvisor/pkg/metric/metric_go_proto" 41 "github.com/MerlinKodo/gvisor/pkg/prometheus" 42 "github.com/MerlinKodo/gvisor/pkg/sentry/control" 43 "github.com/MerlinKodo/gvisor/pkg/sentry/platform" 44 "github.com/MerlinKodo/gvisor/pkg/sentry/seccheck" 45 "github.com/MerlinKodo/gvisor/pkg/state/statefile" 46 "github.com/MerlinKodo/gvisor/pkg/sync" 47 "github.com/MerlinKodo/gvisor/pkg/urpc" 48 "github.com/MerlinKodo/gvisor/runsc/boot" 49 "github.com/MerlinKodo/gvisor/runsc/boot/procfs" 50 "github.com/MerlinKodo/gvisor/runsc/cgroup" 51 "github.com/MerlinKodo/gvisor/runsc/config" 52 "github.com/MerlinKodo/gvisor/runsc/console" 53 "github.com/MerlinKodo/gvisor/runsc/donation" 54 "github.com/MerlinKodo/gvisor/runsc/specutils" 55 "github.com/cenkalti/backoff" 56 specs "github.com/opencontainers/runtime-spec/specs-go" 57 "github.com/syndtr/gocapability/capability" 58 "golang.org/x/sys/unix" 59 ) 60 61 const ( 62 // namespaceAnnotation is a pod annotation populated by containerd. 63 // It contains the name of the pod that a sandbox is in when running in Kubernetes. 64 podNameAnnotation = "io.kubernetes.cri.sandbox-name" 65 66 // namespaceAnnotation is a pod annotation populated by containerd. 67 // It contains the namespace of the pod that a sandbox is in when running in Kubernetes. 68 namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" 69 ) 70 71 // createControlSocket finds a location and creates the socket used to 72 // communicate with the sandbox. The socket is a UDS on the host filesystem. 73 // 74 // Note that abstract sockets are *not* used, because any user can connect to 75 // them. There is no file mode protecting abstract sockets. 76 func createControlSocket(rootDir, id string) (string, int, error) { 77 name := fmt.Sprintf("runsc-%s.sock", id) 78 79 // Only use absolute paths to guarantee resolution from anywhere. 80 for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} { 81 path := filepath.Join(dir, name) 82 log.Debugf("Attempting to create socket file %q", path) 83 fd, err := server.CreateSocket(path) 84 if err == nil { 85 log.Debugf("Using socket file %q", path) 86 return path, fd, nil 87 } 88 log.Debugf("Failed to create socket file %q: %v", path, err) 89 } 90 return "", -1, fmt.Errorf("unable to find location to write socket file") 91 } 92 93 // pid is an atomic type that implements JSON marshal/unmarshal interfaces. 94 type pid struct { 95 val atomicbitops.Int64 96 } 97 98 func (p *pid) store(pid int) { 99 p.val.Store(int64(pid)) 100 } 101 102 func (p *pid) load() int { 103 return int(p.val.Load()) 104 } 105 106 // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. 107 func (p *pid) UnmarshalJSON(b []byte) error { 108 var pid int 109 110 if err := json.Unmarshal(b, &pid); err != nil { 111 return err 112 } 113 p.store(pid) 114 return nil 115 } 116 117 // MarshalJSON implements json.Marshaler.MarshalJSON 118 func (p *pid) MarshalJSON() ([]byte, error) { 119 return json.Marshal(p.load()) 120 } 121 122 // Sandbox wraps a sandbox process. 123 // 124 // It is used to start/stop sandbox process (and associated processes like 125 // gofers), as well as for running and manipulating containers inside a running 126 // sandbox. 127 // 128 // Note: Sandbox must be immutable because a copy of it is saved for each 129 // container and changes would not be synchronized to all of them. 130 type Sandbox struct { 131 // ID is the id of the sandbox (immutable). By convention, this is the same 132 // ID as the first container run in the sandbox. 133 ID string `json:"id"` 134 135 // PodName is the name of the Kubernetes Pod (if any) that this sandbox 136 // represents. Unset if not running under containerd or Kubernetes. 137 PodName string `json:"podName"` 138 139 // Namespace is the Kubernetes namespace (if any) of the pod that this 140 // sandbox represents. Unset if not running under containerd or Kubernetes. 141 Namespace string `json:"namespace"` 142 143 // Pid is the pid of the running sandbox. May be 0 if the sandbox 144 // is not running. 145 Pid pid `json:"pid"` 146 147 // UID is the user ID in the parent namespace that the sandbox is running as. 148 UID int `json:"uid"` 149 // GID is the group ID in the parent namespace that the sandbox is running as. 150 GID int `json:"gid"` 151 152 // CgroupJSON contains the cgroup configuration that the sandbox is part of 153 // and allow serialization of the configuration into json 154 CgroupJSON cgroup.CgroupJSON `json:"cgroup"` 155 156 // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox 157 // started, before it may be modified. 158 OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` 159 160 // RegisteredMetrics is the set of metrics registered in the sandbox. 161 // Used for verifying metric data integrity after containers are started. 162 // Only populated if exporting metrics was requested when the sandbox was 163 // created. 164 RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"` 165 166 // MetricMetadata are key-value pairs that are useful to export about this 167 // sandbox, but not part of the set of labels that uniquely identify it. 168 // They are static once initialized, and typically contain high-level 169 // configuration information about the sandbox. 170 MetricMetadata map[string]string `json:"metricMetadata"` 171 172 // MetricServerAddress is the address of the metric server that this sandbox 173 // intends to export metrics for. 174 // Only populated if exporting metrics was requested when the sandbox was 175 // created. 176 MetricServerAddress string `json:"metricServerAddress"` 177 178 // ControlSocketPath is the path to the sandbox's uRPC server socket. 179 // Connections to the sandbox are made through this. 180 ControlSocketPath string `json:"controlSocketPath"` 181 182 // MountHints provides extra information about container mounts that apply 183 // to the entire pod. 184 MountHints *boot.PodMountHints `json:"mountHints"` 185 186 // child is set if a sandbox process is a child of the current process. 187 // 188 // This field isn't saved to json, because only a creator of sandbox 189 // will have it as a child process. 190 child bool `nojson:"true"` 191 192 // statusMu protects status. 193 statusMu sync.Mutex `nojson:"true"` 194 195 // status is the exit status of a sandbox process. It's only set if the 196 // child==true and the sandbox was waited on. This field allows for multiple 197 // threads to wait on sandbox and get the exit code, since Linux will return 198 // WaitStatus to one of the waiters only. 199 status unix.WaitStatus `nojson:"true"` 200 } 201 202 // Getpid returns the process ID of the sandbox process. 203 func (s *Sandbox) Getpid() int { 204 return s.Pid.load() 205 } 206 207 // Args is used to configure a new sandbox. 208 type Args struct { 209 // ID is the sandbox unique identifier. 210 ID string 211 212 // Spec is the OCI spec that describes the container. 213 Spec *specs.Spec 214 215 // BundleDir is the directory containing the container bundle. 216 BundleDir string 217 218 // ConsoleSocket is the path to a unix domain socket that will receive 219 // the console FD. It may be empty. 220 ConsoleSocket string 221 222 // UserLog is the filename to send user-visible logs to. It may be empty. 223 UserLog string 224 225 // IOFiles is the list of files that connect to a gofer endpoint for the 226 // mounts points using Gofers. They must be in the same order as mounts 227 // appear in the spec. 228 IOFiles []*os.File 229 230 // OverlayFilestoreFiles are the regular files that will back the tmpfs upper 231 // mount in the overlay mounts. 232 OverlayFilestoreFiles []*os.File 233 234 // OverlayMediums contains information about how the gofer mounts have been 235 // overlaid. The first entry is for rootfs and the following entries are for 236 // bind mounts in Spec.Mounts (in the same order). 237 OverlayMediums boot.OverlayMediumFlags 238 239 // MountHints provides extra information about containers mounts that apply 240 // to the entire pod. 241 MountHints *boot.PodMountHints 242 243 // MountsFile is a file container mount information from the spec. It's 244 // equivalent to the mounts from the spec, except that all paths have been 245 // resolved to their final absolute location. 246 MountsFile *os.File 247 248 // Gcgroup is the cgroup that the sandbox is part of. 249 Cgroup cgroup.Cgroup 250 251 // Attached indicates that the sandbox lifecycle is attached with the caller. 252 // If the caller exits, the sandbox should exit too. 253 Attached bool 254 255 // SinkFiles is the an ordered array of files to be used by seccheck sinks 256 // configured from the --pod-init-config file. 257 SinkFiles []*os.File 258 259 // PassFiles are user-supplied files from the host to be exposed to the 260 // sandboxed app. 261 PassFiles map[int]*os.File 262 263 // ExecFile is the file from the host used for program execution. 264 ExecFile *os.File 265 266 // NvidiaDevMinors is the list of device minors for Nvidia GPU devices 267 // exposed to the sandbox. 268 NvidiaDevMinors boot.NvidiaDevMinors 269 } 270 271 // New creates the sandbox process. The caller must call Destroy() on the 272 // sandbox. 273 func New(conf *config.Config, args *Args) (*Sandbox, error) { 274 s := &Sandbox{ 275 ID: args.ID, 276 CgroupJSON: cgroup.CgroupJSON{ 277 Cgroup: args.Cgroup, 278 }, 279 UID: -1, // prevent usage before it's set. 280 GID: -1, // prevent usage before it's set. 281 MetricMetadata: conf.MetricMetadata(), 282 MetricServerAddress: conf.MetricServer, 283 MountHints: args.MountHints, 284 } 285 if args.Spec != nil && args.Spec.Annotations != nil { 286 s.PodName = args.Spec.Annotations[podNameAnnotation] 287 s.Namespace = args.Spec.Annotations[namespaceAnnotation] 288 } 289 290 // The Cleanup object cleans up partially created sandboxes when an error 291 // occurs. Any errors occurring during cleanup itself are ignored. 292 c := cleanup.Make(func() { 293 if err := s.destroy(); err != nil { 294 log.Warningf("error destroying sandbox: %v", err) 295 } 296 }) 297 defer c.Clean() 298 299 if len(conf.PodInitConfig) > 0 { 300 initConf, err := boot.LoadInitConfig(conf.PodInitConfig) 301 if err != nil { 302 return nil, fmt.Errorf("loading init config file: %w", err) 303 } 304 args.SinkFiles, err = initConf.Setup() 305 if err != nil { 306 return nil, fmt.Errorf("cannot init config: %w", err) 307 } 308 } 309 310 // Create pipe to synchronize when sandbox process has been booted. 311 clientSyncFile, sandboxSyncFile, err := os.Pipe() 312 if err != nil { 313 return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) 314 } 315 defer clientSyncFile.Close() 316 317 // Create the sandbox process. 318 err = s.createSandboxProcess(conf, args, sandboxSyncFile) 319 // sandboxSyncFile has to be closed to be able to detect when the sandbox 320 // process exits unexpectedly. 321 sandboxSyncFile.Close() 322 if err != nil { 323 return nil, fmt.Errorf("cannot create sandbox process: %w", err) 324 } 325 326 // Wait until the sandbox has booted. 327 b := make([]byte, 1) 328 if l, err := clientSyncFile.Read(b); err != nil || l != 1 { 329 err := fmt.Errorf("waiting for sandbox to start: %v", err) 330 // If the sandbox failed to start, it may be because the binary 331 // permissions were incorrect. Check the bits and return a more helpful 332 // error message. 333 // 334 // NOTE: The error message is checked because error types are lost over 335 // rpc calls. 336 if strings.Contains(err.Error(), io.EOF.Error()) { 337 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 338 return nil, fmt.Errorf("%v: %v", err, permsErr) 339 } 340 } 341 return nil, fmt.Errorf("cannot read client sync file: %w", err) 342 } 343 344 if conf.MetricServer != "" { 345 // The control server is up and the sandbox was configured to export metrics. 346 // We must gather data about registered metrics prior to any process starting in the sandbox. 347 log.Debugf("Getting metric registration information from sandbox %q", s.ID) 348 var registeredMetrics control.MetricsRegistrationResponse 349 if err := s.call(boot.MetricsGetRegistered, nil, ®isteredMetrics); err != nil { 350 return nil, fmt.Errorf("cannot get registered metrics: %v", err) 351 } 352 s.RegisteredMetrics = registeredMetrics.RegisteredMetrics 353 } 354 355 c.Release() 356 return s, nil 357 } 358 359 // CreateSubcontainer creates a container inside the sandbox. 360 func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error { 361 log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 362 363 var files []*os.File 364 if tty != nil { 365 files = []*os.File{tty} 366 } 367 if err := s.configureStdios(conf, files); err != nil { 368 return err 369 } 370 371 args := boot.CreateArgs{ 372 CID: cid, 373 FilePayload: urpc.FilePayload{Files: files}, 374 } 375 if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil { 376 return fmt.Errorf("creating sub-container %q: %w", cid, err) 377 } 378 return nil 379 } 380 381 // StartRoot starts running the root container process inside the sandbox. 382 func (s *Sandbox) StartRoot(conf *config.Config) error { 383 pid := s.Pid.load() 384 log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid) 385 conn, err := s.sandboxConnect() 386 if err != nil { 387 return err 388 } 389 defer conn.Close() 390 391 // Configure the network. 392 if err := setupNetwork(conn, pid, conf); err != nil { 393 return fmt.Errorf("setting up network: %w", err) 394 } 395 396 // Send a message to the sandbox control server to start the root container. 397 if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil { 398 return fmt.Errorf("starting root container: %w", err) 399 } 400 401 return nil 402 } 403 404 // StartSubcontainer starts running a sub-container inside the sandbox. 405 func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, overlayFilestoreFiles []*os.File, overlayMediums []boot.OverlayMedium) error { 406 log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 407 408 if err := s.configureStdios(conf, stdios); err != nil { 409 return err 410 } 411 s.fixPidns(spec) 412 413 // The payload contains (in this specific order): 414 // * stdin/stdout/stderr (optional: only present when not using TTY) 415 // * The subcontainer's overlay filestore files (optional: only present when 416 // host file backed overlay is configured) 417 // * Gofer files. 418 payload := urpc.FilePayload{} 419 payload.Files = append(payload.Files, stdios...) 420 payload.Files = append(payload.Files, overlayFilestoreFiles...) 421 payload.Files = append(payload.Files, goferFiles...) 422 423 // Start running the container. 424 args := boot.StartArgs{ 425 Spec: spec, 426 Conf: conf, 427 CID: cid, 428 NumOverlayFilestoreFDs: len(overlayFilestoreFiles), 429 OverlayMediums: overlayMediums, 430 FilePayload: payload, 431 } 432 if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil { 433 return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) 434 } 435 return nil 436 } 437 438 // Restore sends the restore call for a container in the sandbox. 439 func (s *Sandbox) Restore(conf *config.Config, cid string, filename string) error { 440 log.Debugf("Restore sandbox %q", s.ID) 441 442 rf, err := os.Open(filename) 443 if err != nil { 444 return fmt.Errorf("opening restore file %q failed: %v", filename, err) 445 } 446 defer rf.Close() 447 448 opt := boot.RestoreOpts{ 449 FilePayload: urpc.FilePayload{ 450 Files: []*os.File{rf}, 451 }, 452 SandboxID: s.ID, 453 } 454 455 // If the platform needs a device FD we must pass it in. 456 if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil { 457 return err 458 } else if deviceFile != nil { 459 defer deviceFile.Close() 460 opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile) 461 } 462 463 conn, err := s.sandboxConnect() 464 if err != nil { 465 return err 466 } 467 defer conn.Close() 468 469 // Configure the network. 470 if err := setupNetwork(conn, s.Pid.load(), conf); err != nil { 471 return fmt.Errorf("setting up network: %v", err) 472 } 473 474 // Restore the container and start the root container. 475 if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { 476 return fmt.Errorf("restoring container %q: %v", cid, err) 477 } 478 479 return nil 480 } 481 482 // Processes retrieves the list of processes and associated metadata for a 483 // given container in this sandbox. 484 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { 485 log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) 486 var pl []*control.Process 487 if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil { 488 return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) 489 } 490 return pl, nil 491 } 492 493 // CreateTraceSession creates a new trace session. 494 func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error { 495 log.Debugf("Creating trace session in sandbox %q", s.ID) 496 497 sinkFiles, err := seccheck.SetupSinks(config.Sinks) 498 if err != nil { 499 return err 500 } 501 defer func() { 502 for _, f := range sinkFiles { 503 _ = f.Close() 504 } 505 }() 506 507 arg := boot.CreateTraceSessionArgs{ 508 Config: *config, 509 Force: force, 510 FilePayload: urpc.FilePayload{ 511 Files: sinkFiles, 512 }, 513 } 514 if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil { 515 return fmt.Errorf("creating trace session: %w", err) 516 } 517 return nil 518 } 519 520 // DeleteTraceSession deletes an existing trace session. 521 func (s *Sandbox) DeleteTraceSession(name string) error { 522 log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID) 523 if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil { 524 return fmt.Errorf("deleting trace session: %w", err) 525 } 526 return nil 527 } 528 529 // ListTraceSessions lists all trace sessions. 530 func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) { 531 log.Debugf("Listing trace sessions in sandbox %q", s.ID) 532 var sessions []seccheck.SessionConfig 533 if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil { 534 return nil, fmt.Errorf("listing trace session: %w", err) 535 } 536 return sessions, nil 537 } 538 539 // ProcfsDump collects and returns a procfs dump for the sandbox. 540 func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) { 541 log.Debugf("Procfs dump %q", s.ID) 542 var procfsDump []procfs.ProcessProcfsDump 543 if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil { 544 return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 545 } 546 return procfsDump, nil 547 } 548 549 // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. 550 func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) { 551 return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */) 552 } 553 554 // Execute runs the specified command in the container. It returns the PID of 555 // the newly created process. 556 func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 557 log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) 558 559 // Stdios are those files which have an FD <= 2 in the process. We do not 560 // want the ownership of other files to be changed by configureStdios. 561 var stdios []*os.File 562 for i, fd := range args.GuestFDs { 563 if fd > 2 || i >= len(args.Files) { 564 continue 565 } 566 stdios = append(stdios, args.Files[i]) 567 } 568 569 if err := s.configureStdios(conf, stdios); err != nil { 570 return 0, err 571 } 572 573 // Send a message to the sandbox control server to start the container. 574 var pid int32 575 if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil { 576 return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err) 577 } 578 return pid, nil 579 } 580 581 // Event retrieves stats about the sandbox such as memory and CPU utilization. 582 func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { 583 log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) 584 var e boot.EventOut 585 if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil { 586 return nil, fmt.Errorf("retrieving event data from sandbox: %w", err) 587 } 588 return &e, nil 589 } 590 591 // PortForward starts port forwarding to the sandbox. 592 func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error { 593 log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts) 594 conn, err := s.sandboxConnect() 595 if err != nil { 596 return err 597 } 598 defer conn.Close() 599 600 if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil { 601 return fmt.Errorf("port forwarding to sandbox: %v", err) 602 } 603 604 return nil 605 } 606 607 func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { 608 log.Debugf("Connecting to sandbox %q", s.ID) 609 path := s.ControlSocketPath 610 if len(path) >= linux.UnixPathMax { 611 // This is not an abstract socket path. It is a filesystem path. 612 // UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead 613 // open the socket using open(2) and use /proc to refer to the open FD. 614 sockFD, err := unix.Open(path, unix.O_PATH, 0) 615 if err != nil { 616 return nil, fmt.Errorf("failed to open socket at %q", path) 617 } 618 defer unix.Close(sockFD) 619 path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD)) 620 } 621 conn, err := client.ConnectTo(path) 622 if err != nil { 623 return nil, s.connError(err) 624 } 625 return conn, nil 626 } 627 628 func (s *Sandbox) call(method string, arg, result any) error { 629 conn, err := s.sandboxConnect() 630 if err != nil { 631 return err 632 } 633 defer conn.Close() 634 635 return conn.Call(method, arg, result) 636 } 637 638 func (s *Sandbox) connError(err error) error { 639 return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err) 640 } 641 642 // createSandboxProcess starts the sandbox as a subprocess by running the "boot" 643 // command, passing in the bundle dir. 644 func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { 645 donations := donation.Agency{} 646 defer donations.Close() 647 648 // pgalloc.MemoryFile (which provides application memory) sometimes briefly 649 // mlock(2)s ranges of memory in order to fault in a large number of pages at 650 // a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc 651 // expects to run in a memory cgroup that limits its memory usage as 652 // required. 653 // This needs to be done before exec'ing `runsc boot`, as that subcommand 654 // runs as an unprivileged user that will not be able to call `setrlimit` 655 // by itself. Calling `setrlimit` here will have the side-effect of setting 656 // the limit on the currently-running `runsc` process as well, but that 657 // should be OK too. 658 var rlim unix.Rlimit 659 if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 660 log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err) 661 } else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY { 662 rlim.Cur = unix.RLIM_INFINITY 663 rlim.Max = unix.RLIM_INFINITY 664 if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 665 // We may not have CAP_SYS_RESOURCE, so this failure may be expected. 666 log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err) 667 } 668 } 669 670 // 671 // These flags must come BEFORE the "boot" command in cmd.Args. 672 // 673 674 // Open the log files to pass to the sandbox as FDs. 675 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 676 return err 677 } 678 679 test := "" 680 if len(conf.TestOnlyTestNameEnv) != 0 { 681 // Fetch test name if one is provided and the test only flag was set. 682 if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 683 test = t 684 } 685 } 686 if specutils.IsDebugCommand(conf, "boot") { 687 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil { 688 return err 689 } 690 } 691 if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil { 692 return err 693 } 694 covFilename := conf.CoverageReport 695 if covFilename == "" { 696 covFilename = os.Getenv("GO_COVERAGE_FILE") 697 } 698 if covFilename != "" && coverage.Available() { 699 if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil { 700 return err 701 } 702 } 703 if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test); err != nil { 704 return err 705 } 706 707 // Relay all the config flags to the sandbox process. 708 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 709 cmd.SysProcAttr = &unix.SysProcAttr{ 710 // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT 711 // when re-parented. 712 Setsid: true, 713 } 714 715 // Set Args[0] to make easier to spot the sandbox process. Otherwise it's 716 // shown as `exe`. 717 cmd.Args[0] = "runsc-sandbox" 718 719 // Tranfer FDs that need to be present before the "boot" command. 720 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 721 nextFD := donations.Transfer(cmd, 3) 722 723 // Add the "boot" command to the args. 724 // 725 // All flags after this must be for the boot command 726 cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) 727 728 // Clear environment variables, unless --TESTONLY-unsafe-nonroot is set. 729 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 730 // Setting cmd.Env = nil causes cmd to inherit the current process's env. 731 cmd.Env = []string{} 732 } 733 734 // If there is a gofer, sends all socket ends to the sandbox. 735 donations.DonateAndClose("io-fds", args.IOFiles...) 736 donations.DonateAndClose("overlay-filestore-fds", args.OverlayFilestoreFiles...) 737 donations.DonateAndClose("mounts-fd", args.MountsFile) 738 donations.Donate("start-sync-fd", startSyncFile) 739 if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 740 return err 741 } 742 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 743 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil { 744 return err 745 } 746 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil { 747 return err 748 } 749 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil { 750 return err 751 } 752 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil { 753 return err 754 } 755 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil { 756 return err 757 } 758 759 // Pass nvidia device minors. 760 if len(args.NvidiaDevMinors) > 0 { 761 cmd.Args = append(cmd.Args, "--nvidia-dev-minors="+args.NvidiaDevMinors.String()) 762 } 763 764 // Pass overlay mediums. 765 cmd.Args = append(cmd.Args, "--overlay-mediums="+args.OverlayMediums.String()) 766 767 // Create a socket for the control server and donate it to the sandbox. 768 controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID) 769 if err != nil { 770 return fmt.Errorf("failed to create control socket: %v", err) 771 } 772 s.ControlSocketPath = controlSocketPath 773 log.Infof("Control socket path: %q", s.ControlSocketPath) 774 donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket")) 775 776 specFile, err := specutils.OpenSpec(args.BundleDir) 777 if err != nil { 778 return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err) 779 } 780 donations.DonateAndClose("spec-fd", specFile) 781 782 if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil { 783 return err 784 } 785 donations.DonateAndClose("sink-fds", args.SinkFiles...) 786 787 gPlatform, err := platform.Lookup(conf.Platform) 788 if err != nil { 789 return fmt.Errorf("cannot look up platform: %w", err) 790 } 791 if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil { 792 return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) 793 } else if deviceFile != nil { 794 donations.DonateAndClose("device-fd", deviceFile) 795 } 796 797 // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff 798 // isn't set. 799 if conf.Platform == "kvm" { 800 cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") 801 } 802 803 // nss is the set of namespaces to join or create before starting the sandbox 804 // process. Mount, IPC and UTS namespaces from the host are not used as they 805 // are virtualized inside the sandbox. Be paranoid and run inside an empty 806 // namespace for these. Don't unshare cgroup because sandbox is added to a 807 // cgroup in the caller's namespace. 808 log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") 809 nss := []specs.LinuxNamespace{ 810 {Type: specs.IPCNamespace}, 811 {Type: specs.MountNamespace}, 812 {Type: specs.UTSNamespace}, 813 } 814 815 if gPlatform.Requirements().RequiresCurrentPIDNS { 816 // TODO(b/75837838): Also set a new PID namespace so that we limit 817 // access to other host processes. 818 log.Infof("Sandbox will be started in the current PID namespace") 819 } else { 820 log.Infof("Sandbox will be started in a new PID namespace") 821 nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) 822 cmd.Args = append(cmd.Args, "--pidns=true") 823 } 824 825 // Joins the network namespace if network is enabled. the sandbox talks 826 // directly to the host network, which may have been configured in the 827 // namespace. 828 if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { 829 log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) 830 nss = append(nss, ns) 831 } else if conf.Network == config.NetworkHost { 832 log.Infof("Sandbox will be started in the host network namespace") 833 } else { 834 log.Infof("Sandbox will be started in new network namespace") 835 nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) 836 } 837 838 // These are set to the uid/gid that the sandbox process will use. May be 839 // overriden below. 840 s.UID = os.Getuid() 841 s.GID = os.Getgid() 842 843 // User namespace depends on the network type or whether access to the host 844 // filesystem is required. These features require to run inside the user 845 // namespace specified in the spec or the current namespace if none is 846 // configured. 847 rootlessEUID := unix.Geteuid() != 0 848 setUserMappings := false 849 if conf.Network == config.NetworkHost || conf.DirectFS { 850 if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { 851 log.Infof("Sandbox will be started in container's user namespace: %+v", userns) 852 nss = append(nss, userns) 853 if rootlessEUID { 854 syncFile, err := ConfigureCmdForRootless(cmd, &donations) 855 if err != nil { 856 return err 857 } 858 defer syncFile.Close() 859 setUserMappings = true 860 } else { 861 specutils.SetUIDGIDMappings(cmd, args.Spec) 862 // We need to set UID and GID to have capabilities in a new user namespace. 863 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 864 } 865 } else { 866 if rootlessEUID { 867 return fmt.Errorf("unable to run a rootless container without userns") 868 } 869 log.Infof("Sandbox will be started in the current user namespace") 870 } 871 // When running in the caller's defined user namespace, apply the same 872 // capabilities to the sandbox process to ensure it abides to the same 873 // rules. 874 cmd.Args = append(cmd.Args, "--apply-caps=true") 875 876 // If we have CAP_SYS_ADMIN, we can create an empty chroot and 877 // bind-mount the executable inside it. 878 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 879 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 880 } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID { 881 log.Infof("Sandbox will be started in minimal chroot") 882 cmd.Args = append(cmd.Args, "--setup-root") 883 } else { 884 return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") 885 } 886 } else { 887 // If we have CAP_SETUID and CAP_SETGID, then we can also run 888 // as user nobody. 889 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 890 log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) 891 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 892 } else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { 893 log.Infof("Sandbox will be started in new user namespace") 894 nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) 895 cmd.Args = append(cmd.Args, "--setup-root") 896 897 const nobody = 65534 898 if rootlessEUID || conf.Rootless { 899 log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) 900 } else { 901 // Map nobody in the new namespace to nobody in the parent namespace. 902 s.UID = nobody 903 s.GID = nobody 904 } 905 906 // Set credentials to run as user and group nobody. 907 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} 908 cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ 909 { 910 ContainerID: nobody, 911 HostID: s.UID, 912 Size: 1, 913 }, 914 } 915 cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ 916 { 917 ContainerID: nobody, 918 HostID: s.GID, 919 Size: 1, 920 }, 921 } 922 923 // A sandbox process will construct an empty root for itself, so it has 924 // to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. 925 cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, 926 uintptr(capability.CAP_SYS_ADMIN), 927 uintptr(capability.CAP_SYS_CHROOT), 928 // CAP_SETPCAP is required to clear the bounding set. 929 uintptr(capability.CAP_SETPCAP), 930 ) 931 932 } else { 933 return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") 934 } 935 } 936 937 // The current process' stdio must be passed to the application via the 938 // --stdio-fds flag. The stdio of the sandbox process itself must not 939 // be connected to the same FDs, otherwise we risk leaking sandbox 940 // errors to the application, so we set the sandbox stdio to nil, 941 // causing them to read/write from the null device. 942 cmd.Stdin = nil 943 cmd.Stdout = nil 944 cmd.Stderr = nil 945 var stdios [3]*os.File 946 947 // If the console control socket file is provided, then create a new 948 // pty master/replica pair and set the TTY on the sandbox process. 949 if args.Spec.Process.Terminal && args.ConsoleSocket != "" { 950 // console.NewWithSocket will send the master on the given 951 // socket, and return the replica. 952 tty, err := console.NewWithSocket(args.ConsoleSocket) 953 if err != nil { 954 return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) 955 } 956 defer tty.Close() 957 958 // Set the TTY as a controlling TTY on the sandbox process. 959 cmd.SysProcAttr.Setctty = true 960 961 // Inconveniently, the Ctty must be the FD in the *child* process's FD 962 // table. So transfer all files we have so far and make sure the next file 963 // added to donations is stdin. 964 // 965 // See https://github.com/golang/go/issues/29458. 966 nextFD = donations.Transfer(cmd, nextFD) 967 cmd.SysProcAttr.Ctty = nextFD 968 969 // Pass the tty as all stdio fds to sandbox. 970 stdios[0] = tty 971 stdios[1] = tty 972 stdios[2] = tty 973 974 if conf.Debug { 975 // If debugging, send the boot process stdio to the 976 // TTY, so that it is easier to find. 977 cmd.Stdin = tty 978 cmd.Stdout = tty 979 cmd.Stderr = tty 980 } 981 } else { 982 // If not using a console, pass our current stdio as the 983 // container stdio via flags. 984 stdios[0] = os.Stdin 985 stdios[1] = os.Stdout 986 stdios[2] = os.Stderr 987 988 if conf.Debug { 989 // If debugging, send the boot process stdio to the 990 // this process' stdio, so that is is easier to find. 991 cmd.Stdin = os.Stdin 992 cmd.Stdout = os.Stdout 993 cmd.Stderr = os.Stderr 994 } 995 } 996 if err := s.configureStdios(conf, stdios[:]); err != nil { 997 return fmt.Errorf("configuring stdios: %w", err) 998 } 999 // Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above 1000 // because it relies on stdin being the next FD donated. 1001 donations.Donate("stdio-fds", stdios[:]...) 1002 1003 totalSysMem, err := totalSystemMemory() 1004 if err != nil { 1005 return err 1006 } 1007 cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10)) 1008 1009 mem := totalSysMem 1010 if s.CgroupJSON.Cgroup != nil { 1011 cpuNum, err := s.CgroupJSON.Cgroup.NumCPU() 1012 if err != nil { 1013 return fmt.Errorf("getting cpu count from cgroups: %v", err) 1014 } 1015 if conf.CPUNumFromQuota { 1016 // Dropping below 2 CPUs can trigger application to disable 1017 // locks that can lead do hard to debug errors, so just 1018 // leaving two cores as reasonable default. 1019 const minCPUs = 2 1020 1021 quota, err := s.CgroupJSON.Cgroup.CPUQuota() 1022 if err != nil { 1023 return fmt.Errorf("getting cpu quota from cgroups: %v", err) 1024 } 1025 if n := int(math.Ceil(quota)); n > 0 { 1026 if n < minCPUs { 1027 n = minCPUs 1028 } 1029 if n < cpuNum { 1030 // Only lower the cpu number. 1031 cpuNum = n 1032 } 1033 } 1034 } 1035 cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) 1036 1037 memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit() 1038 if err != nil { 1039 return fmt.Errorf("getting memory limit from cgroups: %v", err) 1040 } 1041 if memLimit < mem { 1042 mem = memLimit 1043 } 1044 } 1045 cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) 1046 1047 if args.Attached { 1048 // Kill sandbox if parent process exits in attached mode. 1049 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1050 // Tells boot that any process it creates must have pdeathsig set. 1051 cmd.Args = append(cmd.Args, "--attached") 1052 } 1053 1054 if args.ExecFile != nil { 1055 donations.Donate("exec-fd", args.ExecFile) 1056 } 1057 1058 nextFD = donations.Transfer(cmd, nextFD) 1059 1060 _ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles) 1061 1062 // Add container ID as the last argument. 1063 cmd.Args = append(cmd.Args, s.ID) 1064 1065 donation.LogDonations(cmd) 1066 log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args) 1067 log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) 1068 if err := specutils.StartInNS(cmd, nss); err != nil { 1069 err := fmt.Errorf("starting sandbox: %v", err) 1070 // If the sandbox failed to start, it may be because the binary 1071 // permissions were incorrect. Check the bits and return a more helpful 1072 // error message. 1073 // 1074 // NOTE: The error message is checked because error types are lost over 1075 // rpc calls. 1076 if strings.Contains(err.Error(), unix.EACCES.Error()) { 1077 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 1078 return fmt.Errorf("%v: %v", err, permsErr) 1079 } 1080 } 1081 return err 1082 } 1083 s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) 1084 if err != nil { 1085 return err 1086 } 1087 if setUserMappings { 1088 if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil { 1089 return err 1090 } 1091 } 1092 1093 s.child = true 1094 s.Pid.store(cmd.Process.Pid) 1095 log.Infof("Sandbox started, PID: %d", cmd.Process.Pid) 1096 1097 return nil 1098 } 1099 1100 // Wait waits for the containerized process to exit, and returns its WaitStatus. 1101 func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { 1102 log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) 1103 1104 if conn, err := s.sandboxConnect(); err != nil { 1105 // The sandbox may have exited while before we had a chance to wait on it. 1106 // There is nothing we can do for subcontainers. For the init container, we 1107 // can try to get the sandbox exit code. 1108 if !s.IsRootContainer(cid) { 1109 return unix.WaitStatus(0), err 1110 } 1111 log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1112 } else { 1113 defer conn.Close() 1114 1115 // Try the Wait RPC to the sandbox. 1116 var ws unix.WaitStatus 1117 err = conn.Call(boot.ContMgrWait, &cid, &ws) 1118 conn.Close() 1119 if err == nil { 1120 if s.IsRootContainer(cid) { 1121 if err := s.waitForStopped(); err != nil { 1122 return unix.WaitStatus(0), err 1123 } 1124 } 1125 // It worked! 1126 return ws, nil 1127 } 1128 // See comment above. 1129 if !s.IsRootContainer(cid) { 1130 return unix.WaitStatus(0), err 1131 } 1132 1133 // The sandbox may have exited after we connected, but before 1134 // or during the Wait RPC. 1135 log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1136 } 1137 1138 // The sandbox may have already exited, or exited while handling the Wait RPC. 1139 // The best we can do is ask Linux what the sandbox exit status was, since in 1140 // most cases that will be the same as the container exit status. 1141 if err := s.waitForStopped(); err != nil { 1142 return unix.WaitStatus(0), err 1143 } 1144 if !s.child { 1145 return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") 1146 } 1147 1148 s.statusMu.Lock() 1149 defer s.statusMu.Unlock() 1150 return s.status, nil 1151 } 1152 1153 // WaitPID waits for process 'pid' in the container's sandbox and returns its 1154 // WaitStatus. 1155 func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { 1156 log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) 1157 var ws unix.WaitStatus 1158 args := &boot.WaitPIDArgs{ 1159 PID: pid, 1160 CID: cid, 1161 } 1162 if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil { 1163 return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err) 1164 } 1165 return ws, nil 1166 } 1167 1168 // IsRootContainer returns true if the specified container ID belongs to the 1169 // root container. 1170 func (s *Sandbox) IsRootContainer(cid string) bool { 1171 return s.ID == cid 1172 } 1173 1174 // Destroy frees all resources associated with the sandbox. It fails fast and 1175 // is idempotent. 1176 func (s *Sandbox) destroy() error { 1177 log.Debugf("Destroying sandbox %q", s.ID) 1178 // Only delete the control file if it exists. 1179 if len(s.ControlSocketPath) > 0 { 1180 if err := os.Remove(s.ControlSocketPath); err != nil { 1181 log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err) 1182 } 1183 } 1184 pid := s.Pid.load() 1185 if pid != 0 { 1186 log.Debugf("Killing sandbox %q", s.ID) 1187 if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH { 1188 return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err) 1189 } 1190 if err := s.waitForStopped(); err != nil { 1191 return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err) 1192 } 1193 } 1194 1195 return nil 1196 } 1197 1198 // SignalContainer sends the signal to a container in the sandbox. If all is 1199 // true and signal is SIGKILL, then waits for all processes to exit before 1200 // returning. 1201 func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { 1202 log.Debugf("Signal sandbox %q", s.ID) 1203 mode := boot.DeliverToProcess 1204 if all { 1205 mode = boot.DeliverToAllProcesses 1206 } 1207 1208 args := boot.SignalArgs{ 1209 CID: cid, 1210 Signo: int32(sig), 1211 Mode: mode, 1212 } 1213 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1214 return fmt.Errorf("signaling container %q: %w", cid, err) 1215 } 1216 return nil 1217 } 1218 1219 // SignalProcess sends the signal to a particular process in the container. If 1220 // fgProcess is true, then the signal is sent to the foreground process group 1221 // in the same session that PID belongs to. This is only valid if the process 1222 // is attached to a host TTY. 1223 func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error { 1224 log.Debugf("Signal sandbox %q", s.ID) 1225 1226 mode := boot.DeliverToProcess 1227 if fgProcess { 1228 mode = boot.DeliverToForegroundProcessGroup 1229 } 1230 1231 args := boot.SignalArgs{ 1232 CID: cid, 1233 Signo: int32(sig), 1234 PID: pid, 1235 Mode: mode, 1236 } 1237 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1238 return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) 1239 } 1240 return nil 1241 } 1242 1243 // Checkpoint sends the checkpoint call for a container in the sandbox. 1244 // The statefile will be written to f. 1245 func (s *Sandbox) Checkpoint(cid string, f *os.File, options statefile.Options) error { 1246 log.Debugf("Checkpoint sandbox %q, options %+v", s.ID, options) 1247 opt := control.SaveOpts{ 1248 Metadata: options.WriteToMetadata(map[string]string{}), 1249 FilePayload: urpc.FilePayload{ 1250 Files: []*os.File{f}, 1251 }, 1252 } 1253 1254 if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil { 1255 return fmt.Errorf("checkpointing container %q: %w", cid, err) 1256 } 1257 return nil 1258 } 1259 1260 // Pause sends the pause call for a container in the sandbox. 1261 func (s *Sandbox) Pause(cid string) error { 1262 log.Debugf("Pause sandbox %q", s.ID) 1263 if err := s.call(boot.LifecyclePause, nil, nil); err != nil { 1264 return fmt.Errorf("pausing container %q: %w", cid, err) 1265 } 1266 return nil 1267 } 1268 1269 // Resume sends the resume call for a container in the sandbox. 1270 func (s *Sandbox) Resume(cid string) error { 1271 log.Debugf("Resume sandbox %q", s.ID) 1272 if err := s.call(boot.LifecycleResume, nil, nil); err != nil { 1273 return fmt.Errorf("resuming container %q: %w", cid, err) 1274 } 1275 return nil 1276 } 1277 1278 // Usage sends the collect call for a container in the sandbox. 1279 func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) { 1280 log.Debugf("Usage sandbox %q", s.ID) 1281 opts := control.MemoryUsageOpts{Full: Full} 1282 var m control.MemoryUsage 1283 if err := s.call(boot.UsageCollect, &opts, &m); err != nil { 1284 return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err) 1285 } 1286 return m, nil 1287 } 1288 1289 // UsageFD sends the usagefd call for a container in the sandbox. 1290 func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) { 1291 log.Debugf("Usage sandbox %q", s.ID) 1292 opts := control.MemoryUsageFileOpts{Version: 1} 1293 var m control.MemoryUsageFile 1294 if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil { 1295 return nil, fmt.Errorf("collecting usage FD: %w", err) 1296 } 1297 1298 if len(m.FilePayload.Files) != 2 { 1299 return nil, fmt.Errorf("wants exactly two fds") 1300 } 1301 return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) 1302 } 1303 1304 // GetRegisteredMetrics returns metric registration data from the sandbox. 1305 // This data is meant to be used as a way to sanity-check any exported metrics data during the 1306 // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce 1307 // bogus metrics. 1308 // This returns an error if the sandbox has not requested instrumentation during creation time. 1309 func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) { 1310 if s.RegisteredMetrics == nil { 1311 return nil, errors.New("sandbox did not request instrumentation when it was created") 1312 } 1313 return s.RegisteredMetrics, nil 1314 } 1315 1316 // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format. 1317 func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) { 1318 log.Debugf("Metrics export sandbox %q", s.ID) 1319 var data control.MetricsExportData 1320 if err := s.call(boot.MetricsExport, &opts, &data); err != nil { 1321 return nil, err 1322 } 1323 // Since we do not trust the output of the sandbox as-is, double-check that the options were 1324 // respected. 1325 if err := opts.Verify(&data); err != nil { 1326 return nil, err 1327 } 1328 return data.Snapshot, nil 1329 } 1330 1331 // IsRunning returns true if the sandbox or gofer process is running. 1332 func (s *Sandbox) IsRunning() bool { 1333 pid := s.Pid.load() 1334 if pid != 0 { 1335 // Send a signal 0 to the sandbox process. 1336 if err := unix.Kill(pid, 0); err == nil { 1337 // Succeeded, process is running. 1338 return true 1339 } 1340 } 1341 return false 1342 } 1343 1344 // Stacks collects and returns all stacks for the sandbox. 1345 func (s *Sandbox) Stacks() (string, error) { 1346 log.Debugf("Stacks sandbox %q", s.ID) 1347 var stacks string 1348 if err := s.call(boot.DebugStacks, nil, &stacks); err != nil { 1349 return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 1350 } 1351 return stacks, nil 1352 } 1353 1354 // HeapProfile writes a heap profile to the given file. 1355 func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { 1356 log.Debugf("Heap profile %q", s.ID) 1357 opts := control.HeapProfileOpts{ 1358 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1359 Delay: delay, 1360 } 1361 return s.call(boot.ProfileHeap, &opts, nil) 1362 } 1363 1364 // CPUProfile collects a CPU profile. 1365 func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { 1366 log.Debugf("CPU profile %q", s.ID) 1367 opts := control.CPUProfileOpts{ 1368 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1369 Duration: duration, 1370 } 1371 return s.call(boot.ProfileCPU, &opts, nil) 1372 } 1373 1374 // BlockProfile writes a block profile to the given file. 1375 func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { 1376 log.Debugf("Block profile %q", s.ID) 1377 opts := control.BlockProfileOpts{ 1378 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1379 Duration: duration, 1380 } 1381 return s.call(boot.ProfileBlock, &opts, nil) 1382 } 1383 1384 // MutexProfile writes a mutex profile to the given file. 1385 func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { 1386 log.Debugf("Mutex profile %q", s.ID) 1387 opts := control.MutexProfileOpts{ 1388 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1389 Duration: duration, 1390 } 1391 return s.call(boot.ProfileMutex, &opts, nil) 1392 } 1393 1394 // Trace collects an execution trace. 1395 func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { 1396 log.Debugf("Trace %q", s.ID) 1397 opts := control.TraceProfileOpts{ 1398 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1399 Duration: duration, 1400 } 1401 return s.call(boot.ProfileTrace, &opts, nil) 1402 } 1403 1404 // ChangeLogging changes logging options. 1405 func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { 1406 log.Debugf("Change logging start %q", s.ID) 1407 if err := s.call(boot.LoggingChange, &args, nil); err != nil { 1408 return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err) 1409 } 1410 return nil 1411 } 1412 1413 // DestroyContainer destroys the given container. If it is the root container, 1414 // then the entire sandbox is destroyed. 1415 func (s *Sandbox) DestroyContainer(cid string) error { 1416 if err := s.destroyContainer(cid); err != nil { 1417 // If the sandbox isn't running, the container has already been destroyed, 1418 // ignore the error in this case. 1419 if s.IsRunning() { 1420 return err 1421 } 1422 } 1423 return nil 1424 } 1425 1426 func (s *Sandbox) destroyContainer(cid string) error { 1427 if s.IsRootContainer(cid) { 1428 log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) 1429 return s.destroy() 1430 } 1431 1432 log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) 1433 if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil { 1434 return fmt.Errorf("destroying container %q: %w", cid, err) 1435 } 1436 return nil 1437 } 1438 1439 func (s *Sandbox) waitForStopped() error { 1440 if s.child { 1441 s.statusMu.Lock() 1442 defer s.statusMu.Unlock() 1443 pid := s.Pid.load() 1444 if pid == 0 { 1445 return nil 1446 } 1447 // The sandbox process is a child of the current process, 1448 // so we can wait on it to terminate and collect its zombie. 1449 if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil { 1450 return fmt.Errorf("error waiting the sandbox process: %v", err) 1451 } 1452 s.Pid.store(0) 1453 return nil 1454 } 1455 1456 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1457 defer cancel() 1458 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1459 op := func() error { 1460 if s.IsRunning() { 1461 return fmt.Errorf("sandbox is still running") 1462 } 1463 return nil 1464 } 1465 return backoff.Retry(op, b) 1466 } 1467 1468 // configureStdios change stdios ownership to give access to the sandbox 1469 // process. This may be skipped depending on the configuration. 1470 func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error { 1471 if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1472 // Cannot change ownership without CAP_CHOWN. 1473 return nil 1474 } 1475 1476 if s.UID < 0 || s.GID < 0 { 1477 panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID)) 1478 } 1479 for _, file := range stdios { 1480 log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) 1481 if err := file.Chown(s.UID, s.GID); err != nil { 1482 if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { 1483 log.Warningf("can't change an owner of %s: %s", file.Name(), err) 1484 continue 1485 } 1486 return err 1487 } 1488 } 1489 return nil 1490 } 1491 1492 // deviceFileForPlatform opens the device file for the given platform. If the 1493 // platform does not need a device file, then nil is returned. 1494 // devicePath may be empty to use a sane platform-specific default. 1495 func deviceFileForPlatform(name, devicePath string) (*os.File, error) { 1496 p, err := platform.Lookup(name) 1497 if err != nil { 1498 return nil, err 1499 } 1500 1501 f, err := p.OpenDevice(devicePath) 1502 if err != nil { 1503 return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) 1504 } 1505 return f, nil 1506 } 1507 1508 // checkBinaryPermissions verifies that the required binary bits are set on 1509 // the runsc executable. 1510 func checkBinaryPermissions(conf *config.Config) error { 1511 // All platforms need the other exe bit 1512 neededBits := os.FileMode(0001) 1513 if conf.Platform == "ptrace" { 1514 // Ptrace needs the other read bit 1515 neededBits |= os.FileMode(0004) 1516 } 1517 1518 exePath, err := os.Executable() 1519 if err != nil { 1520 return fmt.Errorf("getting exe path: %v", err) 1521 } 1522 1523 // Check the permissions of the runsc binary and print an error if it 1524 // doesn't match expectations. 1525 info, err := os.Stat(exePath) 1526 if err != nil { 1527 return fmt.Errorf("stat file: %v", err) 1528 } 1529 1530 if info.Mode().Perm()&neededBits != neededBits { 1531 return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) 1532 } 1533 return nil 1534 } 1535 1536 // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox. 1537 func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) { 1538 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1539 args := control.CgroupsReadArgs{ 1540 Args: []control.CgroupsReadArg{ 1541 { 1542 File: file, 1543 }, 1544 }, 1545 } 1546 var out control.CgroupsResults 1547 if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil { 1548 return "", err 1549 } 1550 if len(out.Results) != 1 { 1551 return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1552 } 1553 return out.Results[0].Unpack() 1554 } 1555 1556 // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox. 1557 func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error { 1558 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1559 args := control.CgroupsWriteArgs{ 1560 Args: []control.CgroupsWriteArg{ 1561 { 1562 File: file, 1563 Value: value, 1564 }, 1565 }, 1566 } 1567 var out control.CgroupsResults 1568 if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil { 1569 return err 1570 } 1571 if len(out.Results) != 1 { 1572 return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1573 } 1574 return out.Results[0].AsError() 1575 } 1576 1577 // fixPidns looks at the PID namespace path. If that path corresponds to the 1578 // sandbox process PID namespace, then change the spec so that the container 1579 // joins the sandbox root namespace. 1580 func (s *Sandbox) fixPidns(spec *specs.Spec) { 1581 pidns, ok := specutils.GetNS(specs.PIDNamespace, spec) 1582 if !ok { 1583 // pidns was not set, nothing to fix. 1584 return 1585 } 1586 if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) { 1587 // Fix only if the PID namespace corresponds to the sandbox's. 1588 return 1589 } 1590 1591 for i := range spec.Linux.Namespaces { 1592 if spec.Linux.Namespaces[i].Type == specs.PIDNamespace { 1593 // Removing the namespace makes the container join the sandbox root 1594 // namespace. 1595 log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path) 1596 spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...) 1597 return 1598 } 1599 } 1600 panic("unreachable") 1601 } 1602 1603 // ConfigureCmdForRootless configures cmd to donate a socket FD that can be 1604 // used to synchronize userns configuration. 1605 func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) { 1606 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1607 if err != nil { 1608 return nil, err 1609 } 1610 f := os.NewFile(uintptr(fds[1]), "userns sync other FD") 1611 donations.DonateAndClose("sync-userns-fd", f) 1612 if cmd.SysProcAttr == nil { 1613 cmd.SysProcAttr = &unix.SysProcAttr{} 1614 } 1615 cmd.SysProcAttr.AmbientCaps = []uintptr{ 1616 // Same as `cap` in cmd/gofer.go. 1617 unix.CAP_CHOWN, 1618 unix.CAP_DAC_OVERRIDE, 1619 unix.CAP_DAC_READ_SEARCH, 1620 unix.CAP_FOWNER, 1621 unix.CAP_FSETID, 1622 unix.CAP_SYS_CHROOT, 1623 // Needed for setuid(2)/setgid(2). 1624 unix.CAP_SETUID, 1625 unix.CAP_SETGID, 1626 // Needed for chroot. 1627 unix.CAP_SYS_ADMIN, 1628 // Needed to be able to clear bounding set (PR_CAPBSET_DROP). 1629 unix.CAP_SETPCAP, 1630 } 1631 return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil 1632 } 1633 1634 // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings 1635 // for process pid. 1636 func SetUserMappings(spec *specs.Spec, pid int) error { 1637 log.Debugf("Setting user mappings") 1638 args := []string{strconv.Itoa(pid)} 1639 for _, idMap := range spec.Linux.UIDMappings { 1640 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1641 idMap.HostID, idMap.ContainerID, idMap.Size) 1642 args = append(args, 1643 strconv.Itoa(int(idMap.ContainerID)), 1644 strconv.Itoa(int(idMap.HostID)), 1645 strconv.Itoa(int(idMap.Size)), 1646 ) 1647 } 1648 1649 out, err := exec.Command("newuidmap", args...).CombinedOutput() 1650 log.Debugf("newuidmap: %#v\n%s", args, out) 1651 if err != nil { 1652 return fmt.Errorf("newuidmap failed: %w", err) 1653 } 1654 1655 args = []string{strconv.Itoa(pid)} 1656 for _, idMap := range spec.Linux.GIDMappings { 1657 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1658 idMap.HostID, idMap.ContainerID, idMap.Size) 1659 args = append(args, 1660 strconv.Itoa(int(idMap.ContainerID)), 1661 strconv.Itoa(int(idMap.HostID)), 1662 strconv.Itoa(int(idMap.Size)), 1663 ) 1664 } 1665 out, err = exec.Command("newgidmap", args...).CombinedOutput() 1666 log.Debugf("newgidmap: %#v\n%s", args, out) 1667 if err != nil { 1668 return fmt.Errorf("newgidmap failed: %w", err) 1669 } 1670 return nil 1671 }