gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/sandbox/sandbox.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sandbox creates and manipulates sandboxes. 16 package sandbox 17 18 import ( 19 "context" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "math" 25 "os" 26 "os/exec" 27 "path" 28 "path/filepath" 29 "strconv" 30 "strings" 31 "syscall" 32 "time" 33 34 "github.com/cenkalti/backoff" 35 specs "github.com/opencontainers/runtime-spec/specs-go" 36 "github.com/syndtr/gocapability/capability" 37 "golang.org/x/sys/unix" 38 "gvisor.dev/gvisor/pkg/abi/linux" 39 "gvisor.dev/gvisor/pkg/atomicbitops" 40 "gvisor.dev/gvisor/pkg/cleanup" 41 "gvisor.dev/gvisor/pkg/control/client" 42 "gvisor.dev/gvisor/pkg/control/server" 43 "gvisor.dev/gvisor/pkg/coverage" 44 "gvisor.dev/gvisor/pkg/fd" 45 "gvisor.dev/gvisor/pkg/log" 46 metricpb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" 47 "gvisor.dev/gvisor/pkg/prometheus" 48 "gvisor.dev/gvisor/pkg/sentry/control" 49 "gvisor.dev/gvisor/pkg/sentry/devices/nvproxy" 50 "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" 51 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 52 "gvisor.dev/gvisor/pkg/sentry/platform" 53 "gvisor.dev/gvisor/pkg/sentry/seccheck" 54 "gvisor.dev/gvisor/pkg/state/statefile" 55 "gvisor.dev/gvisor/pkg/sync" 56 "gvisor.dev/gvisor/pkg/urpc" 57 "gvisor.dev/gvisor/runsc/boot" 58 "gvisor.dev/gvisor/runsc/boot/procfs" 59 "gvisor.dev/gvisor/runsc/cgroup" 60 "gvisor.dev/gvisor/runsc/config" 61 "gvisor.dev/gvisor/runsc/console" 62 "gvisor.dev/gvisor/runsc/donation" 63 "gvisor.dev/gvisor/runsc/specutils" 64 ) 65 66 const ( 67 // namespaceAnnotation is a pod annotation populated by containerd. 68 // It contains the name of the pod that a sandbox is in when running in Kubernetes. 69 podNameAnnotation = "io.kubernetes.cri.sandbox-name" 70 71 // namespaceAnnotation is a pod annotation populated by containerd. 72 // It contains the namespace of the pod that a sandbox is in when running in Kubernetes. 73 namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" 74 ) 75 76 // createControlSocket finds a location and creates the socket used to 77 // communicate with the sandbox. The socket is a UDS on the host filesystem. 78 // 79 // Note that abstract sockets are *not* used, because any user can connect to 80 // them. There is no file mode protecting abstract sockets. 81 func createControlSocket(rootDir, id string) (string, int, error) { 82 name := fmt.Sprintf("runsc-%s.sock", id) 83 84 // Only use absolute paths to guarantee resolution from anywhere. 85 for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} { 86 path := filepath.Join(dir, name) 87 log.Debugf("Attempting to create socket file %q", path) 88 fd, err := server.CreateSocket(path) 89 if err == nil { 90 log.Debugf("Using socket file %q", path) 91 return path, fd, nil 92 } 93 log.Debugf("Failed to create socket file %q: %v", path, err) 94 } 95 return "", -1, fmt.Errorf("unable to find location to write socket file") 96 } 97 98 // pid is an atomic type that implements JSON marshal/unmarshal interfaces. 99 type pid struct { 100 val atomicbitops.Int64 101 } 102 103 func (p *pid) store(pid int) { 104 p.val.Store(int64(pid)) 105 } 106 107 func (p *pid) load() int { 108 return int(p.val.Load()) 109 } 110 111 // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. 112 func (p *pid) UnmarshalJSON(b []byte) error { 113 var pid int 114 115 if err := json.Unmarshal(b, &pid); err != nil { 116 return err 117 } 118 p.store(pid) 119 return nil 120 } 121 122 // MarshalJSON implements json.Marshaler.MarshalJSON 123 func (p *pid) MarshalJSON() ([]byte, error) { 124 return json.Marshal(p.load()) 125 } 126 127 // Sandbox wraps a sandbox process. 128 // 129 // It is used to start/stop sandbox process (and associated processes like 130 // gofers), as well as for running and manipulating containers inside a running 131 // sandbox. 132 // 133 // Note: Sandbox must be immutable because a copy of it is saved for each 134 // container and changes would not be synchronized to all of them. 135 type Sandbox struct { 136 // ID is the id of the sandbox (immutable). By convention, this is the same 137 // ID as the first container run in the sandbox. 138 ID string `json:"id"` 139 140 // PodName is the name of the Kubernetes Pod (if any) that this sandbox 141 // represents. Unset if not running under containerd or Kubernetes. 142 PodName string `json:"podName"` 143 144 // Namespace is the Kubernetes namespace (if any) of the pod that this 145 // sandbox represents. Unset if not running under containerd or Kubernetes. 146 Namespace string `json:"namespace"` 147 148 // Pid is the pid of the running sandbox. May be 0 if the sandbox 149 // is not running. 150 Pid pid `json:"pid"` 151 152 // UID is the user ID in the parent namespace that the sandbox is running as. 153 UID int `json:"uid"` 154 // GID is the group ID in the parent namespace that the sandbox is running as. 155 GID int `json:"gid"` 156 157 // CgroupJSON contains the cgroup configuration that the sandbox is part of 158 // and allow serialization of the configuration into json 159 CgroupJSON cgroup.CgroupJSON `json:"cgroup"` 160 161 // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox 162 // started, before it may be modified. 163 OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` 164 165 // RegisteredMetrics is the set of metrics registered in the sandbox. 166 // Used for verifying metric data integrity after containers are started. 167 // Only populated if exporting metrics was requested when the sandbox was 168 // created. 169 RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"` 170 171 // MetricMetadata are key-value pairs that are useful to export about this 172 // sandbox, but not part of the set of labels that uniquely identify it. 173 // They are static once initialized, and typically contain high-level 174 // configuration information about the sandbox. 175 MetricMetadata map[string]string `json:"metricMetadata"` 176 177 // MetricServerAddress is the address of the metric server that this sandbox 178 // intends to export metrics for. 179 // Only populated if exporting metrics was requested when the sandbox was 180 // created. 181 MetricServerAddress string `json:"metricServerAddress"` 182 183 // ControlSocketPath is the path to the sandbox's uRPC server socket. 184 // Connections to the sandbox are made through this. 185 ControlSocketPath string `json:"controlSocketPath"` 186 187 // MountHints provides extra information about container mounts that apply 188 // to the entire pod. 189 MountHints *boot.PodMountHints `json:"mountHints"` 190 191 // child is set if a sandbox process is a child of the current process. 192 // 193 // This field isn't saved to json, because only a creator of sandbox 194 // will have it as a child process. 195 child bool `nojson:"true"` 196 197 // statusMu protects status. 198 statusMu sync.Mutex `nojson:"true"` 199 200 // status is the exit status of a sandbox process. It's only set if the 201 // child==true and the sandbox was waited on. This field allows for multiple 202 // threads to wait on sandbox and get the exit code, since Linux will return 203 // WaitStatus to one of the waiters only. 204 status unix.WaitStatus `nojson:"true"` 205 } 206 207 // Getpid returns the process ID of the sandbox process. 208 func (s *Sandbox) Getpid() int { 209 return s.Pid.load() 210 } 211 212 // Args is used to configure a new sandbox. 213 type Args struct { 214 // ID is the sandbox unique identifier. 215 ID string 216 217 // Spec is the OCI spec that describes the container. 218 Spec *specs.Spec 219 220 // BundleDir is the directory containing the container bundle. 221 BundleDir string 222 223 // ConsoleSocket is the path to a unix domain socket that will receive 224 // the console FD. It may be empty. 225 ConsoleSocket string 226 227 // UserLog is the filename to send user-visible logs to. It may be empty. 228 UserLog string 229 230 // IOFiles is the list of image files and/or socket files that connect to 231 // a gofer endpoint for the mount points using Gofers. They must be in the 232 // same order as mounts appear in the spec. 233 IOFiles []*os.File 234 235 // File that connects to a gofer endpoint for a device mount point at /dev. 236 DevIOFile *os.File 237 238 // GoferFilestoreFiles are the regular files that will back the overlayfs or 239 // tmpfs mount if a gofer mount is to be overlaid. 240 GoferFilestoreFiles []*os.File 241 242 // GoferMountConfs contains information about how the gofer mounts have been 243 // configured. The first entry is for rootfs and the following entries are 244 // for bind mounts in Spec.Mounts (in the same order). 245 GoferMountConfs boot.GoferMountConfFlags 246 247 // MountHints provides extra information about containers mounts that apply 248 // to the entire pod. 249 MountHints *boot.PodMountHints 250 251 // MountsFile is a file container mount information from the spec. It's 252 // equivalent to the mounts from the spec, except that all paths have been 253 // resolved to their final absolute location. 254 MountsFile *os.File 255 256 // Gcgroup is the cgroup that the sandbox is part of. 257 Cgroup cgroup.Cgroup 258 259 // Attached indicates that the sandbox lifecycle is attached with the caller. 260 // If the caller exits, the sandbox should exit too. 261 Attached bool 262 263 // SinkFiles is the an ordered array of files to be used by seccheck sinks 264 // configured from the --pod-init-config file. 265 SinkFiles []*os.File 266 267 // PassFiles are user-supplied files from the host to be exposed to the 268 // sandboxed app. 269 PassFiles map[int]*os.File 270 271 // ExecFile is the file from the host used for program execution. 272 ExecFile *os.File 273 } 274 275 // New creates the sandbox process. The caller must call Destroy() on the 276 // sandbox. 277 func New(conf *config.Config, args *Args) (*Sandbox, error) { 278 s := &Sandbox{ 279 ID: args.ID, 280 CgroupJSON: cgroup.CgroupJSON{ 281 Cgroup: args.Cgroup, 282 }, 283 UID: -1, // prevent usage before it's set. 284 GID: -1, // prevent usage before it's set. 285 MetricMetadata: conf.MetricMetadata(), 286 MetricServerAddress: conf.MetricServer, 287 MountHints: args.MountHints, 288 } 289 if args.Spec != nil && args.Spec.Annotations != nil { 290 s.PodName = args.Spec.Annotations[podNameAnnotation] 291 s.Namespace = args.Spec.Annotations[namespaceAnnotation] 292 } 293 294 // The Cleanup object cleans up partially created sandboxes when an error 295 // occurs. Any errors occurring during cleanup itself are ignored. 296 c := cleanup.Make(func() { 297 if err := s.destroy(); err != nil { 298 log.Warningf("error destroying sandbox: %v", err) 299 } 300 }) 301 defer c.Clean() 302 303 if len(conf.PodInitConfig) > 0 { 304 initConf, err := boot.LoadInitConfig(conf.PodInitConfig) 305 if err != nil { 306 return nil, fmt.Errorf("loading init config file: %w", err) 307 } 308 args.SinkFiles, err = initConf.Setup() 309 if err != nil { 310 return nil, fmt.Errorf("cannot init config: %w", err) 311 } 312 } 313 314 // Create pipe to synchronize when sandbox process has been booted. 315 clientSyncFile, sandboxSyncFile, err := os.Pipe() 316 if err != nil { 317 return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) 318 } 319 defer clientSyncFile.Close() 320 321 // Create the sandbox process. 322 err = s.createSandboxProcess(conf, args, sandboxSyncFile) 323 // sandboxSyncFile has to be closed to be able to detect when the sandbox 324 // process exits unexpectedly. 325 sandboxSyncFile.Close() 326 if err != nil { 327 return nil, fmt.Errorf("cannot create sandbox process: %w", err) 328 } 329 330 // Wait until the sandbox has booted. 331 b := make([]byte, 1) 332 if l, err := clientSyncFile.Read(b); err != nil || l != 1 { 333 err := fmt.Errorf("waiting for sandbox to start: %v", err) 334 // If the sandbox failed to start, it may be because the binary 335 // permissions were incorrect. Check the bits and return a more helpful 336 // error message. 337 // 338 // NOTE: The error message is checked because error types are lost over 339 // rpc calls. 340 if strings.Contains(err.Error(), io.EOF.Error()) { 341 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 342 return nil, fmt.Errorf("%v: %v", err, permsErr) 343 } 344 } 345 return nil, fmt.Errorf("cannot read client sync file: %w", err) 346 } 347 348 if conf.MetricServer != "" { 349 // The control server is up and the sandbox was configured to export metrics. 350 // We must gather data about registered metrics prior to any process starting in the sandbox. 351 log.Debugf("Getting metric registration information from sandbox %q", s.ID) 352 var registeredMetrics control.MetricsRegistrationResponse 353 if err := s.call(boot.MetricsGetRegistered, nil, ®isteredMetrics); err != nil { 354 return nil, fmt.Errorf("cannot get registered metrics: %v", err) 355 } 356 s.RegisteredMetrics = registeredMetrics.RegisteredMetrics 357 } 358 359 c.Release() 360 return s, nil 361 } 362 363 // CreateSubcontainer creates a container inside the sandbox. 364 func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error { 365 log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 366 367 var files []*os.File 368 if tty != nil { 369 files = []*os.File{tty} 370 } 371 if err := s.configureStdios(conf, files); err != nil { 372 return err 373 } 374 375 args := boot.CreateArgs{ 376 CID: cid, 377 FilePayload: urpc.FilePayload{Files: files}, 378 } 379 if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil { 380 return fmt.Errorf("creating sub-container %q: %w", cid, err) 381 } 382 return nil 383 } 384 385 // StartRoot starts running the root container process inside the sandbox. 386 func (s *Sandbox) StartRoot(conf *config.Config) error { 387 pid := s.Pid.load() 388 log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid) 389 conn, err := s.sandboxConnect() 390 if err != nil { 391 return err 392 } 393 defer conn.Close() 394 395 // Configure the network. 396 if err := setupNetwork(conn, pid, conf); err != nil { 397 return fmt.Errorf("setting up network: %w", err) 398 } 399 400 // Send a message to the sandbox control server to start the root container. 401 if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil { 402 return fmt.Errorf("starting root container: %w", err) 403 } 404 405 return nil 406 } 407 408 // StartSubcontainer starts running a sub-container inside the sandbox. 409 func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestores []*os.File, devIOFile *os.File, goferConfs []boot.GoferMountConf) error { 410 log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 411 412 if err := s.configureStdios(conf, stdios); err != nil { 413 return err 414 } 415 s.fixPidns(spec) 416 417 // The payload contains (in this specific order): 418 // * stdin/stdout/stderr (optional: only present when not using TTY) 419 // * The subcontainer's gofer filestore files (optional) 420 // * The subcontainer's dev gofer file (optional) 421 // * Gofer files. 422 payload := urpc.FilePayload{} 423 payload.Files = append(payload.Files, stdios...) 424 payload.Files = append(payload.Files, goferFilestores...) 425 if devIOFile != nil { 426 payload.Files = append(payload.Files, devIOFile) 427 } 428 payload.Files = append(payload.Files, goferFiles...) 429 430 // Start running the container. 431 args := boot.StartArgs{ 432 Spec: spec, 433 Conf: conf, 434 CID: cid, 435 NumGoferFilestoreFDs: len(goferFilestores), 436 IsDevIoFilePresent: devIOFile != nil, 437 GoferMountConfs: goferConfs, 438 FilePayload: payload, 439 } 440 if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil { 441 return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) 442 } 443 return nil 444 } 445 446 // Restore sends the restore call for a container in the sandbox. 447 func (s *Sandbox) Restore(conf *config.Config, cid string, imagePath string, direct bool) error { 448 log.Debugf("Restore sandbox %q from path %q", s.ID, imagePath) 449 450 stateFileName := path.Join(imagePath, boot.CheckpointStateFileName) 451 sf, err := os.Open(stateFileName) 452 if err != nil { 453 return fmt.Errorf("opening state file %q failed: %v", stateFileName, err) 454 } 455 defer sf.Close() 456 457 opt := boot.RestoreOpts{ 458 FilePayload: urpc.FilePayload{ 459 Files: []*os.File{sf}, 460 }, 461 } 462 463 // If the pages file exists, we must pass it in. 464 pagesFileName := path.Join(imagePath, boot.CheckpointPagesFileName) 465 pagesReadFlags := os.O_RDONLY 466 if direct { 467 // The contents are page-aligned, so it can be opened with O_DIRECT. 468 pagesReadFlags |= syscall.O_DIRECT 469 } 470 if pf, err := os.OpenFile(pagesFileName, pagesReadFlags, 0); err == nil { 471 defer pf.Close() 472 pagesMetadataFileName := path.Join(imagePath, boot.CheckpointPagesMetadataFileName) 473 pmf, err := os.Open(pagesMetadataFileName) 474 if err != nil { 475 return fmt.Errorf("opening restore image file %q failed: %v", pagesMetadataFileName, err) 476 } 477 defer pmf.Close() 478 opt.HavePagesFile = true 479 opt.FilePayload.Files = append(opt.FilePayload.Files, pmf, pf) 480 } else if !os.IsNotExist(err) { 481 return fmt.Errorf("opening restore image file %q failed: %v", pagesFileName, err) 482 } 483 484 // If the platform needs a device FD we must pass it in. 485 if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil { 486 return err 487 } else if deviceFile != nil { 488 defer deviceFile.Close() 489 opt.HaveDeviceFile = true 490 opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile.ReleaseToFile("device file")) 491 } 492 493 conn, err := s.sandboxConnect() 494 if err != nil { 495 return err 496 } 497 defer conn.Close() 498 499 // Configure the network. 500 if err := setupNetwork(conn, s.Pid.load(), conf); err != nil { 501 return fmt.Errorf("setting up network: %v", err) 502 } 503 504 // Restore the container and start the root container. 505 if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { 506 return fmt.Errorf("restoring container %q: %v", cid, err) 507 } 508 509 return nil 510 } 511 512 // RestoreSubcontainer sends the restore call for a sub-container in the sandbox. 513 func (s *Sandbox) RestoreSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, goferFilestoreFiles []*os.File, devIOFile *os.File, goferMountConf []boot.GoferMountConf) error { 514 log.Debugf("Restore sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 515 516 if err := s.configureStdios(conf, stdios); err != nil { 517 return err 518 } 519 s.fixPidns(spec) 520 521 // The payload contains (in this specific order): 522 // * stdin/stdout/stderr (optional: only present when not using TTY) 523 // * The subcontainer's overlay filestore files (optional: only present when 524 // host file backed overlay is configured) 525 // * Gofer files. 526 payload := urpc.FilePayload{} 527 payload.Files = append(payload.Files, stdios...) 528 payload.Files = append(payload.Files, goferFilestoreFiles...) 529 if devIOFile != nil { 530 payload.Files = append(payload.Files, devIOFile) 531 } 532 payload.Files = append(payload.Files, goferFiles...) 533 534 // Start running the container. 535 args := boot.StartArgs{ 536 Spec: spec, 537 Conf: conf, 538 CID: cid, 539 NumGoferFilestoreFDs: len(goferFilestoreFiles), 540 IsDevIoFilePresent: devIOFile != nil, 541 GoferMountConfs: goferMountConf, 542 FilePayload: payload, 543 } 544 if err := s.call(boot.ContMgrRestoreSubcontainer, &args, nil); err != nil { 545 return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) 546 } 547 return nil 548 } 549 550 // Processes retrieves the list of processes and associated metadata for a 551 // given container in this sandbox. 552 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { 553 log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) 554 var pl []*control.Process 555 if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil { 556 return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) 557 } 558 return pl, nil 559 } 560 561 // CreateTraceSession creates a new trace session. 562 func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error { 563 log.Debugf("Creating trace session in sandbox %q", s.ID) 564 565 sinkFiles, err := seccheck.SetupSinks(config.Sinks) 566 if err != nil { 567 return err 568 } 569 defer func() { 570 for _, f := range sinkFiles { 571 _ = f.Close() 572 } 573 }() 574 575 arg := boot.CreateTraceSessionArgs{ 576 Config: *config, 577 Force: force, 578 FilePayload: urpc.FilePayload{ 579 Files: sinkFiles, 580 }, 581 } 582 if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil { 583 return fmt.Errorf("creating trace session: %w", err) 584 } 585 return nil 586 } 587 588 // DeleteTraceSession deletes an existing trace session. 589 func (s *Sandbox) DeleteTraceSession(name string) error { 590 log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID) 591 if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil { 592 return fmt.Errorf("deleting trace session: %w", err) 593 } 594 return nil 595 } 596 597 // ListTraceSessions lists all trace sessions. 598 func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) { 599 log.Debugf("Listing trace sessions in sandbox %q", s.ID) 600 var sessions []seccheck.SessionConfig 601 if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil { 602 return nil, fmt.Errorf("listing trace session: %w", err) 603 } 604 return sessions, nil 605 } 606 607 // ProcfsDump collects and returns a procfs dump for the sandbox. 608 func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) { 609 log.Debugf("Procfs dump %q", s.ID) 610 var procfsDump []procfs.ProcessProcfsDump 611 if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil { 612 return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 613 } 614 return procfsDump, nil 615 } 616 617 // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. 618 func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) { 619 return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */) 620 } 621 622 // Execute runs the specified command in the container. It returns the PID of 623 // the newly created process. 624 func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 625 log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) 626 627 // Stdios are those files which have an FD <= 2 in the process. We do not 628 // want the ownership of other files to be changed by configureStdios. 629 var stdios []*os.File 630 for i, fd := range args.GuestFDs { 631 if fd > 2 || i >= len(args.Files) { 632 continue 633 } 634 stdios = append(stdios, args.Files[i]) 635 } 636 637 if err := s.configureStdios(conf, stdios); err != nil { 638 return 0, err 639 } 640 641 // Send a message to the sandbox control server to start the container. 642 var pid int32 643 if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil { 644 return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err) 645 } 646 return pid, nil 647 } 648 649 // Event retrieves stats about the sandbox such as memory and CPU utilization. 650 func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { 651 log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) 652 var e boot.EventOut 653 if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil { 654 return nil, fmt.Errorf("retrieving event data from sandbox: %w", err) 655 } 656 return &e, nil 657 } 658 659 // PortForward starts port forwarding to the sandbox. 660 func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error { 661 log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts) 662 conn, err := s.sandboxConnect() 663 if err != nil { 664 return err 665 } 666 defer conn.Close() 667 668 if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil { 669 return fmt.Errorf("port forwarding to sandbox: %v", err) 670 } 671 672 return nil 673 } 674 675 func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { 676 log.Debugf("Connecting to sandbox %q", s.ID) 677 path := s.ControlSocketPath 678 if len(path) >= linux.UnixPathMax { 679 // This is not an abstract socket path. It is a filesystem path. 680 // UDS connect fails when the len(socket path) >= UNIX_PATH_MAX. Instead 681 // open the socket using open(2) and use /proc to refer to the open FD. 682 sockFD, err := unix.Open(path, unix.O_PATH, 0) 683 if err != nil { 684 return nil, fmt.Errorf("failed to open socket at %q", path) 685 } 686 defer unix.Close(sockFD) 687 path = filepath.Join("/proc/self/fd", fmt.Sprintf("%d", sockFD)) 688 } 689 conn, err := client.ConnectTo(path) 690 if err != nil { 691 return nil, s.connError(err) 692 } 693 return conn, nil 694 } 695 696 func (s *Sandbox) call(method string, arg, result any) error { 697 conn, err := s.sandboxConnect() 698 if err != nil { 699 return err 700 } 701 defer conn.Close() 702 703 return conn.Call(method, arg, result) 704 } 705 706 func (s *Sandbox) connError(err error) error { 707 return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err) 708 } 709 710 // createSandboxProcess starts the sandbox as a subprocess by running the "boot" 711 // command, passing in the bundle dir. 712 func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { 713 // Ensure we don't leak FDs to the sandbox process. 714 if err := SetCloExeOnAllFDs(); err != nil { 715 return fmt.Errorf("setting CLOEXEC on all FDs: %w", err) 716 } 717 718 donations := donation.Agency{} 719 defer donations.Close() 720 721 // pgalloc.MemoryFile (which provides application memory) sometimes briefly 722 // mlock(2)s ranges of memory in order to fault in a large number of pages at 723 // a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc 724 // expects to run in a memory cgroup that limits its memory usage as 725 // required. 726 // This needs to be done before exec'ing `runsc boot`, as that subcommand 727 // runs as an unprivileged user that will not be able to call `setrlimit` 728 // by itself. Calling `setrlimit` here will have the side-effect of setting 729 // the limit on the currently-running `runsc` process as well, but that 730 // should be OK too. 731 var rlim unix.Rlimit 732 if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 733 log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err) 734 } else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY { 735 rlim.Cur = unix.RLIM_INFINITY 736 rlim.Max = unix.RLIM_INFINITY 737 if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 738 // We may not have CAP_SYS_RESOURCE, so this failure may be expected. 739 log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err) 740 } 741 } 742 743 // 744 // These flags must come BEFORE the "boot" command in cmd.Args. 745 // 746 747 // Open the log files to pass to the sandbox as FDs. 748 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 749 return err 750 } 751 752 test := "" 753 if len(conf.TestOnlyTestNameEnv) != 0 { 754 // Fetch test name if one is provided and the test only flag was set. 755 if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 756 test = t 757 } 758 } 759 if specutils.IsDebugCommand(conf, "boot") { 760 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil { 761 return err 762 } 763 } 764 if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil { 765 return err 766 } 767 covFilename := conf.CoverageReport 768 if covFilename == "" { 769 covFilename = os.Getenv("GO_COVERAGE_FILE") 770 } 771 if covFilename != "" && coverage.Available() { 772 if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil { 773 return err 774 } 775 } 776 777 // Relay all the config flags to the sandbox process. 778 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 779 cmd.SysProcAttr = &unix.SysProcAttr{ 780 // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT 781 // when re-parented. 782 Setsid: true, 783 } 784 785 // Set Args[0] to make easier to spot the sandbox process. Otherwise it's 786 // shown as `exe`. 787 cmd.Args[0] = "runsc-sandbox" 788 789 // Tranfer FDs that need to be present before the "boot" command. 790 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 791 nextFD := donations.Transfer(cmd, 3) 792 793 // Add the "boot" command to the args. 794 // 795 // All flags after this must be for the boot command 796 cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) 797 798 // Clear environment variables, unless --TESTONLY-unsafe-nonroot is set. 799 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 800 // Setting cmd.Env = nil causes cmd to inherit the current process's env. 801 cmd.Env = []string{} 802 } 803 804 // If there is a gofer, sends all socket ends to the sandbox. 805 donations.DonateAndClose("io-fds", args.IOFiles...) 806 donations.DonateAndClose("dev-io-fd", args.DevIOFile) 807 donations.DonateAndClose("gofer-filestore-fds", args.GoferFilestoreFiles...) 808 donations.DonateAndClose("mounts-fd", args.MountsFile) 809 donations.Donate("start-sync-fd", startSyncFile) 810 if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 811 return err 812 } 813 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 814 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil { 815 return err 816 } 817 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil { 818 return err 819 } 820 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil { 821 return err 822 } 823 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil { 824 return err 825 } 826 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil { 827 return err 828 } 829 830 // Pass gofer mount configs. 831 cmd.Args = append(cmd.Args, "--gofer-mount-confs="+args.GoferMountConfs.String()) 832 833 // Create a socket for the control server and donate it to the sandbox. 834 controlSocketPath, sockFD, err := createControlSocket(conf.RootDir, s.ID) 835 if err != nil { 836 return fmt.Errorf("failed to create control socket: %v", err) 837 } 838 s.ControlSocketPath = controlSocketPath 839 log.Infof("Control socket path: %q", s.ControlSocketPath) 840 donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket")) 841 842 specFile, err := specutils.OpenSpec(args.BundleDir) 843 if err != nil { 844 return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err) 845 } 846 donations.DonateAndClose("spec-fd", specFile) 847 848 if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil { 849 return err 850 } 851 donations.DonateAndClose("sink-fds", args.SinkFiles...) 852 853 gPlatform, err := platform.Lookup(conf.Platform) 854 if err != nil { 855 return fmt.Errorf("cannot look up platform: %w", err) 856 } 857 if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil { 858 return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) 859 } else if deviceFile != nil { 860 donations.DonateAndClose("device-fd", deviceFile.ReleaseToFile("device file")) 861 } 862 863 // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff 864 // isn't set. 865 if conf.Platform == "kvm" { 866 cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") 867 } 868 869 // nss is the set of namespaces to join or create before starting the sandbox 870 // process. Mount, IPC and UTS namespaces from the host are not used as they 871 // are virtualized inside the sandbox. Be paranoid and run inside an empty 872 // namespace for these. Don't unshare cgroup because sandbox is added to a 873 // cgroup in the caller's namespace. 874 log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") 875 nss := []specs.LinuxNamespace{ 876 {Type: specs.IPCNamespace}, 877 {Type: specs.MountNamespace}, 878 {Type: specs.UTSNamespace}, 879 } 880 881 if gPlatform.Requirements().RequiresCurrentPIDNS { 882 // TODO(b/75837838): Also set a new PID namespace so that we limit 883 // access to other host processes. 884 log.Infof("Sandbox will be started in the current PID namespace") 885 } else { 886 log.Infof("Sandbox will be started in a new PID namespace") 887 nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) 888 cmd.Args = append(cmd.Args, "--pidns=true") 889 } 890 891 if specutils.NVProxyEnabled(args.Spec, conf) { 892 version, err := getNvproxyDriverVersion(conf) 893 if err != nil { 894 return fmt.Errorf("failed to get Nvidia driver version: %w", err) 895 } 896 cmd.Args = append(cmd.Args, "--nvidia-driver-version="+version) 897 } 898 899 // Joins the network namespace if network is enabled. the sandbox talks 900 // directly to the host network, which may have been configured in the 901 // namespace. 902 if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { 903 log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) 904 nss = append(nss, ns) 905 } else if conf.Network == config.NetworkHost { 906 log.Infof("Sandbox will be started in the host network namespace") 907 } else { 908 log.Infof("Sandbox will be started in new network namespace") 909 nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) 910 } 911 912 // These are set to the uid/gid that the sandbox process will use. May be 913 // overriden below. 914 s.UID = os.Getuid() 915 s.GID = os.Getgid() 916 917 // User namespace depends on the network type or whether access to the host 918 // filesystem is required. These features require to run inside the user 919 // namespace specified in the spec or the current namespace if none is 920 // configured. 921 rootlessEUID := unix.Geteuid() != 0 922 setUserMappings := false 923 if conf.Network == config.NetworkHost || conf.DirectFS { 924 if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { 925 log.Infof("Sandbox will be started in container's user namespace: %+v", userns) 926 nss = append(nss, userns) 927 if rootlessEUID { 928 syncFile, err := ConfigureCmdForRootless(cmd, &donations) 929 if err != nil { 930 return err 931 } 932 defer syncFile.Close() 933 setUserMappings = true 934 } else { 935 specutils.SetUIDGIDMappings(cmd, args.Spec) 936 // We need to set UID and GID to have capabilities in a new user namespace. 937 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 938 } 939 } else { 940 if rootlessEUID { 941 return fmt.Errorf("unable to run a rootless container without userns") 942 } 943 log.Infof("Sandbox will be started in the current user namespace") 944 } 945 // When running in the caller's defined user namespace, apply the same 946 // capabilities to the sandbox process to ensure it abides to the same 947 // rules. 948 cmd.Args = append(cmd.Args, "--apply-caps=true") 949 950 // If we have CAP_SYS_ADMIN, we can create an empty chroot and 951 // bind-mount the executable inside it. 952 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 953 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 954 } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID { 955 log.Infof("Sandbox will be started in minimal chroot") 956 cmd.Args = append(cmd.Args, "--setup-root") 957 } else { 958 return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") 959 } 960 } else { 961 // If we have CAP_SETUID and CAP_SETGID, then we can also run 962 // as user nobody. 963 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 964 log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) 965 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 966 } else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { 967 log.Infof("Sandbox will be started in new user namespace") 968 nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) 969 cmd.Args = append(cmd.Args, "--setup-root") 970 971 const nobody = 65534 972 if rootlessEUID || conf.Rootless { 973 log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) 974 } else { 975 // Map nobody in the new namespace to nobody in the parent namespace. 976 s.UID = nobody 977 s.GID = nobody 978 } 979 980 // Set credentials to run as user and group nobody. 981 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} 982 cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ 983 { 984 ContainerID: nobody, 985 HostID: s.UID, 986 Size: 1, 987 }, 988 } 989 cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ 990 { 991 ContainerID: nobody, 992 HostID: s.GID, 993 Size: 1, 994 }, 995 } 996 997 // A sandbox process will construct an empty root for itself, so it has 998 // to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. 999 cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, 1000 uintptr(capability.CAP_SYS_ADMIN), 1001 uintptr(capability.CAP_SYS_CHROOT), 1002 // CAP_SETPCAP is required to clear the bounding set. 1003 uintptr(capability.CAP_SETPCAP), 1004 ) 1005 1006 } else { 1007 return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") 1008 } 1009 } 1010 1011 // The current process' stdio must be passed to the application via the 1012 // --stdio-fds flag. The stdio of the sandbox process itself must not 1013 // be connected to the same FDs, otherwise we risk leaking sandbox 1014 // errors to the application, so we set the sandbox stdio to nil, 1015 // causing them to read/write from the null device. 1016 cmd.Stdin = nil 1017 cmd.Stdout = nil 1018 cmd.Stderr = nil 1019 var stdios [3]*os.File 1020 1021 // If the console control socket file is provided, then create a new 1022 // pty master/replica pair and set the TTY on the sandbox process. 1023 if args.Spec.Process.Terminal && args.ConsoleSocket != "" { 1024 // console.NewWithSocket will send the master on the given 1025 // socket, and return the replica. 1026 tty, err := console.NewWithSocket(args.ConsoleSocket) 1027 if err != nil { 1028 return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) 1029 } 1030 defer tty.Close() 1031 1032 // Set the TTY as a controlling TTY on the sandbox process. 1033 cmd.SysProcAttr.Setctty = true 1034 1035 // Inconveniently, the Ctty must be the FD in the *child* process's FD 1036 // table. So transfer all files we have so far and make sure the next file 1037 // added to donations is stdin. 1038 // 1039 // See https://github.com/golang/go/issues/29458. 1040 nextFD = donations.Transfer(cmd, nextFD) 1041 cmd.SysProcAttr.Ctty = nextFD 1042 1043 // Pass the tty as all stdio fds to sandbox. 1044 stdios[0] = tty 1045 stdios[1] = tty 1046 stdios[2] = tty 1047 1048 if conf.Debug { 1049 // If debugging, send the boot process stdio to the 1050 // TTY, so that it is easier to find. 1051 cmd.Stdin = tty 1052 cmd.Stdout = tty 1053 cmd.Stderr = tty 1054 } 1055 } else { 1056 // If not using a console, pass our current stdio as the 1057 // container stdio via flags. 1058 stdios[0] = os.Stdin 1059 stdios[1] = os.Stdout 1060 stdios[2] = os.Stderr 1061 1062 if conf.Debug { 1063 // If debugging, send the boot process stdio to the 1064 // this process' stdio, so that is is easier to find. 1065 cmd.Stdin = os.Stdin 1066 cmd.Stdout = os.Stdout 1067 cmd.Stderr = os.Stderr 1068 } 1069 } 1070 if err := s.configureStdios(conf, stdios[:]); err != nil { 1071 return fmt.Errorf("configuring stdios: %w", err) 1072 } 1073 // Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above 1074 // because it relies on stdin being the next FD donated. 1075 donations.Donate("stdio-fds", stdios[:]...) 1076 if conf.ProfilingMetricsLog == "-" { 1077 donations.Donate("profiling-metrics-fd", stdios[1]) 1078 cmd.Args = append(cmd.Args, "--profiling-metrics-fd-lossy=true") 1079 } else if conf.ProfilingMetricsLog != "" { 1080 if err := donations.DonateDebugLogFile("profiling-metrics-fd", conf.ProfilingMetricsLog, "metrics", test); err != nil { 1081 return err 1082 } 1083 cmd.Args = append(cmd.Args, "--profiling-metrics-fd-lossy=false") 1084 } 1085 1086 totalSysMem, err := totalSystemMemory() 1087 if err != nil { 1088 return err 1089 } 1090 cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10)) 1091 1092 mem := totalSysMem 1093 if s.CgroupJSON.Cgroup != nil { 1094 cpuNum, err := s.CgroupJSON.Cgroup.NumCPU() 1095 if err != nil { 1096 return fmt.Errorf("getting cpu count from cgroups: %v", err) 1097 } 1098 if conf.CPUNumFromQuota { 1099 // Dropping below 2 CPUs can trigger application to disable 1100 // locks that can lead do hard to debug errors, so just 1101 // leaving two cores as reasonable default. 1102 const minCPUs = 2 1103 1104 quota, err := s.CgroupJSON.Cgroup.CPUQuota() 1105 if err != nil { 1106 return fmt.Errorf("getting cpu quota from cgroups: %v", err) 1107 } 1108 if n := int(math.Ceil(quota)); n > 0 { 1109 if n < minCPUs { 1110 n = minCPUs 1111 } 1112 if n < cpuNum { 1113 // Only lower the cpu number. 1114 cpuNum = n 1115 } 1116 } 1117 } 1118 cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) 1119 1120 memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit() 1121 if err != nil { 1122 return fmt.Errorf("getting memory limit from cgroups: %v", err) 1123 } 1124 if memLimit < mem { 1125 mem = memLimit 1126 } 1127 } 1128 cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) 1129 1130 if args.Attached { 1131 // Kill sandbox if parent process exits in attached mode. 1132 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1133 // Tells boot that any process it creates must have pdeathsig set. 1134 cmd.Args = append(cmd.Args, "--attached") 1135 } 1136 1137 if args.ExecFile != nil { 1138 donations.Donate("exec-fd", args.ExecFile) 1139 } 1140 1141 nextFD = donations.Transfer(cmd, nextFD) 1142 1143 _ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles) 1144 1145 // Add container ID as the last argument. 1146 cmd.Args = append(cmd.Args, s.ID) 1147 1148 donation.LogDonations(cmd) 1149 log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args) 1150 log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) 1151 if err := specutils.StartInNS(cmd, nss); err != nil { 1152 err := fmt.Errorf("starting sandbox: %v", err) 1153 // If the sandbox failed to start, it may be because the binary 1154 // permissions were incorrect. Check the bits and return a more helpful 1155 // error message. 1156 // 1157 // NOTE: The error message is checked because error types are lost over 1158 // rpc calls. 1159 if strings.Contains(err.Error(), unix.EACCES.Error()) { 1160 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 1161 return fmt.Errorf("%v: %v", err, permsErr) 1162 } 1163 } 1164 return err 1165 } 1166 s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) 1167 if err != nil { 1168 return err 1169 } 1170 if setUserMappings { 1171 if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil { 1172 return err 1173 } 1174 } 1175 1176 s.child = true 1177 s.Pid.store(cmd.Process.Pid) 1178 log.Infof("Sandbox started, PID: %d", cmd.Process.Pid) 1179 1180 return nil 1181 } 1182 1183 // Wait waits for the containerized process to exit, and returns its WaitStatus. 1184 func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { 1185 log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) 1186 1187 if conn, err := s.sandboxConnect(); err != nil { 1188 // The sandbox may have exited while before we had a chance to wait on it. 1189 // There is nothing we can do for subcontainers. For the init container, we 1190 // can try to get the sandbox exit code. 1191 if !s.IsRootContainer(cid) { 1192 return unix.WaitStatus(0), err 1193 } 1194 log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1195 } else { 1196 defer conn.Close() 1197 1198 // Try the Wait RPC to the sandbox. 1199 var ws unix.WaitStatus 1200 err = conn.Call(boot.ContMgrWait, &cid, &ws) 1201 conn.Close() 1202 if err == nil { 1203 if s.IsRootContainer(cid) { 1204 if err := s.waitForStopped(); err != nil { 1205 return unix.WaitStatus(0), err 1206 } 1207 } 1208 // It worked! 1209 return ws, nil 1210 } 1211 // See comment above. 1212 if !s.IsRootContainer(cid) { 1213 return unix.WaitStatus(0), err 1214 } 1215 1216 // The sandbox may have exited after we connected, but before 1217 // or during the Wait RPC. 1218 log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1219 } 1220 1221 // The sandbox may have already exited, or exited while handling the Wait RPC. 1222 // The best we can do is ask Linux what the sandbox exit status was, since in 1223 // most cases that will be the same as the container exit status. 1224 if err := s.waitForStopped(); err != nil { 1225 return unix.WaitStatus(0), err 1226 } 1227 if !s.child { 1228 return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") 1229 } 1230 1231 s.statusMu.Lock() 1232 defer s.statusMu.Unlock() 1233 return s.status, nil 1234 } 1235 1236 // WaitPID waits for process 'pid' in the container's sandbox and returns its 1237 // WaitStatus. 1238 func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { 1239 log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) 1240 var ws unix.WaitStatus 1241 args := &boot.WaitPIDArgs{ 1242 PID: pid, 1243 CID: cid, 1244 } 1245 if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil { 1246 return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err) 1247 } 1248 return ws, nil 1249 } 1250 1251 // IsRootContainer returns true if the specified container ID belongs to the 1252 // root container. 1253 func (s *Sandbox) IsRootContainer(cid string) bool { 1254 return s.ID == cid 1255 } 1256 1257 // Destroy frees all resources associated with the sandbox. It fails fast and 1258 // is idempotent. 1259 func (s *Sandbox) destroy() error { 1260 log.Debugf("Destroying sandbox %q", s.ID) 1261 // Only delete the control file if it exists. 1262 if len(s.ControlSocketPath) > 0 { 1263 if err := os.Remove(s.ControlSocketPath); err != nil { 1264 log.Warningf("failed to delete control socket file %q: %v", s.ControlSocketPath, err) 1265 } 1266 } 1267 pid := s.Pid.load() 1268 if pid != 0 { 1269 log.Debugf("Killing sandbox %q", s.ID) 1270 if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH { 1271 return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err) 1272 } 1273 if err := s.waitForStopped(); err != nil { 1274 return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err) 1275 } 1276 } 1277 1278 return nil 1279 } 1280 1281 // SignalContainer sends the signal to a container in the sandbox. If all is 1282 // true and signal is SIGKILL, then waits for all processes to exit before 1283 // returning. 1284 func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { 1285 log.Debugf("Signal sandbox %q", s.ID) 1286 mode := boot.DeliverToProcess 1287 if all { 1288 mode = boot.DeliverToAllProcesses 1289 } 1290 1291 args := boot.SignalArgs{ 1292 CID: cid, 1293 Signo: int32(sig), 1294 Mode: mode, 1295 } 1296 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1297 return fmt.Errorf("signaling container %q: %w", cid, err) 1298 } 1299 return nil 1300 } 1301 1302 // SignalProcess sends the signal to a particular process in the container. If 1303 // fgProcess is true, then the signal is sent to the foreground process group 1304 // in the same session that PID belongs to. This is only valid if the process 1305 // is attached to a host TTY. 1306 func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error { 1307 log.Debugf("Signal sandbox %q", s.ID) 1308 1309 mode := boot.DeliverToProcess 1310 if fgProcess { 1311 mode = boot.DeliverToForegroundProcessGroup 1312 } 1313 1314 args := boot.SignalArgs{ 1315 CID: cid, 1316 Signo: int32(sig), 1317 PID: pid, 1318 Mode: mode, 1319 } 1320 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1321 return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) 1322 } 1323 return nil 1324 } 1325 1326 // Checkpoint sends the checkpoint call for a container in the sandbox. 1327 // The statefile will be written to f. 1328 func (s *Sandbox) Checkpoint(cid string, imagePath string, direct bool, sfOpts statefile.Options, mfOpts pgalloc.SaveOpts) error { 1329 log.Debugf("Checkpoint sandbox %q, statefile options %+v, MemoryFile options %+v", s.ID, sfOpts, mfOpts) 1330 1331 stateFilePath := filepath.Join(imagePath, boot.CheckpointStateFileName) 1332 sf, err := os.OpenFile(stateFilePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) 1333 if err != nil { 1334 return fmt.Errorf("creating checkpoint state file %q: %w", stateFilePath, err) 1335 } 1336 defer sf.Close() 1337 1338 opt := control.SaveOpts{ 1339 Metadata: sfOpts.WriteToMetadata(map[string]string{}), 1340 MemoryFileSaveOpts: mfOpts, 1341 FilePayload: urpc.FilePayload{ 1342 Files: []*os.File{sf}, 1343 }, 1344 Resume: sfOpts.Resume, 1345 } 1346 1347 // When there is no compression, MemoryFile contents are page-aligned. 1348 // It is beneficial to store them separately so certain optimizations can be 1349 // applied during restore. See Restore(). 1350 if sfOpts.Compression == statefile.CompressionLevelNone { 1351 pagesFilePath := filepath.Join(imagePath, boot.CheckpointPagesFileName) 1352 pagesWriteFlags := os.O_CREATE | os.O_EXCL | os.O_RDWR 1353 if direct { 1354 // The writes will be page-aligned, so it can be opened with O_DIRECT. 1355 pagesWriteFlags |= syscall.O_DIRECT 1356 } 1357 pf, err := os.OpenFile(pagesFilePath, pagesWriteFlags, 0644) 1358 if err != nil { 1359 return fmt.Errorf("creating checkpoint pages file %q: %w", pagesFilePath, err) 1360 } 1361 defer pf.Close() 1362 pagesMetadataFilePath := filepath.Join(imagePath, boot.CheckpointPagesMetadataFileName) 1363 pmf, err := os.OpenFile(pagesMetadataFilePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) 1364 if err != nil { 1365 return fmt.Errorf("creating checkpoint pages metadata file %q: %w", pagesMetadataFilePath, err) 1366 } 1367 defer pmf.Close() 1368 opt.FilePayload.Files = append(opt.FilePayload.Files, pmf, pf) 1369 opt.HavePagesFile = true 1370 } 1371 1372 if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil { 1373 return fmt.Errorf("checkpointing container %q: %w", cid, err) 1374 } 1375 return nil 1376 } 1377 1378 // Pause sends the pause call for a container in the sandbox. 1379 func (s *Sandbox) Pause(cid string) error { 1380 log.Debugf("Pause sandbox %q", s.ID) 1381 if err := s.call(boot.LifecyclePause, nil, nil); err != nil { 1382 return fmt.Errorf("pausing container %q: %w", cid, err) 1383 } 1384 return nil 1385 } 1386 1387 // Resume sends the resume call for a container in the sandbox. 1388 func (s *Sandbox) Resume(cid string) error { 1389 log.Debugf("Resume sandbox %q", s.ID) 1390 if err := s.call(boot.LifecycleResume, nil, nil); err != nil { 1391 return fmt.Errorf("resuming container %q: %w", cid, err) 1392 } 1393 return nil 1394 } 1395 1396 // Usage sends the collect call for a container in the sandbox. 1397 func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) { 1398 log.Debugf("Usage sandbox %q", s.ID) 1399 opts := control.MemoryUsageOpts{Full: Full} 1400 var m control.MemoryUsage 1401 if err := s.call(boot.UsageCollect, &opts, &m); err != nil { 1402 return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err) 1403 } 1404 return m, nil 1405 } 1406 1407 // UsageFD sends the usagefd call for a container in the sandbox. 1408 func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) { 1409 log.Debugf("Usage sandbox %q", s.ID) 1410 opts := control.MemoryUsageFileOpts{Version: 1} 1411 var m control.MemoryUsageFile 1412 if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil { 1413 return nil, fmt.Errorf("collecting usage FD: %w", err) 1414 } 1415 1416 if len(m.FilePayload.Files) != 2 { 1417 return nil, fmt.Errorf("wants exactly two fds") 1418 } 1419 return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) 1420 } 1421 1422 // GetRegisteredMetrics returns metric registration data from the sandbox. 1423 // This data is meant to be used as a way to sanity-check any exported metrics data during the 1424 // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce 1425 // bogus metrics. 1426 // This returns an error if the sandbox has not requested instrumentation during creation time. 1427 func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) { 1428 if s.RegisteredMetrics == nil { 1429 return nil, errors.New("sandbox did not request instrumentation when it was created") 1430 } 1431 return s.RegisteredMetrics, nil 1432 } 1433 1434 // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format. 1435 func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) { 1436 log.Debugf("Metrics export sandbox %q", s.ID) 1437 var data control.MetricsExportData 1438 if err := s.call(boot.MetricsExport, &opts, &data); err != nil { 1439 return nil, err 1440 } 1441 // Since we do not trust the output of the sandbox as-is, double-check that the options were 1442 // respected. 1443 if err := opts.Verify(&data); err != nil { 1444 return nil, err 1445 } 1446 return data.Snapshot, nil 1447 } 1448 1449 // IsRunning returns true if the sandbox or gofer process is running. 1450 func (s *Sandbox) IsRunning() bool { 1451 pid := s.Pid.load() 1452 if pid == 0 { 1453 return false 1454 } 1455 // Send a signal 0 to the sandbox process. If it succeeds, the sandbox 1456 // process is running. 1457 return unix.Kill(pid, 0) == nil 1458 } 1459 1460 // Stacks collects and returns all stacks for the sandbox. 1461 func (s *Sandbox) Stacks() (string, error) { 1462 log.Debugf("Stacks sandbox %q", s.ID) 1463 var stacks string 1464 if err := s.call(boot.DebugStacks, nil, &stacks); err != nil { 1465 return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 1466 } 1467 return stacks, nil 1468 } 1469 1470 // HeapProfile writes a heap profile to the given file. 1471 func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { 1472 log.Debugf("Heap profile %q", s.ID) 1473 opts := control.HeapProfileOpts{ 1474 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1475 Delay: delay, 1476 } 1477 return s.call(boot.ProfileHeap, &opts, nil) 1478 } 1479 1480 // CPUProfile collects a CPU profile. 1481 func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { 1482 log.Debugf("CPU profile %q", s.ID) 1483 opts := control.CPUProfileOpts{ 1484 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1485 Duration: duration, 1486 } 1487 return s.call(boot.ProfileCPU, &opts, nil) 1488 } 1489 1490 // BlockProfile writes a block profile to the given file. 1491 func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { 1492 log.Debugf("Block profile %q", s.ID) 1493 opts := control.BlockProfileOpts{ 1494 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1495 Duration: duration, 1496 } 1497 return s.call(boot.ProfileBlock, &opts, nil) 1498 } 1499 1500 // MutexProfile writes a mutex profile to the given file. 1501 func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { 1502 log.Debugf("Mutex profile %q", s.ID) 1503 opts := control.MutexProfileOpts{ 1504 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1505 Duration: duration, 1506 } 1507 return s.call(boot.ProfileMutex, &opts, nil) 1508 } 1509 1510 // Trace collects an execution trace. 1511 func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { 1512 log.Debugf("Trace %q", s.ID) 1513 opts := control.TraceProfileOpts{ 1514 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1515 Duration: duration, 1516 } 1517 return s.call(boot.ProfileTrace, &opts, nil) 1518 } 1519 1520 // ChangeLogging changes logging options. 1521 func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { 1522 log.Debugf("Change logging start %q", s.ID) 1523 if err := s.call(boot.LoggingChange, &args, nil); err != nil { 1524 return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err) 1525 } 1526 return nil 1527 } 1528 1529 // DestroyContainer destroys the given container. If it is the root container, 1530 // then the entire sandbox is destroyed. 1531 func (s *Sandbox) DestroyContainer(cid string) error { 1532 if err := s.destroyContainer(cid); err != nil { 1533 // If the sandbox isn't running, the container has already been destroyed, 1534 // ignore the error in this case. 1535 if s.IsRunning() { 1536 return err 1537 } 1538 } 1539 return nil 1540 } 1541 1542 func (s *Sandbox) destroyContainer(cid string) error { 1543 if s.IsRootContainer(cid) { 1544 log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) 1545 return s.destroy() 1546 } 1547 1548 log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) 1549 if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil { 1550 return fmt.Errorf("destroying container %q: %w", cid, err) 1551 } 1552 return nil 1553 } 1554 1555 // waitForStopped waits for the sandbox to actually stop. 1556 // This should only be called when the sandbox is known to be shutting down. 1557 func (s *Sandbox) waitForStopped() error { 1558 const waitTimeout = 2 * time.Minute 1559 if s.child { 1560 s.statusMu.Lock() 1561 defer s.statusMu.Unlock() 1562 pid := s.Pid.load() 1563 if pid == 0 { 1564 return nil 1565 } 1566 // The sandbox process is a child of the current process, 1567 // so we can wait on it to terminate and collect its zombie. 1568 if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil { 1569 return fmt.Errorf("error waiting the sandbox process: %v", err) 1570 } 1571 s.Pid.store(0) 1572 return nil 1573 } 1574 ctx, cancel := context.WithTimeout(context.Background(), waitTimeout) 1575 defer cancel() 1576 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1577 op := func() error { 1578 if s.IsRunning() { 1579 return fmt.Errorf("sandbox is still running") 1580 } 1581 return nil 1582 } 1583 return backoff.Retry(op, b) 1584 } 1585 1586 // configureStdios change stdios ownership to give access to the sandbox 1587 // process. This may be skipped depending on the configuration. 1588 func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error { 1589 if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1590 // Cannot change ownership without CAP_CHOWN. 1591 return nil 1592 } 1593 1594 if s.UID < 0 || s.GID < 0 { 1595 panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID)) 1596 } 1597 for _, file := range stdios { 1598 log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) 1599 if err := file.Chown(s.UID, s.GID); err != nil { 1600 if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { 1601 log.Warningf("can't change an owner of %s: %s", file.Name(), err) 1602 continue 1603 } 1604 return err 1605 } 1606 } 1607 return nil 1608 } 1609 1610 // deviceFileForPlatform opens the device file for the given platform. If the 1611 // platform does not need a device file, then nil is returned. 1612 // devicePath may be empty to use a sane platform-specific default. 1613 func deviceFileForPlatform(name, devicePath string) (*fd.FD, error) { 1614 p, err := platform.Lookup(name) 1615 if err != nil { 1616 return nil, err 1617 } 1618 1619 f, err := p.OpenDevice(devicePath) 1620 if err != nil { 1621 return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) 1622 } 1623 return f, nil 1624 } 1625 1626 // getNvproxyDriverVersion returns the NVIDIA driver ABI version to use by 1627 // nvproxy. 1628 func getNvproxyDriverVersion(conf *config.Config) (string, error) { 1629 switch conf.NVProxyDriverVersion { 1630 case "": 1631 return nvproxy.HostDriverVersion() 1632 case "latest": 1633 nvproxy.Init() 1634 return nvproxy.LatestDriver().String(), nil 1635 default: 1636 version, err := nvproxy.DriverVersionFrom(conf.NVProxyDriverVersion) 1637 return version.String(), err 1638 } 1639 } 1640 1641 // checkBinaryPermissions verifies that the required binary bits are set on 1642 // the runsc executable. 1643 func checkBinaryPermissions(conf *config.Config) error { 1644 // All platforms need the other exe bit 1645 neededBits := os.FileMode(0001) 1646 if conf.Platform == "ptrace" { 1647 // Ptrace needs the other read bit 1648 neededBits |= os.FileMode(0004) 1649 } 1650 1651 exePath, err := os.Executable() 1652 if err != nil { 1653 return fmt.Errorf("getting exe path: %v", err) 1654 } 1655 1656 // Check the permissions of the runsc binary and print an error if it 1657 // doesn't match expectations. 1658 info, err := os.Stat(exePath) 1659 if err != nil { 1660 return fmt.Errorf("stat file: %v", err) 1661 } 1662 1663 if info.Mode().Perm()&neededBits != neededBits { 1664 return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) 1665 } 1666 return nil 1667 } 1668 1669 // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox. 1670 func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) { 1671 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1672 args := control.CgroupsReadArgs{ 1673 Args: []control.CgroupsReadArg{ 1674 { 1675 File: file, 1676 }, 1677 }, 1678 } 1679 var out control.CgroupsResults 1680 if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil { 1681 return "", err 1682 } 1683 if len(out.Results) != 1 { 1684 return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1685 } 1686 return out.Results[0].Unpack() 1687 } 1688 1689 // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox. 1690 func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error { 1691 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1692 args := control.CgroupsWriteArgs{ 1693 Args: []control.CgroupsWriteArg{ 1694 { 1695 File: file, 1696 Value: value, 1697 }, 1698 }, 1699 } 1700 var out control.CgroupsResults 1701 if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil { 1702 return err 1703 } 1704 if len(out.Results) != 1 { 1705 return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1706 } 1707 return out.Results[0].AsError() 1708 } 1709 1710 // fixPidns looks at the PID namespace path. If that path corresponds to the 1711 // sandbox process PID namespace, then change the spec so that the container 1712 // joins the sandbox root namespace. 1713 func (s *Sandbox) fixPidns(spec *specs.Spec) { 1714 pidns, ok := specutils.GetNS(specs.PIDNamespace, spec) 1715 if !ok { 1716 // pidns was not set, nothing to fix. 1717 return 1718 } 1719 if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) { 1720 // Fix only if the PID namespace corresponds to the sandbox's. 1721 return 1722 } 1723 1724 for i := range spec.Linux.Namespaces { 1725 if spec.Linux.Namespaces[i].Type == specs.PIDNamespace { 1726 // Removing the namespace makes the container join the sandbox root 1727 // namespace. 1728 log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path) 1729 spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...) 1730 return 1731 } 1732 } 1733 panic("unreachable") 1734 } 1735 1736 // ConfigureCmdForRootless configures cmd to donate a socket FD that can be 1737 // used to synchronize userns configuration. 1738 func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) { 1739 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1740 if err != nil { 1741 return nil, err 1742 } 1743 f := os.NewFile(uintptr(fds[1]), "userns sync other FD") 1744 donations.DonateAndClose("sync-userns-fd", f) 1745 if cmd.SysProcAttr == nil { 1746 cmd.SysProcAttr = &unix.SysProcAttr{} 1747 } 1748 cmd.SysProcAttr.AmbientCaps = []uintptr{ 1749 // Same as `cap` in cmd/gofer.go. 1750 unix.CAP_CHOWN, 1751 unix.CAP_DAC_OVERRIDE, 1752 unix.CAP_DAC_READ_SEARCH, 1753 unix.CAP_FOWNER, 1754 unix.CAP_FSETID, 1755 unix.CAP_SYS_CHROOT, 1756 // Needed for setuid(2)/setgid(2). 1757 unix.CAP_SETUID, 1758 unix.CAP_SETGID, 1759 // Needed for chroot. 1760 unix.CAP_SYS_ADMIN, 1761 // Needed to be able to clear bounding set (PR_CAPBSET_DROP). 1762 unix.CAP_SETPCAP, 1763 } 1764 return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil 1765 } 1766 1767 // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings 1768 // for process pid. 1769 func SetUserMappings(spec *specs.Spec, pid int) error { 1770 log.Debugf("Setting user mappings") 1771 args := []string{strconv.Itoa(pid)} 1772 for _, idMap := range spec.Linux.UIDMappings { 1773 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1774 idMap.HostID, idMap.ContainerID, idMap.Size) 1775 args = append(args, 1776 strconv.Itoa(int(idMap.ContainerID)), 1777 strconv.Itoa(int(idMap.HostID)), 1778 strconv.Itoa(int(idMap.Size)), 1779 ) 1780 } 1781 1782 out, err := exec.Command("newuidmap", args...).CombinedOutput() 1783 log.Debugf("newuidmap: %#v\n%s", args, out) 1784 if err != nil { 1785 return fmt.Errorf("newuidmap failed: %w", err) 1786 } 1787 1788 args = []string{strconv.Itoa(pid)} 1789 for _, idMap := range spec.Linux.GIDMappings { 1790 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1791 idMap.HostID, idMap.ContainerID, idMap.Size) 1792 args = append(args, 1793 strconv.Itoa(int(idMap.ContainerID)), 1794 strconv.Itoa(int(idMap.HostID)), 1795 strconv.Itoa(int(idMap.Size)), 1796 ) 1797 } 1798 out, err = exec.Command("newgidmap", args...).CombinedOutput() 1799 log.Debugf("newgidmap: %#v\n%s", args, out) 1800 if err != nil { 1801 return fmt.Errorf("newgidmap failed: %w", err) 1802 } 1803 return nil 1804 } 1805 1806 // Mount mounts a filesystem in a container. 1807 func (s *Sandbox) Mount(cid, fstype, src, dest string) error { 1808 var files []*os.File 1809 switch fstype { 1810 case erofs.Name: 1811 if imageFile, err := os.Open(src); err != nil { 1812 return fmt.Errorf("opening %s: %v", src, err) 1813 } else { 1814 files = append(files, imageFile) 1815 } 1816 1817 default: 1818 return fmt.Errorf("unsupported filesystem type: %v", fstype) 1819 } 1820 1821 args := boot.MountArgs{ 1822 ContainerID: cid, 1823 Source: src, 1824 Destination: dest, 1825 FsType: fstype, 1826 FilePayload: urpc.FilePayload{Files: files}, 1827 } 1828 return s.call(boot.ContMgrMount, &args, nil) 1829 } 1830 1831 // ContainerRuntimeState returns the runtime state of a container. 1832 func (s *Sandbox) ContainerRuntimeState(cid string) (boot.ContainerRuntimeState, error) { 1833 log.Debugf("ContainerRuntimeState, sandbox: %q, cid: %q", s.ID, cid) 1834 var state boot.ContainerRuntimeState 1835 if err := s.call(boot.ContMgrContainerRuntimeState, &cid, &state); err != nil { 1836 return boot.RuntimeStateInvalid, fmt.Errorf("getting container state (CID: %q): %w", cid, err) 1837 } 1838 log.Debugf("ContainerRuntimeState, sandbox: %q, cid: %q, state: %v", s.ID, cid, state) 1839 return state, nil 1840 } 1841 1842 func setCloExeOnAllFDs() error { 1843 f, err := os.Open("/proc/self/fd") 1844 if err != nil { 1845 return fmt.Errorf("failed to open /proc/self/fd: %w", err) 1846 1847 } 1848 defer f.Close() 1849 for { 1850 dents, err := f.Readdirnames(256) 1851 if err == io.EOF { 1852 break 1853 } else if err != nil { 1854 return fmt.Errorf("failed to read /proc/self/fd: %w", err) 1855 } 1856 for _, dent := range dents { 1857 fd, err := strconv.Atoi(dent) 1858 if err != nil { 1859 return fmt.Errorf("failed to convert /proc/self/fd entry %q to int: %w", dent, err) 1860 } 1861 if fd == int(f.Fd()) { 1862 continue 1863 } 1864 flags, _, errno := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), unix.F_GETFD, 0) 1865 if errno != 0 { 1866 return fmt.Errorf("error getting descriptor flags: %w", errno) 1867 } 1868 if flags&unix.FD_CLOEXEC != 0 { 1869 continue 1870 } 1871 flags |= unix.FD_CLOEXEC 1872 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, uintptr(fd), unix.F_SETFD, flags); errno != 0 { 1873 return fmt.Errorf("error setting CLOEXEC: %w", errno) 1874 } 1875 } 1876 } 1877 return nil 1878 } 1879 1880 var setCloseExecOnce sync.Once 1881 1882 // SetCloExeOnAllFDs sets CLOEXEC on all FDs in /proc/self/fd. This avoids 1883 // leaking inherited FDs from the parent (caller) to subprocesses created. 1884 func SetCloExeOnAllFDs() (retErr error) { 1885 // Sufficient to do this only once per runsc invocation. Avoid double work. 1886 setCloseExecOnce.Do(func() { retErr = setCloExeOnAllFDs() }) 1887 return 1888 }