github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/sandbox/sandbox.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sandbox creates and manipulates sandboxes. 16 package sandbox 17 18 import ( 19 "context" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "math" 25 "os" 26 "os/exec" 27 "path/filepath" 28 "strconv" 29 "strings" 30 "syscall" 31 "time" 32 33 "github.com/cenkalti/backoff" 34 specs "github.com/opencontainers/runtime-spec/specs-go" 35 "github.com/syndtr/gocapability/capability" 36 "golang.org/x/sys/unix" 37 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 38 "github.com/nicocha30/gvisor-ligolo/pkg/cleanup" 39 "github.com/nicocha30/gvisor-ligolo/pkg/control/client" 40 "github.com/nicocha30/gvisor-ligolo/pkg/control/server" 41 "github.com/nicocha30/gvisor-ligolo/pkg/coverage" 42 "github.com/nicocha30/gvisor-ligolo/pkg/log" 43 metricpb "github.com/nicocha30/gvisor-ligolo/pkg/metric/metric_go_proto" 44 "github.com/nicocha30/gvisor-ligolo/pkg/prometheus" 45 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/control" 46 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/platform" 47 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/seccheck" 48 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 49 "github.com/nicocha30/gvisor-ligolo/pkg/urpc" 50 "github.com/nicocha30/gvisor-ligolo/runsc/boot" 51 "github.com/nicocha30/gvisor-ligolo/runsc/boot/procfs" 52 "github.com/nicocha30/gvisor-ligolo/runsc/cgroup" 53 "github.com/nicocha30/gvisor-ligolo/runsc/config" 54 "github.com/nicocha30/gvisor-ligolo/runsc/console" 55 "github.com/nicocha30/gvisor-ligolo/runsc/donation" 56 "github.com/nicocha30/gvisor-ligolo/runsc/specutils" 57 ) 58 59 const ( 60 // namespaceAnnotation is a pod annotation populated by containerd. 61 // It contains the name of the pod that a sandbox is in when running in Kubernetes. 62 podNameAnnotation = "io.kubernetes.cri.sandbox-name" 63 64 // namespaceAnnotation is a pod annotation populated by containerd. 65 // It contains the namespace of the pod that a sandbox is in when running in Kubernetes. 66 namespaceAnnotation = "io.kubernetes.cri.sandbox-namespace" 67 ) 68 69 // createControlSocket finds a location and creates the socket used to 70 // communicate with the sandbox. 71 func createControlSocket(rootDir, id string) (string, int, error) { 72 name := fmt.Sprintf("runsc-%s.sock", id) 73 74 // Only use absolute paths to guarantee resolution from anywhere. 75 var paths []string 76 for _, dir := range []string{rootDir, "/var/run", "/run", "/tmp"} { 77 paths = append(paths, filepath.Join(dir, name)) 78 } 79 // If nothing else worked, use the abstract namespace. 80 paths = append(paths, fmt.Sprintf("\x00runsc-sandbox.%s", id)) 81 82 for _, path := range paths { 83 log.Debugf("Attempting to create socket file %q", path) 84 fd, err := server.CreateSocket(path) 85 if err == nil { 86 log.Debugf("Using socket file %q", path) 87 return path, fd, nil 88 } 89 } 90 return "", -1, fmt.Errorf("unable to find location to write socket file") 91 } 92 93 // pid is an atomic type that implements JSON marshal/unmarshal interfaces. 94 type pid struct { 95 val atomicbitops.Int64 96 } 97 98 func (p *pid) store(pid int) { 99 p.val.Store(int64(pid)) 100 } 101 102 func (p *pid) load() int { 103 return int(p.val.Load()) 104 } 105 106 // UnmarshalJSON implements json.Unmarshaler.UnmarshalJSON. 107 func (p *pid) UnmarshalJSON(b []byte) error { 108 var pid int 109 110 if err := json.Unmarshal(b, &pid); err != nil { 111 return err 112 } 113 p.store(pid) 114 return nil 115 } 116 117 // MarshalJSON implements json.Marshaler.MarshalJSON 118 func (p *pid) MarshalJSON() ([]byte, error) { 119 return json.Marshal(p.load()) 120 } 121 122 // Sandbox wraps a sandbox process. 123 // 124 // It is used to start/stop sandbox process (and associated processes like 125 // gofers), as well as for running and manipulating containers inside a running 126 // sandbox. 127 // 128 // Note: Sandbox must be immutable because a copy of it is saved for each 129 // container and changes would not be synchronized to all of them. 130 type Sandbox struct { 131 // ID is the id of the sandbox (immutable). By convention, this is the same 132 // ID as the first container run in the sandbox. 133 ID string `json:"id"` 134 135 // PodName is the name of the Kubernetes Pod (if any) that this sandbox 136 // represents. Unset if not running under containerd or Kubernetes. 137 PodName string `json:"podName"` 138 139 // Namespace is the Kubernetes namespace (if any) of the pod that this 140 // sandbox represents. Unset if not running under containerd or Kubernetes. 141 Namespace string `json:"namespace"` 142 143 // Pid is the pid of the running sandbox. May be 0 if the sandbox 144 // is not running. 145 Pid pid `json:"pid"` 146 147 // UID is the user ID in the parent namespace that the sandbox is running as. 148 UID int `json:"uid"` 149 // GID is the group ID in the parent namespace that the sandbox is running as. 150 GID int `json:"gid"` 151 152 // CgroupJSON contains the cgroup configuration that the sandbox is part of 153 // and allow serialization of the configuration into json 154 CgroupJSON cgroup.CgroupJSON `json:"cgroup"` 155 156 // OriginalOOMScoreAdj stores the value of oom_score_adj when the sandbox 157 // started, before it may be modified. 158 OriginalOOMScoreAdj int `json:"originalOomScoreAdj"` 159 160 // RegisteredMetrics is the set of metrics registered in the sandbox. 161 // Used for verifying metric data integrity after containers are started. 162 // Only populated if exporting metrics was requested when the sandbox was 163 // created. 164 RegisteredMetrics *metricpb.MetricRegistration `json:"registeredMetrics"` 165 166 // MetricMetadata are key-value pairs that are useful to export about this 167 // sandbox, but not part of the set of labels that uniquely identify it. 168 // They are static once initialized, and typically contain high-level 169 // configuration information about the sandbox. 170 MetricMetadata map[string]string `json:"metricMetadata"` 171 172 // MetricServerAddress is the address of the metric server that this sandbox 173 // intends to export metrics for. 174 // Only populated if exporting metrics was requested when the sandbox was 175 // created. 176 MetricServerAddress string `json:"metricServerAddress"` 177 178 // ControlAddress is the uRPC address used to connect to the sandbox. 179 ControlAddress string `json:"control_address"` 180 181 // MountHints provides extra information about container mounts that apply 182 // to the entire pod. 183 MountHints *boot.PodMountHints `json:"mountHints"` 184 185 // child is set if a sandbox process is a child of the current process. 186 // 187 // This field isn't saved to json, because only a creator of sandbox 188 // will have it as a child process. 189 child bool 190 191 // statusMu protects status. 192 statusMu sync.Mutex 193 194 // status is the exit status of a sandbox process. It's only set if the 195 // child==true and the sandbox was waited on. This field allows for multiple 196 // threads to wait on sandbox and get the exit code, since Linux will return 197 // WaitStatus to one of the waiters only. 198 status unix.WaitStatus 199 } 200 201 // Getpid returns the process ID of the sandbox process. 202 func (s *Sandbox) Getpid() int { 203 return s.Pid.load() 204 } 205 206 // Args is used to configure a new sandbox. 207 type Args struct { 208 // ID is the sandbox unique identifier. 209 ID string 210 211 // Spec is the OCI spec that describes the container. 212 Spec *specs.Spec 213 214 // BundleDir is the directory containing the container bundle. 215 BundleDir string 216 217 // ConsoleSocket is the path to a unix domain socket that will receive 218 // the console FD. It may be empty. 219 ConsoleSocket string 220 221 // UserLog is the filename to send user-visible logs to. It may be empty. 222 UserLog string 223 224 // IOFiles is the list of files that connect to a gofer endpoint for the 225 // mounts points using Gofers. They must be in the same order as mounts 226 // appear in the spec. 227 IOFiles []*os.File 228 229 // OverlayFilestoreFiles are the regular files that will back the tmpfs upper 230 // mount in the overlay mounts. 231 OverlayFilestoreFiles []*os.File 232 233 // OverlayMediums contains information about how the gofer mounts have been 234 // overlaid. The first entry is for rootfs and the following entries are for 235 // bind mounts in Spec.Mounts (in the same order). 236 OverlayMediums []boot.OverlayMedium 237 238 // MountHints provides extra information about containers mounts that apply 239 // to the entire pod. 240 MountHints *boot.PodMountHints 241 242 // MountsFile is a file container mount information from the spec. It's 243 // equivalent to the mounts from the spec, except that all paths have been 244 // resolved to their final absolute location. 245 MountsFile *os.File 246 247 // Gcgroup is the cgroup that the sandbox is part of. 248 Cgroup cgroup.Cgroup 249 250 // Attached indicates that the sandbox lifecycle is attached with the caller. 251 // If the caller exits, the sandbox should exit too. 252 Attached bool 253 254 // SinkFiles is the an ordered array of files to be used by seccheck sinks 255 // configured from the --pod-init-config file. 256 SinkFiles []*os.File 257 258 // PassFiles are user-supplied files from the host to be exposed to the 259 // sandboxed app. 260 PassFiles map[int]*os.File 261 262 // ExecFile is the file from the host used for program execution. 263 ExecFile *os.File 264 } 265 266 // New creates the sandbox process. The caller must call Destroy() on the 267 // sandbox. 268 func New(conf *config.Config, args *Args) (*Sandbox, error) { 269 s := &Sandbox{ 270 ID: args.ID, 271 CgroupJSON: cgroup.CgroupJSON{ 272 Cgroup: args.Cgroup, 273 }, 274 UID: -1, // prevent usage before it's set. 275 GID: -1, // prevent usage before it's set. 276 MetricMetadata: conf.MetricMetadata(), 277 MetricServerAddress: conf.MetricServer, 278 MountHints: args.MountHints, 279 } 280 if args.Spec != nil && args.Spec.Annotations != nil { 281 s.PodName = args.Spec.Annotations[podNameAnnotation] 282 s.Namespace = args.Spec.Annotations[namespaceAnnotation] 283 } 284 285 // The Cleanup object cleans up partially created sandboxes when an error 286 // occurs. Any errors occurring during cleanup itself are ignored. 287 c := cleanup.Make(func() { 288 if err := s.destroy(); err != nil { 289 log.Warningf("error destroying sandbox: %v", err) 290 } 291 }) 292 defer c.Clean() 293 294 if len(conf.PodInitConfig) > 0 { 295 initConf, err := boot.LoadInitConfig(conf.PodInitConfig) 296 if err != nil { 297 return nil, fmt.Errorf("loading init config file: %w", err) 298 } 299 args.SinkFiles, err = initConf.Setup() 300 if err != nil { 301 return nil, fmt.Errorf("cannot init config: %w", err) 302 } 303 } 304 305 // Create pipe to synchronize when sandbox process has been booted. 306 clientSyncFile, sandboxSyncFile, err := os.Pipe() 307 if err != nil { 308 return nil, fmt.Errorf("creating pipe for sandbox %q: %v", s.ID, err) 309 } 310 defer clientSyncFile.Close() 311 312 // Create the sandbox process. 313 err = s.createSandboxProcess(conf, args, sandboxSyncFile) 314 // sandboxSyncFile has to be closed to be able to detect when the sandbox 315 // process exits unexpectedly. 316 sandboxSyncFile.Close() 317 if err != nil { 318 return nil, fmt.Errorf("cannot create sandbox process: %w", err) 319 } 320 321 // Wait until the sandbox has booted. 322 b := make([]byte, 1) 323 if l, err := clientSyncFile.Read(b); err != nil || l != 1 { 324 err := fmt.Errorf("waiting for sandbox to start: %v", err) 325 // If the sandbox failed to start, it may be because the binary 326 // permissions were incorrect. Check the bits and return a more helpful 327 // error message. 328 // 329 // NOTE: The error message is checked because error types are lost over 330 // rpc calls. 331 if strings.Contains(err.Error(), io.EOF.Error()) { 332 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 333 return nil, fmt.Errorf("%v: %v", err, permsErr) 334 } 335 } 336 return nil, fmt.Errorf("cannot read client sync file: %w", err) 337 } 338 339 if conf.MetricServer != "" { 340 // The control server is up and the sandbox was configured to export metrics. 341 // We must gather data about registered metrics prior to any process starting in the sandbox. 342 log.Debugf("Getting metric registration information from sandbox %q", s.ID) 343 var registeredMetrics control.MetricsRegistrationResponse 344 if err := s.call(boot.MetricsGetRegistered, nil, ®isteredMetrics); err != nil { 345 return nil, fmt.Errorf("cannot get registered metrics: %v", err) 346 } 347 s.RegisteredMetrics = registeredMetrics.RegisteredMetrics 348 } 349 350 c.Release() 351 return s, nil 352 } 353 354 // CreateSubcontainer creates a container inside the sandbox. 355 func (s *Sandbox) CreateSubcontainer(conf *config.Config, cid string, tty *os.File) error { 356 log.Debugf("Create sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 357 358 var files []*os.File 359 if tty != nil { 360 files = []*os.File{tty} 361 } 362 if err := s.configureStdios(conf, files); err != nil { 363 return err 364 } 365 366 args := boot.CreateArgs{ 367 CID: cid, 368 FilePayload: urpc.FilePayload{Files: files}, 369 } 370 if err := s.call(boot.ContMgrCreateSubcontainer, &args, nil); err != nil { 371 return fmt.Errorf("creating sub-container %q: %w", cid, err) 372 } 373 return nil 374 } 375 376 // StartRoot starts running the root container process inside the sandbox. 377 func (s *Sandbox) StartRoot(conf *config.Config) error { 378 pid := s.Pid.load() 379 log.Debugf("Start root sandbox %q, PID: %d", s.ID, pid) 380 conn, err := s.sandboxConnect() 381 if err != nil { 382 return err 383 } 384 defer conn.Close() 385 386 // Configure the network. 387 if err := setupNetwork(conn, pid, conf); err != nil { 388 return fmt.Errorf("setting up network: %w", err) 389 } 390 391 // Send a message to the sandbox control server to start the root container. 392 if err := conn.Call(boot.ContMgrRootContainerStart, &s.ID, nil); err != nil { 393 return fmt.Errorf("starting root container: %w", err) 394 } 395 396 return nil 397 } 398 399 // StartSubcontainer starts running a sub-container inside the sandbox. 400 func (s *Sandbox) StartSubcontainer(spec *specs.Spec, conf *config.Config, cid string, stdios, goferFiles, overlayFilestoreFiles []*os.File, overlayMediums []boot.OverlayMedium) error { 401 log.Debugf("Start sub-container %q in sandbox %q, PID: %d", cid, s.ID, s.Pid.load()) 402 403 if err := s.configureStdios(conf, stdios); err != nil { 404 return err 405 } 406 s.fixPidns(spec) 407 408 // The payload contains (in this specific order): 409 // * stdin/stdout/stderr (optional: only present when not using TTY) 410 // * The subcontainer's overlay filestore files (optional: only present when 411 // host file backed overlay is configured) 412 // * Gofer files. 413 payload := urpc.FilePayload{} 414 payload.Files = append(payload.Files, stdios...) 415 payload.Files = append(payload.Files, overlayFilestoreFiles...) 416 payload.Files = append(payload.Files, goferFiles...) 417 418 // Start running the container. 419 args := boot.StartArgs{ 420 Spec: spec, 421 Conf: conf, 422 CID: cid, 423 NumOverlayFilestoreFDs: len(overlayFilestoreFiles), 424 OverlayMediums: overlayMediums, 425 FilePayload: payload, 426 } 427 if err := s.call(boot.ContMgrStartSubcontainer, &args, nil); err != nil { 428 return fmt.Errorf("starting sub-container %v: %w", spec.Process.Args, err) 429 } 430 return nil 431 } 432 433 // Restore sends the restore call for a container in the sandbox. 434 func (s *Sandbox) Restore(conf *config.Config, cid string, filename string) error { 435 log.Debugf("Restore sandbox %q", s.ID) 436 437 rf, err := os.Open(filename) 438 if err != nil { 439 return fmt.Errorf("opening restore file %q failed: %v", filename, err) 440 } 441 defer rf.Close() 442 443 opt := boot.RestoreOpts{ 444 FilePayload: urpc.FilePayload{ 445 Files: []*os.File{rf}, 446 }, 447 SandboxID: s.ID, 448 } 449 450 // If the platform needs a device FD we must pass it in. 451 if deviceFile, err := deviceFileForPlatform(conf.Platform, conf.PlatformDevicePath); err != nil { 452 return err 453 } else if deviceFile != nil { 454 defer deviceFile.Close() 455 opt.FilePayload.Files = append(opt.FilePayload.Files, deviceFile) 456 } 457 458 conn, err := s.sandboxConnect() 459 if err != nil { 460 return err 461 } 462 defer conn.Close() 463 464 // Configure the network. 465 if err := setupNetwork(conn, s.Pid.load(), conf); err != nil { 466 return fmt.Errorf("setting up network: %v", err) 467 } 468 469 // Restore the container and start the root container. 470 if err := conn.Call(boot.ContMgrRestore, &opt, nil); err != nil { 471 return fmt.Errorf("restoring container %q: %v", cid, err) 472 } 473 474 return nil 475 } 476 477 // Processes retrieves the list of processes and associated metadata for a 478 // given container in this sandbox. 479 func (s *Sandbox) Processes(cid string) ([]*control.Process, error) { 480 log.Debugf("Getting processes for container %q in sandbox %q", cid, s.ID) 481 var pl []*control.Process 482 if err := s.call(boot.ContMgrProcesses, &cid, &pl); err != nil { 483 return nil, fmt.Errorf("retrieving process data from sandbox: %v", err) 484 } 485 return pl, nil 486 } 487 488 // CreateTraceSession creates a new trace session. 489 func (s *Sandbox) CreateTraceSession(config *seccheck.SessionConfig, force bool) error { 490 log.Debugf("Creating trace session in sandbox %q", s.ID) 491 492 sinkFiles, err := seccheck.SetupSinks(config.Sinks) 493 if err != nil { 494 return err 495 } 496 defer func() { 497 for _, f := range sinkFiles { 498 _ = f.Close() 499 } 500 }() 501 502 arg := boot.CreateTraceSessionArgs{ 503 Config: *config, 504 Force: force, 505 FilePayload: urpc.FilePayload{ 506 Files: sinkFiles, 507 }, 508 } 509 if err := s.call(boot.ContMgrCreateTraceSession, &arg, nil); err != nil { 510 return fmt.Errorf("creating trace session: %w", err) 511 } 512 return nil 513 } 514 515 // DeleteTraceSession deletes an existing trace session. 516 func (s *Sandbox) DeleteTraceSession(name string) error { 517 log.Debugf("Deleting trace session %q in sandbox %q", name, s.ID) 518 if err := s.call(boot.ContMgrDeleteTraceSession, name, nil); err != nil { 519 return fmt.Errorf("deleting trace session: %w", err) 520 } 521 return nil 522 } 523 524 // ListTraceSessions lists all trace sessions. 525 func (s *Sandbox) ListTraceSessions() ([]seccheck.SessionConfig, error) { 526 log.Debugf("Listing trace sessions in sandbox %q", s.ID) 527 var sessions []seccheck.SessionConfig 528 if err := s.call(boot.ContMgrListTraceSessions, nil, &sessions); err != nil { 529 return nil, fmt.Errorf("listing trace session: %w", err) 530 } 531 return sessions, nil 532 } 533 534 // ProcfsDump collects and returns a procfs dump for the sandbox. 535 func (s *Sandbox) ProcfsDump() ([]procfs.ProcessProcfsDump, error) { 536 log.Debugf("Procfs dump %q", s.ID) 537 var procfsDump []procfs.ProcessProcfsDump 538 if err := s.call(boot.ContMgrProcfsDump, nil, &procfsDump); err != nil { 539 return nil, fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 540 } 541 return procfsDump, nil 542 } 543 544 // NewCGroup returns the sandbox's Cgroup, or an error if it does not have one. 545 func (s *Sandbox) NewCGroup() (cgroup.Cgroup, error) { 546 return cgroup.NewFromPid(s.Pid.load(), false /* useSystemd */) 547 } 548 549 // Execute runs the specified command in the container. It returns the PID of 550 // the newly created process. 551 func (s *Sandbox) Execute(conf *config.Config, args *control.ExecArgs) (int32, error) { 552 log.Debugf("Executing new process in container %q in sandbox %q", args.ContainerID, s.ID) 553 554 // Stdios are those files which have an FD <= 2 in the process. We do not 555 // want the ownership of other files to be changed by configureStdios. 556 var stdios []*os.File 557 for i, fd := range args.GuestFDs { 558 if fd > 2 || i >= len(args.Files) { 559 continue 560 } 561 stdios = append(stdios, args.Files[i]) 562 } 563 564 if err := s.configureStdios(conf, stdios); err != nil { 565 return 0, err 566 } 567 568 // Send a message to the sandbox control server to start the container. 569 var pid int32 570 if err := s.call(boot.ContMgrExecuteAsync, args, &pid); err != nil { 571 return 0, fmt.Errorf("executing command %q in sandbox: %w", args, err) 572 } 573 return pid, nil 574 } 575 576 // Event retrieves stats about the sandbox such as memory and CPU utilization. 577 func (s *Sandbox) Event(cid string) (*boot.EventOut, error) { 578 log.Debugf("Getting events for container %q in sandbox %q", cid, s.ID) 579 var e boot.EventOut 580 if err := s.call(boot.ContMgrEvent, &cid, &e); err != nil { 581 return nil, fmt.Errorf("retrieving event data from sandbox: %w", err) 582 } 583 return &e, nil 584 } 585 586 // PortForward starts port forwarding to the sandbox. 587 func (s *Sandbox) PortForward(opts *boot.PortForwardOpts) error { 588 log.Debugf("Requesting port forward for container %q in sandbox %q: %+v", opts.ContainerID, s.ID, opts) 589 conn, err := s.sandboxConnect() 590 if err != nil { 591 return err 592 } 593 defer conn.Close() 594 595 if err := conn.Call(boot.ContMgrPortForward, opts, nil); err != nil { 596 return fmt.Errorf("port forwarding to sandbox: %v", err) 597 } 598 599 return nil 600 } 601 602 func (s *Sandbox) sandboxConnect() (*urpc.Client, error) { 603 log.Debugf("Connecting to sandbox %q", s.ID) 604 conn, err := client.ConnectTo(s.ControlAddress) 605 if err != nil { 606 return nil, s.connError(err) 607 } 608 return conn, nil 609 } 610 611 func (s *Sandbox) call(method string, arg, result any) error { 612 conn, err := s.sandboxConnect() 613 if err != nil { 614 return err 615 } 616 defer conn.Close() 617 618 return conn.Call(method, arg, result) 619 } 620 621 func (s *Sandbox) connError(err error) error { 622 return fmt.Errorf("connecting to control server at PID %d: %v", s.Pid.load(), err) 623 } 624 625 // createSandboxProcess starts the sandbox as a subprocess by running the "boot" 626 // command, passing in the bundle dir. 627 func (s *Sandbox) createSandboxProcess(conf *config.Config, args *Args, startSyncFile *os.File) error { 628 donations := donation.Agency{} 629 defer donations.Close() 630 631 // pgalloc.MemoryFile (which provides application memory) sometimes briefly 632 // mlock(2)s ranges of memory in order to fault in a large number of pages at 633 // a time. Try to make RLIMIT_MEMLOCK unlimited so that it can do so. runsc 634 // expects to run in a memory cgroup that limits its memory usage as 635 // required. 636 // This needs to be done before exec'ing `runsc boot`, as that subcommand 637 // runs as an unprivileged user that will not be able to call `setrlimit` 638 // by itself. Calling `setrlimit` here will have the side-effect of setting 639 // the limit on the currently-running `runsc` process as well, but that 640 // should be OK too. 641 var rlim unix.Rlimit 642 if err := unix.Getrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 643 log.Warningf("Failed to get RLIMIT_MEMLOCK: %v", err) 644 } else if rlim.Cur != unix.RLIM_INFINITY || rlim.Max != unix.RLIM_INFINITY { 645 rlim.Cur = unix.RLIM_INFINITY 646 rlim.Max = unix.RLIM_INFINITY 647 if err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, &rlim); err != nil { 648 // We may not have CAP_SYS_RESOURCE, so this failure may be expected. 649 log.Infof("Failed to set RLIMIT_MEMLOCK: %v", err) 650 } 651 } 652 653 // 654 // These flags must come BEFORE the "boot" command in cmd.Args. 655 // 656 657 // Open the log files to pass to the sandbox as FDs. 658 if err := donations.OpenAndDonate("log-fd", conf.LogFilename, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 659 return err 660 } 661 662 test := "" 663 if len(conf.TestOnlyTestNameEnv) != 0 { 664 // Fetch test name if one is provided and the test only flag was set. 665 if t, ok := specutils.EnvVar(args.Spec.Process.Env, conf.TestOnlyTestNameEnv); ok { 666 test = t 667 } 668 } 669 if specutils.IsDebugCommand(conf, "boot") { 670 if err := donations.DonateDebugLogFile("debug-log-fd", conf.DebugLog, "boot", test); err != nil { 671 return err 672 } 673 } 674 if err := donations.DonateDebugLogFile("panic-log-fd", conf.PanicLog, "panic", test); err != nil { 675 return err 676 } 677 covFilename := conf.CoverageReport 678 if covFilename == "" { 679 covFilename = os.Getenv("GO_COVERAGE_FILE") 680 } 681 if covFilename != "" && coverage.Available() { 682 if err := donations.DonateDebugLogFile("coverage-fd", covFilename, "cov", test); err != nil { 683 return err 684 } 685 } 686 687 // Relay all the config flags to the sandbox process. 688 cmd := exec.Command(specutils.ExePath, conf.ToFlags()...) 689 cmd.SysProcAttr = &unix.SysProcAttr{ 690 // Detach from this session, otherwise cmd will get SIGHUP and SIGCONT 691 // when re-parented. 692 Setsid: true, 693 } 694 695 // Set Args[0] to make easier to spot the sandbox process. Otherwise it's 696 // shown as `exe`. 697 cmd.Args[0] = "runsc-sandbox" 698 699 // Tranfer FDs that need to be present before the "boot" command. 700 // Start at 3 because 0, 1, and 2 are taken by stdin/out/err. 701 nextFD := donations.Transfer(cmd, 3) 702 703 // Add the "boot" command to the args. 704 // 705 // All flags after this must be for the boot command 706 cmd.Args = append(cmd.Args, "boot", "--bundle="+args.BundleDir) 707 708 // Clear environment variables, unless --TESTONLY-unsafe-nonroot is set. 709 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 710 // Setting cmd.Env = nil causes cmd to inherit the current process's env. 711 cmd.Env = []string{} 712 } 713 714 // If there is a gofer, sends all socket ends to the sandbox. 715 donations.DonateAndClose("io-fds", args.IOFiles...) 716 donations.DonateAndClose("overlay-filestore-fds", args.OverlayFilestoreFiles...) 717 donations.DonateAndClose("mounts-fd", args.MountsFile) 718 donations.Donate("start-sync-fd", startSyncFile) 719 if err := donations.OpenAndDonate("user-log-fd", args.UserLog, os.O_CREATE|os.O_WRONLY|os.O_APPEND); err != nil { 720 return err 721 } 722 const profFlags = os.O_CREATE | os.O_WRONLY | os.O_TRUNC 723 if err := donations.OpenAndDonate("profile-block-fd", conf.ProfileBlock, profFlags); err != nil { 724 return err 725 } 726 if err := donations.OpenAndDonate("profile-cpu-fd", conf.ProfileCPU, profFlags); err != nil { 727 return err 728 } 729 if err := donations.OpenAndDonate("profile-heap-fd", conf.ProfileHeap, profFlags); err != nil { 730 return err 731 } 732 if err := donations.OpenAndDonate("profile-mutex-fd", conf.ProfileMutex, profFlags); err != nil { 733 return err 734 } 735 if err := donations.OpenAndDonate("trace-fd", conf.TraceFile, profFlags); err != nil { 736 return err 737 } 738 739 // Pass overlay mediums. 740 cmd.Args = append(cmd.Args, "--overlay-mediums="+boot.ToOverlayMediumFlags(args.OverlayMediums)) 741 742 // Create a socket for the control server and donate it to the sandbox. 743 controlAddress, sockFD, err := createControlSocket(conf.RootDir, s.ID) 744 if err != nil { 745 return fmt.Errorf("creating control socket %q: %v", s.ControlAddress, err) 746 } 747 log.Infof("Control socket: %q", s.ControlAddress) 748 s.ControlAddress = controlAddress 749 donations.DonateAndClose("controller-fd", os.NewFile(uintptr(sockFD), "control_server_socket")) 750 751 specFile, err := specutils.OpenSpec(args.BundleDir) 752 if err != nil { 753 return fmt.Errorf("cannot open spec file in bundle dir %v: %w", args.BundleDir, err) 754 } 755 donations.DonateAndClose("spec-fd", specFile) 756 757 if err := donations.OpenAndDonate("pod-init-config-fd", conf.PodInitConfig, os.O_RDONLY); err != nil { 758 return err 759 } 760 donations.DonateAndClose("sink-fds", args.SinkFiles...) 761 762 gPlatform, err := platform.Lookup(conf.Platform) 763 if err != nil { 764 return fmt.Errorf("cannot look up platform: %w", err) 765 } 766 if deviceFile, err := gPlatform.OpenDevice(conf.PlatformDevicePath); err != nil { 767 return fmt.Errorf("opening device file for platform %q: %v", conf.Platform, err) 768 } else if deviceFile != nil { 769 donations.DonateAndClose("device-fd", deviceFile) 770 } 771 772 // TODO(b/151157106): syscall tests fail by timeout if asyncpreemptoff 773 // isn't set. 774 if conf.Platform == "kvm" { 775 cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1") 776 } 777 778 // nss is the set of namespaces to join or create before starting the sandbox 779 // process. Mount, IPC and UTS namespaces from the host are not used as they 780 // are virtualized inside the sandbox. Be paranoid and run inside an empty 781 // namespace for these. Don't unshare cgroup because sandbox is added to a 782 // cgroup in the caller's namespace. 783 log.Infof("Sandbox will be started in new mount, IPC and UTS namespaces") 784 nss := []specs.LinuxNamespace{ 785 {Type: specs.IPCNamespace}, 786 {Type: specs.MountNamespace}, 787 {Type: specs.UTSNamespace}, 788 } 789 790 if gPlatform.Requirements().RequiresCurrentPIDNS { 791 // TODO(b/75837838): Also set a new PID namespace so that we limit 792 // access to other host processes. 793 log.Infof("Sandbox will be started in the current PID namespace") 794 } else { 795 log.Infof("Sandbox will be started in a new PID namespace") 796 nss = append(nss, specs.LinuxNamespace{Type: specs.PIDNamespace}) 797 cmd.Args = append(cmd.Args, "--pidns=true") 798 } 799 800 // Joins the network namespace if network is enabled. the sandbox talks 801 // directly to the host network, which may have been configured in the 802 // namespace. 803 if ns, ok := specutils.GetNS(specs.NetworkNamespace, args.Spec); ok && conf.Network != config.NetworkNone { 804 log.Infof("Sandbox will be started in the container's network namespace: %+v", ns) 805 nss = append(nss, ns) 806 } else if conf.Network == config.NetworkHost { 807 log.Infof("Sandbox will be started in the host network namespace") 808 } else { 809 log.Infof("Sandbox will be started in new network namespace") 810 nss = append(nss, specs.LinuxNamespace{Type: specs.NetworkNamespace}) 811 } 812 813 // These are set to the uid/gid that the sandbox process will use. May be 814 // overriden below. 815 s.UID = os.Getuid() 816 s.GID = os.Getgid() 817 818 // User namespace depends on the network type or whether access to the host 819 // filesystem is required. These features require to run inside the user 820 // namespace specified in the spec or the current namespace if none is 821 // configured. 822 rootlessEUID := unix.Geteuid() != 0 823 setUserMappings := false 824 if conf.Network == config.NetworkHost || conf.DirectFS { 825 if userns, ok := specutils.GetNS(specs.UserNamespace, args.Spec); ok { 826 log.Infof("Sandbox will be started in container's user namespace: %+v", userns) 827 nss = append(nss, userns) 828 if rootlessEUID { 829 syncFile, err := ConfigureCmdForRootless(cmd, &donations) 830 if err != nil { 831 return err 832 } 833 defer syncFile.Close() 834 setUserMappings = true 835 } else { 836 specutils.SetUIDGIDMappings(cmd, args.Spec) 837 // We need to set UID and GID to have capabilities in a new user namespace. 838 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: 0, Gid: 0} 839 } 840 } else { 841 if rootlessEUID { 842 return fmt.Errorf("unable to run a rootless container without userns") 843 } 844 log.Infof("Sandbox will be started in the current user namespace") 845 } 846 // When running in the caller's defined user namespace, apply the same 847 // capabilities to the sandbox process to ensure it abides to the same 848 // rules. 849 cmd.Args = append(cmd.Args, "--apply-caps=true") 850 851 // If we have CAP_SYS_ADMIN, we can create an empty chroot and 852 // bind-mount the executable inside it. 853 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 854 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 855 } else if specutils.HasCapabilities(capability.CAP_SYS_ADMIN) || rootlessEUID { 856 log.Infof("Sandbox will be started in minimal chroot") 857 cmd.Args = append(cmd.Args, "--setup-root") 858 } else { 859 return fmt.Errorf("can't run sandbox process in minimal chroot since we don't have CAP_SYS_ADMIN") 860 } 861 } else { 862 // If we have CAP_SETUID and CAP_SETGID, then we can also run 863 // as user nobody. 864 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 865 log.Warningf("Running sandbox in test mode as current user (uid=%d gid=%d). This is only safe in tests!", os.Getuid(), os.Getgid()) 866 log.Warningf("Running sandbox in test mode without chroot. This is only safe in tests!") 867 } else if rootlessEUID || specutils.HasCapabilities(capability.CAP_SETUID, capability.CAP_SETGID) { 868 log.Infof("Sandbox will be started in new user namespace") 869 nss = append(nss, specs.LinuxNamespace{Type: specs.UserNamespace}) 870 cmd.Args = append(cmd.Args, "--setup-root") 871 872 const nobody = 65534 873 if rootlessEUID || conf.Rootless { 874 log.Infof("Rootless mode: sandbox will run as nobody inside user namespace, mapped to the current user, uid: %d, gid: %d", os.Getuid(), os.Getgid()) 875 } else { 876 // Map nobody in the new namespace to nobody in the parent namespace. 877 s.UID = nobody 878 s.GID = nobody 879 } 880 881 // Set credentials to run as user and group nobody. 882 cmd.SysProcAttr.Credential = &syscall.Credential{Uid: nobody, Gid: nobody} 883 cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ 884 { 885 ContainerID: nobody, 886 HostID: s.UID, 887 Size: 1, 888 }, 889 } 890 cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ 891 { 892 ContainerID: nobody, 893 HostID: s.GID, 894 Size: 1, 895 }, 896 } 897 898 // A sandbox process will construct an empty root for itself, so it has 899 // to have CAP_SYS_ADMIN and CAP_SYS_CHROOT capabilities. 900 cmd.SysProcAttr.AmbientCaps = append(cmd.SysProcAttr.AmbientCaps, 901 uintptr(capability.CAP_SYS_ADMIN), 902 uintptr(capability.CAP_SYS_CHROOT), 903 // CAP_SETPCAP is required to clear the bounding set. 904 uintptr(capability.CAP_SETPCAP), 905 ) 906 907 } else { 908 return fmt.Errorf("can't run sandbox process as user nobody since we don't have CAP_SETUID or CAP_SETGID") 909 } 910 } 911 912 // The current process' stdio must be passed to the application via the 913 // --stdio-fds flag. The stdio of the sandbox process itself must not 914 // be connected to the same FDs, otherwise we risk leaking sandbox 915 // errors to the application, so we set the sandbox stdio to nil, 916 // causing them to read/write from the null device. 917 cmd.Stdin = nil 918 cmd.Stdout = nil 919 cmd.Stderr = nil 920 var stdios [3]*os.File 921 922 // If the console control socket file is provided, then create a new 923 // pty master/replica pair and set the TTY on the sandbox process. 924 if args.Spec.Process.Terminal && args.ConsoleSocket != "" { 925 // console.NewWithSocket will send the master on the given 926 // socket, and return the replica. 927 tty, err := console.NewWithSocket(args.ConsoleSocket) 928 if err != nil { 929 return fmt.Errorf("setting up console with socket %q: %v", args.ConsoleSocket, err) 930 } 931 defer tty.Close() 932 933 // Set the TTY as a controlling TTY on the sandbox process. 934 cmd.SysProcAttr.Setctty = true 935 936 // Inconveniently, the Ctty must be the FD in the *child* process's FD 937 // table. So transfer all files we have so far and make sure the next file 938 // added to donations is stdin. 939 // 940 // See https://github.com/golang/go/issues/29458. 941 nextFD = donations.Transfer(cmd, nextFD) 942 cmd.SysProcAttr.Ctty = nextFD 943 944 // Pass the tty as all stdio fds to sandbox. 945 stdios[0] = tty 946 stdios[1] = tty 947 stdios[2] = tty 948 949 if conf.Debug { 950 // If debugging, send the boot process stdio to the 951 // TTY, so that it is easier to find. 952 cmd.Stdin = tty 953 cmd.Stdout = tty 954 cmd.Stderr = tty 955 } 956 } else { 957 // If not using a console, pass our current stdio as the 958 // container stdio via flags. 959 stdios[0] = os.Stdin 960 stdios[1] = os.Stdout 961 stdios[2] = os.Stderr 962 963 if conf.Debug { 964 // If debugging, send the boot process stdio to the 965 // this process' stdio, so that is is easier to find. 966 cmd.Stdin = os.Stdin 967 cmd.Stdout = os.Stdout 968 cmd.Stderr = os.Stderr 969 } 970 } 971 if err := s.configureStdios(conf, stdios[:]); err != nil { 972 return fmt.Errorf("configuring stdios: %w", err) 973 } 974 // Note: this must be done right after "cmd.SysProcAttr.Ctty" is set above 975 // because it relies on stdin being the next FD donated. 976 donations.Donate("stdio-fds", stdios[:]...) 977 978 totalSysMem, err := totalSystemMemory() 979 if err != nil { 980 return err 981 } 982 cmd.Args = append(cmd.Args, "--total-host-memory", strconv.FormatUint(totalSysMem, 10)) 983 984 mem := totalSysMem 985 if s.CgroupJSON.Cgroup != nil { 986 cpuNum, err := s.CgroupJSON.Cgroup.NumCPU() 987 if err != nil { 988 return fmt.Errorf("getting cpu count from cgroups: %v", err) 989 } 990 if conf.CPUNumFromQuota { 991 // Dropping below 2 CPUs can trigger application to disable 992 // locks that can lead do hard to debug errors, so just 993 // leaving two cores as reasonable default. 994 const minCPUs = 2 995 996 quota, err := s.CgroupJSON.Cgroup.CPUQuota() 997 if err != nil { 998 return fmt.Errorf("getting cpu quota from cgroups: %v", err) 999 } 1000 if n := int(math.Ceil(quota)); n > 0 { 1001 if n < minCPUs { 1002 n = minCPUs 1003 } 1004 if n < cpuNum { 1005 // Only lower the cpu number. 1006 cpuNum = n 1007 } 1008 } 1009 } 1010 cmd.Args = append(cmd.Args, "--cpu-num", strconv.Itoa(cpuNum)) 1011 1012 memLimit, err := s.CgroupJSON.Cgroup.MemoryLimit() 1013 if err != nil { 1014 return fmt.Errorf("getting memory limit from cgroups: %v", err) 1015 } 1016 if memLimit < mem { 1017 mem = memLimit 1018 } 1019 } 1020 cmd.Args = append(cmd.Args, "--total-memory", strconv.FormatUint(mem, 10)) 1021 1022 if args.Attached { 1023 // Kill sandbox if parent process exits in attached mode. 1024 cmd.SysProcAttr.Pdeathsig = unix.SIGKILL 1025 // Tells boot that any process it creates must have pdeathsig set. 1026 cmd.Args = append(cmd.Args, "--attached") 1027 } 1028 1029 if args.ExecFile != nil { 1030 donations.Donate("exec-fd", args.ExecFile) 1031 } 1032 1033 nextFD = donations.Transfer(cmd, nextFD) 1034 1035 _ = donation.DonateAndTransferCustomFiles(cmd, nextFD, args.PassFiles) 1036 1037 // Add container ID as the last argument. 1038 cmd.Args = append(cmd.Args, s.ID) 1039 1040 donation.LogDonations(cmd) 1041 log.Debugf("Starting sandbox: %s %v", cmd.Path, cmd.Args) 1042 log.Debugf("SysProcAttr: %+v", cmd.SysProcAttr) 1043 if err := specutils.StartInNS(cmd, nss); err != nil { 1044 err := fmt.Errorf("starting sandbox: %v", err) 1045 // If the sandbox failed to start, it may be because the binary 1046 // permissions were incorrect. Check the bits and return a more helpful 1047 // error message. 1048 // 1049 // NOTE: The error message is checked because error types are lost over 1050 // rpc calls. 1051 if strings.Contains(err.Error(), unix.EACCES.Error()) { 1052 if permsErr := checkBinaryPermissions(conf); permsErr != nil { 1053 return fmt.Errorf("%v: %v", err, permsErr) 1054 } 1055 } 1056 return err 1057 } 1058 s.OriginalOOMScoreAdj, err = specutils.GetOOMScoreAdj(cmd.Process.Pid) 1059 if err != nil { 1060 return err 1061 } 1062 if setUserMappings { 1063 if err := SetUserMappings(args.Spec, cmd.Process.Pid); err != nil { 1064 return err 1065 } 1066 } 1067 1068 s.child = true 1069 s.Pid.store(cmd.Process.Pid) 1070 log.Infof("Sandbox started, PID: %d", cmd.Process.Pid) 1071 1072 return nil 1073 } 1074 1075 // Wait waits for the containerized process to exit, and returns its WaitStatus. 1076 func (s *Sandbox) Wait(cid string) (unix.WaitStatus, error) { 1077 log.Debugf("Waiting for container %q in sandbox %q", cid, s.ID) 1078 1079 if conn, err := s.sandboxConnect(); err != nil { 1080 // The sandbox may have exited while before we had a chance to wait on it. 1081 // There is nothing we can do for subcontainers. For the init container, we 1082 // can try to get the sandbox exit code. 1083 if !s.IsRootContainer(cid) { 1084 return unix.WaitStatus(0), err 1085 } 1086 log.Warningf("Wait on container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1087 } else { 1088 defer conn.Close() 1089 1090 // Try the Wait RPC to the sandbox. 1091 var ws unix.WaitStatus 1092 err = conn.Call(boot.ContMgrWait, &cid, &ws) 1093 conn.Close() 1094 if err == nil { 1095 if s.IsRootContainer(cid) { 1096 if err := s.waitForStopped(); err != nil { 1097 return unix.WaitStatus(0), err 1098 } 1099 } 1100 // It worked! 1101 return ws, nil 1102 } 1103 // See comment above. 1104 if !s.IsRootContainer(cid) { 1105 return unix.WaitStatus(0), err 1106 } 1107 1108 // The sandbox may have exited after we connected, but before 1109 // or during the Wait RPC. 1110 log.Warningf("Wait RPC to container %q failed: %v. Will try waiting on the sandbox process instead.", cid, err) 1111 } 1112 1113 // The sandbox may have already exited, or exited while handling the Wait RPC. 1114 // The best we can do is ask Linux what the sandbox exit status was, since in 1115 // most cases that will be the same as the container exit status. 1116 if err := s.waitForStopped(); err != nil { 1117 return unix.WaitStatus(0), err 1118 } 1119 if !s.child { 1120 return unix.WaitStatus(0), fmt.Errorf("sandbox no longer running and its exit status is unavailable") 1121 } 1122 1123 s.statusMu.Lock() 1124 defer s.statusMu.Unlock() 1125 return s.status, nil 1126 } 1127 1128 // WaitPID waits for process 'pid' in the container's sandbox and returns its 1129 // WaitStatus. 1130 func (s *Sandbox) WaitPID(cid string, pid int32) (unix.WaitStatus, error) { 1131 log.Debugf("Waiting for PID %d in sandbox %q", pid, s.ID) 1132 var ws unix.WaitStatus 1133 args := &boot.WaitPIDArgs{ 1134 PID: pid, 1135 CID: cid, 1136 } 1137 if err := s.call(boot.ContMgrWaitPID, args, &ws); err != nil { 1138 return ws, fmt.Errorf("waiting on PID %d in sandbox %q: %w", pid, s.ID, err) 1139 } 1140 return ws, nil 1141 } 1142 1143 // IsRootContainer returns true if the specified container ID belongs to the 1144 // root container. 1145 func (s *Sandbox) IsRootContainer(cid string) bool { 1146 return s.ID == cid 1147 } 1148 1149 // Destroy frees all resources associated with the sandbox. It fails fast and 1150 // is idempotent. 1151 func (s *Sandbox) destroy() error { 1152 log.Debugf("Destroying sandbox %q", s.ID) 1153 // Only delete the control file if it exists and is not an abstract UDS. 1154 if len(s.ControlAddress) > 0 && s.ControlAddress[0] != 0 { 1155 if err := os.Remove(s.ControlAddress); err != nil { 1156 log.Warningf("failed to delete control socket file %q: %v", s.ControlAddress, err) 1157 } 1158 } 1159 pid := s.Pid.load() 1160 if pid != 0 { 1161 log.Debugf("Killing sandbox %q", s.ID) 1162 if err := unix.Kill(pid, unix.SIGKILL); err != nil && err != unix.ESRCH { 1163 return fmt.Errorf("killing sandbox %q PID %q: %w", s.ID, pid, err) 1164 } 1165 if err := s.waitForStopped(); err != nil { 1166 return fmt.Errorf("waiting sandbox %q stop: %w", s.ID, err) 1167 } 1168 } 1169 1170 return nil 1171 } 1172 1173 // SignalContainer sends the signal to a container in the sandbox. If all is 1174 // true and signal is SIGKILL, then waits for all processes to exit before 1175 // returning. 1176 func (s *Sandbox) SignalContainer(cid string, sig unix.Signal, all bool) error { 1177 log.Debugf("Signal sandbox %q", s.ID) 1178 mode := boot.DeliverToProcess 1179 if all { 1180 mode = boot.DeliverToAllProcesses 1181 } 1182 1183 args := boot.SignalArgs{ 1184 CID: cid, 1185 Signo: int32(sig), 1186 Mode: mode, 1187 } 1188 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1189 return fmt.Errorf("signaling container %q: %w", cid, err) 1190 } 1191 return nil 1192 } 1193 1194 // SignalProcess sends the signal to a particular process in the container. If 1195 // fgProcess is true, then the signal is sent to the foreground process group 1196 // in the same session that PID belongs to. This is only valid if the process 1197 // is attached to a host TTY. 1198 func (s *Sandbox) SignalProcess(cid string, pid int32, sig unix.Signal, fgProcess bool) error { 1199 log.Debugf("Signal sandbox %q", s.ID) 1200 1201 mode := boot.DeliverToProcess 1202 if fgProcess { 1203 mode = boot.DeliverToForegroundProcessGroup 1204 } 1205 1206 args := boot.SignalArgs{ 1207 CID: cid, 1208 Signo: int32(sig), 1209 PID: pid, 1210 Mode: mode, 1211 } 1212 if err := s.call(boot.ContMgrSignal, &args, nil); err != nil { 1213 return fmt.Errorf("signaling container %q PID %d: %v", cid, pid, err) 1214 } 1215 return nil 1216 } 1217 1218 // Checkpoint sends the checkpoint call for a container in the sandbox. 1219 // The statefile will be written to f. 1220 func (s *Sandbox) Checkpoint(cid string, f *os.File) error { 1221 log.Debugf("Checkpoint sandbox %q", s.ID) 1222 opt := control.SaveOpts{ 1223 FilePayload: urpc.FilePayload{ 1224 Files: []*os.File{f}, 1225 }, 1226 } 1227 1228 if err := s.call(boot.ContMgrCheckpoint, &opt, nil); err != nil { 1229 return fmt.Errorf("checkpointing container %q: %w", cid, err) 1230 } 1231 return nil 1232 } 1233 1234 // Pause sends the pause call for a container in the sandbox. 1235 func (s *Sandbox) Pause(cid string) error { 1236 log.Debugf("Pause sandbox %q", s.ID) 1237 if err := s.call(boot.LifecyclePause, nil, nil); err != nil { 1238 return fmt.Errorf("pausing container %q: %w", cid, err) 1239 } 1240 return nil 1241 } 1242 1243 // Resume sends the resume call for a container in the sandbox. 1244 func (s *Sandbox) Resume(cid string) error { 1245 log.Debugf("Resume sandbox %q", s.ID) 1246 if err := s.call(boot.LifecycleResume, nil, nil); err != nil { 1247 return fmt.Errorf("resuming container %q: %w", cid, err) 1248 } 1249 return nil 1250 } 1251 1252 // Usage sends the collect call for a container in the sandbox. 1253 func (s *Sandbox) Usage(Full bool) (control.MemoryUsage, error) { 1254 log.Debugf("Usage sandbox %q", s.ID) 1255 opts := control.MemoryUsageOpts{Full: Full} 1256 var m control.MemoryUsage 1257 if err := s.call(boot.UsageCollect, &opts, &m); err != nil { 1258 return control.MemoryUsage{}, fmt.Errorf("collecting usage: %w", err) 1259 } 1260 return m, nil 1261 } 1262 1263 // UsageFD sends the usagefd call for a container in the sandbox. 1264 func (s *Sandbox) UsageFD() (*control.MemoryUsageRecord, error) { 1265 log.Debugf("Usage sandbox %q", s.ID) 1266 opts := control.MemoryUsageFileOpts{Version: 1} 1267 var m control.MemoryUsageFile 1268 if err := s.call(boot.UsageUsageFD, &opts, &m); err != nil { 1269 return nil, fmt.Errorf("collecting usage FD: %w", err) 1270 } 1271 1272 if len(m.FilePayload.Files) != 2 { 1273 return nil, fmt.Errorf("wants exactly two fds") 1274 } 1275 return control.NewMemoryUsageRecord(*m.FilePayload.Files[0], *m.FilePayload.Files[1]) 1276 } 1277 1278 // GetRegisteredMetrics returns metric registration data from the sandbox. 1279 // This data is meant to be used as a way to sanity-check any exported metrics data during the 1280 // lifetime of the sandbox in order to avoid a compromised sandbox from being able to produce 1281 // bogus metrics. 1282 // This returns an error if the sandbox has not requested instrumentation during creation time. 1283 func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) { 1284 if s.RegisteredMetrics == nil { 1285 return nil, errors.New("sandbox did not request instrumentation when it was created") 1286 } 1287 return s.RegisteredMetrics, nil 1288 } 1289 1290 // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format. 1291 func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) { 1292 log.Debugf("Metrics export sandbox %q", s.ID) 1293 var data control.MetricsExportData 1294 if err := s.call(boot.MetricsExport, &opts, &data); err != nil { 1295 return nil, err 1296 } 1297 // Since we do not trust the output of the sandbox as-is, double-check that the options were 1298 // respected. 1299 if err := opts.Verify(&data); err != nil { 1300 return nil, err 1301 } 1302 return data.Snapshot, nil 1303 } 1304 1305 // IsRunning returns true if the sandbox or gofer process is running. 1306 func (s *Sandbox) IsRunning() bool { 1307 pid := s.Pid.load() 1308 if pid != 0 { 1309 // Send a signal 0 to the sandbox process. 1310 if err := unix.Kill(pid, 0); err == nil { 1311 // Succeeded, process is running. 1312 return true 1313 } 1314 } 1315 return false 1316 } 1317 1318 // Stacks collects and returns all stacks for the sandbox. 1319 func (s *Sandbox) Stacks() (string, error) { 1320 log.Debugf("Stacks sandbox %q", s.ID) 1321 var stacks string 1322 if err := s.call(boot.DebugStacks, nil, &stacks); err != nil { 1323 return "", fmt.Errorf("getting sandbox %q stacks: %w", s.ID, err) 1324 } 1325 return stacks, nil 1326 } 1327 1328 // HeapProfile writes a heap profile to the given file. 1329 func (s *Sandbox) HeapProfile(f *os.File, delay time.Duration) error { 1330 log.Debugf("Heap profile %q", s.ID) 1331 opts := control.HeapProfileOpts{ 1332 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1333 Delay: delay, 1334 } 1335 return s.call(boot.ProfileHeap, &opts, nil) 1336 } 1337 1338 // CPUProfile collects a CPU profile. 1339 func (s *Sandbox) CPUProfile(f *os.File, duration time.Duration) error { 1340 log.Debugf("CPU profile %q", s.ID) 1341 opts := control.CPUProfileOpts{ 1342 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1343 Duration: duration, 1344 } 1345 return s.call(boot.ProfileCPU, &opts, nil) 1346 } 1347 1348 // BlockProfile writes a block profile to the given file. 1349 func (s *Sandbox) BlockProfile(f *os.File, duration time.Duration) error { 1350 log.Debugf("Block profile %q", s.ID) 1351 opts := control.BlockProfileOpts{ 1352 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1353 Duration: duration, 1354 } 1355 return s.call(boot.ProfileBlock, &opts, nil) 1356 } 1357 1358 // MutexProfile writes a mutex profile to the given file. 1359 func (s *Sandbox) MutexProfile(f *os.File, duration time.Duration) error { 1360 log.Debugf("Mutex profile %q", s.ID) 1361 opts := control.MutexProfileOpts{ 1362 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1363 Duration: duration, 1364 } 1365 return s.call(boot.ProfileMutex, &opts, nil) 1366 } 1367 1368 // Trace collects an execution trace. 1369 func (s *Sandbox) Trace(f *os.File, duration time.Duration) error { 1370 log.Debugf("Trace %q", s.ID) 1371 opts := control.TraceProfileOpts{ 1372 FilePayload: urpc.FilePayload{Files: []*os.File{f}}, 1373 Duration: duration, 1374 } 1375 return s.call(boot.ProfileTrace, &opts, nil) 1376 } 1377 1378 // ChangeLogging changes logging options. 1379 func (s *Sandbox) ChangeLogging(args control.LoggingArgs) error { 1380 log.Debugf("Change logging start %q", s.ID) 1381 if err := s.call(boot.LoggingChange, &args, nil); err != nil { 1382 return fmt.Errorf("changing sandbox %q logging: %w", s.ID, err) 1383 } 1384 return nil 1385 } 1386 1387 // DestroyContainer destroys the given container. If it is the root container, 1388 // then the entire sandbox is destroyed. 1389 func (s *Sandbox) DestroyContainer(cid string) error { 1390 if err := s.destroyContainer(cid); err != nil { 1391 // If the sandbox isn't running, the container has already been destroyed, 1392 // ignore the error in this case. 1393 if s.IsRunning() { 1394 return err 1395 } 1396 } 1397 return nil 1398 } 1399 1400 func (s *Sandbox) destroyContainer(cid string) error { 1401 if s.IsRootContainer(cid) { 1402 log.Debugf("Destroying root container by destroying sandbox, cid: %s", cid) 1403 return s.destroy() 1404 } 1405 1406 log.Debugf("Destroying container, cid: %s, sandbox: %s", cid, s.ID) 1407 if err := s.call(boot.ContMgrDestroySubcontainer, &cid, nil); err != nil { 1408 return fmt.Errorf("destroying container %q: %w", cid, err) 1409 } 1410 return nil 1411 } 1412 1413 func (s *Sandbox) waitForStopped() error { 1414 if s.child { 1415 s.statusMu.Lock() 1416 defer s.statusMu.Unlock() 1417 pid := s.Pid.load() 1418 if pid == 0 { 1419 return nil 1420 } 1421 // The sandbox process is a child of the current process, 1422 // so we can wait on it to terminate and collect its zombie. 1423 if _, err := unix.Wait4(int(pid), &s.status, 0, nil); err != nil { 1424 return fmt.Errorf("error waiting the sandbox process: %v", err) 1425 } 1426 s.Pid.store(0) 1427 return nil 1428 } 1429 1430 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1431 defer cancel() 1432 b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) 1433 op := func() error { 1434 if s.IsRunning() { 1435 return fmt.Errorf("sandbox is still running") 1436 } 1437 return nil 1438 } 1439 return backoff.Retry(op, b) 1440 } 1441 1442 // configureStdios change stdios ownership to give access to the sandbox 1443 // process. This may be skipped depending on the configuration. 1444 func (s *Sandbox) configureStdios(conf *config.Config, stdios []*os.File) error { 1445 if conf.Rootless || conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 1446 // Cannot change ownership without CAP_CHOWN. 1447 return nil 1448 } 1449 1450 if s.UID < 0 || s.GID < 0 { 1451 panic(fmt.Sprintf("sandbox UID/GID is not set: %d/%d", s.UID, s.GID)) 1452 } 1453 for _, file := range stdios { 1454 log.Debugf("Changing %q ownership to %d/%d", file.Name(), s.UID, s.GID) 1455 if err := file.Chown(s.UID, s.GID); err != nil { 1456 if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) { 1457 log.Warningf("can't change an owner of %s: %s", file.Name(), err) 1458 continue 1459 } 1460 return err 1461 } 1462 } 1463 return nil 1464 } 1465 1466 // deviceFileForPlatform opens the device file for the given platform. If the 1467 // platform does not need a device file, then nil is returned. 1468 // devicePath may be empty to use a sane platform-specific default. 1469 func deviceFileForPlatform(name, devicePath string) (*os.File, error) { 1470 p, err := platform.Lookup(name) 1471 if err != nil { 1472 return nil, err 1473 } 1474 1475 f, err := p.OpenDevice(devicePath) 1476 if err != nil { 1477 return nil, fmt.Errorf("opening device file for platform %q: %w", name, err) 1478 } 1479 return f, nil 1480 } 1481 1482 // checkBinaryPermissions verifies that the required binary bits are set on 1483 // the runsc executable. 1484 func checkBinaryPermissions(conf *config.Config) error { 1485 // All platforms need the other exe bit 1486 neededBits := os.FileMode(0001) 1487 if conf.Platform == "ptrace" { 1488 // Ptrace needs the other read bit 1489 neededBits |= os.FileMode(0004) 1490 } 1491 1492 exePath, err := os.Executable() 1493 if err != nil { 1494 return fmt.Errorf("getting exe path: %v", err) 1495 } 1496 1497 // Check the permissions of the runsc binary and print an error if it 1498 // doesn't match expectations. 1499 info, err := os.Stat(exePath) 1500 if err != nil { 1501 return fmt.Errorf("stat file: %v", err) 1502 } 1503 1504 if info.Mode().Perm()&neededBits != neededBits { 1505 return fmt.Errorf(specutils.FaqErrorMsg("runsc-perms", fmt.Sprintf("%s does not have the correct permissions", exePath))) 1506 } 1507 return nil 1508 } 1509 1510 // CgroupsReadControlFile reads a single cgroupfs control file in the sandbox. 1511 func (s *Sandbox) CgroupsReadControlFile(file control.CgroupControlFile) (string, error) { 1512 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1513 args := control.CgroupsReadArgs{ 1514 Args: []control.CgroupsReadArg{ 1515 { 1516 File: file, 1517 }, 1518 }, 1519 } 1520 var out control.CgroupsResults 1521 if err := s.call(boot.CgroupsReadControlFiles, &args, &out); err != nil { 1522 return "", err 1523 } 1524 if len(out.Results) != 1 { 1525 return "", fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1526 } 1527 return out.Results[0].Unpack() 1528 } 1529 1530 // CgroupsWriteControlFile writes a single cgroupfs control file in the sandbox. 1531 func (s *Sandbox) CgroupsWriteControlFile(file control.CgroupControlFile, value string) error { 1532 log.Debugf("CgroupsReadControlFiles sandbox %q", s.ID) 1533 args := control.CgroupsWriteArgs{ 1534 Args: []control.CgroupsWriteArg{ 1535 { 1536 File: file, 1537 Value: value, 1538 }, 1539 }, 1540 } 1541 var out control.CgroupsResults 1542 if err := s.call(boot.CgroupsWriteControlFiles, &args, &out); err != nil { 1543 return err 1544 } 1545 if len(out.Results) != 1 { 1546 return fmt.Errorf("expected 1 result, got %d, raw: %+v", len(out.Results), out) 1547 } 1548 return out.Results[0].AsError() 1549 } 1550 1551 // fixPidns looks at the PID namespace path. If that path corresponds to the 1552 // sandbox process PID namespace, then change the spec so that the container 1553 // joins the sandbox root namespace. 1554 func (s *Sandbox) fixPidns(spec *specs.Spec) { 1555 pidns, ok := specutils.GetNS(specs.PIDNamespace, spec) 1556 if !ok { 1557 // pidns was not set, nothing to fix. 1558 return 1559 } 1560 if pidns.Path != fmt.Sprintf("/proc/%d/ns/pid", s.Pid.load()) { 1561 // Fix only if the PID namespace corresponds to the sandbox's. 1562 return 1563 } 1564 1565 for i := range spec.Linux.Namespaces { 1566 if spec.Linux.Namespaces[i].Type == specs.PIDNamespace { 1567 // Removing the namespace makes the container join the sandbox root 1568 // namespace. 1569 log.Infof("Fixing PID namespace in spec from %q to make the container join the sandbox root namespace", pidns.Path) 1570 spec.Linux.Namespaces = append(spec.Linux.Namespaces[:i], spec.Linux.Namespaces[i+1:]...) 1571 return 1572 } 1573 } 1574 panic("unreachable") 1575 } 1576 1577 // ConfigureCmdForRootless configures cmd to donate a socket FD that can be 1578 // used to synchronize userns configuration. 1579 func ConfigureCmdForRootless(cmd *exec.Cmd, donations *donation.Agency) (*os.File, error) { 1580 fds, err := unix.Socketpair(unix.AF_UNIX, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 1581 if err != nil { 1582 return nil, err 1583 } 1584 f := os.NewFile(uintptr(fds[1]), "userns sync other FD") 1585 donations.DonateAndClose("sync-userns-fd", f) 1586 if cmd.SysProcAttr == nil { 1587 cmd.SysProcAttr = &unix.SysProcAttr{} 1588 } 1589 cmd.SysProcAttr.AmbientCaps = []uintptr{ 1590 // Same as `cap` in cmd/gofer.go. 1591 unix.CAP_CHOWN, 1592 unix.CAP_DAC_OVERRIDE, 1593 unix.CAP_DAC_READ_SEARCH, 1594 unix.CAP_FOWNER, 1595 unix.CAP_FSETID, 1596 unix.CAP_SYS_CHROOT, 1597 // Needed for setuid(2)/setgid(2). 1598 unix.CAP_SETUID, 1599 unix.CAP_SETGID, 1600 // Needed for chroot. 1601 unix.CAP_SYS_ADMIN, 1602 // Needed to be able to clear bounding set (PR_CAPBSET_DROP). 1603 unix.CAP_SETPCAP, 1604 } 1605 return os.NewFile(uintptr(fds[0]), "userns sync FD"), nil 1606 } 1607 1608 // SetUserMappings uses newuidmap/newgidmap programs to set up user ID mappings 1609 // for process pid. 1610 func SetUserMappings(spec *specs.Spec, pid int) error { 1611 log.Debugf("Setting user mappings") 1612 args := []string{strconv.Itoa(pid)} 1613 for _, idMap := range spec.Linux.UIDMappings { 1614 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1615 idMap.HostID, idMap.ContainerID, idMap.Size) 1616 args = append(args, 1617 strconv.Itoa(int(idMap.ContainerID)), 1618 strconv.Itoa(int(idMap.HostID)), 1619 strconv.Itoa(int(idMap.Size)), 1620 ) 1621 } 1622 1623 out, err := exec.Command("newuidmap", args...).CombinedOutput() 1624 log.Debugf("newuidmap: %#v\n%s", args, out) 1625 if err != nil { 1626 return fmt.Errorf("newuidmap failed: %w", err) 1627 } 1628 1629 args = []string{strconv.Itoa(pid)} 1630 for _, idMap := range spec.Linux.GIDMappings { 1631 log.Infof("Mapping host uid %d to container uid %d (size=%d)", 1632 idMap.HostID, idMap.ContainerID, idMap.Size) 1633 args = append(args, 1634 strconv.Itoa(int(idMap.ContainerID)), 1635 strconv.Itoa(int(idMap.HostID)), 1636 strconv.Itoa(int(idMap.Size)), 1637 ) 1638 } 1639 out, err = exec.Command("newgidmap", args...).CombinedOutput() 1640 log.Debugf("newgidmap: %#v\n%s", args, out) 1641 if err != nil { 1642 return fmt.Errorf("newgidmap failed: %w", err) 1643 } 1644 return nil 1645 }