github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/runsc/cmd/boot.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cmd 16 17 import ( 18 "context" 19 "fmt" 20 "io/ioutil" 21 "os" 22 "os/exec" 23 "path/filepath" 24 "runtime" 25 "runtime/debug" 26 "strconv" 27 "strings" 28 29 "github.com/google/subcommands" 30 specs "github.com/opencontainers/runtime-spec/specs-go" 31 "github.com/ttpreport/gvisor-ligolo/pkg/coretag" 32 "github.com/ttpreport/gvisor-ligolo/pkg/cpuid" 33 "github.com/ttpreport/gvisor-ligolo/pkg/log" 34 "github.com/ttpreport/gvisor-ligolo/pkg/metric" 35 "github.com/ttpreport/gvisor-ligolo/pkg/ring0" 36 "github.com/ttpreport/gvisor-ligolo/pkg/sentry/platform" 37 "github.com/ttpreport/gvisor-ligolo/runsc/boot" 38 "github.com/ttpreport/gvisor-ligolo/runsc/cmd/util" 39 "github.com/ttpreport/gvisor-ligolo/runsc/config" 40 "github.com/ttpreport/gvisor-ligolo/runsc/flag" 41 "github.com/ttpreport/gvisor-ligolo/runsc/profile" 42 "github.com/ttpreport/gvisor-ligolo/runsc/specutils" 43 "golang.org/x/sys/unix" 44 ) 45 46 // Note that directfsSandboxCaps is the same as caps defined in gofer.go 47 // except CAP_SYS_CHROOT because we don't need to chroot in directfs mode. 48 var directfsSandboxCaps = []string{ 49 "CAP_CHOWN", 50 "CAP_DAC_OVERRIDE", 51 "CAP_DAC_READ_SEARCH", 52 "CAP_FOWNER", 53 "CAP_FSETID", 54 } 55 56 // directfsSandboxLinuxCaps is the minimal set of capabilities needed by the 57 // sandbox to operate on files in directfs mode. 58 var directfsSandboxLinuxCaps = &specs.LinuxCapabilities{ 59 Bounding: directfsSandboxCaps, 60 Effective: directfsSandboxCaps, 61 Permitted: directfsSandboxCaps, 62 } 63 64 // Boot implements subcommands.Command for the "boot" command which starts a 65 // new sandbox. It should not be called directly. 66 type Boot struct { 67 // bundleDir is the directory containing the OCI spec. 68 bundleDir string 69 70 // specFD is the file descriptor that the spec will be read from. 71 specFD int 72 73 // controllerFD is the file descriptor of a stream socket for the 74 // control server that is donated to this process. 75 controllerFD int 76 77 // deviceFD is the file descriptor for the platform device file. 78 deviceFD int 79 80 // ioFDs is the list of FDs used to connect to FS gofers. 81 ioFDs intFlags 82 83 // overlayFilestoreFDs are FDs to the regular files that will back the tmpfs 84 // upper mount in the overlay mounts. 85 overlayFilestoreFDs intFlags 86 87 // overlayMediums contains information about how the gofer mounts have been 88 // overlaid. The first entry is for rootfs and the following entries are for 89 // bind mounts in Spec.Mounts (in the same order). 90 overlayMediums boot.OverlayMediumFlags 91 92 // stdioFDs are the fds for stdin, stdout, and stderr. They must be 93 // provided in that order. 94 stdioFDs intFlags 95 96 // passFDs are mappings of user-supplied host to guest file descriptors. 97 passFDs fdMappings 98 99 // execFD is the host file descriptor used for program execution. 100 execFD int 101 102 // applyCaps determines if capabilities defined in the spec should be applied 103 // to the process. 104 applyCaps bool 105 106 // setUpChroot is set to true if the sandbox is started in an empty root. 107 setUpRoot bool 108 109 // cpuNum number of CPUs to create inside the sandbox. 110 cpuNum int 111 112 // totalMem sets the initial amount of total memory to report back to the 113 // container. 114 totalMem uint64 115 116 // totalHostMem is the total memory reported by host /proc/meminfo. 117 totalHostMem uint64 118 119 // userLogFD is the file descriptor to write user logs to. 120 userLogFD int 121 122 // startSyncFD is the file descriptor to synchronize runsc and sandbox. 123 startSyncFD int 124 125 // mountsFD is the file descriptor to read list of mounts after they have 126 // been resolved (direct paths, no symlinks). They are resolved outside the 127 // sandbox (e.g. gofer) and sent through this FD. 128 mountsFD int 129 130 podInitConfigFD int 131 132 sinkFDs intFlags 133 134 // pidns is set if the sandbox is in its own pid namespace. 135 pidns bool 136 137 // attached is set to true to kill the sandbox process when the parent process 138 // terminates. This flag is set when the command execve's itself because 139 // parent death signal doesn't propagate through execve when uid/gid changes. 140 attached bool 141 142 // productName is the value to show in 143 // /sys/devices/virtual/dmi/id/product_name. 144 productName string 145 146 // FDs for profile data. 147 profileFDs profile.FDArgs 148 149 // procMountSyncFD is a file descriptor that has to be closed when the 150 // procfs mount isn't needed anymore. 151 procMountSyncFD int 152 153 // syncUsernsFD is the file descriptor that has to be closed when the 154 // boot process should invoke setuid/setgid for root user. This is mainly 155 // used to synchronize rootless user namespace initialization. 156 syncUsernsFD int 157 } 158 159 // Name implements subcommands.Command.Name. 160 func (*Boot) Name() string { 161 return "boot" 162 } 163 164 // Synopsis implements subcommands.Command.Synopsis. 165 func (*Boot) Synopsis() string { 166 return "launch a sandbox process" 167 } 168 169 // Usage implements subcommands.Command.Usage. 170 func (*Boot) Usage() string { 171 return `boot [flags] <container id>` 172 } 173 174 // SetFlags implements subcommands.Command.SetFlags. 175 func (b *Boot) SetFlags(f *flag.FlagSet) { 176 f.StringVar(&b.bundleDir, "bundle", "", "required path to the root of the bundle directory") 177 f.BoolVar(&b.applyCaps, "apply-caps", false, "if true, apply capabilities defined in the spec to the process") 178 f.BoolVar(&b.setUpRoot, "setup-root", false, "if true, set up an empty root for the process") 179 f.BoolVar(&b.pidns, "pidns", false, "if true, the sandbox is in its own PID namespace") 180 f.IntVar(&b.cpuNum, "cpu-num", 0, "number of CPUs to create inside the sandbox") 181 f.IntVar(&b.procMountSyncFD, "proc-mount-sync-fd", -1, "file descriptor that has to be written to when /proc isn't needed anymore and can be unmounted") 182 f.IntVar(&b.syncUsernsFD, "sync-userns-fd", -1, "file descriptor used to synchronize rootless user namespace initialization.") 183 f.Uint64Var(&b.totalMem, "total-memory", 0, "sets the initial amount of total memory to report back to the container") 184 f.Uint64Var(&b.totalHostMem, "total-host-memory", 0, "total memory reported by host /proc/meminfo") 185 f.BoolVar(&b.attached, "attached", false, "if attached is true, kills the sandbox process when the parent process terminates") 186 f.StringVar(&b.productName, "product-name", "", "value to show in /sys/devices/virtual/dmi/id/product_name") 187 188 // Open FDs that are donated to the sandbox. 189 f.IntVar(&b.specFD, "spec-fd", -1, "required fd with the container spec") 190 f.IntVar(&b.controllerFD, "controller-fd", -1, "required FD of a stream socket for the control server that must be donated to this process") 191 f.IntVar(&b.deviceFD, "device-fd", -1, "FD for the platform device file") 192 f.Var(&b.ioFDs, "io-fds", "list of FDs to connect gofer clients. They must follow this order: root first, then mounts as defined in the spec") 193 f.Var(&b.stdioFDs, "stdio-fds", "list of FDs containing sandbox stdin, stdout, and stderr in that order") 194 f.Var(&b.passFDs, "pass-fd", "mapping of host to guest FDs. They must be in M:N format. M is the host and N the guest descriptor.") 195 f.IntVar(&b.execFD, "exec-fd", -1, "host file descriptor used for program execution.") 196 f.Var(&b.overlayFilestoreFDs, "overlay-filestore-fds", "FDs to the regular files that will back the tmpfs upper mount in the overlay mounts.") 197 f.Var(&b.overlayMediums, "overlay-mediums", "information about how the gofer mounts have been overlaid.") 198 f.IntVar(&b.userLogFD, "user-log-fd", 0, "file descriptor to write user logs to. 0 means no logging.") 199 f.IntVar(&b.startSyncFD, "start-sync-fd", -1, "required FD to used to synchronize sandbox startup") 200 f.IntVar(&b.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to read list of mounts after they have been resolved (direct paths, no symlinks).") 201 f.IntVar(&b.podInitConfigFD, "pod-init-config-fd", -1, "file descriptor to the pod init configuration file.") 202 f.Var(&b.sinkFDs, "sink-fds", "ordered list of file descriptors to be used by the sinks defined in --pod-init-config.") 203 204 // Profiling flags. 205 b.profileFDs.SetFromFlags(f) 206 } 207 208 // Execute implements subcommands.Command.Execute. It starts a sandbox in a 209 // waiting state. 210 func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { 211 if b.specFD == -1 || b.controllerFD == -1 || b.startSyncFD == -1 || f.NArg() != 1 { 212 f.Usage() 213 return subcommands.ExitUsageError 214 } 215 216 conf := args[0].(*config.Config) 217 218 // Set traceback level 219 debug.SetTraceback(conf.Traceback) 220 221 // Initialize CPUID information. 222 cpuid.Initialize() 223 224 // Initialize ring0 library. 225 ring0.InitDefault() 226 227 if len(b.productName) == 0 { 228 // Do this before chroot takes effect, otherwise we can't read /sys. 229 if product, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_name"); err != nil { 230 log.Warningf("Not setting product_name: %v", err) 231 } else { 232 b.productName = strings.TrimSpace(string(product)) 233 log.Infof("Setting product_name: %q", b.productName) 234 } 235 } 236 237 if b.attached { 238 // Ensure this process is killed after parent process terminates when 239 // attached mode is enabled. In the unfortunate event that the parent 240 // terminates before this point, this process leaks. 241 if err := unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0); err != nil { 242 util.Fatalf("error setting parent death signal: %v", err) 243 } 244 } 245 246 syncUsernsForRootless(b.syncUsernsFD) 247 248 // Get the spec from the specFD. We *must* keep this os.File alive past 249 // the call setCapsAndCallSelf, otherwise the FD will be closed and the 250 // child process cannot read it 251 specFile := os.NewFile(uintptr(b.specFD), "spec file") 252 spec, err := specutils.ReadSpecFromFile(b.bundleDir, specFile, conf) 253 if err != nil { 254 util.Fatalf("reading spec: %v", err) 255 } 256 257 if b.setUpRoot { 258 if err := setUpChroot(b.pidns, spec, conf); err != nil { 259 util.Fatalf("error setting up chroot: %v", err) 260 } 261 262 if !conf.Rootless { 263 // /proc is umounted from a forked process, because the 264 // current one is going to re-execute itself without 265 // capabilities. 266 cmd, w := execProcUmounter() 267 defer cmd.Wait() 268 defer w.Close() 269 if b.procMountSyncFD != -1 { 270 panic("procMountSyncFD is set") 271 } 272 b.procMountSyncFD = int(w.Fd()) 273 274 // Clear FD_CLOEXEC. Regardless of b.applyCaps, this process will be 275 // re-executed. procMountSyncFD should remain open. 276 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { 277 util.Fatalf("error clearing CLOEXEC: %v", errno) 278 } 279 280 if !b.applyCaps { 281 // Remove the args that have already been done before calling self. 282 args := b.prepareArgs("setup-root", "sync-userns-fd") 283 284 // Note that we've already read the spec from the spec FD, and 285 // we will read it again after the exec call. This works 286 // because the ReadSpecFromFile function seeks to the beginning 287 // of the file before reading. 288 util.Fatalf("callSelfAsNobody(%v): %v", args, callSelfAsNobody(args)) 289 panic("unreachable") 290 } 291 } 292 } 293 294 specutils.LogSpecDebug(spec, conf.OCISeccomp) 295 296 if b.applyCaps { 297 caps := spec.Process.Capabilities 298 if caps == nil { 299 caps = &specs.LinuxCapabilities{} 300 } 301 302 gPlatform, err := platform.Lookup(conf.Platform) 303 if err != nil { 304 util.Fatalf("loading platform: %v", err) 305 } 306 if gPlatform.Requirements().RequiresCapSysPtrace { 307 // Ptrace platform requires extra capabilities. 308 const c = "CAP_SYS_PTRACE" 309 caps.Bounding = append(caps.Bounding, c) 310 caps.Effective = append(caps.Effective, c) 311 caps.Permitted = append(caps.Permitted, c) 312 } 313 314 if conf.DirectFS { 315 caps = specutils.MergeCapabilities(caps, directfsSandboxLinuxCaps) 316 } 317 318 // Remove the args that have already been done before calling self. 319 args := b.prepareArgs("setup-root", "sync-userns-fd", "apply-caps") 320 321 // Note that we've already read the spec from the spec FD, and 322 // we will read it again after the exec call. This works 323 // because the ReadSpecFromFile function seeks to the beginning 324 // of the file before reading. 325 util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, caps, setCapsAndCallSelf(args, caps)) 326 327 // This prevents the specFile finalizer from running and closed 328 // the specFD, which we have passed to ourselves when 329 // re-execing. 330 runtime.KeepAlive(specFile) 331 panic("unreachable") 332 } 333 334 if b.syncUsernsFD >= 0 { 335 // syncUsernsFD is set, but runsc hasn't been re-executed with a new UID and GID. 336 // We expect that setCapsAndCallSelf has to be called in this case. 337 panic("unreachable") 338 } 339 340 // Close specFile to avoid exposing it to the sandbox. 341 if err := specFile.Close(); err != nil { 342 util.Fatalf("closing specFile: %v", err) 343 } 344 345 // At this point we won't re-execute, so it's safe to limit via rlimits. Any 346 // limit >= 0 works. If the limit is lower than the current number of open 347 // files, then Setrlimit will succeed, and the next open will fail. 348 if conf.FDLimit > -1 { 349 rlimit := unix.Rlimit{ 350 Cur: uint64(conf.FDLimit), 351 Max: uint64(conf.FDLimit), 352 } 353 switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { 354 case nil: 355 case unix.EPERM: 356 log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) 357 default: 358 util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) 359 } 360 } 361 362 // Read resolved mount list and replace the original one from the spec. 363 mountsFile := os.NewFile(uintptr(b.mountsFD), "mounts file") 364 cleanMounts, err := specutils.ReadMounts(mountsFile) 365 if err != nil { 366 mountsFile.Close() 367 util.Fatalf("Error reading mounts file: %v", err) 368 } 369 mountsFile.Close() 370 spec.Mounts = cleanMounts 371 372 if conf.DirectFS { 373 // sandbox should run with a umask of 0, because we want to preserve file 374 // modes exactly as sent by the sentry, which would have already applied 375 // the application umask. 376 unix.Umask(0) 377 } 378 379 if conf.EnableCoreTags { 380 if err := coretag.Enable(); err != nil { 381 util.Fatalf("Failed to core tag sentry: %v", err) 382 } 383 384 // Verify that all sentry threads are properly core tagged, and log 385 // current core tag. 386 coreTags, err := coretag.GetAllCoreTags(os.Getpid()) 387 if err != nil { 388 util.Fatalf("Failed read current core tags: %v", err) 389 } 390 if len(coreTags) != 1 { 391 util.Fatalf("Not all child threads were core tagged the same. Tags=%v", coreTags) 392 } 393 log.Infof("Core tag enabled (core tag=%d)", coreTags[0]) 394 } 395 396 // Create the loader. 397 bootArgs := boot.Args{ 398 ID: f.Arg(0), 399 Spec: spec, 400 Conf: conf, 401 ControllerFD: b.controllerFD, 402 Device: os.NewFile(uintptr(b.deviceFD), "platform device"), 403 GoferFDs: b.ioFDs.GetArray(), 404 StdioFDs: b.stdioFDs.GetArray(), 405 PassFDs: b.passFDs.GetArray(), 406 ExecFD: b.execFD, 407 OverlayFilestoreFDs: b.overlayFilestoreFDs.GetArray(), 408 OverlayMediums: b.overlayMediums.GetArray(), 409 NumCPU: b.cpuNum, 410 TotalMem: b.totalMem, 411 TotalHostMem: b.totalHostMem, 412 UserLogFD: b.userLogFD, 413 ProductName: b.productName, 414 PodInitConfigFD: b.podInitConfigFD, 415 SinkFDs: b.sinkFDs.GetArray(), 416 ProfileOpts: b.profileFDs.ToOpts(), 417 } 418 l, err := boot.New(bootArgs) 419 if err != nil { 420 util.Fatalf("creating loader: %v", err) 421 } 422 423 // Fatalf exits the process and doesn't run defers. 424 // 'l' must be destroyed explicitly after this point! 425 426 if b.procMountSyncFD != -1 { 427 l.PreSeccompCallback = func() { 428 // Call validateOpenFDs() before umounting /proc. 429 validateOpenFDs(bootArgs.PassFDs) 430 // Umount /proc right before installing seccomp filters. 431 umountProc(b.procMountSyncFD) 432 } 433 } 434 435 // Prepare metrics. 436 // This needs to happen after the kernel is initialized (such that all metrics are registered) 437 // but before the start-sync file is notified, as the parent process needs to query for 438 // registered metrics prior to sending the start signal. 439 metric.Initialize() 440 441 // Notify the parent process the sandbox has booted (and that the controller 442 // is up). 443 startSyncFile := os.NewFile(uintptr(b.startSyncFD), "start-sync file") 444 buf := make([]byte, 1) 445 if w, err := startSyncFile.Write(buf); err != nil || w != 1 { 446 l.Destroy() 447 util.Fatalf("unable to write into the start-sync descriptor: %v", err) 448 } 449 // Closes startSyncFile because 'l.Run()' only returns when the sandbox exits. 450 startSyncFile.Close() 451 452 // Wait for the start signal from runsc. 453 l.WaitForStartSignal() 454 455 // Run the application and wait for it to finish. 456 if err := l.Run(); err != nil { 457 l.Destroy() 458 util.Fatalf("running sandbox: %v", err) 459 } 460 461 ws := l.WaitExit() 462 log.Infof("application exiting with %+v", ws) 463 waitStatus := args[1].(*unix.WaitStatus) 464 *waitStatus = unix.WaitStatus(ws) 465 l.Destroy() 466 return subcommands.ExitSuccess 467 } 468 469 func (b *Boot) prepareArgs(exclude ...string) []string { 470 var args []string 471 for _, arg := range os.Args { 472 for _, excl := range exclude { 473 if strings.Contains(arg, excl) { 474 goto skip 475 } 476 } 477 args = append(args, arg) 478 // Some parameters are not already part of os.Args because they are 479 // solely configured by Boot.Execute(). Strategically add these parameters 480 // after the command and before the container ID at the end. 481 if arg == "boot" { 482 if b.procMountSyncFD != -1 { 483 args = append(args, fmt.Sprintf("--proc-mount-sync-fd=%d", b.procMountSyncFD)) 484 } 485 if len(b.productName) > 0 { 486 args = append(args, "--product-name", b.productName) 487 } 488 } 489 skip: 490 } 491 return args 492 } 493 494 // execProcUmounter execute a child process that umounts /proc when the 495 // returned pipe is closed. 496 func execProcUmounter() (*exec.Cmd, *os.File) { 497 r, w, err := os.Pipe() 498 if err != nil { 499 util.Fatalf("error creating a pipe: %v", err) 500 } 501 defer r.Close() 502 503 cmd := exec.Command(specutils.ExePath) 504 cmd.Args = append(cmd.Args, "umount", "--sync-fd=3", "/proc") 505 cmd.ExtraFiles = append(cmd.ExtraFiles, r) 506 cmd.Stdin = os.Stdin 507 cmd.Stdout = os.Stdout 508 cmd.Stderr = os.Stderr 509 if err := cmd.Start(); err != nil { 510 util.Fatalf("error executing umounter: %v", err) 511 } 512 return cmd, w 513 } 514 515 // umountProc writes to syncFD signalling the process started by 516 // execProcUmounter() to umount /proc. 517 func umountProc(syncFD int) { 518 syncFile := os.NewFile(uintptr(syncFD), "procfs umount sync FD") 519 buf := make([]byte, 1) 520 if w, err := syncFile.Write(buf); err != nil || w != 1 { 521 util.Fatalf("unable to write into the proc umounter descriptor: %v", err) 522 } 523 syncFile.Close() 524 525 var waitStatus unix.WaitStatus 526 if _, err := unix.Wait4(0, &waitStatus, 0, nil); err != nil { 527 util.Fatalf("error waiting for the proc umounter process: %v", err) 528 } 529 if !waitStatus.Exited() || waitStatus.ExitStatus() != 0 { 530 util.Fatalf("the proc umounter process failed: %v", waitStatus) 531 } 532 if err := unix.Access("/proc/self", unix.F_OK); err != unix.ENOENT { 533 util.Fatalf("/proc is still accessible") 534 } 535 } 536 537 // validateOpenFDs checks that the sandbox process does not have any open 538 // directory FDs. 539 func validateOpenFDs(passFDs []boot.FDMapping) { 540 passHostFDs := make(map[int]struct{}) 541 for _, passFD := range passFDs { 542 passHostFDs[passFD.Host] = struct{}{} 543 } 544 const selfFDDir = "/proc/self/fd" 545 if err := filepath.WalkDir(selfFDDir, func(path string, d os.DirEntry, err error) error { 546 if err != nil { 547 return err 548 } 549 if d.Type() != os.ModeSymlink { 550 // All entries are symlinks. Ignore the callback for fd directory itself. 551 return nil 552 } 553 if fdInfo, err := os.Stat(path); err != nil { 554 if os.IsNotExist(err) { 555 // Ignore FDs that are now closed. For example, the FD to selfFDDir that 556 // was opened by filepath.WalkDir() to read dirents. 557 return nil 558 } 559 return fmt.Errorf("os.Stat(%s) failed: %v", path, err) 560 } else if !fdInfo.IsDir() { 561 return nil 562 } 563 // Uh-oh. This is a directory FD. 564 fdNo, err := strconv.Atoi(d.Name()) 565 if err != nil { 566 return fmt.Errorf("strconv.Atoi(%s) failed: %v", d.Name(), err) 567 } 568 dirLink, err := os.Readlink(path) 569 if err != nil { 570 return fmt.Errorf("os.Readlink(%s) failed: %v", path, err) 571 } 572 if _, ok := passHostFDs[fdNo]; ok { 573 // Passed FDs are allowed to be directories. The user must be knowing 574 // what they are doing. Log a warning regardless. 575 log.Warningf("Sandbox has access to FD %d, which is a directory for %s", fdNo, dirLink) 576 return nil 577 } 578 return fmt.Errorf("FD %d is a directory for %s", fdNo, dirLink) 579 }); err != nil { 580 util.Fatalf("WalkDir(%s) failed: %v", selfFDDir, err) 581 } 582 }