github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/cmd/gofer.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cmd 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "io" 22 "os" 23 "path/filepath" 24 "runtime" 25 "runtime/debug" 26 "strings" 27 28 "github.com/MerlinKodo/gvisor/pkg/log" 29 "github.com/MerlinKodo/gvisor/pkg/unet" 30 "github.com/MerlinKodo/gvisor/runsc/boot" 31 "github.com/MerlinKodo/gvisor/runsc/cmd/util" 32 "github.com/MerlinKodo/gvisor/runsc/config" 33 "github.com/MerlinKodo/gvisor/runsc/flag" 34 "github.com/MerlinKodo/gvisor/runsc/fsgofer" 35 "github.com/MerlinKodo/gvisor/runsc/fsgofer/filter" 36 "github.com/MerlinKodo/gvisor/runsc/profile" 37 "github.com/MerlinKodo/gvisor/runsc/specutils" 38 "github.com/google/subcommands" 39 specs "github.com/opencontainers/runtime-spec/specs-go" 40 "golang.org/x/sys/unix" 41 ) 42 43 var caps = []string{ 44 "CAP_CHOWN", 45 "CAP_DAC_OVERRIDE", 46 "CAP_DAC_READ_SEARCH", 47 "CAP_FOWNER", 48 "CAP_FSETID", 49 "CAP_SYS_CHROOT", 50 } 51 52 // goferCaps is the minimal set of capabilities needed by the Gofer to operate 53 // on files. 54 var goferCaps = &specs.LinuxCapabilities{ 55 Bounding: caps, 56 Effective: caps, 57 Permitted: caps, 58 } 59 60 // goferSyncFDs contains file descriptors that are used for synchronization 61 // of the Gofer startup process against other processes. 62 type goferSyncFDs struct { 63 // nvproxyFD is a file descriptor that is used to wait until 64 // nvproxy-related setup is done. This setup involves creating mounts in the 65 // Gofer process's mount namespace. 66 // If this is set, this FD is the first that the Gofer waits for. 67 nvproxyFD int 68 // usernsFD is a file descriptor that is used to wait until 69 // user namespace ID mappings are established in the Gofer's userns. 70 // If this is set, this FD is the second that the Gofer waits for. 71 usernsFD int 72 // procMountFD is a file descriptor that has to be closed when the 73 // procfs mount isn't needed anymore. It is read by the procfs unmounter 74 // process. 75 // If this is set, this FD is the last that the Gofer interacts with and 76 // closes. 77 procMountFD int 78 } 79 80 // Gofer implements subcommands.Command for the "gofer" command, which starts a 81 // filesystem gofer. This command should not be called directly. 82 type Gofer struct { 83 bundleDir string 84 ioFDs intFlags 85 applyCaps bool 86 setUpRoot bool 87 overlayMediums boot.OverlayMediumFlags 88 89 specFD int 90 mountsFD int 91 profileFDs profile.FDArgs 92 syncFDs goferSyncFDs 93 stopProfiling func() 94 } 95 96 // Name implements subcommands.Command. 97 func (*Gofer) Name() string { 98 return "gofer" 99 } 100 101 // Synopsis implements subcommands.Command. 102 func (g *Gofer) Synopsis() string { 103 return fmt.Sprintf("launch a gofer process that proxies access to container files") 104 } 105 106 // Usage implements subcommands.Command. 107 func (*Gofer) Usage() string { 108 return `gofer [flags]` 109 } 110 111 // SetFlags implements subcommands.Command. 112 func (g *Gofer) SetFlags(f *flag.FlagSet) { 113 f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") 114 f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") 115 f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") 116 117 // Open FDs that are donated to the gofer. 118 f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec") 119 f.Var(&g.overlayMediums, "overlay-mediums", "information about how the gofer mounts have been overlaid.") 120 f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") 121 f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") 122 123 // Add synchronization FD flags. 124 g.syncFDs.setFlags(f) 125 126 // Profiling flags. 127 g.profileFDs.SetFromFlags(f) 128 } 129 130 // Execute implements subcommands.Command. 131 func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { 132 if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 { 133 f.Usage() 134 return subcommands.ExitUsageError 135 } 136 137 conf := args[0].(*config.Config) 138 139 // Set traceback level 140 debug.SetTraceback(conf.Traceback) 141 142 specFile := os.NewFile(uintptr(g.specFD), "spec file") 143 defer specFile.Close() 144 spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf) 145 if err != nil { 146 util.Fatalf("reading spec: %v", err) 147 } 148 149 g.syncFDs.syncNVProxy() 150 g.syncFDs.syncUsernsForRootless() 151 152 if g.setUpRoot { 153 if err := g.setupRootFS(spec, conf); err != nil { 154 util.Fatalf("Error setting up root FS: %v", err) 155 } 156 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 157 cleanupUnmounter := g.syncFDs.spawnProcUnmounter() 158 defer cleanupUnmounter() 159 } 160 } 161 if g.applyCaps { 162 overrides := g.syncFDs.flags() 163 overrides["apply-caps"] = "false" 164 overrides["setup-root"] = "false" 165 args := prepareArgs(g.Name(), f, overrides) 166 util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps)) 167 panic("unreachable") 168 } 169 170 // Start profiling. This will be a noop if no profiling arguments were passed. 171 profileOpts := g.profileFDs.ToOpts() 172 g.stopProfiling = profile.Start(profileOpts) 173 174 // At this point we won't re-execute, so it's safe to limit via rlimits. Any 175 // limit >= 0 works. If the limit is lower than the current number of open 176 // files, then Setrlimit will succeed, and the next open will fail. 177 if conf.FDLimit > -1 { 178 rlimit := unix.Rlimit{ 179 Cur: uint64(conf.FDLimit), 180 Max: uint64(conf.FDLimit), 181 } 182 switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { 183 case nil: 184 case unix.EPERM: 185 log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) 186 default: 187 util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) 188 } 189 } 190 191 // Find what path is going to be served by this gofer. 192 root := spec.Root.Path 193 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 194 root = "/root" 195 } 196 197 // Resolve mount points paths, then replace mounts from our spec and send the 198 // mount list over to the sandbox, so they are both in sync. 199 // 200 // Note that all mount points have been mounted in the proper location in 201 // setupRootFS(). 202 cleanMounts, err := resolveMounts(conf, spec.Mounts, root) 203 if err != nil { 204 util.Fatalf("Failure to resolve mounts: %v", err) 205 } 206 spec.Mounts = cleanMounts 207 go func() { 208 if err := g.writeMounts(cleanMounts); err != nil { 209 panic(fmt.Sprintf("Failed to write mounts: %v", err)) 210 } 211 }() 212 213 specutils.LogSpecDebug(spec, conf.OCISeccomp) 214 215 // fsgofer should run with a umask of 0, because we want to preserve file 216 // modes exactly as sent by the sandbox, which will have applied its own umask. 217 unix.Umask(0) 218 219 if err := fsgofer.OpenProcSelfFD(); err != nil { 220 util.Fatalf("failed to open /proc/self/fd: %v", err) 221 } 222 223 // procfs isn't needed anymore. 224 g.syncFDs.unmountProcfs() 225 226 if err := unix.Chroot(root); err != nil { 227 util.Fatalf("failed to chroot to %q: %v", root, err) 228 } 229 if err := unix.Chdir("/"); err != nil { 230 util.Fatalf("changing working dir: %v", err) 231 } 232 log.Infof("Process chroot'd to %q", root) 233 234 // Initialize filters. 235 opts := filter.Options{ 236 UDSOpenEnabled: conf.GetHostUDS().AllowOpen(), 237 UDSCreateEnabled: conf.GetHostUDS().AllowCreate(), 238 ProfileEnabled: len(profileOpts) > 0, 239 } 240 if err := filter.Install(opts); err != nil { 241 util.Fatalf("installing seccomp filters: %v", err) 242 } 243 244 return g.serve(spec, conf, root) 245 } 246 247 func newSocket(ioFD int) *unet.Socket { 248 socket, err := unet.NewSocket(ioFD) 249 if err != nil { 250 util.Fatalf("creating server on FD %d: %v", ioFD, err) 251 } 252 return socket 253 } 254 255 func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus { 256 type connectionConfig struct { 257 sock *unet.Socket 258 mountPath string 259 readonly bool 260 } 261 cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1) 262 server := fsgofer.NewLisafsServer(fsgofer.Config{ 263 // These are global options. Ignore readonly configuration, that is set on 264 // a per connection basis. 265 HostUDS: conf.GetHostUDS(), 266 HostFifo: conf.HostFifo, 267 DonateMountPointFD: conf.DirectFS, 268 }) 269 270 // Start with root mount, then add any other additional mount as needed. 271 cfgs = append(cfgs, connectionConfig{ 272 sock: newSocket(g.ioFDs[0]), 273 mountPath: "/", // fsgofer process is always chroot()ed. So serve root. 274 readonly: spec.Root.Readonly || g.overlayMediums[0].IsEnabled(), 275 }) 276 log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], cfgs[0].readonly) 277 278 mountIdx := 1 // first one is the root 279 for _, m := range spec.Mounts { 280 if !specutils.IsGoferMount(m) { 281 continue 282 } 283 284 if !filepath.IsAbs(m.Destination) { 285 util.Fatalf("mount destination must be absolute: %q", m.Destination) 286 } 287 if mountIdx >= len(g.ioFDs) { 288 util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m) 289 } 290 291 cfgs = append(cfgs, connectionConfig{ 292 sock: newSocket(g.ioFDs[mountIdx]), 293 mountPath: m.Destination, 294 readonly: specutils.IsReadonlyMount(m.Options) || g.overlayMediums[mountIdx].IsEnabled(), 295 }) 296 297 log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfgs[mountIdx].readonly) 298 mountIdx++ 299 } 300 301 if mountIdx != len(g.ioFDs) { 302 util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs)) 303 } 304 cfgs = cfgs[:mountIdx] 305 306 for _, cfg := range cfgs { 307 conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly) 308 if err != nil { 309 util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err) 310 } 311 server.StartConnection(conn) 312 } 313 server.Wait() 314 server.Destroy() 315 log.Infof("All lisafs servers exited.") 316 if g.stopProfiling != nil { 317 g.stopProfiling() 318 } 319 return subcommands.ExitSuccess 320 } 321 322 func (g *Gofer) writeMounts(mounts []specs.Mount) error { 323 bytes, err := json.Marshal(mounts) 324 if err != nil { 325 return err 326 } 327 328 f := os.NewFile(uintptr(g.mountsFD), "mounts file") 329 defer f.Close() 330 331 for written := 0; written < len(bytes); { 332 w, err := f.Write(bytes[written:]) 333 if err != nil { 334 return err 335 } 336 written += w 337 } 338 return nil 339 } 340 341 func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config) error { 342 // Convert all shared mounts into slaves to be sure that nothing will be 343 // propagated outside of our namespace. 344 procPath := "/proc" 345 if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil { 346 util.Fatalf("error converting mounts: %v", err) 347 } 348 349 root := spec.Root.Path 350 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 351 // runsc can't be re-executed without /proc, so we create a tmpfs mount, 352 // mount ./proc and ./root there, then move this mount to the root and after 353 // setCapsAndCallSelf, runsc will chroot into /root. 354 // 355 // We need a directory to construct a new root and we know that 356 // runsc can't start without /proc, so we can use it for this. 357 flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC) 358 if err := specutils.SafeMount("runsc-root", "/proc", "tmpfs", flags, "", procPath); err != nil { 359 util.Fatalf("error mounting tmpfs: %v", err) 360 } 361 362 // Prepare tree structure for pivot_root(2). 363 if err := os.Mkdir("/proc/proc", 0755); err != nil { 364 util.Fatalf("error creating /proc/proc: %v", err) 365 } 366 if err := os.Mkdir("/proc/root", 0755); err != nil { 367 util.Fatalf("error creating /proc/root: %v", err) 368 } 369 if err := os.Mkdir("/proc/etc", 0755); err != nil { 370 util.Fatalf("error creating /proc/etc: %v", err) 371 } 372 // This cannot use SafeMount because there's no available procfs. But we 373 // know that /proc is an empty tmpfs mount, so this is safe. 374 if err := unix.Mount("runsc-proc", "/proc/proc", "proc", flags|unix.MS_RDONLY, ""); err != nil { 375 util.Fatalf("error mounting proc: %v", err) 376 } 377 // self/fd is bind-mounted, so that the FD return by 378 // OpenProcSelfFD() does not allow escapes with walking ".." . 379 if err := unix.Mount("/proc/proc/self/fd", "/proc/proc/self/fd", 380 "", unix.MS_RDONLY|unix.MS_BIND|unix.MS_NOEXEC, ""); err != nil { 381 util.Fatalf("error mounting proc/self/fd: %v", err) 382 } 383 if err := copyFile("/proc/etc/localtime", "/etc/localtime"); err != nil { 384 log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) 385 } 386 root = "/proc/root" 387 procPath = "/proc/proc" 388 } 389 390 // Mount root path followed by submounts. 391 if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil { 392 return fmt.Errorf("mounting root on root (%q) err: %v", root, err) 393 } 394 395 flags := uint32(unix.MS_SLAVE | unix.MS_REC) 396 if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { 397 flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation}) 398 } 399 if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil { 400 return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err) 401 } 402 403 // Replace the current spec, with the clean spec with symlinks resolved. 404 if err := g.setupMounts(conf, spec.Mounts, root, procPath); err != nil { 405 util.Fatalf("error setting up FS: %v", err) 406 } 407 408 // Create working directory if needed. 409 if spec.Process.Cwd != "" { 410 dst, err := resolveSymlinks(root, spec.Process.Cwd) 411 if err != nil { 412 return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) 413 } 414 log.Infof("Create working directory %q if needed", spec.Process.Cwd) 415 if err := os.MkdirAll(dst, 0755); err != nil { 416 return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) 417 } 418 } 419 420 // Check if root needs to be remounted as readonly. 421 if spec.Root.Readonly || g.overlayMediums[0].IsEnabled() { 422 // If root is a mount point but not read-only, we can change mount options 423 // to make it read-only for extra safety. 424 log.Infof("Remounting root as readonly: %q", root) 425 flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_REC) 426 if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil { 427 return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err) 428 } 429 } 430 431 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 432 if err := pivotRoot("/proc"); err != nil { 433 util.Fatalf("failed to change the root file system: %v", err) 434 } 435 if err := os.Chdir("/"); err != nil { 436 util.Fatalf("failed to change working directory") 437 } 438 } 439 return nil 440 } 441 442 // setupMounts bind mounts all mounts specified in the spec in their correct 443 // location inside root. It will resolve relative paths and symlinks. It also 444 // creates directories as needed. 445 func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error { 446 goferMntIdx := 1 // First index is for rootfs. 447 for _, m := range mounts { 448 if !specutils.IsGoferMount(m) { 449 continue 450 } 451 452 dst, err := resolveSymlinks(root, m.Destination) 453 if err != nil { 454 return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) 455 } 456 457 flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND 458 if g.overlayMediums[goferMntIdx].IsEnabled() { 459 // Force mount read-only if writes are not going to be sent to it. 460 flags |= unix.MS_RDONLY 461 } 462 463 log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) 464 if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil { 465 return fmt.Errorf("mounting %+v: %v", m, err) 466 } 467 468 // Set propagation options that cannot be set together with other options. 469 flags = specutils.PropOptionsToFlags(m.Options) 470 if flags != 0 { 471 if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil { 472 return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) 473 } 474 } 475 goferMntIdx++ 476 } 477 return nil 478 } 479 480 // resolveMounts resolved relative paths and symlinks to mount points. 481 // 482 // Note: mount points must already be in place for resolution to work. 483 // Otherwise, it may follow symlinks to locations that would be overwritten 484 // with another mount point and return the wrong location. In short, make sure 485 // setupMounts() has been called before. 486 func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { 487 cleanMounts := make([]specs.Mount, 0, len(mounts)) 488 for _, m := range mounts { 489 if !specutils.IsGoferMount(m) { 490 cleanMounts = append(cleanMounts, m) 491 continue 492 } 493 dst, err := resolveSymlinks(root, m.Destination) 494 if err != nil { 495 return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) 496 } 497 relDst, err := filepath.Rel(root, dst) 498 if err != nil { 499 panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err)) 500 } 501 502 opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options) 503 if err != nil { 504 return nil, err 505 } 506 507 cpy := m 508 cpy.Destination = filepath.Join("/", relDst) 509 cpy.Options = opts 510 cleanMounts = append(cleanMounts, cpy) 511 } 512 return cleanMounts, nil 513 } 514 515 // ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are 516 // symlinks, they are evaluated relative to 'root' to ensure the end result is 517 // the same as if the process was running inside the container. 518 func resolveSymlinks(root, rel string) (string, error) { 519 return resolveSymlinksImpl(root, root, rel, 255) 520 } 521 522 func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { 523 if followCount == 0 { 524 return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) 525 } 526 527 rel = filepath.Clean(rel) 528 for _, name := range strings.Split(rel, string(filepath.Separator)) { 529 if name == "" { 530 continue 531 } 532 // Note that Join() resolves things like ".." and returns a clean path. 533 path := filepath.Join(base, name) 534 if !strings.HasPrefix(path, root) { 535 // One cannot '..' their way out of root. 536 base = root 537 continue 538 } 539 fi, err := os.Lstat(path) 540 if err != nil { 541 if !os.IsNotExist(err) { 542 return "", err 543 } 544 // Not found means there is no symlink to check. Just keep walking dirs. 545 base = path 546 continue 547 } 548 if fi.Mode()&os.ModeSymlink != 0 { 549 link, err := os.Readlink(path) 550 if err != nil { 551 return "", err 552 } 553 if filepath.IsAbs(link) { 554 base = root 555 } 556 base, err = resolveSymlinksImpl(root, base, link, followCount-1) 557 if err != nil { 558 return "", err 559 } 560 continue 561 } 562 base = path 563 } 564 return base, nil 565 } 566 567 // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs. 568 func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) { 569 rv := make([]string, len(opts)) 570 copy(rv, opts) 571 572 statfs := unix.Statfs_t{} 573 if err := unix.Statfs(path, &statfs); err != nil { 574 return nil, err 575 } 576 if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC { 577 rv = append(rv, "overlayfs_stale_read") 578 } 579 return rv, nil 580 } 581 582 // setFlags sets sync FD flags on the given FlagSet. 583 func (g *goferSyncFDs) setFlags(f *flag.FlagSet) { 584 f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done") 585 f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the the gofer waits on until userns mappings are set up") 586 f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted") 587 } 588 589 // flags returns the flags necessary to pass along the current sync FD values 590 // to a re-executed version of this process. 591 func (g *goferSyncFDs) flags() map[string]string { 592 return map[string]string{ 593 "sync-nvproxy-fd": fmt.Sprintf("%d", g.nvproxyFD), 594 "sync-userns-fd": fmt.Sprintf("%d", g.usernsFD), 595 "proc-mount-sync-fd": fmt.Sprintf("%d", g.procMountFD), 596 } 597 } 598 599 // waitForFD waits for the other end of a given FD to be closed. 600 // `fd` is closed unconditionally after that. 601 // This should only be called for actual FDs (i.e. `fd` >= 0). 602 func waitForFD(fd int, fdName string) error { 603 log.Debugf("Waiting on %s %d...", fdName, fd) 604 f := os.NewFile(uintptr(fd), fdName) 605 defer f.Close() 606 var b [1]byte 607 if n, err := f.Read(b[:]); n != 0 || err != io.EOF { 608 return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err) 609 } 610 log.Debugf("Synced on %s %d.", fdName, fd) 611 return nil 612 } 613 614 // spawnProcMounter executes the /proc unmounter process. 615 // It returns a function to wait on the proc unmounter process, which 616 // should be called (via defer) in case of errors in order to clean up the 617 // unmounter process properly. 618 // When procfs is no longer needed, `unmountProcfs` should be called. 619 func (g *goferSyncFDs) spawnProcUnmounter() func() { 620 if g.procMountFD != -1 { 621 util.Fatalf("procMountFD is set") 622 } 623 // /proc is umounted from a forked process, because the 624 // current one may re-execute itself without capabilities. 625 cmd, w := execProcUmounter() 626 // Clear FD_CLOEXEC. This process may be re-executed. procMountFD 627 // should remain open. 628 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { 629 util.Fatalf("error clearing CLOEXEC: %v", errno) 630 } 631 g.procMountFD = int(w.Fd()) 632 return func() { 633 g.procMountFD = -1 634 w.Close() 635 cmd.Wait() 636 } 637 } 638 639 // unmountProcfs signals the proc unmounter process that procfs is no longer 640 // needed. 641 func (g *goferSyncFDs) unmountProcfs() { 642 if g.procMountFD < 0 { 643 return 644 } 645 umountProc(g.procMountFD) 646 g.procMountFD = -1 647 } 648 649 // syncUsernsForRootless waits on usernsFD to be closed and then sets 650 // UID/GID to 0. Note that this function calls runtime.LockOSThread(). 651 // This function is a no-op if usernsFD is -1. 652 // 653 // Postcondition: All callers must re-exec themselves after this returns, 654 // unless usernsFD was -1. 655 func (g *goferSyncFDs) syncUsernsForRootless() { 656 if g.usernsFD < 0 { 657 return 658 } 659 syncUsernsForRootless(g.usernsFD) 660 g.usernsFD = -1 661 } 662 663 // syncUsernsForRootless waits on usernsFD to be closed and then sets 664 // UID/GID to 0. Note that this function calls runtime.LockOSThread(). 665 // 666 // Postcondition: All callers must re-exec themselves after this returns. 667 func syncUsernsForRootless(fd int) { 668 if err := waitForFD(fd, "userns sync FD"); err != nil { 669 util.Fatalf("failed to sync on userns FD: %v", err) 670 } 671 672 // SETUID changes UID on the current system thread, so we have 673 // to re-execute current binary. 674 runtime.LockOSThread() 675 if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 { 676 util.Fatalf("failed to set UID: %v", errno) 677 } 678 if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 { 679 util.Fatalf("failed to set GID: %v", errno) 680 } 681 } 682 683 // syncNVProxy waits on nvproxyFD to be closed. 684 // Used for synchronization during nvproxy setup which is done from the 685 // non-gofer process. 686 // This function is a no-op if nvProxySyncFD is -1. 687 func (g *goferSyncFDs) syncNVProxy() { 688 if g.nvproxyFD < 0 { 689 return 690 } 691 if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil { 692 util.Fatalf("failed to sync on NVProxy FD: %v", err) 693 } 694 g.nvproxyFD = -1 695 }