github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/cmd/gofer.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package cmd 16 17 import ( 18 "context" 19 "encoding/json" 20 "fmt" 21 "io" 22 "os" 23 "path/filepath" 24 "regexp" 25 "runtime" 26 "runtime/debug" 27 "strings" 28 29 "github.com/google/subcommands" 30 specs "github.com/opencontainers/runtime-spec/specs-go" 31 "golang.org/x/sys/unix" 32 "github.com/metacubex/gvisor/pkg/log" 33 "github.com/metacubex/gvisor/pkg/sentry/devices/tpuproxy" 34 "github.com/metacubex/gvisor/pkg/unet" 35 "github.com/metacubex/gvisor/runsc/boot" 36 "github.com/metacubex/gvisor/runsc/cmd/util" 37 "github.com/metacubex/gvisor/runsc/config" 38 "github.com/metacubex/gvisor/runsc/flag" 39 "github.com/metacubex/gvisor/runsc/fsgofer" 40 "github.com/metacubex/gvisor/runsc/fsgofer/filter" 41 "github.com/metacubex/gvisor/runsc/profile" 42 "github.com/metacubex/gvisor/runsc/specutils" 43 ) 44 45 var caps = []string{ 46 "CAP_CHOWN", 47 "CAP_DAC_OVERRIDE", 48 "CAP_DAC_READ_SEARCH", 49 "CAP_FOWNER", 50 "CAP_FSETID", 51 "CAP_SYS_CHROOT", 52 } 53 54 // goferCaps is the minimal set of capabilities needed by the Gofer to operate 55 // on files. 56 var goferCaps = &specs.LinuxCapabilities{ 57 Bounding: caps, 58 Effective: caps, 59 Permitted: caps, 60 } 61 62 // goferSyncFDs contains file descriptors that are used for synchronization 63 // of the Gofer startup process against other processes. 64 type goferSyncFDs struct { 65 // nvproxyFD is a file descriptor that is used to wait until 66 // nvproxy-related setup is done. This setup involves creating mounts in the 67 // Gofer process's mount namespace. 68 // If this is set, this FD is the first that the Gofer waits for. 69 nvproxyFD int 70 // usernsFD is a file descriptor that is used to wait until 71 // user namespace ID mappings are established in the Gofer's userns. 72 // If this is set, this FD is the second that the Gofer waits for. 73 usernsFD int 74 // procMountFD is a file descriptor that has to be closed when the 75 // procfs mount isn't needed anymore. It is read by the procfs unmounter 76 // process. 77 // If this is set, this FD is the last that the Gofer interacts with and 78 // closes. 79 procMountFD int 80 } 81 82 // Gofer implements subcommands.Command for the "gofer" command, which starts a 83 // filesystem gofer. This command should not be called directly. 84 type Gofer struct { 85 bundleDir string 86 ioFDs intFlags 87 devIoFD int 88 applyCaps bool 89 setUpRoot bool 90 mountConfs boot.GoferMountConfFlags 91 92 specFD int 93 mountsFD int 94 profileFDs profile.FDArgs 95 syncFDs goferSyncFDs 96 stopProfiling func() 97 } 98 99 // Name implements subcommands.Command. 100 func (*Gofer) Name() string { 101 return "gofer" 102 } 103 104 // Synopsis implements subcommands.Command. 105 func (g *Gofer) Synopsis() string { 106 return fmt.Sprintf("launch a gofer process that proxies access to container files") 107 } 108 109 // Usage implements subcommands.Command. 110 func (*Gofer) Usage() string { 111 return `gofer [flags]` 112 } 113 114 // SetFlags implements subcommands.Command. 115 func (g *Gofer) SetFlags(f *flag.FlagSet) { 116 f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory") 117 f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do") 118 f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process") 119 120 // Open FDs that are donated to the gofer. 121 f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. Follows the same order as --gofer-mount-confs. FDs are only donated if the mount is backed by lisafs.") 122 f.Var(&g.mountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured. They must follow this order: root first, then mounts as defined in the spec.") 123 f.IntVar(&g.devIoFD, "dev-io-fd", -1, "optional FD to connect /dev gofer server") 124 f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec") 125 f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).") 126 127 // Add synchronization FD flags. 128 g.syncFDs.setFlags(f) 129 130 // Profiling flags. 131 g.profileFDs.SetFromFlags(f) 132 } 133 134 // Execute implements subcommands.Command. 135 func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus { 136 if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 { 137 f.Usage() 138 return subcommands.ExitUsageError 139 } 140 141 conf := args[0].(*config.Config) 142 143 // Set traceback level 144 debug.SetTraceback(conf.Traceback) 145 146 specFile := os.NewFile(uintptr(g.specFD), "spec file") 147 defer specFile.Close() 148 spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf) 149 if err != nil { 150 util.Fatalf("reading spec: %v", err) 151 } 152 153 g.syncFDs.syncNVProxy() 154 g.syncFDs.syncUsernsForRootless() 155 156 if g.setUpRoot { 157 if err := g.setupRootFS(spec, conf); err != nil { 158 util.Fatalf("Error setting up root FS: %v", err) 159 } 160 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 161 cleanupUnmounter := g.syncFDs.spawnProcUnmounter() 162 defer cleanupUnmounter() 163 } 164 } 165 if g.applyCaps { 166 overrides := g.syncFDs.flags() 167 overrides["apply-caps"] = "false" 168 overrides["setup-root"] = "false" 169 args := prepareArgs(g.Name(), f, overrides) 170 util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps)) 171 panic("unreachable") 172 } 173 174 // Start profiling. This will be a noop if no profiling arguments were passed. 175 profileOpts := g.profileFDs.ToOpts() 176 g.stopProfiling = profile.Start(profileOpts) 177 178 // At this point we won't re-execute, so it's safe to limit via rlimits. Any 179 // limit >= 0 works. If the limit is lower than the current number of open 180 // files, then Setrlimit will succeed, and the next open will fail. 181 if conf.FDLimit > -1 { 182 rlimit := unix.Rlimit{ 183 Cur: uint64(conf.FDLimit), 184 Max: uint64(conf.FDLimit), 185 } 186 switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err { 187 case nil: 188 case unix.EPERM: 189 log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit) 190 default: 191 util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err) 192 } 193 } 194 195 // Find what path is going to be served by this gofer. 196 root := spec.Root.Path 197 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 198 root = "/root" 199 } 200 201 // Resolve mount points paths, then replace mounts from our spec and send the 202 // mount list over to the sandbox, so they are both in sync. 203 // 204 // Note that all mount points have been mounted in the proper location in 205 // setupRootFS(). 206 cleanMounts, err := g.resolveMounts(conf, spec.Mounts, root) 207 if err != nil { 208 util.Fatalf("Failure to resolve mounts: %v", err) 209 } 210 spec.Mounts = cleanMounts 211 go func() { 212 if err := g.writeMounts(cleanMounts); err != nil { 213 panic(fmt.Sprintf("Failed to write mounts: %v", err)) 214 } 215 }() 216 217 specutils.LogSpecDebug(spec, conf.OCISeccomp) 218 219 // fsgofer should run with a umask of 0, because we want to preserve file 220 // modes exactly as sent by the sandbox, which will have applied its own umask. 221 unix.Umask(0) 222 223 procFDPath := procFDBindMount 224 if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 225 procFDPath = "/proc/self/fd" 226 } 227 if err := fsgofer.OpenProcSelfFD(procFDPath); err != nil { 228 util.Fatalf("failed to open /proc/self/fd: %v", err) 229 } 230 231 // procfs isn't needed anymore. 232 g.syncFDs.unmountProcfs() 233 234 if err := unix.Chroot(root); err != nil { 235 util.Fatalf("failed to chroot to %q: %v", root, err) 236 } 237 if err := unix.Chdir("/"); err != nil { 238 util.Fatalf("changing working dir: %v", err) 239 } 240 log.Infof("Process chroot'd to %q", root) 241 242 // Initialize filters. 243 opts := filter.Options{ 244 UDSOpenEnabled: conf.GetHostUDS().AllowOpen(), 245 UDSCreateEnabled: conf.GetHostUDS().AllowCreate(), 246 ProfileEnabled: len(profileOpts) > 0, 247 DirectFS: conf.DirectFS, 248 } 249 if err := filter.Install(opts); err != nil { 250 util.Fatalf("installing seccomp filters: %v", err) 251 } 252 253 return g.serve(spec, conf, root) 254 } 255 256 func newSocket(ioFD int) *unet.Socket { 257 socket, err := unet.NewSocket(ioFD) 258 if err != nil { 259 util.Fatalf("creating server on FD %d: %v", ioFD, err) 260 } 261 return socket 262 } 263 264 func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus { 265 type connectionConfig struct { 266 sock *unet.Socket 267 mountPath string 268 readonly bool 269 } 270 cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1) 271 server := fsgofer.NewLisafsServer(fsgofer.Config{ 272 // These are global options. Ignore readonly configuration, that is set on 273 // a per connection basis. 274 HostUDS: conf.GetHostUDS(), 275 HostFifo: conf.HostFifo, 276 DonateMountPointFD: conf.DirectFS, 277 }) 278 279 ioFDs := g.ioFDs 280 rootfsConf := g.mountConfs[0] 281 if rootfsConf.ShouldUseLisafs() { 282 // Start with root mount, then add any other additional mount as needed. 283 cfgs = append(cfgs, connectionConfig{ 284 sock: newSocket(ioFDs[0]), 285 mountPath: "/", // fsgofer process is always chroot()ed. So serve root. 286 readonly: spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs(), 287 }) 288 log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, ioFDs[0], cfgs[0].readonly) 289 ioFDs = ioFDs[1:] 290 } 291 292 mountIdx := 1 // first one is the root 293 for _, m := range spec.Mounts { 294 if !specutils.IsGoferMount(m) { 295 continue 296 } 297 mountConf := g.mountConfs[mountIdx] 298 mountIdx++ 299 if !mountConf.ShouldUseLisafs() { 300 continue 301 } 302 if !filepath.IsAbs(m.Destination) { 303 util.Fatalf("mount destination must be absolute: %q", m.Destination) 304 } 305 306 if len(ioFDs) == 0 { 307 util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m) 308 } 309 ioFD := ioFDs[0] 310 ioFDs = ioFDs[1:] 311 readonly := specutils.IsReadonlyMount(m.Options) || mountConf.ShouldUseOverlayfs() 312 cfgs = append(cfgs, connectionConfig{ 313 sock: newSocket(ioFD), 314 mountPath: m.Destination, 315 readonly: readonly, 316 }) 317 log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, ioFD, readonly) 318 } 319 320 if len(ioFDs) > 0 { 321 util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", len(cfgs), len(g.ioFDs)) 322 } 323 324 if g.devIoFD >= 0 { 325 cfgs = append(cfgs, connectionConfig{ 326 sock: newSocket(g.devIoFD), 327 mountPath: "/dev", 328 }) 329 log.Infof("Serving /dev mapped on FD %d (ro: false)", g.devIoFD) 330 } 331 332 for _, cfg := range cfgs { 333 conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly) 334 if err != nil { 335 util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err) 336 } 337 server.StartConnection(conn) 338 } 339 server.Wait() 340 server.Destroy() 341 log.Infof("All lisafs servers exited.") 342 if g.stopProfiling != nil { 343 g.stopProfiling() 344 } 345 return subcommands.ExitSuccess 346 } 347 348 func (g *Gofer) writeMounts(mounts []specs.Mount) error { 349 bytes, err := json.Marshal(mounts) 350 if err != nil { 351 return err 352 } 353 354 f := os.NewFile(uintptr(g.mountsFD), "mounts file") 355 defer f.Close() 356 357 for written := 0; written < len(bytes); { 358 w, err := f.Write(bytes[written:]) 359 if err != nil { 360 return err 361 } 362 written += w 363 } 364 return nil 365 } 366 367 // Redhat distros don't allow to create bind-mounts in /proc/self directories. 368 // It is protected by selinux rules. 369 const procFDBindMount = "/proc/fs" 370 371 func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config) error { 372 // Convert all shared mounts into slaves to be sure that nothing will be 373 // propagated outside of our namespace. 374 procPath := "/proc" 375 if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil { 376 util.Fatalf("error converting mounts: %v", err) 377 } 378 379 root := spec.Root.Path 380 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 381 // runsc can't be re-executed without /proc, so we create a tmpfs mount, 382 // mount ./proc and ./root there, then move this mount to the root and after 383 // setCapsAndCallSelf, runsc will chroot into /root. 384 // 385 // We need a directory to construct a new root and we know that 386 // runsc can't start without /proc, so we can use it for this. 387 flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC) 388 if err := specutils.SafeMount("runsc-root", "/proc/fs", "tmpfs", flags, "", procPath); err != nil { 389 util.Fatalf("error mounting tmpfs: %v", err) 390 } 391 if err := unix.Mount("", "/proc/fs", "", unix.MS_UNBINDABLE, ""); err != nil { 392 util.Fatalf("error setting MS_UNBINDABLE") 393 } 394 // Prepare tree structure for pivot_root(2). 395 if err := os.Mkdir("/proc/fs/proc", 0755); err != nil { 396 util.Fatalf("error creating /proc/fs/proc: %v", err) 397 } 398 if err := os.Mkdir("/proc/fs/root", 0755); err != nil { 399 util.Fatalf("error creating /proc/fs/root: %v", err) 400 } 401 if err := os.Mkdir("/proc/fs/etc", 0755); err != nil { 402 util.Fatalf("error creating /proc/fs/etc: %v", err) 403 } 404 // This cannot use SafeMount because there's no available procfs. But we 405 // know that /proc/fs is an empty tmpfs mount, so this is safe. 406 if err := unix.Mount("/proc", "/proc/fs/proc", "", flags|unix.MS_RDONLY|unix.MS_BIND|unix.MS_REC, ""); err != nil { 407 util.Fatalf("error mounting /proc/fs/proc: %v", err) 408 } 409 // self/fd is bind-mounted, so that the FD return by 410 // OpenProcSelfFD() does not allow escapes with walking ".." . 411 if err := unix.Mount("/proc/fs/proc/self/fd", "/proc/fs/"+procFDBindMount, 412 "", unix.MS_RDONLY|unix.MS_BIND|flags, ""); err != nil { 413 util.Fatalf("error mounting proc/self/fd: %v", err) 414 } 415 if err := copyFile("/proc/fs/etc/localtime", "/etc/localtime"); err != nil { 416 log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err) 417 } 418 root = "/proc/fs/root" 419 procPath = "/proc/fs/proc" 420 } 421 422 rootfsConf := g.mountConfs[0] 423 if rootfsConf.ShouldUseLisafs() { 424 // Mount root path followed by submounts. 425 if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil { 426 return fmt.Errorf("mounting root on root (%q) err: %v", root, err) 427 } 428 429 flags := uint32(unix.MS_SLAVE | unix.MS_REC) 430 if spec.Linux != nil && spec.Linux.RootfsPropagation != "" { 431 flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation}) 432 } 433 if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil { 434 return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err) 435 } 436 } 437 438 // Replace the current spec, with the clean spec with symlinks resolved. 439 if err := g.setupMounts(conf, spec.Mounts, root, procPath); err != nil { 440 util.Fatalf("error setting up FS: %v", err) 441 } 442 443 // Set up /dev directory is needed. 444 if g.devIoFD >= 0 { 445 g.setupDev(spec, conf, root, procPath) 446 } 447 448 // Create working directory if needed. 449 if spec.Process.Cwd != "" { 450 dst, err := resolveSymlinks(root, spec.Process.Cwd) 451 if err != nil { 452 return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err) 453 } 454 log.Infof("Create working directory %q if needed", spec.Process.Cwd) 455 if err := os.MkdirAll(dst, 0755); err != nil { 456 return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err) 457 } 458 } 459 460 // Check if root needs to be remounted as readonly. 461 if rootfsConf.ShouldUseLisafs() && (spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs()) { 462 // If root is a mount point but not read-only, we can change mount options 463 // to make it read-only for extra safety. 464 // unix.MS_NOSUID and unix.MS_NODEV are included here not only 465 // for safety reasons but also because they can be locked and 466 // any attempts to unset them will fail. See 467 // mount_namespaces(7) for more details. 468 log.Infof("Remounting root as readonly: %q", root) 469 flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_NOSUID | unix.MS_NODEV) 470 if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil { 471 return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err) 472 } 473 } 474 475 if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { 476 if err := pivotRoot("/proc/fs"); err != nil { 477 util.Fatalf("failed to change the root file system: %v", err) 478 } 479 if err := os.Chdir("/"); err != nil { 480 util.Fatalf("failed to change working directory") 481 } 482 } 483 return nil 484 } 485 486 // setupMounts bind mounts all mounts specified in the spec in their correct 487 // location inside root. It will resolve relative paths and symlinks. It also 488 // creates directories as needed. 489 func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error { 490 mountIdx := 1 // First index is for rootfs. 491 for _, m := range mounts { 492 if !specutils.IsGoferMount(m) { 493 continue 494 } 495 mountConf := g.mountConfs[mountIdx] 496 mountIdx++ 497 if !mountConf.ShouldUseLisafs() { 498 continue 499 } 500 501 dst, err := resolveSymlinks(root, m.Destination) 502 if err != nil { 503 return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) 504 } 505 506 flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND 507 if mountConf.ShouldUseOverlayfs() { 508 // Force mount read-only if writes are not going to be sent to it. 509 flags |= unix.MS_RDONLY 510 } 511 512 log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags) 513 if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil { 514 return fmt.Errorf("mounting %+v: %v", m, err) 515 } 516 517 // Set propagation options that cannot be set together with other options. 518 flags = specutils.PropOptionsToFlags(m.Options) 519 if flags != 0 { 520 if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil { 521 return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err) 522 } 523 } 524 } 525 return nil 526 } 527 528 // shouldExposeNvidiaDevice returns true if path refers to an Nvidia device 529 // which should be exposed to the container. 530 // 531 // Precondition: nvproxy is enabled. 532 func shouldExposeNvidiaDevice(path string) bool { 533 if !strings.HasPrefix(path, "/dev/nvidia") { 534 return false 535 } 536 if path == "/dev/nvidiactl" || path == "/dev/nvidia-uvm" { 537 return true 538 } 539 nvidiaDevPathReg := regexp.MustCompile(`^/dev/nvidia(\d+)$`) 540 return nvidiaDevPathReg.MatchString(path) 541 } 542 543 // shouldExposeVfioDevice returns true if path refers to an VFIO device 544 // which shuold be exposed to the container. 545 func shouldExposeVFIODevice(path string) bool { 546 return strings.HasPrefix(path, filepath.Dir(tpuproxy.VFIOPath)) 547 } 548 549 // shouldExposeTpuDevice returns true if path refers to a TPU device which 550 // should be exposed to the container. 551 // 552 // Precondition: tpuproxy is enabled. 553 func shouldExposeTpuDevice(path string) bool { 554 _, valid, _ := util.ExtractTpuDeviceMinor(path) 555 return valid || shouldExposeVFIODevice(path) 556 } 557 558 func (g *Gofer) setupDev(spec *specs.Spec, conf *config.Config, root, procPath string) error { 559 if err := os.MkdirAll(filepath.Join(root, "dev"), 0777); err != nil { 560 return fmt.Errorf("creating dev directory: %v", err) 561 } 562 // Mount any devices specified in the spec. 563 if spec.Linux == nil { 564 return nil 565 } 566 nvproxyEnabled := specutils.NVProxyEnabled(spec, conf) 567 tpuproxyEnabled := specutils.TPUProxyIsEnabled(spec, conf) 568 for _, dev := range spec.Linux.Devices { 569 shouldMount := (nvproxyEnabled && shouldExposeNvidiaDevice(dev.Path)) || 570 (tpuproxyEnabled && shouldExposeTpuDevice(dev.Path)) 571 if !shouldMount { 572 continue 573 } 574 dst := filepath.Join(root, dev.Path) 575 log.Infof("Mounting device %q as bind mount at %q", dev.Path, dst) 576 if err := specutils.SafeSetupAndMount(dev.Path, dst, "bind", unix.MS_BIND, procPath); err != nil { 577 return fmt.Errorf("mounting %q: %v", dev.Path, err) 578 } 579 } 580 return nil 581 } 582 583 // resolveMounts resolved relative paths and symlinks to mount points. 584 // 585 // Note: mount points must already be in place for resolution to work. 586 // Otherwise, it may follow symlinks to locations that would be overwritten 587 // with another mount point and return the wrong location. In short, make sure 588 // setupMounts() has been called before. 589 func (g *Gofer) resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { 590 mountIdx := 1 // First index is for rootfs. 591 cleanMounts := make([]specs.Mount, 0, len(mounts)) 592 for _, m := range mounts { 593 if !specutils.IsGoferMount(m) { 594 cleanMounts = append(cleanMounts, m) 595 continue 596 } 597 mountConf := g.mountConfs[mountIdx] 598 mountIdx++ 599 if !mountConf.ShouldUseLisafs() { 600 cleanMounts = append(cleanMounts, m) 601 continue 602 } 603 dst, err := resolveSymlinks(root, m.Destination) 604 if err != nil { 605 return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err) 606 } 607 relDst, err := filepath.Rel(root, dst) 608 if err != nil { 609 panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err)) 610 } 611 612 opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options) 613 if err != nil { 614 return nil, err 615 } 616 617 cpy := m 618 cpy.Destination = filepath.Join("/", relDst) 619 cpy.Options = opts 620 cleanMounts = append(cleanMounts, cpy) 621 } 622 return cleanMounts, nil 623 } 624 625 // ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are 626 // symlinks, they are evaluated relative to 'root' to ensure the end result is 627 // the same as if the process was running inside the container. 628 func resolveSymlinks(root, rel string) (string, error) { 629 return resolveSymlinksImpl(root, root, rel, 255) 630 } 631 632 func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) { 633 if followCount == 0 { 634 return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel)) 635 } 636 637 rel = filepath.Clean(rel) 638 for _, name := range strings.Split(rel, string(filepath.Separator)) { 639 if name == "" { 640 continue 641 } 642 // Note that Join() resolves things like ".." and returns a clean path. 643 path := filepath.Join(base, name) 644 if !strings.HasPrefix(path, root) { 645 // One cannot '..' their way out of root. 646 base = root 647 continue 648 } 649 fi, err := os.Lstat(path) 650 if err != nil { 651 if !os.IsNotExist(err) { 652 return "", err 653 } 654 // Not found means there is no symlink to check. Just keep walking dirs. 655 base = path 656 continue 657 } 658 if fi.Mode()&os.ModeSymlink != 0 { 659 link, err := os.Readlink(path) 660 if err != nil { 661 return "", err 662 } 663 if filepath.IsAbs(link) { 664 base = root 665 } 666 base, err = resolveSymlinksImpl(root, base, link, followCount-1) 667 if err != nil { 668 return "", err 669 } 670 continue 671 } 672 base = path 673 } 674 return base, nil 675 } 676 677 // adjustMountOptions adds filesystem-specific gofer mount options. 678 func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) { 679 rv := make([]string, len(opts)) 680 copy(rv, opts) 681 682 statfs := unix.Statfs_t{} 683 if err := unix.Statfs(path, &statfs); err != nil { 684 return nil, err 685 } 686 switch statfs.Type { 687 case unix.OVERLAYFS_SUPER_MAGIC: 688 rv = append(rv, "overlayfs_stale_read") 689 case unix.NFS_SUPER_MAGIC: 690 // The gofer client implements remote file handle sharing for performance. 691 // However, remote filesystems like NFS rely on close(2) syscall for 692 // flushing file data to the server. Such handle sharing prevents the 693 // application's close(2) syscall from being propagated to the host. Hence 694 // disable file handle sharing, so NFS files are flushed correctly. 695 rv = append(rv, "disable_file_handle_sharing") 696 } 697 return rv, nil 698 } 699 700 // setFlags sets sync FD flags on the given FlagSet. 701 func (g *goferSyncFDs) setFlags(f *flag.FlagSet) { 702 f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done") 703 f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the the gofer waits on until userns mappings are set up") 704 f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted") 705 } 706 707 // flags returns the flags necessary to pass along the current sync FD values 708 // to a re-executed version of this process. 709 func (g *goferSyncFDs) flags() map[string]string { 710 return map[string]string{ 711 "sync-nvproxy-fd": fmt.Sprintf("%d", g.nvproxyFD), 712 "sync-userns-fd": fmt.Sprintf("%d", g.usernsFD), 713 "proc-mount-sync-fd": fmt.Sprintf("%d", g.procMountFD), 714 } 715 } 716 717 // waitForFD waits for the other end of a given FD to be closed. 718 // `fd` is closed unconditionally after that. 719 // This should only be called for actual FDs (i.e. `fd` >= 0). 720 func waitForFD(fd int, fdName string) error { 721 log.Debugf("Waiting on %s %d...", fdName, fd) 722 f := os.NewFile(uintptr(fd), fdName) 723 defer f.Close() 724 var b [1]byte 725 if n, err := f.Read(b[:]); n != 0 || err != io.EOF { 726 return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err) 727 } 728 log.Debugf("Synced on %s %d.", fdName, fd) 729 return nil 730 } 731 732 // spawnProcMounter executes the /proc unmounter process. 733 // It returns a function to wait on the proc unmounter process, which 734 // should be called (via defer) in case of errors in order to clean up the 735 // unmounter process properly. 736 // When procfs is no longer needed, `unmountProcfs` should be called. 737 func (g *goferSyncFDs) spawnProcUnmounter() func() { 738 if g.procMountFD != -1 { 739 util.Fatalf("procMountFD is set") 740 } 741 // /proc is umounted from a forked process, because the 742 // current one may re-execute itself without capabilities. 743 cmd, w := execProcUmounter() 744 // Clear FD_CLOEXEC. This process may be re-executed. procMountFD 745 // should remain open. 746 if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 { 747 util.Fatalf("error clearing CLOEXEC: %v", errno) 748 } 749 g.procMountFD = int(w.Fd()) 750 return func() { 751 g.procMountFD = -1 752 w.Close() 753 cmd.Wait() 754 } 755 } 756 757 // unmountProcfs signals the proc unmounter process that procfs is no longer 758 // needed. 759 func (g *goferSyncFDs) unmountProcfs() { 760 if g.procMountFD < 0 { 761 return 762 } 763 umountProc(g.procMountFD) 764 g.procMountFD = -1 765 } 766 767 // syncUsernsForRootless waits on usernsFD to be closed and then sets 768 // UID/GID to 0. Note that this function calls runtime.LockOSThread(). 769 // This function is a no-op if usernsFD is -1. 770 // 771 // Postcondition: All callers must re-exec themselves after this returns, 772 // unless usernsFD was -1. 773 func (g *goferSyncFDs) syncUsernsForRootless() { 774 if g.usernsFD < 0 { 775 return 776 } 777 syncUsernsForRootless(g.usernsFD) 778 g.usernsFD = -1 779 } 780 781 // syncUsernsForRootless waits on usernsFD to be closed and then sets 782 // UID/GID to 0. Note that this function calls runtime.LockOSThread(). 783 // 784 // Postcondition: All callers must re-exec themselves after this returns. 785 func syncUsernsForRootless(fd int) { 786 if err := waitForFD(fd, "userns sync FD"); err != nil { 787 util.Fatalf("failed to sync on userns FD: %v", err) 788 } 789 790 // SETUID changes UID on the current system thread, so we have 791 // to re-execute current binary. 792 runtime.LockOSThread() 793 if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 { 794 util.Fatalf("failed to set UID: %v", errno) 795 } 796 if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 { 797 util.Fatalf("failed to set GID: %v", errno) 798 } 799 } 800 801 // syncNVProxy waits on nvproxyFD to be closed. 802 // Used for synchronization during nvproxy setup which is done from the 803 // non-gofer process. 804 // This function is a no-op if nvProxySyncFD is -1. 805 func (g *goferSyncFDs) syncNVProxy() { 806 if g.nvproxyFD < 0 { 807 return 808 } 809 if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil { 810 util.Fatalf("failed to sync on NVProxy FD: %v", err) 811 } 812 g.nvproxyFD = -1 813 }