gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/cmd/gofer.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cmd
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"path/filepath"
    24  	"regexp"
    25  	"runtime"
    26  	"runtime/debug"
    27  	"strings"
    28  
    29  	"github.com/google/subcommands"
    30  	specs "github.com/opencontainers/runtime-spec/specs-go"
    31  	"golang.org/x/sys/unix"
    32  	"gvisor.dev/gvisor/pkg/log"
    33  	"gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy"
    34  	"gvisor.dev/gvisor/pkg/unet"
    35  	"gvisor.dev/gvisor/runsc/boot"
    36  	"gvisor.dev/gvisor/runsc/cmd/util"
    37  	"gvisor.dev/gvisor/runsc/config"
    38  	"gvisor.dev/gvisor/runsc/flag"
    39  	"gvisor.dev/gvisor/runsc/fsgofer"
    40  	"gvisor.dev/gvisor/runsc/fsgofer/filter"
    41  	"gvisor.dev/gvisor/runsc/profile"
    42  	"gvisor.dev/gvisor/runsc/specutils"
    43  )
    44  
    45  var caps = []string{
    46  	"CAP_CHOWN",
    47  	"CAP_DAC_OVERRIDE",
    48  	"CAP_DAC_READ_SEARCH",
    49  	"CAP_FOWNER",
    50  	"CAP_FSETID",
    51  	"CAP_SYS_CHROOT",
    52  }
    53  
    54  // goferCaps is the minimal set of capabilities needed by the Gofer to operate
    55  // on files.
    56  var goferCaps = &specs.LinuxCapabilities{
    57  	Bounding:  caps,
    58  	Effective: caps,
    59  	Permitted: caps,
    60  }
    61  
    62  // goferSyncFDs contains file descriptors that are used for synchronization
    63  // of the Gofer startup process against other processes.
    64  type goferSyncFDs struct {
    65  	// nvproxyFD is a file descriptor that is used to wait until
    66  	// nvproxy-related setup is done. This setup involves creating mounts in the
    67  	// Gofer process's mount namespace.
    68  	// If this is set, this FD is the first that the Gofer waits for.
    69  	nvproxyFD int
    70  	// usernsFD is a file descriptor that is used to wait until
    71  	// user namespace ID mappings are established in the Gofer's userns.
    72  	// If this is set, this FD is the second that the Gofer waits for.
    73  	usernsFD int
    74  	// procMountFD is a file descriptor that has to be closed when the
    75  	// procfs mount isn't needed anymore. It is read by the procfs unmounter
    76  	// process.
    77  	// If this is set, this FD is the last that the Gofer interacts with and
    78  	// closes.
    79  	procMountFD int
    80  }
    81  
    82  // Gofer implements subcommands.Command for the "gofer" command, which starts a
    83  // filesystem gofer.  This command should not be called directly.
    84  type Gofer struct {
    85  	bundleDir  string
    86  	ioFDs      intFlags
    87  	devIoFD    int
    88  	applyCaps  bool
    89  	setUpRoot  bool
    90  	mountConfs boot.GoferMountConfFlags
    91  
    92  	specFD        int
    93  	mountsFD      int
    94  	profileFDs    profile.FDArgs
    95  	syncFDs       goferSyncFDs
    96  	stopProfiling func()
    97  }
    98  
    99  // Name implements subcommands.Command.
   100  func (*Gofer) Name() string {
   101  	return "gofer"
   102  }
   103  
   104  // Synopsis implements subcommands.Command.
   105  func (g *Gofer) Synopsis() string {
   106  	return fmt.Sprintf("launch a gofer process that proxies access to container files")
   107  }
   108  
   109  // Usage implements subcommands.Command.
   110  func (*Gofer) Usage() string {
   111  	return `gofer [flags]`
   112  }
   113  
   114  // SetFlags implements subcommands.Command.
   115  func (g *Gofer) SetFlags(f *flag.FlagSet) {
   116  	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
   117  	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
   118  	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
   119  
   120  	// Open FDs that are donated to the gofer.
   121  	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. Follows the same order as --gofer-mount-confs. FDs are only donated if the mount is backed by lisafs.")
   122  	f.Var(&g.mountConfs, "gofer-mount-confs", "information about how the gofer mounts have been configured. They must follow this order: root first, then mounts as defined in the spec.")
   123  	f.IntVar(&g.devIoFD, "dev-io-fd", -1, "optional FD to connect /dev gofer server")
   124  	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
   125  	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
   126  
   127  	// Add synchronization FD flags.
   128  	g.syncFDs.setFlags(f)
   129  
   130  	// Profiling flags.
   131  	g.profileFDs.SetFromFlags(f)
   132  }
   133  
   134  // Execute implements subcommands.Command.
   135  func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
   136  	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
   137  		f.Usage()
   138  		return subcommands.ExitUsageError
   139  	}
   140  
   141  	conf := args[0].(*config.Config)
   142  
   143  	// Set traceback level
   144  	debug.SetTraceback(conf.Traceback)
   145  
   146  	specFile := os.NewFile(uintptr(g.specFD), "spec file")
   147  	defer specFile.Close()
   148  	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
   149  	if err != nil {
   150  		util.Fatalf("reading spec: %v", err)
   151  	}
   152  
   153  	g.syncFDs.syncNVProxy()
   154  	g.syncFDs.syncUsernsForRootless()
   155  
   156  	if g.setUpRoot {
   157  		if err := g.setupRootFS(spec, conf); err != nil {
   158  			util.Fatalf("Error setting up root FS: %v", err)
   159  		}
   160  		if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   161  			cleanupUnmounter := g.syncFDs.spawnProcUnmounter()
   162  			defer cleanupUnmounter()
   163  		}
   164  	}
   165  	if g.applyCaps {
   166  		overrides := g.syncFDs.flags()
   167  		overrides["apply-caps"] = "false"
   168  		overrides["setup-root"] = "false"
   169  		args := prepareArgs(g.Name(), f, overrides)
   170  		util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps))
   171  		panic("unreachable")
   172  	}
   173  
   174  	// Start profiling. This will be a noop if no profiling arguments were passed.
   175  	profileOpts := g.profileFDs.ToOpts()
   176  	g.stopProfiling = profile.Start(profileOpts)
   177  
   178  	// At this point we won't re-execute, so it's safe to limit via rlimits. Any
   179  	// limit >= 0 works. If the limit is lower than the current number of open
   180  	// files, then Setrlimit will succeed, and the next open will fail.
   181  	if conf.FDLimit > -1 {
   182  		rlimit := unix.Rlimit{
   183  			Cur: uint64(conf.FDLimit),
   184  			Max: uint64(conf.FDLimit),
   185  		}
   186  		switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
   187  		case nil:
   188  		case unix.EPERM:
   189  			log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
   190  		default:
   191  			util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
   192  		}
   193  	}
   194  
   195  	// Find what path is going to be served by this gofer.
   196  	root := spec.Root.Path
   197  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   198  		root = "/root"
   199  	}
   200  
   201  	// Resolve mount points paths, then replace mounts from our spec and send the
   202  	// mount list over to the sandbox, so they are both in sync.
   203  	//
   204  	// Note that all mount points have been mounted in the proper location in
   205  	// setupRootFS().
   206  	cleanMounts, err := g.resolveMounts(conf, spec.Mounts, root)
   207  	if err != nil {
   208  		util.Fatalf("Failure to resolve mounts: %v", err)
   209  	}
   210  	spec.Mounts = cleanMounts
   211  	go func() {
   212  		if err := g.writeMounts(cleanMounts); err != nil {
   213  			panic(fmt.Sprintf("Failed to write mounts: %v", err))
   214  		}
   215  	}()
   216  
   217  	specutils.LogSpecDebug(spec, conf.OCISeccomp)
   218  
   219  	// fsgofer should run with a umask of 0, because we want to preserve file
   220  	// modes exactly as sent by the sandbox, which will have applied its own umask.
   221  	unix.Umask(0)
   222  
   223  	procFDPath := procFDBindMount
   224  	if conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   225  		procFDPath = "/proc/self/fd"
   226  	}
   227  	if err := fsgofer.OpenProcSelfFD(procFDPath); err != nil {
   228  		util.Fatalf("failed to open /proc/self/fd: %v", err)
   229  	}
   230  
   231  	// procfs isn't needed anymore.
   232  	g.syncFDs.unmountProcfs()
   233  
   234  	if err := unix.Chroot(root); err != nil {
   235  		util.Fatalf("failed to chroot to %q: %v", root, err)
   236  	}
   237  	if err := unix.Chdir("/"); err != nil {
   238  		util.Fatalf("changing working dir: %v", err)
   239  	}
   240  	log.Infof("Process chroot'd to %q", root)
   241  
   242  	// Initialize filters.
   243  	opts := filter.Options{
   244  		UDSOpenEnabled:   conf.GetHostUDS().AllowOpen(),
   245  		UDSCreateEnabled: conf.GetHostUDS().AllowCreate(),
   246  		ProfileEnabled:   len(profileOpts) > 0,
   247  		DirectFS:         conf.DirectFS,
   248  	}
   249  	if err := filter.Install(opts); err != nil {
   250  		util.Fatalf("installing seccomp filters: %v", err)
   251  	}
   252  
   253  	return g.serve(spec, conf, root)
   254  }
   255  
   256  func newSocket(ioFD int) *unet.Socket {
   257  	socket, err := unet.NewSocket(ioFD)
   258  	if err != nil {
   259  		util.Fatalf("creating server on FD %d: %v", ioFD, err)
   260  	}
   261  	return socket
   262  }
   263  
   264  func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus {
   265  	type connectionConfig struct {
   266  		sock      *unet.Socket
   267  		mountPath string
   268  		readonly  bool
   269  	}
   270  	cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1)
   271  	server := fsgofer.NewLisafsServer(fsgofer.Config{
   272  		// These are global options. Ignore readonly configuration, that is set on
   273  		// a per connection basis.
   274  		HostUDS:            conf.GetHostUDS(),
   275  		HostFifo:           conf.HostFifo,
   276  		DonateMountPointFD: conf.DirectFS,
   277  	})
   278  
   279  	ioFDs := g.ioFDs
   280  	rootfsConf := g.mountConfs[0]
   281  	if rootfsConf.ShouldUseLisafs() {
   282  		// Start with root mount, then add any other additional mount as needed.
   283  		cfgs = append(cfgs, connectionConfig{
   284  			sock:      newSocket(ioFDs[0]),
   285  			mountPath: "/", // fsgofer process is always chroot()ed. So serve root.
   286  			readonly:  spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs(),
   287  		})
   288  		log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, ioFDs[0], cfgs[0].readonly)
   289  		ioFDs = ioFDs[1:]
   290  	}
   291  
   292  	mountIdx := 1 // first one is the root
   293  	for _, m := range spec.Mounts {
   294  		if !specutils.IsGoferMount(m) {
   295  			continue
   296  		}
   297  		mountConf := g.mountConfs[mountIdx]
   298  		mountIdx++
   299  		if !mountConf.ShouldUseLisafs() {
   300  			continue
   301  		}
   302  		if !filepath.IsAbs(m.Destination) {
   303  			util.Fatalf("mount destination must be absolute: %q", m.Destination)
   304  		}
   305  
   306  		if len(ioFDs) == 0 {
   307  			util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m)
   308  		}
   309  		ioFD := ioFDs[0]
   310  		ioFDs = ioFDs[1:]
   311  		readonly := specutils.IsReadonlyMount(m.Options) || mountConf.ShouldUseOverlayfs()
   312  		cfgs = append(cfgs, connectionConfig{
   313  			sock:      newSocket(ioFD),
   314  			mountPath: m.Destination,
   315  			readonly:  readonly,
   316  		})
   317  		log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, ioFD, readonly)
   318  	}
   319  
   320  	if len(ioFDs) > 0 {
   321  		util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", len(cfgs), len(g.ioFDs))
   322  	}
   323  
   324  	if g.devIoFD >= 0 {
   325  		cfgs = append(cfgs, connectionConfig{
   326  			sock:      newSocket(g.devIoFD),
   327  			mountPath: "/dev",
   328  		})
   329  		log.Infof("Serving /dev mapped on FD %d (ro: false)", g.devIoFD)
   330  	}
   331  
   332  	for _, cfg := range cfgs {
   333  		conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly)
   334  		if err != nil {
   335  			util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err)
   336  		}
   337  		server.StartConnection(conn)
   338  	}
   339  	server.Wait()
   340  	server.Destroy()
   341  	log.Infof("All lisafs servers exited.")
   342  	if g.stopProfiling != nil {
   343  		g.stopProfiling()
   344  	}
   345  	return subcommands.ExitSuccess
   346  }
   347  
   348  func (g *Gofer) writeMounts(mounts []specs.Mount) error {
   349  	bytes, err := json.Marshal(mounts)
   350  	if err != nil {
   351  		return err
   352  	}
   353  
   354  	f := os.NewFile(uintptr(g.mountsFD), "mounts file")
   355  	defer f.Close()
   356  
   357  	for written := 0; written < len(bytes); {
   358  		w, err := f.Write(bytes[written:])
   359  		if err != nil {
   360  			return err
   361  		}
   362  		written += w
   363  	}
   364  	return nil
   365  }
   366  
   367  // Redhat distros don't allow to create bind-mounts in /proc/self directories.
   368  // It is protected by selinux rules.
   369  const procFDBindMount = "/proc/fs"
   370  
   371  func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config) error {
   372  	// Convert all shared mounts into slaves to be sure that nothing will be
   373  	// propagated outside of our namespace.
   374  	procPath := "/proc"
   375  	if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil {
   376  		util.Fatalf("error converting mounts: %v", err)
   377  	}
   378  
   379  	root := spec.Root.Path
   380  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   381  		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
   382  		// mount ./proc and ./root there, then move this mount to the root and after
   383  		// setCapsAndCallSelf, runsc will chroot into /root.
   384  		//
   385  		// We need a directory to construct a new root and we know that
   386  		// runsc can't start without /proc, so we can use it for this.
   387  		flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
   388  		if err := specutils.SafeMount("runsc-root", "/proc/fs", "tmpfs", flags, "", procPath); err != nil {
   389  			util.Fatalf("error mounting tmpfs: %v", err)
   390  		}
   391  		if err := unix.Mount("", "/proc/fs", "", unix.MS_UNBINDABLE, ""); err != nil {
   392  			util.Fatalf("error setting MS_UNBINDABLE")
   393  		}
   394  		// Prepare tree structure for pivot_root(2).
   395  		if err := os.Mkdir("/proc/fs/proc", 0755); err != nil {
   396  			util.Fatalf("error creating /proc/fs/proc: %v", err)
   397  		}
   398  		if err := os.Mkdir("/proc/fs/root", 0755); err != nil {
   399  			util.Fatalf("error creating /proc/fs/root: %v", err)
   400  		}
   401  		if err := os.Mkdir("/proc/fs/etc", 0755); err != nil {
   402  			util.Fatalf("error creating /proc/fs/etc: %v", err)
   403  		}
   404  		// This cannot use SafeMount because there's no available procfs. But we
   405  		// know that /proc/fs is an empty tmpfs mount, so this is safe.
   406  		if err := unix.Mount("/proc", "/proc/fs/proc", "", flags|unix.MS_RDONLY|unix.MS_BIND|unix.MS_REC, ""); err != nil {
   407  			util.Fatalf("error mounting /proc/fs/proc: %v", err)
   408  		}
   409  		// self/fd is bind-mounted, so that the FD return by
   410  		// OpenProcSelfFD() does not allow escapes with walking ".." .
   411  		if err := unix.Mount("/proc/fs/proc/self/fd", "/proc/fs/"+procFDBindMount,
   412  			"", unix.MS_RDONLY|unix.MS_BIND|flags, ""); err != nil {
   413  			util.Fatalf("error mounting proc/self/fd: %v", err)
   414  		}
   415  		if err := copyFile("/proc/fs/etc/localtime", "/etc/localtime"); err != nil {
   416  			log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err)
   417  		}
   418  		root = "/proc/fs/root"
   419  		procPath = "/proc/fs/proc"
   420  	}
   421  
   422  	rootfsConf := g.mountConfs[0]
   423  	if rootfsConf.ShouldUseLisafs() {
   424  		// Mount root path followed by submounts.
   425  		if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil {
   426  			return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
   427  		}
   428  
   429  		flags := uint32(unix.MS_SLAVE | unix.MS_REC)
   430  		if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
   431  			flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
   432  		}
   433  		if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil {
   434  			return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
   435  		}
   436  	}
   437  
   438  	// Replace the current spec, with the clean spec with symlinks resolved.
   439  	if err := g.setupMounts(conf, spec.Mounts, root, procPath); err != nil {
   440  		util.Fatalf("error setting up FS: %v", err)
   441  	}
   442  
   443  	// Set up /dev directory is needed.
   444  	if g.devIoFD >= 0 {
   445  		g.setupDev(spec, conf, root, procPath)
   446  	}
   447  
   448  	// Create working directory if needed.
   449  	if spec.Process.Cwd != "" {
   450  		dst, err := resolveSymlinks(root, spec.Process.Cwd)
   451  		if err != nil {
   452  			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
   453  		}
   454  		log.Infof("Create working directory %q if needed", spec.Process.Cwd)
   455  		if err := os.MkdirAll(dst, 0755); err != nil {
   456  			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
   457  		}
   458  	}
   459  
   460  	// Check if root needs to be remounted as readonly.
   461  	if rootfsConf.ShouldUseLisafs() && (spec.Root.Readonly || rootfsConf.ShouldUseOverlayfs()) {
   462  		// If root is a mount point but not read-only, we can change mount options
   463  		// to make it read-only for extra safety.
   464  		// unix.MS_NOSUID and unix.MS_NODEV are included here not only
   465  		// for safety reasons but also because they can be locked and
   466  		// any attempts to unset them will fail.  See
   467  		// mount_namespaces(7) for more details.
   468  		log.Infof("Remounting root as readonly: %q", root)
   469  		flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_NOSUID | unix.MS_NODEV)
   470  		if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil {
   471  			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
   472  		}
   473  	}
   474  
   475  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   476  		if err := pivotRoot("/proc/fs"); err != nil {
   477  			util.Fatalf("failed to change the root file system: %v", err)
   478  		}
   479  		if err := os.Chdir("/"); err != nil {
   480  			util.Fatalf("failed to change working directory")
   481  		}
   482  	}
   483  	return nil
   484  }
   485  
   486  // setupMounts bind mounts all mounts specified in the spec in their correct
   487  // location inside root. It will resolve relative paths and symlinks. It also
   488  // creates directories as needed.
   489  func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error {
   490  	mountIdx := 1 // First index is for rootfs.
   491  	for _, m := range mounts {
   492  		if !specutils.IsGoferMount(m) {
   493  			continue
   494  		}
   495  		mountConf := g.mountConfs[mountIdx]
   496  		mountIdx++
   497  		if !mountConf.ShouldUseLisafs() {
   498  			continue
   499  		}
   500  
   501  		dst, err := resolveSymlinks(root, m.Destination)
   502  		if err != nil {
   503  			return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
   504  		}
   505  
   506  		flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND
   507  		if mountConf.ShouldUseOverlayfs() {
   508  			// Force mount read-only if writes are not going to be sent to it.
   509  			flags |= unix.MS_RDONLY
   510  		}
   511  
   512  		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
   513  		if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil {
   514  			return fmt.Errorf("mounting %+v: %v", m, err)
   515  		}
   516  
   517  		// Set propagation options that cannot be set together with other options.
   518  		flags = specutils.PropOptionsToFlags(m.Options)
   519  		if flags != 0 {
   520  			if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil {
   521  				return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
   522  			}
   523  		}
   524  	}
   525  	return nil
   526  }
   527  
   528  // shouldExposeNvidiaDevice returns true if path refers to an Nvidia device
   529  // which should be exposed to the container.
   530  //
   531  // Precondition: nvproxy is enabled.
   532  func shouldExposeNvidiaDevice(path string) bool {
   533  	if !strings.HasPrefix(path, "/dev/nvidia") {
   534  		return false
   535  	}
   536  	if path == "/dev/nvidiactl" || path == "/dev/nvidia-uvm" {
   537  		return true
   538  	}
   539  	nvidiaDevPathReg := regexp.MustCompile(`^/dev/nvidia(\d+)$`)
   540  	return nvidiaDevPathReg.MatchString(path)
   541  }
   542  
   543  // shouldExposeVfioDevice returns true if path refers to an VFIO device
   544  // which shuold be exposed to the container.
   545  func shouldExposeVFIODevice(path string) bool {
   546  	return strings.HasPrefix(path, filepath.Dir(tpuproxy.VFIOPath))
   547  }
   548  
   549  // shouldExposeTpuDevice returns true if path refers to a TPU device which
   550  // should be exposed to the container.
   551  //
   552  // Precondition: tpuproxy is enabled.
   553  func shouldExposeTpuDevice(path string) bool {
   554  	_, valid, _ := util.ExtractTpuDeviceMinor(path)
   555  	return valid || shouldExposeVFIODevice(path)
   556  }
   557  
   558  func (g *Gofer) setupDev(spec *specs.Spec, conf *config.Config, root, procPath string) error {
   559  	if err := os.MkdirAll(filepath.Join(root, "dev"), 0777); err != nil {
   560  		return fmt.Errorf("creating dev directory: %v", err)
   561  	}
   562  	// Mount any devices specified in the spec.
   563  	if spec.Linux == nil {
   564  		return nil
   565  	}
   566  	nvproxyEnabled := specutils.NVProxyEnabled(spec, conf)
   567  	tpuproxyEnabled := specutils.TPUProxyIsEnabled(spec, conf)
   568  	for _, dev := range spec.Linux.Devices {
   569  		shouldMount := (nvproxyEnabled && shouldExposeNvidiaDevice(dev.Path)) ||
   570  			(tpuproxyEnabled && shouldExposeTpuDevice(dev.Path))
   571  		if !shouldMount {
   572  			continue
   573  		}
   574  		dst := filepath.Join(root, dev.Path)
   575  		log.Infof("Mounting device %q as bind mount at %q", dev.Path, dst)
   576  		if err := specutils.SafeSetupAndMount(dev.Path, dst, "bind", unix.MS_BIND, procPath); err != nil {
   577  			return fmt.Errorf("mounting %q: %v", dev.Path, err)
   578  		}
   579  	}
   580  	return nil
   581  }
   582  
   583  // resolveMounts resolved relative paths and symlinks to mount points.
   584  //
   585  // Note: mount points must already be in place for resolution to work.
   586  // Otherwise, it may follow symlinks to locations that would be overwritten
   587  // with another mount point and return the wrong location. In short, make sure
   588  // setupMounts() has been called before.
   589  func (g *Gofer) resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
   590  	mountIdx := 1 // First index is for rootfs.
   591  	cleanMounts := make([]specs.Mount, 0, len(mounts))
   592  	for _, m := range mounts {
   593  		if !specutils.IsGoferMount(m) {
   594  			cleanMounts = append(cleanMounts, m)
   595  			continue
   596  		}
   597  		mountConf := g.mountConfs[mountIdx]
   598  		mountIdx++
   599  		if !mountConf.ShouldUseLisafs() {
   600  			cleanMounts = append(cleanMounts, m)
   601  			continue
   602  		}
   603  		dst, err := resolveSymlinks(root, m.Destination)
   604  		if err != nil {
   605  			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
   606  		}
   607  		relDst, err := filepath.Rel(root, dst)
   608  		if err != nil {
   609  			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
   610  		}
   611  
   612  		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
   613  		if err != nil {
   614  			return nil, err
   615  		}
   616  
   617  		cpy := m
   618  		cpy.Destination = filepath.Join("/", relDst)
   619  		cpy.Options = opts
   620  		cleanMounts = append(cleanMounts, cpy)
   621  	}
   622  	return cleanMounts, nil
   623  }
   624  
   625  // ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
   626  // symlinks, they are evaluated relative to 'root' to ensure the end result is
   627  // the same as if the process was running inside the container.
   628  func resolveSymlinks(root, rel string) (string, error) {
   629  	return resolveSymlinksImpl(root, root, rel, 255)
   630  }
   631  
   632  func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
   633  	if followCount == 0 {
   634  		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
   635  	}
   636  
   637  	rel = filepath.Clean(rel)
   638  	for _, name := range strings.Split(rel, string(filepath.Separator)) {
   639  		if name == "" {
   640  			continue
   641  		}
   642  		// Note that Join() resolves things like ".." and returns a clean path.
   643  		path := filepath.Join(base, name)
   644  		if !strings.HasPrefix(path, root) {
   645  			// One cannot '..' their way out of root.
   646  			base = root
   647  			continue
   648  		}
   649  		fi, err := os.Lstat(path)
   650  		if err != nil {
   651  			if !os.IsNotExist(err) {
   652  				return "", err
   653  			}
   654  			// Not found means there is no symlink to check. Just keep walking dirs.
   655  			base = path
   656  			continue
   657  		}
   658  		if fi.Mode()&os.ModeSymlink != 0 {
   659  			link, err := os.Readlink(path)
   660  			if err != nil {
   661  				return "", err
   662  			}
   663  			if filepath.IsAbs(link) {
   664  				base = root
   665  			}
   666  			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
   667  			if err != nil {
   668  				return "", err
   669  			}
   670  			continue
   671  		}
   672  		base = path
   673  	}
   674  	return base, nil
   675  }
   676  
   677  // adjustMountOptions adds filesystem-specific gofer mount options.
   678  func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
   679  	rv := make([]string, len(opts))
   680  	copy(rv, opts)
   681  
   682  	statfs := unix.Statfs_t{}
   683  	if err := unix.Statfs(path, &statfs); err != nil {
   684  		return nil, err
   685  	}
   686  	switch statfs.Type {
   687  	case unix.OVERLAYFS_SUPER_MAGIC:
   688  		rv = append(rv, "overlayfs_stale_read")
   689  	case unix.NFS_SUPER_MAGIC:
   690  		// The gofer client implements remote file handle sharing for performance.
   691  		// However, remote filesystems like NFS rely on close(2) syscall for
   692  		// flushing file data to the server. Such handle sharing prevents the
   693  		// application's close(2) syscall from being propagated to the host. Hence
   694  		// disable file handle sharing, so NFS files are flushed correctly.
   695  		rv = append(rv, "disable_file_handle_sharing")
   696  	}
   697  	return rv, nil
   698  }
   699  
   700  // setFlags sets sync FD flags on the given FlagSet.
   701  func (g *goferSyncFDs) setFlags(f *flag.FlagSet) {
   702  	f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done")
   703  	f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the gofer waits on until userns mappings are set up")
   704  	f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted")
   705  }
   706  
   707  // flags returns the flags necessary to pass along the current sync FD values
   708  // to a re-executed version of this process.
   709  func (g *goferSyncFDs) flags() map[string]string {
   710  	return map[string]string{
   711  		"sync-nvproxy-fd":    fmt.Sprintf("%d", g.nvproxyFD),
   712  		"sync-userns-fd":     fmt.Sprintf("%d", g.usernsFD),
   713  		"proc-mount-sync-fd": fmt.Sprintf("%d", g.procMountFD),
   714  	}
   715  }
   716  
   717  // waitForFD waits for the other end of a given FD to be closed.
   718  // `fd` is closed unconditionally after that.
   719  // This should only be called for actual FDs (i.e. `fd` >= 0).
   720  func waitForFD(fd int, fdName string) error {
   721  	log.Debugf("Waiting on %s %d...", fdName, fd)
   722  	f := os.NewFile(uintptr(fd), fdName)
   723  	defer f.Close()
   724  	var b [1]byte
   725  	if n, err := f.Read(b[:]); n != 0 || err != io.EOF {
   726  		return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err)
   727  	}
   728  	log.Debugf("Synced on %s %d.", fdName, fd)
   729  	return nil
   730  }
   731  
   732  // spawnProcMounter executes the /proc unmounter process.
   733  // It returns a function to wait on the proc unmounter process, which
   734  // should be called (via defer) in case of errors in order to clean up the
   735  // unmounter process properly.
   736  // When procfs is no longer needed, `unmountProcfs` should be called.
   737  func (g *goferSyncFDs) spawnProcUnmounter() func() {
   738  	if g.procMountFD != -1 {
   739  		util.Fatalf("procMountFD is set")
   740  	}
   741  	// /proc is umounted from a forked process, because the
   742  	// current one may re-execute itself without capabilities.
   743  	cmd, w := execProcUmounter()
   744  	// Clear FD_CLOEXEC. This process may be re-executed. procMountFD
   745  	// should remain open.
   746  	if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
   747  		util.Fatalf("error clearing CLOEXEC: %v", errno)
   748  	}
   749  	g.procMountFD = int(w.Fd())
   750  	return func() {
   751  		g.procMountFD = -1
   752  		w.Close()
   753  		cmd.Wait()
   754  	}
   755  }
   756  
   757  // unmountProcfs signals the proc unmounter process that procfs is no longer
   758  // needed.
   759  func (g *goferSyncFDs) unmountProcfs() {
   760  	if g.procMountFD < 0 {
   761  		return
   762  	}
   763  	umountProc(g.procMountFD)
   764  	g.procMountFD = -1
   765  }
   766  
   767  // syncUsernsForRootless waits on usernsFD to be closed and then sets
   768  // UID/GID to 0. Note that this function calls runtime.LockOSThread().
   769  // This function is a no-op if usernsFD is -1.
   770  //
   771  // Postcondition: All callers must re-exec themselves after this returns,
   772  // unless usernsFD was -1.
   773  func (g *goferSyncFDs) syncUsernsForRootless() {
   774  	if g.usernsFD < 0 {
   775  		return
   776  	}
   777  	syncUsernsForRootless(g.usernsFD)
   778  	g.usernsFD = -1
   779  }
   780  
   781  // syncUsernsForRootless waits on usernsFD to be closed and then sets
   782  // UID/GID to 0. Note that this function calls runtime.LockOSThread().
   783  //
   784  // Postcondition: All callers must re-exec themselves after this returns.
   785  func syncUsernsForRootless(fd int) {
   786  	if err := waitForFD(fd, "userns sync FD"); err != nil {
   787  		util.Fatalf("failed to sync on userns FD: %v", err)
   788  	}
   789  
   790  	// SETUID changes UID on the current system thread, so we have
   791  	// to re-execute current binary.
   792  	runtime.LockOSThread()
   793  	if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 {
   794  		util.Fatalf("failed to set UID: %v", errno)
   795  	}
   796  	if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 {
   797  		util.Fatalf("failed to set GID: %v", errno)
   798  	}
   799  }
   800  
   801  // syncNVProxy waits on nvproxyFD to be closed.
   802  // Used for synchronization during nvproxy setup which is done from the
   803  // non-gofer process.
   804  // This function is a no-op if nvProxySyncFD is -1.
   805  func (g *goferSyncFDs) syncNVProxy() {
   806  	if g.nvproxyFD < 0 {
   807  		return
   808  	}
   809  	if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil {
   810  		util.Fatalf("failed to sync on NVProxy FD: %v", err)
   811  	}
   812  	g.nvproxyFD = -1
   813  }