github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/cmd/gofer.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cmd
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"path/filepath"
    24  	"runtime"
    25  	"runtime/debug"
    26  	"strings"
    27  
    28  	"github.com/MerlinKodo/gvisor/pkg/log"
    29  	"github.com/MerlinKodo/gvisor/pkg/unet"
    30  	"github.com/MerlinKodo/gvisor/runsc/boot"
    31  	"github.com/MerlinKodo/gvisor/runsc/cmd/util"
    32  	"github.com/MerlinKodo/gvisor/runsc/config"
    33  	"github.com/MerlinKodo/gvisor/runsc/flag"
    34  	"github.com/MerlinKodo/gvisor/runsc/fsgofer"
    35  	"github.com/MerlinKodo/gvisor/runsc/fsgofer/filter"
    36  	"github.com/MerlinKodo/gvisor/runsc/profile"
    37  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    38  	"github.com/google/subcommands"
    39  	specs "github.com/opencontainers/runtime-spec/specs-go"
    40  	"golang.org/x/sys/unix"
    41  )
    42  
    43  var caps = []string{
    44  	"CAP_CHOWN",
    45  	"CAP_DAC_OVERRIDE",
    46  	"CAP_DAC_READ_SEARCH",
    47  	"CAP_FOWNER",
    48  	"CAP_FSETID",
    49  	"CAP_SYS_CHROOT",
    50  }
    51  
    52  // goferCaps is the minimal set of capabilities needed by the Gofer to operate
    53  // on files.
    54  var goferCaps = &specs.LinuxCapabilities{
    55  	Bounding:  caps,
    56  	Effective: caps,
    57  	Permitted: caps,
    58  }
    59  
    60  // goferSyncFDs contains file descriptors that are used for synchronization
    61  // of the Gofer startup process against other processes.
    62  type goferSyncFDs struct {
    63  	// nvproxyFD is a file descriptor that is used to wait until
    64  	// nvproxy-related setup is done. This setup involves creating mounts in the
    65  	// Gofer process's mount namespace.
    66  	// If this is set, this FD is the first that the Gofer waits for.
    67  	nvproxyFD int
    68  	// usernsFD is a file descriptor that is used to wait until
    69  	// user namespace ID mappings are established in the Gofer's userns.
    70  	// If this is set, this FD is the second that the Gofer waits for.
    71  	usernsFD int
    72  	// procMountFD is a file descriptor that has to be closed when the
    73  	// procfs mount isn't needed anymore. It is read by the procfs unmounter
    74  	// process.
    75  	// If this is set, this FD is the last that the Gofer interacts with and
    76  	// closes.
    77  	procMountFD int
    78  }
    79  
    80  // Gofer implements subcommands.Command for the "gofer" command, which starts a
    81  // filesystem gofer.  This command should not be called directly.
    82  type Gofer struct {
    83  	bundleDir      string
    84  	ioFDs          intFlags
    85  	applyCaps      bool
    86  	setUpRoot      bool
    87  	overlayMediums boot.OverlayMediumFlags
    88  
    89  	specFD        int
    90  	mountsFD      int
    91  	profileFDs    profile.FDArgs
    92  	syncFDs       goferSyncFDs
    93  	stopProfiling func()
    94  }
    95  
    96  // Name implements subcommands.Command.
    97  func (*Gofer) Name() string {
    98  	return "gofer"
    99  }
   100  
   101  // Synopsis implements subcommands.Command.
   102  func (g *Gofer) Synopsis() string {
   103  	return fmt.Sprintf("launch a gofer process that proxies access to container files")
   104  }
   105  
   106  // Usage implements subcommands.Command.
   107  func (*Gofer) Usage() string {
   108  	return `gofer [flags]`
   109  }
   110  
   111  // SetFlags implements subcommands.Command.
   112  func (g *Gofer) SetFlags(f *flag.FlagSet) {
   113  	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
   114  	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
   115  	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
   116  
   117  	// Open FDs that are donated to the gofer.
   118  	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec")
   119  	f.Var(&g.overlayMediums, "overlay-mediums", "information about how the gofer mounts have been overlaid.")
   120  	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
   121  	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
   122  
   123  	// Add synchronization FD flags.
   124  	g.syncFDs.setFlags(f)
   125  
   126  	// Profiling flags.
   127  	g.profileFDs.SetFromFlags(f)
   128  }
   129  
   130  // Execute implements subcommands.Command.
   131  func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
   132  	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
   133  		f.Usage()
   134  		return subcommands.ExitUsageError
   135  	}
   136  
   137  	conf := args[0].(*config.Config)
   138  
   139  	// Set traceback level
   140  	debug.SetTraceback(conf.Traceback)
   141  
   142  	specFile := os.NewFile(uintptr(g.specFD), "spec file")
   143  	defer specFile.Close()
   144  	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
   145  	if err != nil {
   146  		util.Fatalf("reading spec: %v", err)
   147  	}
   148  
   149  	g.syncFDs.syncNVProxy()
   150  	g.syncFDs.syncUsernsForRootless()
   151  
   152  	if g.setUpRoot {
   153  		if err := g.setupRootFS(spec, conf); err != nil {
   154  			util.Fatalf("Error setting up root FS: %v", err)
   155  		}
   156  		if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   157  			cleanupUnmounter := g.syncFDs.spawnProcUnmounter()
   158  			defer cleanupUnmounter()
   159  		}
   160  	}
   161  	if g.applyCaps {
   162  		overrides := g.syncFDs.flags()
   163  		overrides["apply-caps"] = "false"
   164  		overrides["setup-root"] = "false"
   165  		args := prepareArgs(g.Name(), f, overrides)
   166  		util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps))
   167  		panic("unreachable")
   168  	}
   169  
   170  	// Start profiling. This will be a noop if no profiling arguments were passed.
   171  	profileOpts := g.profileFDs.ToOpts()
   172  	g.stopProfiling = profile.Start(profileOpts)
   173  
   174  	// At this point we won't re-execute, so it's safe to limit via rlimits. Any
   175  	// limit >= 0 works. If the limit is lower than the current number of open
   176  	// files, then Setrlimit will succeed, and the next open will fail.
   177  	if conf.FDLimit > -1 {
   178  		rlimit := unix.Rlimit{
   179  			Cur: uint64(conf.FDLimit),
   180  			Max: uint64(conf.FDLimit),
   181  		}
   182  		switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
   183  		case nil:
   184  		case unix.EPERM:
   185  			log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
   186  		default:
   187  			util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
   188  		}
   189  	}
   190  
   191  	// Find what path is going to be served by this gofer.
   192  	root := spec.Root.Path
   193  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   194  		root = "/root"
   195  	}
   196  
   197  	// Resolve mount points paths, then replace mounts from our spec and send the
   198  	// mount list over to the sandbox, so they are both in sync.
   199  	//
   200  	// Note that all mount points have been mounted in the proper location in
   201  	// setupRootFS().
   202  	cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
   203  	if err != nil {
   204  		util.Fatalf("Failure to resolve mounts: %v", err)
   205  	}
   206  	spec.Mounts = cleanMounts
   207  	go func() {
   208  		if err := g.writeMounts(cleanMounts); err != nil {
   209  			panic(fmt.Sprintf("Failed to write mounts: %v", err))
   210  		}
   211  	}()
   212  
   213  	specutils.LogSpecDebug(spec, conf.OCISeccomp)
   214  
   215  	// fsgofer should run with a umask of 0, because we want to preserve file
   216  	// modes exactly as sent by the sandbox, which will have applied its own umask.
   217  	unix.Umask(0)
   218  
   219  	if err := fsgofer.OpenProcSelfFD(); err != nil {
   220  		util.Fatalf("failed to open /proc/self/fd: %v", err)
   221  	}
   222  
   223  	// procfs isn't needed anymore.
   224  	g.syncFDs.unmountProcfs()
   225  
   226  	if err := unix.Chroot(root); err != nil {
   227  		util.Fatalf("failed to chroot to %q: %v", root, err)
   228  	}
   229  	if err := unix.Chdir("/"); err != nil {
   230  		util.Fatalf("changing working dir: %v", err)
   231  	}
   232  	log.Infof("Process chroot'd to %q", root)
   233  
   234  	// Initialize filters.
   235  	opts := filter.Options{
   236  		UDSOpenEnabled:   conf.GetHostUDS().AllowOpen(),
   237  		UDSCreateEnabled: conf.GetHostUDS().AllowCreate(),
   238  		ProfileEnabled:   len(profileOpts) > 0,
   239  	}
   240  	if err := filter.Install(opts); err != nil {
   241  		util.Fatalf("installing seccomp filters: %v", err)
   242  	}
   243  
   244  	return g.serve(spec, conf, root)
   245  }
   246  
   247  func newSocket(ioFD int) *unet.Socket {
   248  	socket, err := unet.NewSocket(ioFD)
   249  	if err != nil {
   250  		util.Fatalf("creating server on FD %d: %v", ioFD, err)
   251  	}
   252  	return socket
   253  }
   254  
   255  func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus {
   256  	type connectionConfig struct {
   257  		sock      *unet.Socket
   258  		mountPath string
   259  		readonly  bool
   260  	}
   261  	cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1)
   262  	server := fsgofer.NewLisafsServer(fsgofer.Config{
   263  		// These are global options. Ignore readonly configuration, that is set on
   264  		// a per connection basis.
   265  		HostUDS:            conf.GetHostUDS(),
   266  		HostFifo:           conf.HostFifo,
   267  		DonateMountPointFD: conf.DirectFS,
   268  	})
   269  
   270  	// Start with root mount, then add any other additional mount as needed.
   271  	cfgs = append(cfgs, connectionConfig{
   272  		sock:      newSocket(g.ioFDs[0]),
   273  		mountPath: "/", // fsgofer process is always chroot()ed. So serve root.
   274  		readonly:  spec.Root.Readonly || g.overlayMediums[0].IsEnabled(),
   275  	})
   276  	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], cfgs[0].readonly)
   277  
   278  	mountIdx := 1 // first one is the root
   279  	for _, m := range spec.Mounts {
   280  		if !specutils.IsGoferMount(m) {
   281  			continue
   282  		}
   283  
   284  		if !filepath.IsAbs(m.Destination) {
   285  			util.Fatalf("mount destination must be absolute: %q", m.Destination)
   286  		}
   287  		if mountIdx >= len(g.ioFDs) {
   288  			util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m)
   289  		}
   290  
   291  		cfgs = append(cfgs, connectionConfig{
   292  			sock:      newSocket(g.ioFDs[mountIdx]),
   293  			mountPath: m.Destination,
   294  			readonly:  specutils.IsReadonlyMount(m.Options) || g.overlayMediums[mountIdx].IsEnabled(),
   295  		})
   296  
   297  		log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfgs[mountIdx].readonly)
   298  		mountIdx++
   299  	}
   300  
   301  	if mountIdx != len(g.ioFDs) {
   302  		util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
   303  	}
   304  	cfgs = cfgs[:mountIdx]
   305  
   306  	for _, cfg := range cfgs {
   307  		conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly)
   308  		if err != nil {
   309  			util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err)
   310  		}
   311  		server.StartConnection(conn)
   312  	}
   313  	server.Wait()
   314  	server.Destroy()
   315  	log.Infof("All lisafs servers exited.")
   316  	if g.stopProfiling != nil {
   317  		g.stopProfiling()
   318  	}
   319  	return subcommands.ExitSuccess
   320  }
   321  
   322  func (g *Gofer) writeMounts(mounts []specs.Mount) error {
   323  	bytes, err := json.Marshal(mounts)
   324  	if err != nil {
   325  		return err
   326  	}
   327  
   328  	f := os.NewFile(uintptr(g.mountsFD), "mounts file")
   329  	defer f.Close()
   330  
   331  	for written := 0; written < len(bytes); {
   332  		w, err := f.Write(bytes[written:])
   333  		if err != nil {
   334  			return err
   335  		}
   336  		written += w
   337  	}
   338  	return nil
   339  }
   340  
   341  func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config) error {
   342  	// Convert all shared mounts into slaves to be sure that nothing will be
   343  	// propagated outside of our namespace.
   344  	procPath := "/proc"
   345  	if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil {
   346  		util.Fatalf("error converting mounts: %v", err)
   347  	}
   348  
   349  	root := spec.Root.Path
   350  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   351  		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
   352  		// mount ./proc and ./root there, then move this mount to the root and after
   353  		// setCapsAndCallSelf, runsc will chroot into /root.
   354  		//
   355  		// We need a directory to construct a new root and we know that
   356  		// runsc can't start without /proc, so we can use it for this.
   357  		flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
   358  		if err := specutils.SafeMount("runsc-root", "/proc", "tmpfs", flags, "", procPath); err != nil {
   359  			util.Fatalf("error mounting tmpfs: %v", err)
   360  		}
   361  
   362  		// Prepare tree structure for pivot_root(2).
   363  		if err := os.Mkdir("/proc/proc", 0755); err != nil {
   364  			util.Fatalf("error creating /proc/proc: %v", err)
   365  		}
   366  		if err := os.Mkdir("/proc/root", 0755); err != nil {
   367  			util.Fatalf("error creating /proc/root: %v", err)
   368  		}
   369  		if err := os.Mkdir("/proc/etc", 0755); err != nil {
   370  			util.Fatalf("error creating /proc/etc: %v", err)
   371  		}
   372  		// This cannot use SafeMount because there's no available procfs. But we
   373  		// know that /proc is an empty tmpfs mount, so this is safe.
   374  		if err := unix.Mount("runsc-proc", "/proc/proc", "proc", flags|unix.MS_RDONLY, ""); err != nil {
   375  			util.Fatalf("error mounting proc: %v", err)
   376  		}
   377  		// self/fd is bind-mounted, so that the FD return by
   378  		// OpenProcSelfFD() does not allow escapes with walking ".." .
   379  		if err := unix.Mount("/proc/proc/self/fd", "/proc/proc/self/fd",
   380  			"", unix.MS_RDONLY|unix.MS_BIND|unix.MS_NOEXEC, ""); err != nil {
   381  			util.Fatalf("error mounting proc/self/fd: %v", err)
   382  		}
   383  		if err := copyFile("/proc/etc/localtime", "/etc/localtime"); err != nil {
   384  			log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err)
   385  		}
   386  		root = "/proc/root"
   387  		procPath = "/proc/proc"
   388  	}
   389  
   390  	// Mount root path followed by submounts.
   391  	if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil {
   392  		return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
   393  	}
   394  
   395  	flags := uint32(unix.MS_SLAVE | unix.MS_REC)
   396  	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
   397  		flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
   398  	}
   399  	if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil {
   400  		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
   401  	}
   402  
   403  	// Replace the current spec, with the clean spec with symlinks resolved.
   404  	if err := g.setupMounts(conf, spec.Mounts, root, procPath); err != nil {
   405  		util.Fatalf("error setting up FS: %v", err)
   406  	}
   407  
   408  	// Create working directory if needed.
   409  	if spec.Process.Cwd != "" {
   410  		dst, err := resolveSymlinks(root, spec.Process.Cwd)
   411  		if err != nil {
   412  			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
   413  		}
   414  		log.Infof("Create working directory %q if needed", spec.Process.Cwd)
   415  		if err := os.MkdirAll(dst, 0755); err != nil {
   416  			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
   417  		}
   418  	}
   419  
   420  	// Check if root needs to be remounted as readonly.
   421  	if spec.Root.Readonly || g.overlayMediums[0].IsEnabled() {
   422  		// If root is a mount point but not read-only, we can change mount options
   423  		// to make it read-only for extra safety.
   424  		log.Infof("Remounting root as readonly: %q", root)
   425  		flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_REC)
   426  		if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil {
   427  			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
   428  		}
   429  	}
   430  
   431  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   432  		if err := pivotRoot("/proc"); err != nil {
   433  			util.Fatalf("failed to change the root file system: %v", err)
   434  		}
   435  		if err := os.Chdir("/"); err != nil {
   436  			util.Fatalf("failed to change working directory")
   437  		}
   438  	}
   439  	return nil
   440  }
   441  
   442  // setupMounts bind mounts all mounts specified in the spec in their correct
   443  // location inside root. It will resolve relative paths and symlinks. It also
   444  // creates directories as needed.
   445  func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error {
   446  	goferMntIdx := 1 // First index is for rootfs.
   447  	for _, m := range mounts {
   448  		if !specutils.IsGoferMount(m) {
   449  			continue
   450  		}
   451  
   452  		dst, err := resolveSymlinks(root, m.Destination)
   453  		if err != nil {
   454  			return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
   455  		}
   456  
   457  		flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND
   458  		if g.overlayMediums[goferMntIdx].IsEnabled() {
   459  			// Force mount read-only if writes are not going to be sent to it.
   460  			flags |= unix.MS_RDONLY
   461  		}
   462  
   463  		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
   464  		if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil {
   465  			return fmt.Errorf("mounting %+v: %v", m, err)
   466  		}
   467  
   468  		// Set propagation options that cannot be set together with other options.
   469  		flags = specutils.PropOptionsToFlags(m.Options)
   470  		if flags != 0 {
   471  			if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil {
   472  				return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
   473  			}
   474  		}
   475  		goferMntIdx++
   476  	}
   477  	return nil
   478  }
   479  
   480  // resolveMounts resolved relative paths and symlinks to mount points.
   481  //
   482  // Note: mount points must already be in place for resolution to work.
   483  // Otherwise, it may follow symlinks to locations that would be overwritten
   484  // with another mount point and return the wrong location. In short, make sure
   485  // setupMounts() has been called before.
   486  func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
   487  	cleanMounts := make([]specs.Mount, 0, len(mounts))
   488  	for _, m := range mounts {
   489  		if !specutils.IsGoferMount(m) {
   490  			cleanMounts = append(cleanMounts, m)
   491  			continue
   492  		}
   493  		dst, err := resolveSymlinks(root, m.Destination)
   494  		if err != nil {
   495  			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
   496  		}
   497  		relDst, err := filepath.Rel(root, dst)
   498  		if err != nil {
   499  			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
   500  		}
   501  
   502  		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
   503  		if err != nil {
   504  			return nil, err
   505  		}
   506  
   507  		cpy := m
   508  		cpy.Destination = filepath.Join("/", relDst)
   509  		cpy.Options = opts
   510  		cleanMounts = append(cleanMounts, cpy)
   511  	}
   512  	return cleanMounts, nil
   513  }
   514  
   515  // ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
   516  // symlinks, they are evaluated relative to 'root' to ensure the end result is
   517  // the same as if the process was running inside the container.
   518  func resolveSymlinks(root, rel string) (string, error) {
   519  	return resolveSymlinksImpl(root, root, rel, 255)
   520  }
   521  
   522  func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
   523  	if followCount == 0 {
   524  		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
   525  	}
   526  
   527  	rel = filepath.Clean(rel)
   528  	for _, name := range strings.Split(rel, string(filepath.Separator)) {
   529  		if name == "" {
   530  			continue
   531  		}
   532  		// Note that Join() resolves things like ".." and returns a clean path.
   533  		path := filepath.Join(base, name)
   534  		if !strings.HasPrefix(path, root) {
   535  			// One cannot '..' their way out of root.
   536  			base = root
   537  			continue
   538  		}
   539  		fi, err := os.Lstat(path)
   540  		if err != nil {
   541  			if !os.IsNotExist(err) {
   542  				return "", err
   543  			}
   544  			// Not found means there is no symlink to check. Just keep walking dirs.
   545  			base = path
   546  			continue
   547  		}
   548  		if fi.Mode()&os.ModeSymlink != 0 {
   549  			link, err := os.Readlink(path)
   550  			if err != nil {
   551  				return "", err
   552  			}
   553  			if filepath.IsAbs(link) {
   554  				base = root
   555  			}
   556  			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
   557  			if err != nil {
   558  				return "", err
   559  			}
   560  			continue
   561  		}
   562  		base = path
   563  	}
   564  	return base, nil
   565  }
   566  
   567  // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
   568  func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
   569  	rv := make([]string, len(opts))
   570  	copy(rv, opts)
   571  
   572  	statfs := unix.Statfs_t{}
   573  	if err := unix.Statfs(path, &statfs); err != nil {
   574  		return nil, err
   575  	}
   576  	if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
   577  		rv = append(rv, "overlayfs_stale_read")
   578  	}
   579  	return rv, nil
   580  }
   581  
   582  // setFlags sets sync FD flags on the given FlagSet.
   583  func (g *goferSyncFDs) setFlags(f *flag.FlagSet) {
   584  	f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done")
   585  	f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the the gofer waits on until userns mappings are set up")
   586  	f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted")
   587  }
   588  
   589  // flags returns the flags necessary to pass along the current sync FD values
   590  // to a re-executed version of this process.
   591  func (g *goferSyncFDs) flags() map[string]string {
   592  	return map[string]string{
   593  		"sync-nvproxy-fd":    fmt.Sprintf("%d", g.nvproxyFD),
   594  		"sync-userns-fd":     fmt.Sprintf("%d", g.usernsFD),
   595  		"proc-mount-sync-fd": fmt.Sprintf("%d", g.procMountFD),
   596  	}
   597  }
   598  
   599  // waitForFD waits for the other end of a given FD to be closed.
   600  // `fd` is closed unconditionally after that.
   601  // This should only be called for actual FDs (i.e. `fd` >= 0).
   602  func waitForFD(fd int, fdName string) error {
   603  	log.Debugf("Waiting on %s %d...", fdName, fd)
   604  	f := os.NewFile(uintptr(fd), fdName)
   605  	defer f.Close()
   606  	var b [1]byte
   607  	if n, err := f.Read(b[:]); n != 0 || err != io.EOF {
   608  		return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err)
   609  	}
   610  	log.Debugf("Synced on %s %d.", fdName, fd)
   611  	return nil
   612  }
   613  
   614  // spawnProcMounter executes the /proc unmounter process.
   615  // It returns a function to wait on the proc unmounter process, which
   616  // should be called (via defer) in case of errors in order to clean up the
   617  // unmounter process properly.
   618  // When procfs is no longer needed, `unmountProcfs` should be called.
   619  func (g *goferSyncFDs) spawnProcUnmounter() func() {
   620  	if g.procMountFD != -1 {
   621  		util.Fatalf("procMountFD is set")
   622  	}
   623  	// /proc is umounted from a forked process, because the
   624  	// current one may re-execute itself without capabilities.
   625  	cmd, w := execProcUmounter()
   626  	// Clear FD_CLOEXEC. This process may be re-executed. procMountFD
   627  	// should remain open.
   628  	if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
   629  		util.Fatalf("error clearing CLOEXEC: %v", errno)
   630  	}
   631  	g.procMountFD = int(w.Fd())
   632  	return func() {
   633  		g.procMountFD = -1
   634  		w.Close()
   635  		cmd.Wait()
   636  	}
   637  }
   638  
   639  // unmountProcfs signals the proc unmounter process that procfs is no longer
   640  // needed.
   641  func (g *goferSyncFDs) unmountProcfs() {
   642  	if g.procMountFD < 0 {
   643  		return
   644  	}
   645  	umountProc(g.procMountFD)
   646  	g.procMountFD = -1
   647  }
   648  
   649  // syncUsernsForRootless waits on usernsFD to be closed and then sets
   650  // UID/GID to 0. Note that this function calls runtime.LockOSThread().
   651  // This function is a no-op if usernsFD is -1.
   652  //
   653  // Postcondition: All callers must re-exec themselves after this returns,
   654  // unless usernsFD was -1.
   655  func (g *goferSyncFDs) syncUsernsForRootless() {
   656  	if g.usernsFD < 0 {
   657  		return
   658  	}
   659  	syncUsernsForRootless(g.usernsFD)
   660  	g.usernsFD = -1
   661  }
   662  
   663  // syncUsernsForRootless waits on usernsFD to be closed and then sets
   664  // UID/GID to 0. Note that this function calls runtime.LockOSThread().
   665  //
   666  // Postcondition: All callers must re-exec themselves after this returns.
   667  func syncUsernsForRootless(fd int) {
   668  	if err := waitForFD(fd, "userns sync FD"); err != nil {
   669  		util.Fatalf("failed to sync on userns FD: %v", err)
   670  	}
   671  
   672  	// SETUID changes UID on the current system thread, so we have
   673  	// to re-execute current binary.
   674  	runtime.LockOSThread()
   675  	if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 {
   676  		util.Fatalf("failed to set UID: %v", errno)
   677  	}
   678  	if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 {
   679  		util.Fatalf("failed to set GID: %v", errno)
   680  	}
   681  }
   682  
   683  // syncNVProxy waits on nvproxyFD to be closed.
   684  // Used for synchronization during nvproxy setup which is done from the
   685  // non-gofer process.
   686  // This function is a no-op if nvProxySyncFD is -1.
   687  func (g *goferSyncFDs) syncNVProxy() {
   688  	if g.nvproxyFD < 0 {
   689  		return
   690  	}
   691  	if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil {
   692  		util.Fatalf("failed to sync on NVProxy FD: %v", err)
   693  	}
   694  	g.nvproxyFD = -1
   695  }