github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/runsc/cmd/gofer.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package cmd
    16  
    17  import (
    18  	"context"
    19  	"encoding/json"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"path/filepath"
    24  	"runtime"
    25  	"runtime/debug"
    26  	"strings"
    27  
    28  	"github.com/google/subcommands"
    29  	specs "github.com/opencontainers/runtime-spec/specs-go"
    30  	"github.com/ttpreport/gvisor-ligolo/pkg/log"
    31  	"github.com/ttpreport/gvisor-ligolo/pkg/unet"
    32  	"github.com/ttpreport/gvisor-ligolo/runsc/boot"
    33  	"github.com/ttpreport/gvisor-ligolo/runsc/cmd/util"
    34  	"github.com/ttpreport/gvisor-ligolo/runsc/config"
    35  	"github.com/ttpreport/gvisor-ligolo/runsc/flag"
    36  	"github.com/ttpreport/gvisor-ligolo/runsc/fsgofer"
    37  	"github.com/ttpreport/gvisor-ligolo/runsc/fsgofer/filter"
    38  	"github.com/ttpreport/gvisor-ligolo/runsc/profile"
    39  	"github.com/ttpreport/gvisor-ligolo/runsc/specutils"
    40  	"golang.org/x/sys/unix"
    41  )
    42  
    43  var caps = []string{
    44  	"CAP_CHOWN",
    45  	"CAP_DAC_OVERRIDE",
    46  	"CAP_DAC_READ_SEARCH",
    47  	"CAP_FOWNER",
    48  	"CAP_FSETID",
    49  	"CAP_SYS_CHROOT",
    50  }
    51  
    52  // goferCaps is the minimal set of capabilities needed by the Gofer to operate
    53  // on files.
    54  var goferCaps = &specs.LinuxCapabilities{
    55  	Bounding:  caps,
    56  	Effective: caps,
    57  	Permitted: caps,
    58  }
    59  
    60  // goferSyncFDs contains file descriptors that are used for synchronization
    61  // of the Gofer startup process against other processes.
    62  type goferSyncFDs struct {
    63  	// nvproxyFD is a file descriptor that is used to wait until
    64  	// nvproxy-related setup is done. This setup involves creating mounts in the
    65  	// Gofer process's mount namespace.
    66  	// If this is set, this FD is the first that the Gofer waits for.
    67  	nvproxyFD int
    68  	// usernsFD is a file descriptor that is used to wait until
    69  	// user namespace ID mappings are established in the Gofer's userns.
    70  	// If this is set, this FD is the second that the Gofer waits for.
    71  	usernsFD int
    72  	// procMountFD is a file descriptor that has to be closed when the
    73  	// procfs mount isn't needed anymore. It is read by the procfs unmounter
    74  	// process.
    75  	// If this is set, this FD is the last that the Gofer interacts with and
    76  	// closes.
    77  	procMountFD int
    78  }
    79  
    80  // Gofer implements subcommands.Command for the "gofer" command, which starts a
    81  // filesystem gofer.  This command should not be called directly.
    82  type Gofer struct {
    83  	bundleDir      string
    84  	ioFDs          intFlags
    85  	applyCaps      bool
    86  	setUpRoot      bool
    87  	overlayMediums boot.OverlayMediumFlags
    88  
    89  	specFD        int
    90  	mountsFD      int
    91  	profileFDs    profile.FDArgs
    92  	syncFDs       goferSyncFDs
    93  	stopProfiling func()
    94  }
    95  
    96  // Name implements subcommands.Command.
    97  func (*Gofer) Name() string {
    98  	return "gofer"
    99  }
   100  
   101  // Synopsis implements subcommands.Command.
   102  func (g *Gofer) Synopsis() string {
   103  	return fmt.Sprintf("launch a gofer process that proxies access to container files")
   104  }
   105  
   106  // Usage implements subcommands.Command.
   107  func (*Gofer) Usage() string {
   108  	return `gofer [flags]`
   109  }
   110  
   111  // SetFlags implements subcommands.Command.
   112  func (g *Gofer) SetFlags(f *flag.FlagSet) {
   113  	f.StringVar(&g.bundleDir, "bundle", "", "path to the root of the bundle directory, defaults to the current directory")
   114  	f.BoolVar(&g.applyCaps, "apply-caps", true, "if true, apply capabilities to restrict what the Gofer process can do")
   115  	f.BoolVar(&g.setUpRoot, "setup-root", true, "if true, set up an empty root for the process")
   116  
   117  	// Open FDs that are donated to the gofer.
   118  	f.Var(&g.ioFDs, "io-fds", "list of FDs to connect gofer servers. They must follow this order: root first, then mounts as defined in the spec")
   119  	f.Var(&g.overlayMediums, "overlay-mediums", "information about how the gofer mounts have been overlaid.")
   120  	f.IntVar(&g.specFD, "spec-fd", -1, "required fd with the container spec")
   121  	f.IntVar(&g.mountsFD, "mounts-fd", -1, "mountsFD is the file descriptor to write list of mounts after they have been resolved (direct paths, no symlinks).")
   122  
   123  	// Add synchronization FD flags.
   124  	g.syncFDs.setFlags(f)
   125  
   126  	// Profiling flags.
   127  	g.profileFDs.SetFromFlags(f)
   128  }
   129  
   130  // Execute implements subcommands.Command.
   131  func (g *Gofer) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
   132  	if g.bundleDir == "" || len(g.ioFDs) < 1 || g.specFD < 0 {
   133  		f.Usage()
   134  		return subcommands.ExitUsageError
   135  	}
   136  
   137  	conf := args[0].(*config.Config)
   138  
   139  	// Set traceback level
   140  	debug.SetTraceback(conf.Traceback)
   141  
   142  	specFile := os.NewFile(uintptr(g.specFD), "spec file")
   143  	defer specFile.Close()
   144  	spec, err := specutils.ReadSpecFromFile(g.bundleDir, specFile, conf)
   145  	if err != nil {
   146  		util.Fatalf("reading spec: %v", err)
   147  	}
   148  
   149  	g.syncFDs.syncNVProxy()
   150  	g.syncFDs.syncUsernsForRootless()
   151  
   152  	if g.setUpRoot {
   153  		if err := g.setupRootFS(spec, conf); err != nil {
   154  			util.Fatalf("Error setting up root FS: %v", err)
   155  		}
   156  		if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   157  			cleanupUnmounter := g.syncFDs.spawnProcUnmounter()
   158  			defer cleanupUnmounter()
   159  		}
   160  	}
   161  	if g.applyCaps {
   162  		// Disable caps when calling myself again.
   163  		// Note: minimal argument handling for the default case to keep it simple.
   164  		args := os.Args
   165  		args = append(
   166  			args,
   167  			"--apply-caps=false",
   168  			"--setup-root=false",
   169  		)
   170  		args = append(args, g.syncFDs.flags()...)
   171  		util.Fatalf("setCapsAndCallSelf(%v, %v): %v", args, goferCaps, setCapsAndCallSelf(args, goferCaps))
   172  		panic("unreachable")
   173  	}
   174  
   175  	// Start profiling. This will be a noop if no profiling arguments were passed.
   176  	profileOpts := g.profileFDs.ToOpts()
   177  	g.stopProfiling = profile.Start(profileOpts)
   178  
   179  	// At this point we won't re-execute, so it's safe to limit via rlimits. Any
   180  	// limit >= 0 works. If the limit is lower than the current number of open
   181  	// files, then Setrlimit will succeed, and the next open will fail.
   182  	if conf.FDLimit > -1 {
   183  		rlimit := unix.Rlimit{
   184  			Cur: uint64(conf.FDLimit),
   185  			Max: uint64(conf.FDLimit),
   186  		}
   187  		switch err := unix.Setrlimit(unix.RLIMIT_NOFILE, &rlimit); err {
   188  		case nil:
   189  		case unix.EPERM:
   190  			log.Warningf("FD limit %d is higher than the current hard limit or system-wide maximum", conf.FDLimit)
   191  		default:
   192  			util.Fatalf("Failed to set RLIMIT_NOFILE: %v", err)
   193  		}
   194  	}
   195  
   196  	// Find what path is going to be served by this gofer.
   197  	root := spec.Root.Path
   198  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   199  		root = "/root"
   200  	}
   201  
   202  	// Resolve mount points paths, then replace mounts from our spec and send the
   203  	// mount list over to the sandbox, so they are both in sync.
   204  	//
   205  	// Note that all mount points have been mounted in the proper location in
   206  	// setupRootFS().
   207  	cleanMounts, err := resolveMounts(conf, spec.Mounts, root)
   208  	if err != nil {
   209  		util.Fatalf("Failure to resolve mounts: %v", err)
   210  	}
   211  	spec.Mounts = cleanMounts
   212  	go func() {
   213  		if err := g.writeMounts(cleanMounts); err != nil {
   214  			panic(fmt.Sprintf("Failed to write mounts: %v", err))
   215  		}
   216  	}()
   217  
   218  	specutils.LogSpecDebug(spec, conf.OCISeccomp)
   219  
   220  	// fsgofer should run with a umask of 0, because we want to preserve file
   221  	// modes exactly as sent by the sandbox, which will have applied its own umask.
   222  	unix.Umask(0)
   223  
   224  	if err := fsgofer.OpenProcSelfFD(); err != nil {
   225  		util.Fatalf("failed to open /proc/self/fd: %v", err)
   226  	}
   227  
   228  	// procfs isn't needed anymore.
   229  	g.syncFDs.unmountProcfs()
   230  
   231  	if err := unix.Chroot(root); err != nil {
   232  		util.Fatalf("failed to chroot to %q: %v", root, err)
   233  	}
   234  	if err := unix.Chdir("/"); err != nil {
   235  		util.Fatalf("changing working dir: %v", err)
   236  	}
   237  	log.Infof("Process chroot'd to %q", root)
   238  
   239  	// Initialize filters.
   240  	opts := filter.Options{
   241  		UDSOpenEnabled:   conf.GetHostUDS().AllowOpen(),
   242  		UDSCreateEnabled: conf.GetHostUDS().AllowCreate(),
   243  		ProfileEnabled:   len(profileOpts) > 0,
   244  	}
   245  	if err := filter.Install(opts); err != nil {
   246  		util.Fatalf("installing seccomp filters: %v", err)
   247  	}
   248  
   249  	return g.serve(spec, conf, root)
   250  }
   251  
   252  func newSocket(ioFD int) *unet.Socket {
   253  	socket, err := unet.NewSocket(ioFD)
   254  	if err != nil {
   255  		util.Fatalf("creating server on FD %d: %v", ioFD, err)
   256  	}
   257  	return socket
   258  }
   259  
   260  func (g *Gofer) serve(spec *specs.Spec, conf *config.Config, root string) subcommands.ExitStatus {
   261  	type connectionConfig struct {
   262  		sock      *unet.Socket
   263  		mountPath string
   264  		readonly  bool
   265  	}
   266  	cfgs := make([]connectionConfig, 0, len(spec.Mounts)+1)
   267  	server := fsgofer.NewLisafsServer(fsgofer.Config{
   268  		// These are global options. Ignore readonly configuration, that is set on
   269  		// a per connection basis.
   270  		HostUDS:            conf.GetHostUDS(),
   271  		HostFifo:           conf.HostFifo,
   272  		DonateMountPointFD: conf.DirectFS,
   273  	})
   274  
   275  	// Start with root mount, then add any other additional mount as needed.
   276  	cfgs = append(cfgs, connectionConfig{
   277  		sock:      newSocket(g.ioFDs[0]),
   278  		mountPath: "/", // fsgofer process is always chroot()ed. So serve root.
   279  		readonly:  spec.Root.Readonly || g.overlayMediums[0].IsEnabled(),
   280  	})
   281  	log.Infof("Serving %q mapped to %q on FD %d (ro: %t)", "/", root, g.ioFDs[0], cfgs[0].readonly)
   282  
   283  	mountIdx := 1 // first one is the root
   284  	for _, m := range spec.Mounts {
   285  		if !specutils.IsGoferMount(m) {
   286  			continue
   287  		}
   288  
   289  		if !filepath.IsAbs(m.Destination) {
   290  			util.Fatalf("mount destination must be absolute: %q", m.Destination)
   291  		}
   292  		if mountIdx >= len(g.ioFDs) {
   293  			util.Fatalf("no FD found for mount. Did you forget --io-fd? FDs: %d, Mount: %+v", len(g.ioFDs), m)
   294  		}
   295  
   296  		cfgs = append(cfgs, connectionConfig{
   297  			sock:      newSocket(g.ioFDs[mountIdx]),
   298  			mountPath: m.Destination,
   299  			readonly:  specutils.IsReadonlyMount(m.Options) || g.overlayMediums[mountIdx].IsEnabled(),
   300  		})
   301  
   302  		log.Infof("Serving %q mapped on FD %d (ro: %t)", m.Destination, g.ioFDs[mountIdx], cfgs[mountIdx].readonly)
   303  		mountIdx++
   304  	}
   305  
   306  	if mountIdx != len(g.ioFDs) {
   307  		util.Fatalf("too many FDs passed for mounts. mounts: %d, FDs: %d", mountIdx, len(g.ioFDs))
   308  	}
   309  	cfgs = cfgs[:mountIdx]
   310  
   311  	for _, cfg := range cfgs {
   312  		conn, err := server.CreateConnection(cfg.sock, cfg.mountPath, cfg.readonly)
   313  		if err != nil {
   314  			util.Fatalf("starting connection on FD %d for gofer mount failed: %v", cfg.sock.FD(), err)
   315  		}
   316  		server.StartConnection(conn)
   317  	}
   318  	server.Wait()
   319  	server.Destroy()
   320  	log.Infof("All lisafs servers exited.")
   321  	if g.stopProfiling != nil {
   322  		g.stopProfiling()
   323  	}
   324  	return subcommands.ExitSuccess
   325  }
   326  
   327  func (g *Gofer) writeMounts(mounts []specs.Mount) error {
   328  	bytes, err := json.Marshal(mounts)
   329  	if err != nil {
   330  		return err
   331  	}
   332  
   333  	f := os.NewFile(uintptr(g.mountsFD), "mounts file")
   334  	defer f.Close()
   335  
   336  	for written := 0; written < len(bytes); {
   337  		w, err := f.Write(bytes[written:])
   338  		if err != nil {
   339  			return err
   340  		}
   341  		written += w
   342  	}
   343  	return nil
   344  }
   345  
   346  func (g *Gofer) setupRootFS(spec *specs.Spec, conf *config.Config) error {
   347  	// Convert all shared mounts into slaves to be sure that nothing will be
   348  	// propagated outside of our namespace.
   349  	procPath := "/proc"
   350  	if err := specutils.SafeMount("", "/", "", unix.MS_SLAVE|unix.MS_REC, "", procPath); err != nil {
   351  		util.Fatalf("error converting mounts: %v", err)
   352  	}
   353  
   354  	root := spec.Root.Path
   355  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   356  		// runsc can't be re-executed without /proc, so we create a tmpfs mount,
   357  		// mount ./proc and ./root there, then move this mount to the root and after
   358  		// setCapsAndCallSelf, runsc will chroot into /root.
   359  		//
   360  		// We need a directory to construct a new root and we know that
   361  		// runsc can't start without /proc, so we can use it for this.
   362  		flags := uintptr(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
   363  		if err := specutils.SafeMount("runsc-root", "/proc", "tmpfs", flags, "", procPath); err != nil {
   364  			util.Fatalf("error mounting tmpfs: %v", err)
   365  		}
   366  
   367  		// Prepare tree structure for pivot_root(2).
   368  		if err := os.Mkdir("/proc/proc", 0755); err != nil {
   369  			util.Fatalf("error creating /proc/proc: %v", err)
   370  		}
   371  		if err := os.Mkdir("/proc/root", 0755); err != nil {
   372  			util.Fatalf("error creating /proc/root: %v", err)
   373  		}
   374  		if err := os.Mkdir("/proc/etc", 0755); err != nil {
   375  			util.Fatalf("error creating /proc/etc: %v", err)
   376  		}
   377  		// This cannot use SafeMount because there's no available procfs. But we
   378  		// know that /proc is an empty tmpfs mount, so this is safe.
   379  		if err := unix.Mount("runsc-proc", "/proc/proc", "proc", flags|unix.MS_RDONLY, ""); err != nil {
   380  			util.Fatalf("error mounting proc: %v", err)
   381  		}
   382  		// self/fd is bind-mounted, so that the FD return by
   383  		// OpenProcSelfFD() does not allow escapes with walking ".." .
   384  		if err := unix.Mount("/proc/proc/self/fd", "/proc/proc/self/fd",
   385  			"", unix.MS_RDONLY|unix.MS_BIND|unix.MS_NOEXEC, ""); err != nil {
   386  			util.Fatalf("error mounting proc/self/fd: %v", err)
   387  		}
   388  		if err := copyFile("/proc/etc/localtime", "/etc/localtime"); err != nil {
   389  			log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err)
   390  		}
   391  		root = "/proc/root"
   392  		procPath = "/proc/proc"
   393  	}
   394  
   395  	// Mount root path followed by submounts.
   396  	if err := specutils.SafeMount(spec.Root.Path, root, "bind", unix.MS_BIND|unix.MS_REC, "", procPath); err != nil {
   397  		return fmt.Errorf("mounting root on root (%q) err: %v", root, err)
   398  	}
   399  
   400  	flags := uint32(unix.MS_SLAVE | unix.MS_REC)
   401  	if spec.Linux != nil && spec.Linux.RootfsPropagation != "" {
   402  		flags = specutils.PropOptionsToFlags([]string{spec.Linux.RootfsPropagation})
   403  	}
   404  	if err := specutils.SafeMount("", root, "", uintptr(flags), "", procPath); err != nil {
   405  		return fmt.Errorf("mounting root (%q) with flags: %#x, err: %v", root, flags, err)
   406  	}
   407  
   408  	// Replace the current spec, with the clean spec with symlinks resolved.
   409  	if err := g.setupMounts(conf, spec.Mounts, root, procPath); err != nil {
   410  		util.Fatalf("error setting up FS: %v", err)
   411  	}
   412  
   413  	// Create working directory if needed.
   414  	if spec.Process.Cwd != "" {
   415  		dst, err := resolveSymlinks(root, spec.Process.Cwd)
   416  		if err != nil {
   417  			return fmt.Errorf("resolving symlinks to %q: %v", spec.Process.Cwd, err)
   418  		}
   419  		log.Infof("Create working directory %q if needed", spec.Process.Cwd)
   420  		if err := os.MkdirAll(dst, 0755); err != nil {
   421  			return fmt.Errorf("creating working directory %q: %v", spec.Process.Cwd, err)
   422  		}
   423  	}
   424  
   425  	// Check if root needs to be remounted as readonly.
   426  	if spec.Root.Readonly || g.overlayMediums[0].IsEnabled() {
   427  		// If root is a mount point but not read-only, we can change mount options
   428  		// to make it read-only for extra safety.
   429  		log.Infof("Remounting root as readonly: %q", root)
   430  		flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY | unix.MS_REC)
   431  		if err := specutils.SafeMount(root, root, "bind", flags, "", procPath); err != nil {
   432  			return fmt.Errorf("remounting root as read-only with source: %q, target: %q, flags: %#x, err: %v", root, root, flags, err)
   433  		}
   434  	}
   435  
   436  	if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot {
   437  		if err := pivotRoot("/proc"); err != nil {
   438  			util.Fatalf("failed to change the root file system: %v", err)
   439  		}
   440  		if err := os.Chdir("/"); err != nil {
   441  			util.Fatalf("failed to change working directory")
   442  		}
   443  	}
   444  	return nil
   445  }
   446  
   447  // setupMounts bind mounts all mounts specified in the spec in their correct
   448  // location inside root. It will resolve relative paths and symlinks. It also
   449  // creates directories as needed.
   450  func (g *Gofer) setupMounts(conf *config.Config, mounts []specs.Mount, root, procPath string) error {
   451  	goferMntIdx := 1 // First index is for rootfs.
   452  	for _, m := range mounts {
   453  		if !specutils.IsGoferMount(m) {
   454  			continue
   455  		}
   456  
   457  		dst, err := resolveSymlinks(root, m.Destination)
   458  		if err != nil {
   459  			return fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
   460  		}
   461  
   462  		flags := specutils.OptionsToFlags(m.Options) | unix.MS_BIND
   463  		if g.overlayMediums[goferMntIdx].IsEnabled() {
   464  			// Force mount read-only if writes are not going to be sent to it.
   465  			flags |= unix.MS_RDONLY
   466  		}
   467  
   468  		log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
   469  		if err := specutils.SafeSetupAndMount(m.Source, dst, m.Type, flags, procPath); err != nil {
   470  			return fmt.Errorf("mounting %+v: %v", m, err)
   471  		}
   472  
   473  		// Set propagation options that cannot be set together with other options.
   474  		flags = specutils.PropOptionsToFlags(m.Options)
   475  		if flags != 0 {
   476  			if err := specutils.SafeMount("", dst, "", uintptr(flags), "", procPath); err != nil {
   477  				return fmt.Errorf("mount dst: %q, flags: %#x, err: %v", dst, flags, err)
   478  			}
   479  		}
   480  		goferMntIdx++
   481  	}
   482  	return nil
   483  }
   484  
   485  // resolveMounts resolved relative paths and symlinks to mount points.
   486  //
   487  // Note: mount points must already be in place for resolution to work.
   488  // Otherwise, it may follow symlinks to locations that would be overwritten
   489  // with another mount point and return the wrong location. In short, make sure
   490  // setupMounts() has been called before.
   491  func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) {
   492  	cleanMounts := make([]specs.Mount, 0, len(mounts))
   493  	for _, m := range mounts {
   494  		if !specutils.IsGoferMount(m) {
   495  			cleanMounts = append(cleanMounts, m)
   496  			continue
   497  		}
   498  		dst, err := resolveSymlinks(root, m.Destination)
   499  		if err != nil {
   500  			return nil, fmt.Errorf("resolving symlinks to %q: %v", m.Destination, err)
   501  		}
   502  		relDst, err := filepath.Rel(root, dst)
   503  		if err != nil {
   504  			panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, root, err))
   505  		}
   506  
   507  		opts, err := adjustMountOptions(conf, filepath.Join(root, relDst), m.Options)
   508  		if err != nil {
   509  			return nil, err
   510  		}
   511  
   512  		cpy := m
   513  		cpy.Destination = filepath.Join("/", relDst)
   514  		cpy.Options = opts
   515  		cleanMounts = append(cleanMounts, cpy)
   516  	}
   517  	return cleanMounts, nil
   518  }
   519  
   520  // ResolveSymlinks walks 'rel' having 'root' as the root directory. If there are
   521  // symlinks, they are evaluated relative to 'root' to ensure the end result is
   522  // the same as if the process was running inside the container.
   523  func resolveSymlinks(root, rel string) (string, error) {
   524  	return resolveSymlinksImpl(root, root, rel, 255)
   525  }
   526  
   527  func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
   528  	if followCount == 0 {
   529  		return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
   530  	}
   531  
   532  	rel = filepath.Clean(rel)
   533  	for _, name := range strings.Split(rel, string(filepath.Separator)) {
   534  		if name == "" {
   535  			continue
   536  		}
   537  		// Note that Join() resolves things like ".." and returns a clean path.
   538  		path := filepath.Join(base, name)
   539  		if !strings.HasPrefix(path, root) {
   540  			// One cannot '..' their way out of root.
   541  			base = root
   542  			continue
   543  		}
   544  		fi, err := os.Lstat(path)
   545  		if err != nil {
   546  			if !os.IsNotExist(err) {
   547  				return "", err
   548  			}
   549  			// Not found means there is no symlink to check. Just keep walking dirs.
   550  			base = path
   551  			continue
   552  		}
   553  		if fi.Mode()&os.ModeSymlink != 0 {
   554  			link, err := os.Readlink(path)
   555  			if err != nil {
   556  				return "", err
   557  			}
   558  			if filepath.IsAbs(link) {
   559  				base = root
   560  			}
   561  			base, err = resolveSymlinksImpl(root, base, link, followCount-1)
   562  			if err != nil {
   563  				return "", err
   564  			}
   565  			continue
   566  		}
   567  		base = path
   568  	}
   569  	return base, nil
   570  }
   571  
   572  // adjustMountOptions adds 'overlayfs_stale_read' if mounting over overlayfs.
   573  func adjustMountOptions(conf *config.Config, path string, opts []string) ([]string, error) {
   574  	rv := make([]string, len(opts))
   575  	copy(rv, opts)
   576  
   577  	statfs := unix.Statfs_t{}
   578  	if err := unix.Statfs(path, &statfs); err != nil {
   579  		return nil, err
   580  	}
   581  	if statfs.Type == unix.OVERLAYFS_SUPER_MAGIC {
   582  		rv = append(rv, "overlayfs_stale_read")
   583  	}
   584  	return rv, nil
   585  }
   586  
   587  // setFlags sets sync FD flags on the given FlagSet.
   588  func (g *goferSyncFDs) setFlags(f *flag.FlagSet) {
   589  	f.IntVar(&g.nvproxyFD, "sync-nvproxy-fd", -1, "file descriptor that the gofer waits on until nvproxy setup is done")
   590  	f.IntVar(&g.usernsFD, "sync-userns-fd", -1, "file descriptor the the gofer waits on until userns mappings are set up")
   591  	f.IntVar(&g.procMountFD, "proc-mount-sync-fd", -1, "file descriptor that the gofer writes to when /proc isn't needed anymore and can be unmounted")
   592  }
   593  
   594  // flags returns the flags necessary to pass along the current sync FD values
   595  // to a re-executed version of this process.
   596  func (g *goferSyncFDs) flags() []string {
   597  	return []string{
   598  		fmt.Sprintf("--sync-nvproxy-fd=%d", g.nvproxyFD),
   599  		fmt.Sprintf("--sync-userns-fd=%d", g.usernsFD),
   600  		fmt.Sprintf("--proc-mount-sync-fd=%d", g.procMountFD),
   601  	}
   602  }
   603  
   604  // waitForFD waits for the other end of a given FD to be closed.
   605  // `fd` is closed unconditionally after that.
   606  // This should only be called for actual FDs (i.e. `fd` >= 0).
   607  func waitForFD(fd int, fdName string) error {
   608  	log.Debugf("Waiting on %s %d...", fdName, fd)
   609  	f := os.NewFile(uintptr(fd), fdName)
   610  	defer f.Close()
   611  	var b [1]byte
   612  	if n, err := f.Read(b[:]); n != 0 || err != io.EOF {
   613  		return fmt.Errorf("failed to sync on %s: %v: %v", fdName, n, err)
   614  	}
   615  	log.Debugf("Synced on %s %d.", fdName, fd)
   616  	return nil
   617  }
   618  
   619  // spawnProcMounter executes the /proc unmounter process.
   620  // It returns a function to wait on the proc unmounter process, which
   621  // should be called (via defer) in case of errors in order to clean up the
   622  // unmounter process properly.
   623  // When procfs is no longer needed, `unmountProcfs` should be called.
   624  func (g *goferSyncFDs) spawnProcUnmounter() func() {
   625  	if g.procMountFD != -1 {
   626  		util.Fatalf("procMountFD is set")
   627  	}
   628  	// /proc is umounted from a forked process, because the
   629  	// current one may re-execute itself without capabilities.
   630  	cmd, w := execProcUmounter()
   631  	// Clear FD_CLOEXEC. This process may be re-executed. procMountFD
   632  	// should remain open.
   633  	if _, _, errno := unix.RawSyscall(unix.SYS_FCNTL, w.Fd(), unix.F_SETFD, 0); errno != 0 {
   634  		util.Fatalf("error clearing CLOEXEC: %v", errno)
   635  	}
   636  	g.procMountFD = int(w.Fd())
   637  	return func() {
   638  		g.procMountFD = -1
   639  		w.Close()
   640  		cmd.Wait()
   641  	}
   642  }
   643  
   644  // unmountProcfs signals the proc unmounter process that procfs is no longer
   645  // needed.
   646  func (g *goferSyncFDs) unmountProcfs() {
   647  	if g.procMountFD < 0 {
   648  		return
   649  	}
   650  	umountProc(g.procMountFD)
   651  	g.procMountFD = -1
   652  }
   653  
   654  // syncUsernsForRootless waits on usernsFD to be closed and then sets
   655  // UID/GID to 0. Note that this function calls runtime.LockOSThread().
   656  // This function is a no-op if usernsFD is -1.
   657  //
   658  // Postcondition: All callers must re-exec themselves after this returns,
   659  // unless usernsFD was -1.
   660  func (g *goferSyncFDs) syncUsernsForRootless() {
   661  	syncUsernsForRootless(g.usernsFD)
   662  	g.usernsFD = -1
   663  }
   664  
   665  // syncUsernsForRootless waits on usernsFD to be closed and then sets
   666  // UID/GID to 0. Note that this function calls runtime.LockOSThread().
   667  // This function is a no-op if usernsFD is -1.
   668  //
   669  // Postcondition: All callers must re-exec themselves after this returns,
   670  // unless fd is -1.
   671  func syncUsernsForRootless(fd int) {
   672  	if fd < 0 {
   673  		return
   674  	}
   675  	if err := waitForFD(fd, "userns sync FD"); err != nil {
   676  		util.Fatalf("failed to sync on userns FD: %v", err)
   677  	}
   678  
   679  	// SETUID changes UID on the current system thread, so we have
   680  	// to re-execute current binary.
   681  	runtime.LockOSThread()
   682  	if _, _, errno := unix.RawSyscall(unix.SYS_SETUID, 0, 0, 0); errno != 0 {
   683  		util.Fatalf("failed to set UID: %v", errno)
   684  	}
   685  	if _, _, errno := unix.RawSyscall(unix.SYS_SETGID, 0, 0, 0); errno != 0 {
   686  		util.Fatalf("failed to set GID: %v", errno)
   687  	}
   688  }
   689  
   690  // syncNVProxy waits on nvproxyFD to be closed.
   691  // Used for synchronization during nvproxy setup which is done from the
   692  // non-gofer process.
   693  // This function is a no-op if nvProxySyncFD is -1.
   694  func (g *goferSyncFDs) syncNVProxy() {
   695  	if g.nvproxyFD < 0 {
   696  		return
   697  	}
   698  	if err := waitForFD(g.nvproxyFD, "nvproxy sync FD"); err != nil {
   699  		util.Fatalf("failed to sync on NVProxy FD: %v", err)
   700  	}
   701  	g.nvproxyFD = -1
   702  }