github.com/rish1988/moby@v25.0.2+incompatible/daemon/containerfs_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"os"
     8  	"path/filepath"
     9  	"runtime"
    10  	"strings"
    11  
    12  	"github.com/containerd/log"
    13  	"github.com/hashicorp/go-multierror"
    14  	"github.com/moby/sys/mount"
    15  	"github.com/moby/sys/symlink"
    16  	"golang.org/x/sys/unix"
    17  
    18  	"github.com/docker/docker/api/types"
    19  	"github.com/docker/docker/container"
    20  	"github.com/docker/docker/internal/compatcontext"
    21  	"github.com/docker/docker/internal/mounttree"
    22  	"github.com/docker/docker/internal/unshare"
    23  	"github.com/docker/docker/pkg/fileutils"
    24  )
    25  
    26  type future struct {
    27  	fn  func() error
    28  	res chan<- error
    29  }
    30  
    31  // containerFSView allows functions to be run in the context of a container's
    32  // filesystem. Inside these functions, the root directory is the container root
    33  // for all native OS filesystem APIs, including, but not limited to, the [os]
    34  // and [golang.org/x/sys/unix] packages. The view of the container's filesystem
    35  // is live and read-write. Each view has its own private set of tmpfs mounts.
    36  // Any files written under a tmpfs mount are not visible to processes inside the
    37  // container nor any other view of the container's filesystem, and vice versa.
    38  //
    39  // Each view has its own current working directory which is initialized to the
    40  // root of the container filesystem and can be changed with [os.Chdir]. Changes
    41  // to the current directory persist across successive [*containerFSView.RunInFS]
    42  // and [*containerFSView.GoInFS] calls.
    43  //
    44  // Multiple views of the same container filesystem can coexist at the same time.
    45  // Only one function can be running in a particular filesystem view at any given
    46  // time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will
    47  // block while another function is running. If more than one call is blocked
    48  // concurrently, the order they are unblocked is undefined.
    49  type containerFSView struct {
    50  	d    *Daemon
    51  	ctr  *container.Container
    52  	todo chan future
    53  	done chan error
    54  }
    55  
    56  // openContainerFS opens a new view of the container's filesystem.
    57  func (daemon *Daemon) openContainerFS(container *container.Container) (_ *containerFSView, err error) {
    58  	ctx := context.TODO()
    59  
    60  	if err := daemon.Mount(container); err != nil {
    61  		return nil, err
    62  	}
    63  	defer func() {
    64  		if err != nil {
    65  			_ = daemon.Unmount(container)
    66  		}
    67  	}()
    68  
    69  	mounts, cleanup, err := daemon.setupMounts(ctx, container)
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  	defer func() {
    74  		ctx := compatcontext.WithoutCancel(ctx)
    75  		cleanup(ctx)
    76  		if err != nil {
    77  			_ = container.UnmountVolumes(ctx, daemon.LogVolumeEvent)
    78  		}
    79  	}()
    80  
    81  	// Setup in initial mount namespace complete. We're ready to unshare the
    82  	// mount namespace and bind the volume mounts into that private view of
    83  	// the container FS.
    84  	todo := make(chan future)
    85  	done := make(chan error)
    86  	err = unshare.Go(unix.CLONE_NEWNS,
    87  		func() error {
    88  			if err := mount.MakeRSlave("/"); err != nil {
    89  				return err
    90  			}
    91  			for _, m := range mounts {
    92  				dest, err := container.GetResourcePath(m.Destination)
    93  				if err != nil {
    94  					return err
    95  				}
    96  
    97  				var stat os.FileInfo
    98  				stat, err = os.Stat(m.Source)
    99  				if err != nil {
   100  					return err
   101  				}
   102  				if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
   103  					return err
   104  				}
   105  
   106  				bindMode := "rbind"
   107  				if m.NonRecursive {
   108  					bindMode = "bind"
   109  				}
   110  				writeMode := "ro"
   111  				if m.Writable {
   112  					writeMode = "rw"
   113  					if m.ReadOnlyNonRecursive {
   114  						return errors.New("options conflict: Writable && ReadOnlyNonRecursive")
   115  					}
   116  					if m.ReadOnlyForceRecursive {
   117  						return errors.New("options conflict: Writable && ReadOnlyForceRecursive")
   118  					}
   119  				}
   120  				if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive {
   121  					return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
   122  				}
   123  
   124  				// openContainerFS() is called for temporary mounts
   125  				// outside the container. Soon these will be unmounted
   126  				// with lazy unmount option and given we have mounted
   127  				// them rbind, all the submounts will propagate if these
   128  				// are shared. If daemon is running in host namespace
   129  				// and has / as shared then these unmounts will
   130  				// propagate and unmount original mount as well. So make
   131  				// all these mounts rprivate.  Do not use propagation
   132  				// property of volume as that should apply only when
   133  				// mounting happens inside the container.
   134  				opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
   135  				if err := mount.Mount(m.Source, dest, "", opts); err != nil {
   136  					return err
   137  				}
   138  
   139  				if !m.Writable && !m.ReadOnlyNonRecursive {
   140  					if err := makeMountRRO(dest); err != nil {
   141  						if m.ReadOnlyForceRecursive {
   142  							return err
   143  						} else {
   144  							log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", dest)
   145  						}
   146  					}
   147  				}
   148  			}
   149  
   150  			return mounttree.SwitchRoot(container.BaseFS)
   151  		},
   152  		func() {
   153  			defer close(done)
   154  
   155  			for it := range todo {
   156  				err := it.fn()
   157  				if it.res != nil {
   158  					it.res <- err
   159  				}
   160  			}
   161  
   162  			// The thread will terminate when this goroutine returns, taking the
   163  			// mount namespace and all the volume bind-mounts with it.
   164  		},
   165  	)
   166  	if err != nil {
   167  		return nil, err
   168  	}
   169  	vw := &containerFSView{
   170  		d:    daemon,
   171  		ctr:  container,
   172  		todo: todo,
   173  		done: done,
   174  	}
   175  	runtime.SetFinalizer(vw, (*containerFSView).Close)
   176  	return vw, nil
   177  }
   178  
   179  // RunInFS synchronously runs fn in the context of the container filesytem and
   180  // passes through its return value.
   181  //
   182  // The container filesystem is only visible to functions called in the same
   183  // goroutine as fn. Goroutines started from fn will see the host's filesystem.
   184  func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
   185  	res := make(chan error)
   186  	select {
   187  	case vw.todo <- future{fn: fn, res: res}:
   188  	case <-ctx.Done():
   189  		return ctx.Err()
   190  	}
   191  	return <-res
   192  }
   193  
   194  // GoInFS starts fn in the container FS. It blocks until fn is started but does
   195  // not wait until fn returns. An error is returned if ctx is canceled before fn
   196  // has been started.
   197  //
   198  // The container filesystem is only visible to functions called in the same
   199  // goroutine as fn. Goroutines started from fn will see the host's filesystem.
   200  func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
   201  	select {
   202  	case vw.todo <- future{fn: func() error { fn(); return nil }}:
   203  		return nil
   204  	case <-ctx.Done():
   205  		return ctx.Err()
   206  	}
   207  }
   208  
   209  // Close waits until any in-flight operations complete and frees all
   210  // resources associated with vw.
   211  func (vw *containerFSView) Close() error {
   212  	runtime.SetFinalizer(vw, nil)
   213  	close(vw.todo)
   214  	err := multierror.Append(nil, <-vw.done)
   215  	err = multierror.Append(err, vw.ctr.UnmountVolumes(context.TODO(), vw.d.LogVolumeEvent))
   216  	err = multierror.Append(err, vw.d.Unmount(vw.ctr))
   217  	return err.ErrorOrNil()
   218  }
   219  
   220  // Stat returns the metadata for path, relative to the current working directory
   221  // of vw inside the container filesystem view.
   222  func (vw *containerFSView) Stat(ctx context.Context, path string) (*types.ContainerPathStat, error) {
   223  	var stat *types.ContainerPathStat
   224  	err := vw.RunInFS(ctx, func() error {
   225  		lstat, err := os.Lstat(path)
   226  		if err != nil {
   227  			return err
   228  		}
   229  		var target string
   230  		if lstat.Mode()&os.ModeSymlink != 0 {
   231  			// Fully evaluate symlinks along path to the ultimate
   232  			// target, or as much as possible with broken links.
   233  			target, err = symlink.FollowSymlinkInScope(path, "/")
   234  			if err != nil {
   235  				return err
   236  			}
   237  		}
   238  		stat = &types.ContainerPathStat{
   239  			Name:       filepath.Base(path),
   240  			Size:       lstat.Size(),
   241  			Mode:       lstat.Mode(),
   242  			Mtime:      lstat.ModTime(),
   243  			LinkTarget: target,
   244  		}
   245  		return nil
   246  	})
   247  	return stat, err
   248  }
   249  
   250  // makeMountRRO makes the mount recursively read-only.
   251  func makeMountRRO(dest string) error {
   252  	attr := &unix.MountAttr{
   253  		Attr_set: unix.MOUNT_ATTR_RDONLY,
   254  	}
   255  	var err error
   256  	for {
   257  		err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr)
   258  		if !errors.Is(err, unix.EINTR) {
   259  			break
   260  		}
   261  	}
   262  	if err != nil {
   263  		err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err)
   264  	}
   265  	return err
   266  }