github.com/containerd/nerdctl@v1.7.7/pkg/mountutil/mountutil_linux.go (about)

     1  /*
     2     Copyright The containerd Authors.
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package mountutil
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io/fs"
    23  	"os"
    24  	"path/filepath"
    25  	"strconv"
    26  	"strings"
    27  
    28  	"github.com/containerd/containerd/containers"
    29  	"github.com/containerd/containerd/mount"
    30  	"github.com/containerd/containerd/oci"
    31  	"github.com/containerd/log"
    32  	"github.com/containerd/nerdctl/pkg/mountutil/volumestore"
    33  	"github.com/docker/go-units"
    34  	mobymount "github.com/moby/sys/mount"
    35  	"github.com/opencontainers/runtime-spec/specs-go"
    36  	"golang.org/x/sys/unix"
    37  )
    38  
    39  /*
    40     Portions from https://github.com/moby/moby/blob/v20.10.5/daemon/oci_linux.go
    41     Portions from https://github.com/moby/moby/blob/v20.10.5/volume/mounts/linux_parser.go
    42     Copyright (C) Docker/Moby authors.
    43     Licensed under the Apache License, Version 2.0
    44     NOTICE: https://github.com/moby/moby/blob/v20.10.5/NOTICE
    45  */
    46  
    47  const (
    48  	DefaultMountType = "none"
    49  
    50  	// DefaultPropagationMode is the default propagation of mounts
    51  	// where user doesn't specify mount propagation explicitly.
    52  	// See also: https://github.com/moby/moby/blob/v20.10.7/volume/mounts/linux_parser.go#L145
    53  	DefaultPropagationMode = "rprivate"
    54  )
    55  
    56  // UnprivilegedMountFlags is from https://github.com/moby/moby/blob/v20.10.5/daemon/oci_linux.go#L420-L450
    57  //
    58  // Get the set of mount flags that are set on the mount that contains the given
    59  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
    60  // bind-mounting "with options" will not fail with user namespaces, due to
    61  // kernel restrictions that require user namespace mounts to preserve
    62  // CL_UNPRIVILEGED locked flags.
    63  func UnprivilegedMountFlags(path string) ([]string, error) {
    64  	var statfs unix.Statfs_t
    65  	if err := unix.Statfs(path, &statfs); err != nil {
    66  		return nil, &fs.PathError{Op: "stat", Path: path, Err: err}
    67  	}
    68  
    69  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
    70  	unprivilegedFlags := map[uint64]string{
    71  		unix.MS_RDONLY:     "ro",
    72  		unix.MS_NODEV:      "nodev",
    73  		unix.MS_NOEXEC:     "noexec",
    74  		unix.MS_NOSUID:     "nosuid",
    75  		unix.MS_NOATIME:    "noatime",
    76  		unix.MS_RELATIME:   "relatime",
    77  		unix.MS_NODIRATIME: "nodiratime",
    78  	}
    79  
    80  	var flags []string
    81  	for mask, flag := range unprivilegedFlags {
    82  		if uint64(statfs.Flags)&mask == mask {
    83  			flags = append(flags, flag)
    84  		}
    85  	}
    86  
    87  	return flags, nil
    88  }
    89  
    90  // parseVolumeOptions parses specified optsRaw with using information of
    91  // the volume type and the src directory when necessary.
    92  func parseVolumeOptions(vType, src, optsRaw string) ([]string, []oci.SpecOpts, error) {
    93  	return parseVolumeOptionsWithMountInfo(vType, src, optsRaw, getMountInfo)
    94  }
    95  
    96  // getMountInfo gets mount.Info of a directory.
    97  func getMountInfo(dir string) (mount.Info, error) {
    98  	sourcePath, err := filepath.EvalSymlinks(dir)
    99  	if err != nil {
   100  		return mount.Info{}, err
   101  	}
   102  	return mount.Lookup(sourcePath)
   103  }
   104  
   105  // parseVolumeOptionsWithMountInfo is the testable implementation
   106  // of parseVolumeOptions.
   107  func parseVolumeOptionsWithMountInfo(vType, src, optsRaw string, getMountInfoFunc func(string) (mount.Info, error)) ([]string, []oci.SpecOpts, error) {
   108  	var (
   109  		writeModeRawOpts   []string
   110  		propagationRawOpts []string
   111  		bindOpts           []string
   112  	)
   113  	for _, opt := range strings.Split(optsRaw, ",") {
   114  		switch opt {
   115  		case "rw", "ro", "rro":
   116  			writeModeRawOpts = append(writeModeRawOpts, opt)
   117  		case "private", "rprivate", "shared", "rshared", "slave", "rslave":
   118  			propagationRawOpts = append(propagationRawOpts, opt)
   119  		case "bind", "rbind":
   120  			// bind means not recursively bind-mounted, rbind is the opposite
   121  			bindOpts = append(bindOpts, opt)
   122  		case "":
   123  			// NOP
   124  		default:
   125  			log.L.Warnf("unsupported volume option %q", opt)
   126  		}
   127  	}
   128  
   129  	var opts []string
   130  	var specOpts []oci.SpecOpts
   131  
   132  	if len(bindOpts) > 0 && vType != Bind {
   133  		return nil, nil, fmt.Errorf("volume bind/rbind option is only supported for bind mount: %+v", bindOpts)
   134  	} else if len(bindOpts) > 1 {
   135  		return nil, nil, fmt.Errorf("duplicated bind/rbind option: %+v", bindOpts)
   136  	} else if len(bindOpts) > 0 {
   137  		opts = append(opts, bindOpts[0])
   138  	}
   139  
   140  	if len(writeModeRawOpts) > 1 {
   141  		return nil, nil, fmt.Errorf("duplicated read/write volume option: %+v", writeModeRawOpts)
   142  	} else if len(writeModeRawOpts) > 0 {
   143  		switch writeModeRawOpts[0] {
   144  		case "ro":
   145  			opts = append(opts, "ro")
   146  		case "rro":
   147  			// Mount option "rro" is supported since crun v1.4 / runc v1.1 (https://github.com/opencontainers/runc/pull/3272), with kernel >= 5.12.
   148  			// Older version of runc just ignores "rro", so we have to add "ro" too, to our best effort.
   149  			opts = append(opts, "ro", "rro")
   150  			if len(propagationRawOpts) != 1 || propagationRawOpts[0] != "rprivate" {
   151  				log.L.Warn("Mount option \"rro\" should be used in conjunction with \"rprivate\"")
   152  			}
   153  		case "rw":
   154  			// NOP
   155  		default:
   156  			// NOTREACHED
   157  			return nil, nil, fmt.Errorf("unexpected writeModeRawOpts[0]=%q", writeModeRawOpts[0])
   158  		}
   159  	}
   160  
   161  	if len(propagationRawOpts) > 1 {
   162  		return nil, nil, fmt.Errorf("duplicated volume propagation option: %+v", propagationRawOpts)
   163  	} else if len(propagationRawOpts) > 0 && vType != Bind {
   164  		return nil, nil, fmt.Errorf("volume propagation option is only supported for bind mount: %+v", propagationRawOpts)
   165  	} else if vType == Bind {
   166  		var pFlag string
   167  		var got string
   168  		if len(propagationRawOpts) > 0 {
   169  			got = propagationRawOpts[0]
   170  		}
   171  		switch got {
   172  		case "shared", "rshared":
   173  			pFlag = got
   174  			// a bind mount can be shared from shared mount
   175  			mi, err := getMountInfoFunc(src)
   176  			if err != nil {
   177  				return nil, nil, err
   178  			}
   179  			if err := ensureMountOptionalValue(mi, "shared:"); err != nil {
   180  				return nil, nil, err
   181  			}
   182  
   183  			// NOTE: Though OCI Runtime Spec doesn't explicitly describe, runc's default
   184  			//       of RootfsPropagation is unix.MS_SLAVE | unix.MS_REC (i.e. runc applies
   185  			//       "slave" to all mount points in the container recursively). This ends
   186  			//       up marking the bind src directories "slave" and preventing it to shared
   187  			//      with the host. So we set RootfsPropagation to "shared" here.
   188  			//
   189  			// See also:
   190  			// - OCI Runtime Spec: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config-linux.md#rootfs-mount-propagation
   191  			// - runc implementation: https://github.com/opencontainers/runc/blob/v1.0.0/libcontainer/rootfs_linux.go#L771-L777
   192  			specOpts = append(specOpts, func(ctx context.Context, cli oci.Client, c *containers.Container, s *oci.Spec) error {
   193  				switch s.Linux.RootfsPropagation {
   194  				case "shared", "rshared":
   195  					// NOP
   196  				default:
   197  					s.Linux.RootfsPropagation = "shared"
   198  				}
   199  				return nil
   200  			})
   201  		case "slave", "rslave":
   202  			pFlag = got
   203  			// a bind mount can be a slave of shared or an existing slave mount
   204  			mi, err := getMountInfoFunc(src)
   205  			if err != nil {
   206  				return nil, nil, err
   207  			}
   208  			if err := ensureMountOptionalValue(mi, "shared:", "master:"); err != nil {
   209  				return nil, nil, err
   210  			}
   211  
   212  			// See above comments about RootfsPropagation. Here we make sure that
   213  			// the mountpoint can be a slave of the host mount.
   214  			specOpts = append(specOpts, func(ctx context.Context, cli oci.Client, c *containers.Container, s *oci.Spec) error {
   215  				switch s.Linux.RootfsPropagation {
   216  				case "shared", "rshared", "slave", "rslave":
   217  					// NOP
   218  				default:
   219  					s.Linux.RootfsPropagation = "rslave"
   220  				}
   221  				return nil
   222  			})
   223  		case "private", "rprivate":
   224  			pFlag = got
   225  		default:
   226  			// No propagation is specified to this bind mount.
   227  			// NOTE: When RootfsPropagation is set (e.g. by other bind mount option), that
   228  			//       propagation mode will be applied to this bind mount as well. So we need
   229  			//       to set "rprivate" explicitly for preventing this bind mount from unexpectedly
   230  			//       shared with the host. This behaviour is compatible to docker:
   231  			//       https://github.com/moby/moby/blob/v20.10.7/volume/mounts/linux_parser.go#L320-L322
   232  			//
   233  			// TODO: directories managed by containerd (e.g. /var/lib/containerd, /run/containerd, ...)
   234  			//       should be marked as "rslave" instead of "rprivate". This is because allowing
   235  			//       containers to hold their private bind mounts will prevent containerd from remove
   236  			//       them. See also: https://github.com/moby/moby/pull/36055.
   237  			//       Unfortunately, containerd doesn't expose the locations of directories where it manages.
   238  			//       Current workaround is explicitly add "rshared" or "rslave" option to these bind mounts.
   239  			pFlag = DefaultPropagationMode
   240  		}
   241  		opts = append(opts, pFlag)
   242  	}
   243  
   244  	return opts, specOpts, nil
   245  }
   246  
   247  // ensure the mount of the specified directory has either of the specified
   248  // "optional" value in the entry in the /proc/<pid>/mountinfo file.
   249  //
   250  // For more details about "optional" field:
   251  // - https://github.com/moby/sys/blob/mountinfo/v0.4.1/mountinfo/mountinfo.go#L52-L56
   252  func ensureMountOptionalValue(mi mount.Info, vals ...string) error {
   253  	var hasValue bool
   254  	for _, opt := range strings.Split(mi.Optional, " ") {
   255  		for _, mark := range vals {
   256  			if strings.HasPrefix(opt, mark) {
   257  				hasValue = true
   258  			}
   259  		}
   260  	}
   261  	if !hasValue {
   262  		return fmt.Errorf("mountpoint %q doesn't have optional field neither of %+v", mi.Mountpoint, vals)
   263  	}
   264  	return nil
   265  }
   266  
   267  func ProcessFlagTmpfs(s string) (*Processed, error) {
   268  	split := strings.SplitN(s, ":", 2)
   269  	dst := split[0]
   270  	options := []string{"noexec", "nosuid", "nodev"}
   271  	if len(split) == 2 {
   272  		raw := append(options, strings.Split(split[1], ",")...)
   273  		var err error
   274  		options, err = mobymount.MergeTmpfsOptions(raw)
   275  		if err != nil {
   276  			return nil, err
   277  		}
   278  	}
   279  	res := &Processed{
   280  		Mount: specs.Mount{
   281  			Type:        "tmpfs",
   282  			Source:      "tmpfs",
   283  			Destination: dst,
   284  			Options:     options,
   285  		},
   286  		Type: Tmpfs,
   287  		Mode: strings.Join(options, ","),
   288  	}
   289  	return res, nil
   290  }
   291  
   292  func ProcessFlagMount(s string, volStore volumestore.VolumeStore) (*Processed, error) {
   293  	fields := strings.Split(s, ",")
   294  	var (
   295  		mountType        string
   296  		src              string
   297  		dst              string
   298  		bindPropagation  string
   299  		bindNonRecursive bool
   300  		rwOption         string
   301  		tmpfsSize        int64
   302  		tmpfsMode        os.FileMode
   303  		err              error
   304  	)
   305  
   306  	// set default values
   307  	mountType = Volume
   308  	tmpfsMode = os.FileMode(01777)
   309  
   310  	// three types of mount(and examples):
   311  	// --mount type=bind,source="$(pwd)"/target,target=/app2,readonly,bind-propagation=shared
   312  	// --mount type=tmpfs,destination=/app,tmpfs-mode=1770,tmpfs-size=1MB
   313  	// --mount type=volume,src=vol-1,dst=/app,readonly
   314  	// if type not specified, default will be set to volume
   315  	// --mount src=`pwd`/tmp,target=/app
   316  
   317  	for _, field := range fields {
   318  		parts := strings.SplitN(field, "=", 2)
   319  		key := strings.ToLower(parts[0])
   320  
   321  		if len(parts) == 1 {
   322  			switch key {
   323  			case "readonly", "ro", "rw", "rro":
   324  				rwOption = key
   325  				continue
   326  			case "bind-nonrecursive":
   327  				bindNonRecursive = true
   328  				continue
   329  			}
   330  		}
   331  
   332  		if len(parts) != 2 {
   333  			return nil, fmt.Errorf("invalid field '%s' must be a key=value pair", field)
   334  		}
   335  
   336  		value := parts[1]
   337  		switch key {
   338  		case "type":
   339  			switch value {
   340  			case "tmpfs":
   341  				mountType = Tmpfs
   342  			case "bind":
   343  				mountType = Bind
   344  			case "volume":
   345  			default:
   346  				return nil, fmt.Errorf("invalid mount type '%s' must be a volume/bind/tmpfs", value)
   347  			}
   348  		case "source", "src":
   349  			src = value
   350  		case "target", "dst", "destination":
   351  			dst = value
   352  		case "readonly", "ro", "rw", "rro":
   353  			trueValue, err := strconv.ParseBool(value)
   354  			if err != nil {
   355  				return nil, fmt.Errorf("invalid value for %s: %s", key, value)
   356  			}
   357  			if trueValue {
   358  				rwOption = key
   359  			}
   360  		case "bind-propagation":
   361  			// here don't validate the propagation value
   362  			// parseVolumeOptions will do that.
   363  			bindPropagation = value
   364  		case "bind-nonrecursive":
   365  			bindNonRecursive, err = strconv.ParseBool(value)
   366  			if err != nil {
   367  				return nil, fmt.Errorf("invalid value for %s: %s", key, value)
   368  			}
   369  		case "tmpfs-size":
   370  			tmpfsSize, err = units.RAMInBytes(value)
   371  			if err != nil {
   372  				return nil, fmt.Errorf("invalid value for %s: %s", key, value)
   373  			}
   374  		case "tmpfs-mode":
   375  			ui64, err := strconv.ParseUint(value, 8, 32)
   376  			if err != nil {
   377  				return nil, fmt.Errorf("invalid value for %s: %s", key, value)
   378  			}
   379  			tmpfsMode = os.FileMode(ui64)
   380  		default:
   381  			return nil, fmt.Errorf("unexpected key '%s' in '%s'", key, field)
   382  		}
   383  	}
   384  
   385  	// compose new fileds and join into a string
   386  	// to call legacy ProcessFlagTmpfs or ProcessFlagV function
   387  	fields = []string{}
   388  	options := []string{}
   389  	if rwOption != "" {
   390  		if rwOption == "readonly" {
   391  			rwOption = "ro"
   392  		}
   393  		options = append(options, rwOption)
   394  	}
   395  
   396  	switch mountType {
   397  	case Tmpfs:
   398  		fields = []string{dst}
   399  		if tmpfsMode != 0 {
   400  			options = append(options, fmt.Sprintf("mode=%o", tmpfsMode))
   401  		}
   402  		if tmpfsSize > 0 {
   403  			options = append(options, getTmpfsSize(tmpfsSize))
   404  		}
   405  	case Volume, Bind:
   406  		fields = []string{src, dst}
   407  		if bindPropagation != "" {
   408  			options = append(options, bindPropagation)
   409  		}
   410  		if mountType == Bind {
   411  			if bindNonRecursive {
   412  				options = append(options, "bind")
   413  			} else {
   414  				options = append(options, "rbind")
   415  			}
   416  		}
   417  	}
   418  
   419  	if len(options) > 0 {
   420  		optionsStr := strings.Join(options, ",")
   421  		fields = append(fields, optionsStr)
   422  	}
   423  	fieldsStr := strings.Join(fields, ":")
   424  
   425  	log.L.Debugf("Call legacy %s process, spec: %s ", mountType, fieldsStr)
   426  
   427  	switch mountType {
   428  	case Tmpfs:
   429  		return ProcessFlagTmpfs(fieldsStr)
   430  	case Volume, Bind:
   431  		// createDir=false for --mount option to disallow creating directories on host if not found
   432  		return ProcessFlagV(fieldsStr, volStore, false)
   433  	}
   434  	return nil, fmt.Errorf("invalid mount type '%s' must be a volume/bind/tmpfs", mountType)
   435  }
   436  
   437  // copy from https://github.com/moby/moby/blob/085c6a98d54720e70b28354ccec6da9b1b9e7fcf/volume/mounts/linux_parser.go#L375
   438  func getTmpfsSize(size int64) string {
   439  	// calculate suffix here, making this linux specific, but that is
   440  	// okay, since API is that way anyways.
   441  
   442  	// we do this by finding the suffix that divides evenly into the
   443  	// value, returning the value itself, with no suffix, if it fails.
   444  	//
   445  	// For the most part, we don't enforce any semantic to this values.
   446  	// The operating system will usually align this and enforce minimum
   447  	// and maximums.
   448  	var (
   449  		suffix string
   450  	)
   451  	for _, r := range []struct {
   452  		suffix  string
   453  		divisor int64
   454  	}{
   455  		{"g", 1 << 30},
   456  		{"m", 1 << 20},
   457  		{"k", 1 << 10},
   458  	} {
   459  		if size%r.divisor == 0 {
   460  			size = size / r.divisor
   461  			suffix = r.suffix
   462  			break
   463  		}
   464  	}
   465  
   466  	return fmt.Sprintf("size=%d%s", size, suffix)
   467  }