github.com/containerd/nerdctl@v1.7.7/pkg/mountutil/mountutil_linux.go (about) 1 /* 2 Copyright The containerd Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package mountutil 18 19 import ( 20 "context" 21 "fmt" 22 "io/fs" 23 "os" 24 "path/filepath" 25 "strconv" 26 "strings" 27 28 "github.com/containerd/containerd/containers" 29 "github.com/containerd/containerd/mount" 30 "github.com/containerd/containerd/oci" 31 "github.com/containerd/log" 32 "github.com/containerd/nerdctl/pkg/mountutil/volumestore" 33 "github.com/docker/go-units" 34 mobymount "github.com/moby/sys/mount" 35 "github.com/opencontainers/runtime-spec/specs-go" 36 "golang.org/x/sys/unix" 37 ) 38 39 /* 40 Portions from https://github.com/moby/moby/blob/v20.10.5/daemon/oci_linux.go 41 Portions from https://github.com/moby/moby/blob/v20.10.5/volume/mounts/linux_parser.go 42 Copyright (C) Docker/Moby authors. 43 Licensed under the Apache License, Version 2.0 44 NOTICE: https://github.com/moby/moby/blob/v20.10.5/NOTICE 45 */ 46 47 const ( 48 DefaultMountType = "none" 49 50 // DefaultPropagationMode is the default propagation of mounts 51 // where user doesn't specify mount propagation explicitly. 52 // See also: https://github.com/moby/moby/blob/v20.10.7/volume/mounts/linux_parser.go#L145 53 DefaultPropagationMode = "rprivate" 54 ) 55 56 // UnprivilegedMountFlags is from https://github.com/moby/moby/blob/v20.10.5/daemon/oci_linux.go#L420-L450 57 // 58 // Get the set of mount flags that are set on the mount that contains the given 59 // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that 60 // bind-mounting "with options" will not fail with user namespaces, due to 61 // kernel restrictions that require user namespace mounts to preserve 62 // CL_UNPRIVILEGED locked flags. 63 func UnprivilegedMountFlags(path string) ([]string, error) { 64 var statfs unix.Statfs_t 65 if err := unix.Statfs(path, &statfs); err != nil { 66 return nil, &fs.PathError{Op: "stat", Path: path, Err: err} 67 } 68 69 // The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048. 70 unprivilegedFlags := map[uint64]string{ 71 unix.MS_RDONLY: "ro", 72 unix.MS_NODEV: "nodev", 73 unix.MS_NOEXEC: "noexec", 74 unix.MS_NOSUID: "nosuid", 75 unix.MS_NOATIME: "noatime", 76 unix.MS_RELATIME: "relatime", 77 unix.MS_NODIRATIME: "nodiratime", 78 } 79 80 var flags []string 81 for mask, flag := range unprivilegedFlags { 82 if uint64(statfs.Flags)&mask == mask { 83 flags = append(flags, flag) 84 } 85 } 86 87 return flags, nil 88 } 89 90 // parseVolumeOptions parses specified optsRaw with using information of 91 // the volume type and the src directory when necessary. 92 func parseVolumeOptions(vType, src, optsRaw string) ([]string, []oci.SpecOpts, error) { 93 return parseVolumeOptionsWithMountInfo(vType, src, optsRaw, getMountInfo) 94 } 95 96 // getMountInfo gets mount.Info of a directory. 97 func getMountInfo(dir string) (mount.Info, error) { 98 sourcePath, err := filepath.EvalSymlinks(dir) 99 if err != nil { 100 return mount.Info{}, err 101 } 102 return mount.Lookup(sourcePath) 103 } 104 105 // parseVolumeOptionsWithMountInfo is the testable implementation 106 // of parseVolumeOptions. 107 func parseVolumeOptionsWithMountInfo(vType, src, optsRaw string, getMountInfoFunc func(string) (mount.Info, error)) ([]string, []oci.SpecOpts, error) { 108 var ( 109 writeModeRawOpts []string 110 propagationRawOpts []string 111 bindOpts []string 112 ) 113 for _, opt := range strings.Split(optsRaw, ",") { 114 switch opt { 115 case "rw", "ro", "rro": 116 writeModeRawOpts = append(writeModeRawOpts, opt) 117 case "private", "rprivate", "shared", "rshared", "slave", "rslave": 118 propagationRawOpts = append(propagationRawOpts, opt) 119 case "bind", "rbind": 120 // bind means not recursively bind-mounted, rbind is the opposite 121 bindOpts = append(bindOpts, opt) 122 case "": 123 // NOP 124 default: 125 log.L.Warnf("unsupported volume option %q", opt) 126 } 127 } 128 129 var opts []string 130 var specOpts []oci.SpecOpts 131 132 if len(bindOpts) > 0 && vType != Bind { 133 return nil, nil, fmt.Errorf("volume bind/rbind option is only supported for bind mount: %+v", bindOpts) 134 } else if len(bindOpts) > 1 { 135 return nil, nil, fmt.Errorf("duplicated bind/rbind option: %+v", bindOpts) 136 } else if len(bindOpts) > 0 { 137 opts = append(opts, bindOpts[0]) 138 } 139 140 if len(writeModeRawOpts) > 1 { 141 return nil, nil, fmt.Errorf("duplicated read/write volume option: %+v", writeModeRawOpts) 142 } else if len(writeModeRawOpts) > 0 { 143 switch writeModeRawOpts[0] { 144 case "ro": 145 opts = append(opts, "ro") 146 case "rro": 147 // Mount option "rro" is supported since crun v1.4 / runc v1.1 (https://github.com/opencontainers/runc/pull/3272), with kernel >= 5.12. 148 // Older version of runc just ignores "rro", so we have to add "ro" too, to our best effort. 149 opts = append(opts, "ro", "rro") 150 if len(propagationRawOpts) != 1 || propagationRawOpts[0] != "rprivate" { 151 log.L.Warn("Mount option \"rro\" should be used in conjunction with \"rprivate\"") 152 } 153 case "rw": 154 // NOP 155 default: 156 // NOTREACHED 157 return nil, nil, fmt.Errorf("unexpected writeModeRawOpts[0]=%q", writeModeRawOpts[0]) 158 } 159 } 160 161 if len(propagationRawOpts) > 1 { 162 return nil, nil, fmt.Errorf("duplicated volume propagation option: %+v", propagationRawOpts) 163 } else if len(propagationRawOpts) > 0 && vType != Bind { 164 return nil, nil, fmt.Errorf("volume propagation option is only supported for bind mount: %+v", propagationRawOpts) 165 } else if vType == Bind { 166 var pFlag string 167 var got string 168 if len(propagationRawOpts) > 0 { 169 got = propagationRawOpts[0] 170 } 171 switch got { 172 case "shared", "rshared": 173 pFlag = got 174 // a bind mount can be shared from shared mount 175 mi, err := getMountInfoFunc(src) 176 if err != nil { 177 return nil, nil, err 178 } 179 if err := ensureMountOptionalValue(mi, "shared:"); err != nil { 180 return nil, nil, err 181 } 182 183 // NOTE: Though OCI Runtime Spec doesn't explicitly describe, runc's default 184 // of RootfsPropagation is unix.MS_SLAVE | unix.MS_REC (i.e. runc applies 185 // "slave" to all mount points in the container recursively). This ends 186 // up marking the bind src directories "slave" and preventing it to shared 187 // with the host. So we set RootfsPropagation to "shared" here. 188 // 189 // See also: 190 // - OCI Runtime Spec: https://github.com/opencontainers/runtime-spec/blob/v1.0.2/config-linux.md#rootfs-mount-propagation 191 // - runc implementation: https://github.com/opencontainers/runc/blob/v1.0.0/libcontainer/rootfs_linux.go#L771-L777 192 specOpts = append(specOpts, func(ctx context.Context, cli oci.Client, c *containers.Container, s *oci.Spec) error { 193 switch s.Linux.RootfsPropagation { 194 case "shared", "rshared": 195 // NOP 196 default: 197 s.Linux.RootfsPropagation = "shared" 198 } 199 return nil 200 }) 201 case "slave", "rslave": 202 pFlag = got 203 // a bind mount can be a slave of shared or an existing slave mount 204 mi, err := getMountInfoFunc(src) 205 if err != nil { 206 return nil, nil, err 207 } 208 if err := ensureMountOptionalValue(mi, "shared:", "master:"); err != nil { 209 return nil, nil, err 210 } 211 212 // See above comments about RootfsPropagation. Here we make sure that 213 // the mountpoint can be a slave of the host mount. 214 specOpts = append(specOpts, func(ctx context.Context, cli oci.Client, c *containers.Container, s *oci.Spec) error { 215 switch s.Linux.RootfsPropagation { 216 case "shared", "rshared", "slave", "rslave": 217 // NOP 218 default: 219 s.Linux.RootfsPropagation = "rslave" 220 } 221 return nil 222 }) 223 case "private", "rprivate": 224 pFlag = got 225 default: 226 // No propagation is specified to this bind mount. 227 // NOTE: When RootfsPropagation is set (e.g. by other bind mount option), that 228 // propagation mode will be applied to this bind mount as well. So we need 229 // to set "rprivate" explicitly for preventing this bind mount from unexpectedly 230 // shared with the host. This behaviour is compatible to docker: 231 // https://github.com/moby/moby/blob/v20.10.7/volume/mounts/linux_parser.go#L320-L322 232 // 233 // TODO: directories managed by containerd (e.g. /var/lib/containerd, /run/containerd, ...) 234 // should be marked as "rslave" instead of "rprivate". This is because allowing 235 // containers to hold their private bind mounts will prevent containerd from remove 236 // them. See also: https://github.com/moby/moby/pull/36055. 237 // Unfortunately, containerd doesn't expose the locations of directories where it manages. 238 // Current workaround is explicitly add "rshared" or "rslave" option to these bind mounts. 239 pFlag = DefaultPropagationMode 240 } 241 opts = append(opts, pFlag) 242 } 243 244 return opts, specOpts, nil 245 } 246 247 // ensure the mount of the specified directory has either of the specified 248 // "optional" value in the entry in the /proc/<pid>/mountinfo file. 249 // 250 // For more details about "optional" field: 251 // - https://github.com/moby/sys/blob/mountinfo/v0.4.1/mountinfo/mountinfo.go#L52-L56 252 func ensureMountOptionalValue(mi mount.Info, vals ...string) error { 253 var hasValue bool 254 for _, opt := range strings.Split(mi.Optional, " ") { 255 for _, mark := range vals { 256 if strings.HasPrefix(opt, mark) { 257 hasValue = true 258 } 259 } 260 } 261 if !hasValue { 262 return fmt.Errorf("mountpoint %q doesn't have optional field neither of %+v", mi.Mountpoint, vals) 263 } 264 return nil 265 } 266 267 func ProcessFlagTmpfs(s string) (*Processed, error) { 268 split := strings.SplitN(s, ":", 2) 269 dst := split[0] 270 options := []string{"noexec", "nosuid", "nodev"} 271 if len(split) == 2 { 272 raw := append(options, strings.Split(split[1], ",")...) 273 var err error 274 options, err = mobymount.MergeTmpfsOptions(raw) 275 if err != nil { 276 return nil, err 277 } 278 } 279 res := &Processed{ 280 Mount: specs.Mount{ 281 Type: "tmpfs", 282 Source: "tmpfs", 283 Destination: dst, 284 Options: options, 285 }, 286 Type: Tmpfs, 287 Mode: strings.Join(options, ","), 288 } 289 return res, nil 290 } 291 292 func ProcessFlagMount(s string, volStore volumestore.VolumeStore) (*Processed, error) { 293 fields := strings.Split(s, ",") 294 var ( 295 mountType string 296 src string 297 dst string 298 bindPropagation string 299 bindNonRecursive bool 300 rwOption string 301 tmpfsSize int64 302 tmpfsMode os.FileMode 303 err error 304 ) 305 306 // set default values 307 mountType = Volume 308 tmpfsMode = os.FileMode(01777) 309 310 // three types of mount(and examples): 311 // --mount type=bind,source="$(pwd)"/target,target=/app2,readonly,bind-propagation=shared 312 // --mount type=tmpfs,destination=/app,tmpfs-mode=1770,tmpfs-size=1MB 313 // --mount type=volume,src=vol-1,dst=/app,readonly 314 // if type not specified, default will be set to volume 315 // --mount src=`pwd`/tmp,target=/app 316 317 for _, field := range fields { 318 parts := strings.SplitN(field, "=", 2) 319 key := strings.ToLower(parts[0]) 320 321 if len(parts) == 1 { 322 switch key { 323 case "readonly", "ro", "rw", "rro": 324 rwOption = key 325 continue 326 case "bind-nonrecursive": 327 bindNonRecursive = true 328 continue 329 } 330 } 331 332 if len(parts) != 2 { 333 return nil, fmt.Errorf("invalid field '%s' must be a key=value pair", field) 334 } 335 336 value := parts[1] 337 switch key { 338 case "type": 339 switch value { 340 case "tmpfs": 341 mountType = Tmpfs 342 case "bind": 343 mountType = Bind 344 case "volume": 345 default: 346 return nil, fmt.Errorf("invalid mount type '%s' must be a volume/bind/tmpfs", value) 347 } 348 case "source", "src": 349 src = value 350 case "target", "dst", "destination": 351 dst = value 352 case "readonly", "ro", "rw", "rro": 353 trueValue, err := strconv.ParseBool(value) 354 if err != nil { 355 return nil, fmt.Errorf("invalid value for %s: %s", key, value) 356 } 357 if trueValue { 358 rwOption = key 359 } 360 case "bind-propagation": 361 // here don't validate the propagation value 362 // parseVolumeOptions will do that. 363 bindPropagation = value 364 case "bind-nonrecursive": 365 bindNonRecursive, err = strconv.ParseBool(value) 366 if err != nil { 367 return nil, fmt.Errorf("invalid value for %s: %s", key, value) 368 } 369 case "tmpfs-size": 370 tmpfsSize, err = units.RAMInBytes(value) 371 if err != nil { 372 return nil, fmt.Errorf("invalid value for %s: %s", key, value) 373 } 374 case "tmpfs-mode": 375 ui64, err := strconv.ParseUint(value, 8, 32) 376 if err != nil { 377 return nil, fmt.Errorf("invalid value for %s: %s", key, value) 378 } 379 tmpfsMode = os.FileMode(ui64) 380 default: 381 return nil, fmt.Errorf("unexpected key '%s' in '%s'", key, field) 382 } 383 } 384 385 // compose new fileds and join into a string 386 // to call legacy ProcessFlagTmpfs or ProcessFlagV function 387 fields = []string{} 388 options := []string{} 389 if rwOption != "" { 390 if rwOption == "readonly" { 391 rwOption = "ro" 392 } 393 options = append(options, rwOption) 394 } 395 396 switch mountType { 397 case Tmpfs: 398 fields = []string{dst} 399 if tmpfsMode != 0 { 400 options = append(options, fmt.Sprintf("mode=%o", tmpfsMode)) 401 } 402 if tmpfsSize > 0 { 403 options = append(options, getTmpfsSize(tmpfsSize)) 404 } 405 case Volume, Bind: 406 fields = []string{src, dst} 407 if bindPropagation != "" { 408 options = append(options, bindPropagation) 409 } 410 if mountType == Bind { 411 if bindNonRecursive { 412 options = append(options, "bind") 413 } else { 414 options = append(options, "rbind") 415 } 416 } 417 } 418 419 if len(options) > 0 { 420 optionsStr := strings.Join(options, ",") 421 fields = append(fields, optionsStr) 422 } 423 fieldsStr := strings.Join(fields, ":") 424 425 log.L.Debugf("Call legacy %s process, spec: %s ", mountType, fieldsStr) 426 427 switch mountType { 428 case Tmpfs: 429 return ProcessFlagTmpfs(fieldsStr) 430 case Volume, Bind: 431 // createDir=false for --mount option to disallow creating directories on host if not found 432 return ProcessFlagV(fieldsStr, volStore, false) 433 } 434 return nil, fmt.Errorf("invalid mount type '%s' must be a volume/bind/tmpfs", mountType) 435 } 436 437 // copy from https://github.com/moby/moby/blob/085c6a98d54720e70b28354ccec6da9b1b9e7fcf/volume/mounts/linux_parser.go#L375 438 func getTmpfsSize(size int64) string { 439 // calculate suffix here, making this linux specific, but that is 440 // okay, since API is that way anyways. 441 442 // we do this by finding the suffix that divides evenly into the 443 // value, returning the value itself, with no suffix, if it fails. 444 // 445 // For the most part, we don't enforce any semantic to this values. 446 // The operating system will usually align this and enforce minimum 447 // and maximums. 448 var ( 449 suffix string 450 ) 451 for _, r := range []struct { 452 suffix string 453 divisor int64 454 }{ 455 {"g", 1 << 30}, 456 {"m", 1 << 20}, 457 {"k", 1 << 10}, 458 } { 459 if size%r.divisor == 0 { 460 size = size / r.divisor 461 suffix = r.suffix 462 break 463 } 464 } 465 466 return fmt.Sprintf("size=%d%s", size, suffix) 467 }