github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/overlay/overlay.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package overlay provides an overlay filesystem implementation, which 16 // synthesizes a filesystem by composing one or more immutable filesystems 17 // ("lower layers") with an optional mutable filesystem ("upper layer"). 18 // 19 // Lock order: 20 // 21 // directoryFD.mu / regularFileFD.mu 22 // filesystem.renameMu 23 // dentry.dirMu 24 // dentry.copyMu 25 // filesystem.devMu 26 // *** "memmap.Mappable locks" below this point 27 // dentry.mapsMu 28 // *** "memmap.Mappable locks taken by Translate" below this point 29 // dentry.dataMu 30 // 31 // Locking dentry.dirMu in multiple dentries requires that parent dentries are 32 // locked before child dentries, and that filesystem.renameMu is locked to 33 // stabilize this relationship. 34 package overlay 35 36 import ( 37 "fmt" 38 "strings" 39 "sync/atomic" 40 41 "github.com/metacubex/gvisor/pkg/abi/linux" 42 "github.com/metacubex/gvisor/pkg/atomicbitops" 43 "github.com/metacubex/gvisor/pkg/context" 44 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 45 "github.com/metacubex/gvisor/pkg/fspath" 46 "github.com/metacubex/gvisor/pkg/refs" 47 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 48 "github.com/metacubex/gvisor/pkg/sentry/memmap" 49 "github.com/metacubex/gvisor/pkg/sentry/vfs" 50 "github.com/metacubex/gvisor/pkg/sync" 51 ) 52 53 // Name is the default filesystem name. 54 const Name = "overlay" 55 56 // FilesystemType implements vfs.FilesystemType. 57 // 58 // +stateify savable 59 type FilesystemType struct{} 60 61 // Name implements vfs.FilesystemType.Name. 62 func (FilesystemType) Name() string { 63 return Name 64 } 65 66 // Release implements FilesystemType.Release. 67 func (FilesystemType) Release(ctx context.Context) {} 68 69 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to 70 // FilesystemType.GetFilesystem. 71 // 72 // +stateify savable 73 type FilesystemOptions struct { 74 // Callers passing FilesystemOptions to 75 // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that 76 // the vfs.Mounts comprising the layers of the overlay filesystem do not 77 // contain submounts. 78 79 // If UpperRoot.Ok(), it is the root of the writable upper layer of the 80 // overlay. 81 UpperRoot vfs.VirtualDentry 82 83 // LowerRoots contains the roots of the immutable lower layers of the 84 // overlay. LowerRoots is immutable. 85 LowerRoots []vfs.VirtualDentry 86 } 87 88 // filesystem implements vfs.FilesystemImpl. 89 // 90 // +stateify savable 91 type filesystem struct { 92 vfsfs vfs.Filesystem 93 94 // Immutable options. 95 opts FilesystemOptions 96 97 // creds is a copy of the filesystem's creator's credentials, which are 98 // used for accesses to the filesystem's layers. creds is immutable. 99 creds *auth.Credentials 100 101 // dirDevMinor is the device minor number used for directories. dirDevMinor 102 // is immutable. 103 dirDevMinor uint32 104 105 // lowerDevMinors maps device numbers from lower layer filesystems to 106 // device minor numbers assigned to non-directory files originating from 107 // that filesystem. (This remapping is necessary for lower layers because a 108 // file on a lower layer, and that same file on an overlay, are 109 // distinguishable because they will diverge after copy-up; this isn't true 110 // for non-directory files already on the upper layer.) lowerDevMinors is 111 // protected by devMu. 112 devMu devMutex `state:"nosave"` 113 lowerDevMinors map[layerDevNumber]uint32 114 115 // renameMu synchronizes renaming with non-renaming operations in order to 116 // ensure consistent lock ordering between dentry.dirMu in different 117 // dentries. 118 renameMu renameRWMutex `state:"nosave"` 119 120 // dirInoCache caches overlay-private directory inode numbers by mapped 121 // bottommost device numbers and inode number. dirInoCache is protected by 122 // dirInoCacheMu. 123 dirInoCacheMu dirInoCacheMutex `state:"nosave"` 124 dirInoCache map[layerDevNoAndIno]uint64 125 126 // lastDirIno is the last inode number assigned to a directory. lastDirIno 127 // is protected by dirInoCacheMu. 128 lastDirIno uint64 129 130 // MaxFilenameLen is the maximum filename length allowed by the overlayfs. 131 maxFilenameLen uint64 132 } 133 134 // +stateify savable 135 type layerDevNumber struct { 136 major uint32 137 minor uint32 138 } 139 140 // +stateify savable 141 type layerDevNoAndIno struct { 142 layerDevNumber 143 ino uint64 144 } 145 146 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 147 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 148 mopts := vfs.GenericParseMountOptions(opts.Data) 149 fsoptsRaw := opts.InternalData 150 fsopts, ok := fsoptsRaw.(FilesystemOptions) 151 if fsoptsRaw != nil && !ok { 152 ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) 153 return nil, nil, linuxerr.EINVAL 154 } 155 vfsroot := vfs.RootFromContext(ctx) 156 if vfsroot.Ok() { 157 defer vfsroot.DecRef(ctx) 158 } 159 160 if upperPathname, ok := mopts["upperdir"]; ok { 161 if fsopts.UpperRoot.Ok() { 162 ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") 163 return nil, nil, linuxerr.EINVAL 164 } 165 delete(mopts, "upperdir") 166 // Linux overlayfs also requires a workdir when upperdir is 167 // specified; we don't, so silently ignore this option. 168 if workdir, ok := mopts["workdir"]; ok { 169 // Linux creates the "work" directory in `workdir`. 170 // Docker calls chown on it and fails if it doesn't 171 // exist. 172 workdirPath := fspath.Parse(workdir + "/work") 173 if !workdirPath.Absolute { 174 ctx.Infof("overlay.FilesystemType.GetFilesystem: workdir %q must be absolute", workdir) 175 return nil, nil, linuxerr.EINVAL 176 } 177 pop := vfs.PathOperation{ 178 Root: vfsroot, 179 Start: vfsroot, 180 Path: workdirPath, 181 FollowFinalSymlink: false, 182 } 183 mode := vfs.MkdirOptions{ 184 Mode: linux.ModeUserAll, 185 } 186 if err := vfsObj.MkdirAt(ctx, creds, &pop, &mode); err != nil && err != linuxerr.EEXIST { 187 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to create %s/work: %v", workdir, err) 188 } 189 delete(mopts, "workdir") 190 } 191 upperPath := fspath.Parse(upperPathname) 192 if !upperPath.Absolute { 193 ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) 194 return nil, nil, linuxerr.EINVAL 195 } 196 upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 197 Root: vfsroot, 198 Start: vfsroot, 199 Path: upperPath, 200 FollowFinalSymlink: true, 201 }, &vfs.GetDentryOptions{ 202 CheckSearchable: true, 203 }) 204 if err != nil { 205 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) 206 return nil, nil, err 207 } 208 // TODO(b/286942303): Only tmpfs supports whiteouts and 209 // trusted.overlay attributes. Don't allow to use non-tmpfs 210 // mounts on upper levels for mounts created through the mount 211 // syscall. In gVisor configs, users can specify any 212 // configurations on their own risk. 213 if !opts.InternalMount && upperRoot.Mount().Filesystem().FilesystemType().Name() != "tmpfs" { 214 return nil, nil, linuxerr.EINVAL 215 } 216 privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) 217 upperRoot.DecRef(ctx) 218 if err != nil { 219 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) 220 return nil, nil, err 221 } 222 defer privateUpperRoot.DecRef(ctx) 223 fsopts.UpperRoot = privateUpperRoot 224 } 225 226 if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { 227 if len(fsopts.LowerRoots) != 0 { 228 ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") 229 return nil, nil, linuxerr.EINVAL 230 } 231 delete(mopts, "lowerdir") 232 lowerPathnames := strings.Split(lowerPathnamesStr, ":") 233 for _, lowerPathname := range lowerPathnames { 234 lowerPath := fspath.Parse(lowerPathname) 235 if !lowerPath.Absolute { 236 ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) 237 return nil, nil, linuxerr.EINVAL 238 } 239 lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 240 Root: vfsroot, 241 Start: vfsroot, 242 Path: lowerPath, 243 FollowFinalSymlink: true, 244 }, &vfs.GetDentryOptions{ 245 CheckSearchable: true, 246 }) 247 if err != nil { 248 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) 249 return nil, nil, err 250 } 251 privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) 252 lowerRoot.DecRef(ctx) 253 if err != nil { 254 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) 255 return nil, nil, err 256 } 257 defer privateLowerRoot.DecRef(ctx) 258 fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) 259 } 260 } 261 262 if len(mopts) != 0 { 263 ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) 264 return nil, nil, linuxerr.EINVAL 265 } 266 267 if len(fsopts.LowerRoots) == 0 { 268 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") 269 return nil, nil, linuxerr.EINVAL 270 } 271 if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { 272 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") 273 return nil, nil, linuxerr.EINVAL 274 } 275 const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK 276 if len(fsopts.LowerRoots) > maxLowerLayers { 277 ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) 278 return nil, nil, linuxerr.EINVAL 279 } 280 281 // Allocate dirDevMinor. lowerDevMinors are allocated dynamically. 282 dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() 283 if err != nil { 284 return nil, nil, err 285 } 286 287 // Take extra references held by the filesystem. 288 if fsopts.UpperRoot.Ok() { 289 fsopts.UpperRoot.IncRef() 290 } 291 for _, lowerRoot := range fsopts.LowerRoots { 292 lowerRoot.IncRef() 293 } 294 295 fs := &filesystem{ 296 opts: fsopts, 297 creds: creds.Fork(), 298 dirDevMinor: dirDevMinor, 299 lowerDevMinors: make(map[layerDevNumber]uint32), 300 dirInoCache: make(map[layerDevNoAndIno]uint64), 301 maxFilenameLen: linux.NAME_MAX, 302 } 303 fs.vfsfs.Init(vfsObj, &fstype, fs) 304 305 // Configure max filename length. Similar to what Linux does in 306 // fs/overlayfs/super.c:ovl_fill_super() -> ... -> ovl_check_namelen(). 307 if fsopts.UpperRoot.Ok() { 308 if err := fs.updateMaxNameLen(ctx, creds, vfsObj, fs.opts.UpperRoot); err != nil { 309 ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on upper layer root: %v", err) 310 } 311 } 312 for _, lowerRoot := range fsopts.LowerRoots { 313 if err := fs.updateMaxNameLen(ctx, creds, vfsObj, lowerRoot); err != nil { 314 ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on lower layer root: %v", err) 315 } 316 } 317 318 // Construct the root dentry. 319 root := fs.newDentry() 320 root.refs = atomicbitops.FromInt64(1) 321 if fs.opts.UpperRoot.Ok() { 322 fs.opts.UpperRoot.IncRef() 323 root.copiedUp = atomicbitops.FromUint32(1) 324 root.upperVD = fs.opts.UpperRoot 325 } 326 for _, lowerRoot := range fs.opts.LowerRoots { 327 lowerRoot.IncRef() 328 root.lowerVDs = append(root.lowerVDs, lowerRoot) 329 } 330 rootTopVD := root.topLayer() 331 // Get metadata from the topmost layer. See fs.lookupLocked(). 332 const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 333 rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ 334 Root: rootTopVD, 335 Start: rootTopVD, 336 }, &vfs.StatOptions{ 337 Mask: rootStatMask, 338 }) 339 if err != nil { 340 root.destroyLocked(ctx) 341 fs.vfsfs.DecRef(ctx) 342 return nil, nil, err 343 } 344 if rootStat.Mask&rootStatMask != rootStatMask { 345 root.destroyLocked(ctx) 346 fs.vfsfs.DecRef(ctx) 347 return nil, nil, linuxerr.EREMOTE 348 } 349 if isWhiteout(&rootStat) { 350 ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") 351 root.destroyLocked(ctx) 352 fs.vfsfs.DecRef(ctx) 353 return nil, nil, linuxerr.EINVAL 354 } 355 root.mode = atomicbitops.FromUint32(uint32(rootStat.Mode)) 356 root.uid = atomicbitops.FromUint32(rootStat.UID) 357 root.gid = atomicbitops.FromUint32(rootStat.GID) 358 if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { 359 root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) 360 root.devMinor = atomicbitops.FromUint32(fs.dirDevMinor) 361 // For root dir, it is okay to use top most level's stat to compute inode 362 // number because we don't allow copy ups on root dentries. 363 root.ino.Store(fs.newDirIno(rootStat.DevMajor, rootStat.DevMinor, rootStat.Ino)) 364 } else if !root.upperVD.Ok() { 365 root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) 366 rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor) 367 if err != nil { 368 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) 369 root.destroyLocked(ctx) 370 fs.vfsfs.DecRef(ctx) 371 return nil, nil, err 372 } 373 root.devMinor = atomicbitops.FromUint32(rootDevMinor) 374 root.ino.Store(rootStat.Ino) 375 } else { 376 root.devMajor = atomicbitops.FromUint32(rootStat.DevMajor) 377 root.devMinor = atomicbitops.FromUint32(rootStat.DevMinor) 378 root.ino.Store(rootStat.Ino) 379 } 380 381 return &fs.vfsfs, &root.vfsd, nil 382 } 383 384 // clonePrivateMount creates a non-recursive bind mount rooted at vd, not 385 // associated with any MountNamespace, and returns the root of the new mount. 386 // (This is required to ensure that each layer of an overlay comprises only a 387 // single mount, and therefore can't cross into e.g. the overlay filesystem 388 // itself, risking lock recursion.) A reference is held on the returned 389 // VirtualDentry. 390 func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { 391 oldmnt := vd.Mount() 392 opts := oldmnt.Options() 393 if forceReadOnly { 394 opts.ReadOnly = true 395 } 396 newmnt := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) 397 // Take a reference on the dentry which will be owned by the returned 398 // VirtualDentry. 399 d := vd.Dentry() 400 d.IncRef() 401 return vfs.MakeVirtualDentry(newmnt, d), nil 402 } 403 404 // Release implements vfs.FilesystemImpl.Release. 405 func (fs *filesystem) Release(ctx context.Context) { 406 vfsObj := fs.vfsfs.VirtualFilesystem() 407 vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) 408 for _, lowerDevMinor := range fs.lowerDevMinors { 409 vfsObj.PutAnonBlockDevMinor(lowerDevMinor) 410 } 411 if fs.opts.UpperRoot.Ok() { 412 fs.opts.UpperRoot.DecRef(ctx) 413 } 414 for _, lowerRoot := range fs.opts.LowerRoots { 415 lowerRoot.DecRef(ctx) 416 } 417 } 418 419 // updateMaxNameLen is analogous to fs/overlayfs/super.c:ovl_check_namelen(). 420 func (fs *filesystem) updateMaxNameLen(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry) error { 421 statfs, err := vfsObj.StatFSAt(ctx, creds, &vfs.PathOperation{ 422 Root: vd, 423 Start: vd, 424 }) 425 if err != nil { 426 return err 427 } 428 if statfs.NameLength > fs.maxFilenameLen { 429 fs.maxFilenameLen = statfs.NameLength 430 } 431 return nil 432 } 433 434 func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { 435 // Always statfs the root of the topmost layer. Compare Linux's 436 // fs/overlayfs/super.c:ovl_statfs(). 437 var rootVD vfs.VirtualDentry 438 if fs.opts.UpperRoot.Ok() { 439 rootVD = fs.opts.UpperRoot 440 } else { 441 rootVD = fs.opts.LowerRoots[0] 442 } 443 fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ 444 Root: rootVD, 445 Start: rootVD, 446 }) 447 if err != nil { 448 return linux.Statfs{}, err 449 } 450 fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC 451 return fsstat, nil 452 } 453 454 func (fs *filesystem) newDirIno(layerMajor, layerMinor uint32, layerIno uint64) uint64 { 455 fs.dirInoCacheMu.Lock() 456 defer fs.dirInoCacheMu.Unlock() 457 orig := layerDevNoAndIno{ 458 layerDevNumber: layerDevNumber{layerMajor, layerMinor}, 459 ino: layerIno, 460 } 461 if ino, ok := fs.dirInoCache[orig]; ok { 462 return ino 463 } 464 fs.lastDirIno++ 465 newIno := fs.lastDirIno 466 fs.dirInoCache[orig] = newIno 467 return newIno 468 } 469 470 func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) { 471 fs.devMu.Lock() 472 defer fs.devMu.Unlock() 473 orig := layerDevNumber{layerMajor, layerMinor} 474 if minor, ok := fs.lowerDevMinors[orig]; ok { 475 return minor, nil 476 } 477 minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() 478 if err != nil { 479 return 0, err 480 } 481 fs.lowerDevMinors[orig] = minor 482 return minor, nil 483 } 484 485 // IsDescendant implements vfs.FilesystemImpl.IsDescendant. 486 func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { 487 return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) 488 } 489 490 // dentry implements vfs.DentryImpl. 491 // 492 // +stateify savable 493 type dentry struct { 494 vfsd vfs.Dentry 495 496 refs atomicbitops.Int64 497 498 // fs is the owning filesystem. fs is immutable. 499 fs *filesystem 500 501 // mode, uid, and gid are the file mode, owner, and group of the file in 502 // the topmost layer (and therefore the overlay file as well), and are used 503 // for permission checks on this dentry. These fields are protected by 504 // copyMu. 505 mode atomicbitops.Uint32 506 uid atomicbitops.Uint32 507 gid atomicbitops.Uint32 508 509 // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and 510 // 0 otherwise. 511 copiedUp atomicbitops.Uint32 512 513 // parent is the dentry corresponding to this dentry's parent directory. 514 // name is this dentry's name in parent. If this dentry is a filesystem 515 // root, parent is nil and name is the empty string. parent and name are 516 // protected by fs.renameMu. 517 parent atomic.Pointer[dentry] `state:".(*dentry)"` 518 name string 519 520 // If this dentry represents a directory, children maps the names of 521 // children for which dentries have been instantiated to those dentries, 522 // and dirents (if not nil) is a cache of dirents as returned by 523 // directoryFDs representing this directory. children is protected by 524 // dirMu. 525 dirMu dirMutex `state:"nosave"` 526 children map[string]*dentry 527 dirents []vfs.Dirent 528 529 // upperVD and lowerVDs are the files from the overlay filesystem's layers 530 // that comprise the file on the overlay filesystem. 531 // 532 // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. 533 // be copied up) with copyMu locked for writing; otherwise, it is 534 // immutable. lowerVDs is always immutable. 535 copyMu sync.RWMutex `state:"nosave"` 536 upperVD vfs.VirtualDentry 537 lowerVDs []vfs.VirtualDentry 538 539 // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= 540 // len(inlineLowerVDs). 541 inlineLowerVDs [1]vfs.VirtualDentry 542 543 // devMajor, devMinor, and ino are the device major/minor and inode numbers 544 // used by this dentry. These fields are protected by copyMu. 545 devMajor atomicbitops.Uint32 546 devMinor atomicbitops.Uint32 547 ino atomicbitops.Uint64 548 549 // If this dentry represents a regular file, then: 550 // 551 // - mapsMu is used to synchronize between copy-up and memmap.Mappable 552 // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. 553 // 554 // - dataMu is used to synchronize between copy-up and 555 // dentry.(memmap.Mappable).Translate. 556 // 557 // - lowerMappings tracks memory mappings of the file. lowerMappings is 558 // used to invalidate mappings of the lower layer when the file is copied 559 // up to ensure that they remain coherent with subsequent writes to the 560 // file. (Note that, as of this writing, Linux overlayfs does not do this; 561 // this feature is a gVisor extension.) lowerMappings is protected by 562 // mapsMu. 563 // 564 // - If this dentry is copied-up, then wrappedMappable is the Mappable 565 // obtained from a call to the current top layer's 566 // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil 567 // (from a call to regularFileFD.ensureMappable()), it cannot become nil. 568 // wrappedMappable is protected by mapsMu and dataMu. 569 // 570 // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is 571 // accessed using atomic memory operations. 572 // 573 // - wrappedMappable is protected by mapsMu and dataMu. In addition, 574 // it has to be immutable if copyMu is taken for write. 575 // copyUpMaybeSyntheticMountpointLocked relies on this behavior. 576 mapsMu mapsMutex `state:"nosave"` 577 lowerMappings memmap.MappingSet 578 dataMu dataRWMutex `state:"nosave"` 579 wrappedMappable memmap.Mappable 580 isMappable atomicbitops.Uint32 581 582 locks vfs.FileLocks 583 584 // watches is the set of inotify watches on the file represented by this dentry. 585 // 586 // Note that hard links to the same file will not share the same set of 587 // watches, due to the fact that we do not have inode structures in this 588 // overlay implementation. 589 watches vfs.Watches 590 } 591 592 // newDentry creates a new dentry. The dentry initially has no references; it 593 // is the caller's responsibility to set the dentry's reference count and/or 594 // call dentry.destroy() as appropriate. The dentry is initially invalid in 595 // that it contains no layers; the caller is responsible for setting them. 596 func (fs *filesystem) newDentry() *dentry { 597 d := &dentry{ 598 fs: fs, 599 } 600 d.lowerVDs = d.inlineLowerVDs[:0] 601 d.vfsd.Init(d) 602 refs.Register(d) 603 return d 604 } 605 606 // IncRef implements vfs.DentryImpl.IncRef. 607 func (d *dentry) IncRef() { 608 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 609 // d.checkDropLocked(). 610 r := d.refs.Add(1) 611 if d.LogRefs() { 612 refs.LogIncRef(d, r) 613 } 614 } 615 616 // TryIncRef implements vfs.DentryImpl.TryIncRef. 617 func (d *dentry) TryIncRef() bool { 618 for { 619 r := d.refs.Load() 620 if r <= 0 { 621 return false 622 } 623 if d.refs.CompareAndSwap(r, r+1) { 624 if d.LogRefs() { 625 refs.LogTryIncRef(d, r+1) 626 } 627 return true 628 } 629 } 630 } 631 632 // DecRef implements vfs.DentryImpl.DecRef. 633 func (d *dentry) DecRef(ctx context.Context) { 634 r := d.refs.Add(-1) 635 if d.LogRefs() { 636 refs.LogDecRef(d, r) 637 } 638 if r == 0 { 639 d.fs.renameMu.Lock() 640 d.checkDropLocked(ctx) 641 d.fs.renameMu.Unlock() 642 } else if r < 0 { 643 panic("overlay.dentry.DecRef() called without holding a reference") 644 } 645 } 646 647 func (d *dentry) decRefLocked(ctx context.Context) { 648 r := d.refs.Add(-1) 649 if d.LogRefs() { 650 refs.LogDecRef(d, r) 651 } 652 if r == 0 { 653 d.checkDropLocked(ctx) 654 } else if r < 0 { 655 panic("overlay.dentry.decRefLocked() called without holding a reference") 656 } 657 } 658 659 // checkDropLocked should be called after d's reference count becomes 0 or it 660 // becomes deleted. 661 // 662 // Preconditions: d.fs.renameMu must be locked for writing. 663 func (d *dentry) checkDropLocked(ctx context.Context) { 664 // Dentries with a positive reference count must be retained. (The only way 665 // to obtain a reference on a dentry with zero references is via path 666 // resolution, which requires renameMu, so if d.refs is zero then it will 667 // remain zero while we hold renameMu for writing.) Dentries with a 668 // negative reference count have already been destroyed. 669 if d.refs.Load() != 0 { 670 return 671 } 672 673 // Make sure that we do not lose watches on dentries that have not been 674 // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so 675 // d.vfsd.IsDead() indicates that d was deleted. 676 if !d.vfsd.IsDead() && d.watches.Size() > 0 { 677 return 678 } 679 680 // Refs is still zero; destroy it. 681 d.destroyLocked(ctx) 682 return 683 } 684 685 // destroyLocked destroys the dentry. 686 // 687 // Preconditions: 688 // - d.fs.renameMu must be locked for writing. 689 // - d.refs == 0. 690 func (d *dentry) destroyLocked(ctx context.Context) { 691 switch d.refs.Load() { 692 case 0: 693 // Mark the dentry destroyed. 694 d.refs.Store(-1) 695 case -1: 696 panic("overlay.dentry.destroyLocked() called on already destroyed dentry") 697 default: 698 panic("overlay.dentry.destroyLocked() called with references on the dentry") 699 } 700 701 if d.upperVD.Ok() { 702 d.upperVD.DecRef(ctx) 703 } 704 for _, lowerVD := range d.lowerVDs { 705 lowerVD.DecRef(ctx) 706 } 707 708 d.watches.HandleDeletion(ctx) 709 710 if parent := d.parent.Load(); parent != nil { 711 parent.dirMu.Lock() 712 if !d.vfsd.IsDead() { 713 delete(parent.children, d.name) 714 } 715 parent.dirMu.Unlock() 716 // Drop the reference held by d on its parent without recursively 717 // locking d.fs.renameMu. 718 parent.decRefLocked(ctx) 719 } 720 refs.Unregister(d) 721 } 722 723 // RefType implements refs.CheckedObject.Type. 724 func (d *dentry) RefType() string { 725 return "overlay.dentry" 726 } 727 728 // LeakMessage implements refs.CheckedObject.LeakMessage. 729 func (d *dentry) LeakMessage() string { 730 return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 731 } 732 733 // LogRefs implements refs.CheckedObject.LogRefs. 734 // 735 // This should only be set to true for debugging purposes, as it can generate an 736 // extremely large amount of output and drastically degrade performance. 737 func (d *dentry) LogRefs() bool { 738 return false 739 } 740 741 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 742 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { 743 if d.isDir() { 744 events |= linux.IN_ISDIR 745 } 746 747 // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 748 // that d was deleted. 749 deleted := d.vfsd.IsDead() 750 751 d.fs.renameMu.RLock() 752 // The ordering below is important, Linux always notifies the parent first. 753 if parent := d.parent.Load(); parent != nil { 754 parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) 755 } 756 d.watches.Notify(ctx, "", events, cookie, et, deleted) 757 d.fs.renameMu.RUnlock() 758 } 759 760 // Watches implements vfs.DentryImpl.Watches. 761 func (d *dentry) Watches() *vfs.Watches { 762 return &d.watches 763 } 764 765 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 766 func (d *dentry) OnZeroWatches(ctx context.Context) { 767 if d.refs.Load() == 0 { 768 d.fs.renameMu.Lock() 769 d.checkDropLocked(ctx) 770 d.fs.renameMu.Unlock() 771 } 772 } 773 774 // iterLayers invokes yield on each layer comprising d, from top to bottom. If 775 // any call to yield returns false, iterLayer stops iteration. 776 func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { 777 if d.isCopiedUp() { 778 if !yield(d.upperVD, true) { 779 return 780 } 781 } 782 for _, lowerVD := range d.lowerVDs { 783 if !yield(lowerVD, false) { 784 return 785 } 786 } 787 } 788 789 func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { 790 if d.isCopiedUp() { 791 return d.upperVD, true 792 } 793 return d.lowerVDs[0], false 794 } 795 796 func (d *dentry) topLayer() vfs.VirtualDentry { 797 vd, _ := d.topLayerInfo() 798 return vd 799 } 800 801 func (d *dentry) topLookupLayer() lookupLayer { 802 if d.upperVD.Ok() { 803 return lookupLayerUpper 804 } 805 return lookupLayerLower 806 } 807 808 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 809 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) 810 } 811 812 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 813 mode := linux.FileMode(d.mode.Load()) 814 kuid := auth.KUID(d.uid.Load()) 815 kgid := auth.KGID(d.gid.Load()) 816 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 817 return err 818 } 819 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 820 } 821 822 // statInternalMask is the set of stat fields that is set by 823 // dentry.statInternalTo(). 824 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 825 826 // statInternalTo writes fields to stat that are stored in d, and therefore do 827 // not requiring invoking StatAt on the overlay's layers. 828 func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { 829 stat.Mask |= statInternalMask 830 if d.isDir() { 831 // Linux sets nlink to 1 for merged directories 832 // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is 833 // correct more often ("." and the directory's entry in its parent), 834 // and some of our tests expect this. 835 stat.Nlink = 2 836 } 837 stat.UID = d.uid.Load() 838 stat.GID = d.gid.Load() 839 stat.Mode = uint16(d.mode.Load()) 840 stat.Ino = d.ino.Load() 841 stat.DevMajor = d.devMajor.Load() 842 stat.DevMinor = d.devMinor.Load() 843 } 844 845 // Preconditions: d.copyMu must be locked for writing. 846 func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { 847 if opts.Stat.Mask&linux.STATX_MODE != 0 { 848 d.mode.Store((d.mode.RacyLoad() & linux.S_IFMT) | uint32(opts.Stat.Mode&^linux.S_IFMT)) 849 } 850 if opts.Stat.Mask&linux.STATX_UID != 0 { 851 d.uid.Store(opts.Stat.UID) 852 } 853 if opts.Stat.Mask&linux.STATX_GID != 0 { 854 d.gid.Store(opts.Stat.GID) 855 } 856 } 857 858 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 859 return vfs.CheckDeleteSticky( 860 creds, 861 linux.FileMode(d.mode.Load()), 862 auth.KUID(d.uid.Load()), 863 auth.KUID(child.uid.Load()), 864 auth.KGID(child.gid.Load()), 865 ) 866 } 867 868 // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of 869 // children. 870 func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx { 871 stat := linux.Statx{ 872 Mask: uint32(linux.STATX_UID | linux.STATX_GID), 873 UID: uint32(creds.EffectiveKUID), 874 GID: uint32(creds.EffectiveKGID), 875 } 876 // Set GID and possibly the SGID bit if the parent is an SGID directory. 877 d.copyMu.RLock() 878 defer d.copyMu.RUnlock() 879 if d.mode.Load()&linux.ModeSetGID == linux.ModeSetGID { 880 stat.GID = d.gid.Load() 881 if stat.Mode&linux.ModeDirectory == linux.ModeDirectory { 882 stat.Mode = uint16(mode) | linux.ModeSetGID 883 stat.Mask |= linux.STATX_MODE 884 } 885 } 886 return stat 887 } 888 889 // fileDescription is embedded by overlay implementations of 890 // vfs.FileDescriptionImpl. 891 // 892 // +stateify savable 893 type fileDescription struct { 894 vfsfd vfs.FileDescription 895 vfs.FileDescriptionDefaultImpl 896 vfs.LockFD 897 } 898 899 func (fd *fileDescription) filesystem() *filesystem { 900 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 901 } 902 903 func (fd *fileDescription) dentry() *dentry { 904 return fd.vfsfd.Dentry().Impl().(*dentry) 905 } 906 907 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 908 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 909 return fd.filesystem().listXattr(ctx, fd.dentry(), size) 910 } 911 912 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 913 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 914 return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) 915 } 916 917 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 918 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 919 fs := fd.filesystem() 920 fs.renameMu.RLock() 921 defer fs.renameMu.RUnlock() 922 return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) 923 } 924 925 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 926 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 927 fs := fd.filesystem() 928 fs.renameMu.RLock() 929 defer fs.renameMu.RUnlock() 930 return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) 931 }