github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/overlay/overlay.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package overlay provides an overlay filesystem implementation, which 16 // synthesizes a filesystem by composing one or more immutable filesystems 17 // ("lower layers") with an optional mutable filesystem ("upper layer"). 18 // 19 // Lock order: 20 // 21 // directoryFD.mu / regularFileFD.mu 22 // filesystem.renameMu 23 // dentry.dirMu 24 // dentry.copyMu 25 // filesystem.devMu 26 // *** "memmap.Mappable locks" below this point 27 // dentry.mapsMu 28 // *** "memmap.Mappable locks taken by Translate" below this point 29 // dentry.dataMu 30 // 31 // Locking dentry.dirMu in multiple dentries requires that parent dentries are 32 // locked before child dentries, and that filesystem.renameMu is locked to 33 // stabilize this relationship. 34 package overlay 35 36 import ( 37 "fmt" 38 "strings" 39 40 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 41 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 42 "github.com/MerlinKodo/gvisor/pkg/context" 43 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 44 "github.com/MerlinKodo/gvisor/pkg/fspath" 45 "github.com/MerlinKodo/gvisor/pkg/refs" 46 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 47 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 48 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 49 "github.com/MerlinKodo/gvisor/pkg/sync" 50 ) 51 52 // Name is the default filesystem name. 53 const Name = "overlay" 54 55 // FilesystemType implements vfs.FilesystemType. 56 // 57 // +stateify savable 58 type FilesystemType struct{} 59 60 // Name implements vfs.FilesystemType.Name. 61 func (FilesystemType) Name() string { 62 return Name 63 } 64 65 // Release implements FilesystemType.Release. 66 func (FilesystemType) Release(ctx context.Context) {} 67 68 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to 69 // FilesystemType.GetFilesystem. 70 // 71 // +stateify savable 72 type FilesystemOptions struct { 73 // Callers passing FilesystemOptions to 74 // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that 75 // the vfs.Mounts comprising the layers of the overlay filesystem do not 76 // contain submounts. 77 78 // If UpperRoot.Ok(), it is the root of the writable upper layer of the 79 // overlay. 80 UpperRoot vfs.VirtualDentry 81 82 // LowerRoots contains the roots of the immutable lower layers of the 83 // overlay. LowerRoots is immutable. 84 LowerRoots []vfs.VirtualDentry 85 } 86 87 // filesystem implements vfs.FilesystemImpl. 88 // 89 // +stateify savable 90 type filesystem struct { 91 vfsfs vfs.Filesystem 92 93 // Immutable options. 94 opts FilesystemOptions 95 96 // creds is a copy of the filesystem's creator's credentials, which are 97 // used for accesses to the filesystem's layers. creds is immutable. 98 creds *auth.Credentials 99 100 // dirDevMinor is the device minor number used for directories. dirDevMinor 101 // is immutable. 102 dirDevMinor uint32 103 104 // lowerDevMinors maps device numbers from lower layer filesystems to 105 // device minor numbers assigned to non-directory files originating from 106 // that filesystem. (This remapping is necessary for lower layers because a 107 // file on a lower layer, and that same file on an overlay, are 108 // distinguishable because they will diverge after copy-up; this isn't true 109 // for non-directory files already on the upper layer.) lowerDevMinors is 110 // protected by devMu. 111 devMu devMutex `state:"nosave"` 112 lowerDevMinors map[layerDevNumber]uint32 113 114 // renameMu synchronizes renaming with non-renaming operations in order to 115 // ensure consistent lock ordering between dentry.dirMu in different 116 // dentries. 117 renameMu renameRWMutex `state:"nosave"` 118 119 // dirInoCache caches overlay-private directory inode numbers by mapped 120 // bottommost device numbers and inode number. dirInoCache is protected by 121 // dirInoCacheMu. 122 dirInoCacheMu dirInoCacheMutex `state:"nosave"` 123 dirInoCache map[layerDevNoAndIno]uint64 124 125 // lastDirIno is the last inode number assigned to a directory. lastDirIno 126 // is protected by dirInoCacheMu. 127 lastDirIno uint64 128 129 // MaxFilenameLen is the maximum filename length allowed by the overlayfs. 130 maxFilenameLen uint64 131 } 132 133 // +stateify savable 134 type layerDevNumber struct { 135 major uint32 136 minor uint32 137 } 138 139 // +stateify savable 140 type layerDevNoAndIno struct { 141 layerDevNumber 142 ino uint64 143 } 144 145 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 146 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 147 mopts := vfs.GenericParseMountOptions(opts.Data) 148 fsoptsRaw := opts.InternalData 149 fsopts, ok := fsoptsRaw.(FilesystemOptions) 150 if fsoptsRaw != nil && !ok { 151 ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) 152 return nil, nil, linuxerr.EINVAL 153 } 154 vfsroot := vfs.RootFromContext(ctx) 155 if vfsroot.Ok() { 156 defer vfsroot.DecRef(ctx) 157 } 158 159 if upperPathname, ok := mopts["upperdir"]; ok { 160 if fsopts.UpperRoot.Ok() { 161 ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") 162 return nil, nil, linuxerr.EINVAL 163 } 164 delete(mopts, "upperdir") 165 // Linux overlayfs also requires a workdir when upperdir is 166 // specified; we don't, so silently ignore this option. 167 if workdir, ok := mopts["workdir"]; ok { 168 // Linux creates the "work" directory in `workdir`. 169 // Docker calls chown on it and fails if it doesn't 170 // exist. 171 workdirPath := fspath.Parse(workdir + "/work") 172 if !workdirPath.Absolute { 173 ctx.Infof("overlay.FilesystemType.GetFilesystem: workdir %q must be absolute", workdir) 174 return nil, nil, linuxerr.EINVAL 175 } 176 pop := vfs.PathOperation{ 177 Root: vfsroot, 178 Start: vfsroot, 179 Path: workdirPath, 180 FollowFinalSymlink: false, 181 } 182 mode := vfs.MkdirOptions{ 183 Mode: linux.ModeUserAll, 184 } 185 if err := vfsObj.MkdirAt(ctx, creds, &pop, &mode); err != nil && err != linuxerr.EEXIST { 186 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to create %s/work: %v", workdir, err) 187 } 188 delete(mopts, "workdir") 189 } 190 upperPath := fspath.Parse(upperPathname) 191 if !upperPath.Absolute { 192 ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) 193 return nil, nil, linuxerr.EINVAL 194 } 195 upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 196 Root: vfsroot, 197 Start: vfsroot, 198 Path: upperPath, 199 FollowFinalSymlink: true, 200 }, &vfs.GetDentryOptions{ 201 CheckSearchable: true, 202 }) 203 if err != nil { 204 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) 205 return nil, nil, err 206 } 207 privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) 208 upperRoot.DecRef(ctx) 209 if err != nil { 210 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) 211 return nil, nil, err 212 } 213 defer privateUpperRoot.DecRef(ctx) 214 fsopts.UpperRoot = privateUpperRoot 215 } 216 217 if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { 218 if len(fsopts.LowerRoots) != 0 { 219 ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") 220 return nil, nil, linuxerr.EINVAL 221 } 222 delete(mopts, "lowerdir") 223 lowerPathnames := strings.Split(lowerPathnamesStr, ":") 224 for _, lowerPathname := range lowerPathnames { 225 lowerPath := fspath.Parse(lowerPathname) 226 if !lowerPath.Absolute { 227 ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) 228 return nil, nil, linuxerr.EINVAL 229 } 230 lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 231 Root: vfsroot, 232 Start: vfsroot, 233 Path: lowerPath, 234 FollowFinalSymlink: true, 235 }, &vfs.GetDentryOptions{ 236 CheckSearchable: true, 237 }) 238 if err != nil { 239 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) 240 return nil, nil, err 241 } 242 privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) 243 lowerRoot.DecRef(ctx) 244 if err != nil { 245 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) 246 return nil, nil, err 247 } 248 defer privateLowerRoot.DecRef(ctx) 249 fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) 250 } 251 } 252 253 if len(mopts) != 0 { 254 ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) 255 return nil, nil, linuxerr.EINVAL 256 } 257 258 if len(fsopts.LowerRoots) == 0 { 259 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") 260 return nil, nil, linuxerr.EINVAL 261 } 262 if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { 263 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") 264 return nil, nil, linuxerr.EINVAL 265 } 266 const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK 267 if len(fsopts.LowerRoots) > maxLowerLayers { 268 ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) 269 return nil, nil, linuxerr.EINVAL 270 } 271 272 // Allocate dirDevMinor. lowerDevMinors are allocated dynamically. 273 dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() 274 if err != nil { 275 return nil, nil, err 276 } 277 278 // Take extra references held by the filesystem. 279 if fsopts.UpperRoot.Ok() { 280 fsopts.UpperRoot.IncRef() 281 } 282 for _, lowerRoot := range fsopts.LowerRoots { 283 lowerRoot.IncRef() 284 } 285 286 fs := &filesystem{ 287 opts: fsopts, 288 creds: creds.Fork(), 289 dirDevMinor: dirDevMinor, 290 lowerDevMinors: make(map[layerDevNumber]uint32), 291 dirInoCache: make(map[layerDevNoAndIno]uint64), 292 maxFilenameLen: linux.NAME_MAX, 293 } 294 fs.vfsfs.Init(vfsObj, &fstype, fs) 295 296 // Configure max filename length. Similar to what Linux does in 297 // fs/overlayfs/super.c:ovl_fill_super() -> ... -> ovl_check_namelen(). 298 if fsopts.UpperRoot.Ok() { 299 if err := fs.updateMaxNameLen(ctx, creds, vfsObj, fs.opts.UpperRoot); err != nil { 300 ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on upper layer root: %v", err) 301 } 302 } 303 for _, lowerRoot := range fsopts.LowerRoots { 304 if err := fs.updateMaxNameLen(ctx, creds, vfsObj, lowerRoot); err != nil { 305 ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on lower layer root: %v", err) 306 } 307 } 308 309 // Construct the root dentry. 310 root := fs.newDentry() 311 root.refs = atomicbitops.FromInt64(1) 312 if fs.opts.UpperRoot.Ok() { 313 fs.opts.UpperRoot.IncRef() 314 root.copiedUp = atomicbitops.FromUint32(1) 315 root.upperVD = fs.opts.UpperRoot 316 } 317 for _, lowerRoot := range fs.opts.LowerRoots { 318 lowerRoot.IncRef() 319 root.lowerVDs = append(root.lowerVDs, lowerRoot) 320 } 321 rootTopVD := root.topLayer() 322 // Get metadata from the topmost layer. See fs.lookupLocked(). 323 const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 324 rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ 325 Root: rootTopVD, 326 Start: rootTopVD, 327 }, &vfs.StatOptions{ 328 Mask: rootStatMask, 329 }) 330 if err != nil { 331 root.destroyLocked(ctx) 332 fs.vfsfs.DecRef(ctx) 333 return nil, nil, err 334 } 335 if rootStat.Mask&rootStatMask != rootStatMask { 336 root.destroyLocked(ctx) 337 fs.vfsfs.DecRef(ctx) 338 return nil, nil, linuxerr.EREMOTE 339 } 340 if isWhiteout(&rootStat) { 341 ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") 342 root.destroyLocked(ctx) 343 fs.vfsfs.DecRef(ctx) 344 return nil, nil, linuxerr.EINVAL 345 } 346 root.mode = atomicbitops.FromUint32(uint32(rootStat.Mode)) 347 root.uid = atomicbitops.FromUint32(rootStat.UID) 348 root.gid = atomicbitops.FromUint32(rootStat.GID) 349 if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { 350 root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) 351 root.devMinor = atomicbitops.FromUint32(fs.dirDevMinor) 352 // For root dir, it is okay to use top most level's stat to compute inode 353 // number because we don't allow copy ups on root dentries. 354 root.ino.Store(fs.newDirIno(rootStat.DevMajor, rootStat.DevMinor, rootStat.Ino)) 355 } else if !root.upperVD.Ok() { 356 root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) 357 rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor) 358 if err != nil { 359 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) 360 root.destroyLocked(ctx) 361 fs.vfsfs.DecRef(ctx) 362 return nil, nil, err 363 } 364 root.devMinor = atomicbitops.FromUint32(rootDevMinor) 365 root.ino.Store(rootStat.Ino) 366 } else { 367 root.devMajor = atomicbitops.FromUint32(rootStat.DevMajor) 368 root.devMinor = atomicbitops.FromUint32(rootStat.DevMinor) 369 root.ino.Store(rootStat.Ino) 370 } 371 372 return &fs.vfsfs, &root.vfsd, nil 373 } 374 375 // clonePrivateMount creates a non-recursive bind mount rooted at vd, not 376 // associated with any MountNamespace, and returns the root of the new mount. 377 // (This is required to ensure that each layer of an overlay comprises only a 378 // single mount, and therefore can't cross into e.g. the overlay filesystem 379 // itself, risking lock recursion.) A reference is held on the returned 380 // VirtualDentry. 381 func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { 382 oldmnt := vd.Mount() 383 opts := oldmnt.Options() 384 if forceReadOnly { 385 opts.ReadOnly = true 386 } 387 newmnt := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) 388 // Take a reference on the dentry which will be owned by the returned 389 // VirtualDentry. 390 d := vd.Dentry() 391 d.IncRef() 392 return vfs.MakeVirtualDentry(newmnt, d), nil 393 } 394 395 // Release implements vfs.FilesystemImpl.Release. 396 func (fs *filesystem) Release(ctx context.Context) { 397 vfsObj := fs.vfsfs.VirtualFilesystem() 398 vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) 399 for _, lowerDevMinor := range fs.lowerDevMinors { 400 vfsObj.PutAnonBlockDevMinor(lowerDevMinor) 401 } 402 if fs.opts.UpperRoot.Ok() { 403 fs.opts.UpperRoot.DecRef(ctx) 404 } 405 for _, lowerRoot := range fs.opts.LowerRoots { 406 lowerRoot.DecRef(ctx) 407 } 408 } 409 410 // updateMaxNameLen is analogous to fs/overlayfs/super.c:ovl_check_namelen(). 411 func (fs *filesystem) updateMaxNameLen(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry) error { 412 statfs, err := vfsObj.StatFSAt(ctx, creds, &vfs.PathOperation{ 413 Root: vd, 414 Start: vd, 415 }) 416 if err != nil { 417 return err 418 } 419 if statfs.NameLength > fs.maxFilenameLen { 420 fs.maxFilenameLen = statfs.NameLength 421 } 422 return nil 423 } 424 425 func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { 426 // Always statfs the root of the topmost layer. Compare Linux's 427 // fs/overlayfs/super.c:ovl_statfs(). 428 var rootVD vfs.VirtualDentry 429 if fs.opts.UpperRoot.Ok() { 430 rootVD = fs.opts.UpperRoot 431 } else { 432 rootVD = fs.opts.LowerRoots[0] 433 } 434 fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ 435 Root: rootVD, 436 Start: rootVD, 437 }) 438 if err != nil { 439 return linux.Statfs{}, err 440 } 441 fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC 442 return fsstat, nil 443 } 444 445 func (fs *filesystem) newDirIno(layerMajor, layerMinor uint32, layerIno uint64) uint64 { 446 fs.dirInoCacheMu.Lock() 447 defer fs.dirInoCacheMu.Unlock() 448 orig := layerDevNoAndIno{ 449 layerDevNumber: layerDevNumber{layerMajor, layerMinor}, 450 ino: layerIno, 451 } 452 if ino, ok := fs.dirInoCache[orig]; ok { 453 return ino 454 } 455 fs.lastDirIno++ 456 newIno := fs.lastDirIno 457 fs.dirInoCache[orig] = newIno 458 return newIno 459 } 460 461 func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) { 462 fs.devMu.Lock() 463 defer fs.devMu.Unlock() 464 orig := layerDevNumber{layerMajor, layerMinor} 465 if minor, ok := fs.lowerDevMinors[orig]; ok { 466 return minor, nil 467 } 468 minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() 469 if err != nil { 470 return 0, err 471 } 472 fs.lowerDevMinors[orig] = minor 473 return minor, nil 474 } 475 476 // dentry implements vfs.DentryImpl. 477 // 478 // +stateify savable 479 type dentry struct { 480 vfsd vfs.Dentry 481 482 refs atomicbitops.Int64 483 484 // fs is the owning filesystem. fs is immutable. 485 fs *filesystem 486 487 // mode, uid, and gid are the file mode, owner, and group of the file in 488 // the topmost layer (and therefore the overlay file as well), and are used 489 // for permission checks on this dentry. These fields are protected by 490 // copyMu. 491 mode atomicbitops.Uint32 492 uid atomicbitops.Uint32 493 gid atomicbitops.Uint32 494 495 // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and 496 // 0 otherwise. 497 copiedUp atomicbitops.Uint32 498 499 // parent is the dentry corresponding to this dentry's parent directory. 500 // name is this dentry's name in parent. If this dentry is a filesystem 501 // root, parent is nil and name is the empty string. parent and name are 502 // protected by fs.renameMu. 503 parent *dentry 504 name string 505 506 // If this dentry represents a directory, children maps the names of 507 // children for which dentries have been instantiated to those dentries, 508 // and dirents (if not nil) is a cache of dirents as returned by 509 // directoryFDs representing this directory. children is protected by 510 // dirMu. 511 dirMu dirMutex `state:"nosave"` 512 children map[string]*dentry 513 dirents []vfs.Dirent 514 515 // upperVD and lowerVDs are the files from the overlay filesystem's layers 516 // that comprise the file on the overlay filesystem. 517 // 518 // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. 519 // be copied up) with copyMu locked for writing; otherwise, it is 520 // immutable. lowerVDs is always immutable. 521 copyMu sync.RWMutex `state:"nosave"` 522 upperVD vfs.VirtualDentry 523 lowerVDs []vfs.VirtualDentry 524 525 // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= 526 // len(inlineLowerVDs). 527 inlineLowerVDs [1]vfs.VirtualDentry 528 529 // devMajor, devMinor, and ino are the device major/minor and inode numbers 530 // used by this dentry. These fields are protected by copyMu. 531 devMajor atomicbitops.Uint32 532 devMinor atomicbitops.Uint32 533 ino atomicbitops.Uint64 534 535 // If this dentry represents a regular file, then: 536 // 537 // - mapsMu is used to synchronize between copy-up and memmap.Mappable 538 // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. 539 // 540 // - dataMu is used to synchronize between copy-up and 541 // dentry.(memmap.Mappable).Translate. 542 // 543 // - lowerMappings tracks memory mappings of the file. lowerMappings is 544 // used to invalidate mappings of the lower layer when the file is copied 545 // up to ensure that they remain coherent with subsequent writes to the 546 // file. (Note that, as of this writing, Linux overlayfs does not do this; 547 // this feature is a gVisor extension.) lowerMappings is protected by 548 // mapsMu. 549 // 550 // - If this dentry is copied-up, then wrappedMappable is the Mappable 551 // obtained from a call to the current top layer's 552 // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil 553 // (from a call to regularFileFD.ensureMappable()), it cannot become nil. 554 // wrappedMappable is protected by mapsMu and dataMu. 555 // 556 // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is 557 // accessed using atomic memory operations. 558 // 559 // - wrappedMappable is protected by mapsMu and dataMu. In addition, 560 // it has to be immutable if copyMu is taken for write. 561 // copyUpMaybeSyntheticMountpointLocked relies on this behavior. 562 mapsMu mapsMutex `state:"nosave"` 563 lowerMappings memmap.MappingSet 564 dataMu dataRWMutex `state:"nosave"` 565 wrappedMappable memmap.Mappable 566 isMappable atomicbitops.Uint32 567 568 locks vfs.FileLocks 569 570 // watches is the set of inotify watches on the file repesented by this dentry. 571 // 572 // Note that hard links to the same file will not share the same set of 573 // watches, due to the fact that we do not have inode structures in this 574 // overlay implementation. 575 watches vfs.Watches 576 } 577 578 // newDentry creates a new dentry. The dentry initially has no references; it 579 // is the caller's responsibility to set the dentry's reference count and/or 580 // call dentry.destroy() as appropriate. The dentry is initially invalid in 581 // that it contains no layers; the caller is responsible for setting them. 582 func (fs *filesystem) newDentry() *dentry { 583 d := &dentry{ 584 fs: fs, 585 } 586 d.lowerVDs = d.inlineLowerVDs[:0] 587 d.vfsd.Init(d) 588 refs.Register(d) 589 return d 590 } 591 592 // IncRef implements vfs.DentryImpl.IncRef. 593 func (d *dentry) IncRef() { 594 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 595 // d.checkDropLocked(). 596 r := d.refs.Add(1) 597 if d.LogRefs() { 598 refs.LogIncRef(d, r) 599 } 600 } 601 602 // TryIncRef implements vfs.DentryImpl.TryIncRef. 603 func (d *dentry) TryIncRef() bool { 604 for { 605 r := d.refs.Load() 606 if r <= 0 { 607 return false 608 } 609 if d.refs.CompareAndSwap(r, r+1) { 610 if d.LogRefs() { 611 refs.LogTryIncRef(d, r+1) 612 } 613 return true 614 } 615 } 616 } 617 618 // DecRef implements vfs.DentryImpl.DecRef. 619 func (d *dentry) DecRef(ctx context.Context) { 620 r := d.refs.Add(-1) 621 if d.LogRefs() { 622 refs.LogDecRef(d, r) 623 } 624 if r == 0 { 625 d.fs.renameMu.Lock() 626 d.checkDropLocked(ctx) 627 d.fs.renameMu.Unlock() 628 } else if r < 0 { 629 panic("overlay.dentry.DecRef() called without holding a reference") 630 } 631 } 632 633 func (d *dentry) decRefLocked(ctx context.Context) { 634 r := d.refs.Add(-1) 635 if d.LogRefs() { 636 refs.LogDecRef(d, r) 637 } 638 if r == 0 { 639 d.checkDropLocked(ctx) 640 } else if r < 0 { 641 panic("overlay.dentry.decRefLocked() called without holding a reference") 642 } 643 } 644 645 // checkDropLocked should be called after d's reference count becomes 0 or it 646 // becomes deleted. 647 // 648 // Preconditions: d.fs.renameMu must be locked for writing. 649 func (d *dentry) checkDropLocked(ctx context.Context) { 650 // Dentries with a positive reference count must be retained. (The only way 651 // to obtain a reference on a dentry with zero references is via path 652 // resolution, which requires renameMu, so if d.refs is zero then it will 653 // remain zero while we hold renameMu for writing.) Dentries with a 654 // negative reference count have already been destroyed. 655 if d.refs.Load() != 0 { 656 return 657 } 658 659 // Make sure that we do not lose watches on dentries that have not been 660 // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so 661 // d.vfsd.IsDead() indicates that d was deleted. 662 if !d.vfsd.IsDead() && d.watches.Size() > 0 { 663 return 664 } 665 666 // Refs is still zero; destroy it. 667 d.destroyLocked(ctx) 668 return 669 } 670 671 // destroyLocked destroys the dentry. 672 // 673 // Preconditions: 674 // - d.fs.renameMu must be locked for writing. 675 // - d.refs == 0. 676 func (d *dentry) destroyLocked(ctx context.Context) { 677 switch d.refs.Load() { 678 case 0: 679 // Mark the dentry destroyed. 680 d.refs.Store(-1) 681 case -1: 682 panic("overlay.dentry.destroyLocked() called on already destroyed dentry") 683 default: 684 panic("overlay.dentry.destroyLocked() called with references on the dentry") 685 } 686 687 if d.upperVD.Ok() { 688 d.upperVD.DecRef(ctx) 689 } 690 for _, lowerVD := range d.lowerVDs { 691 lowerVD.DecRef(ctx) 692 } 693 694 d.watches.HandleDeletion(ctx) 695 696 if d.parent != nil { 697 d.parent.dirMu.Lock() 698 if !d.vfsd.IsDead() { 699 delete(d.parent.children, d.name) 700 } 701 d.parent.dirMu.Unlock() 702 // Drop the reference held by d on its parent without recursively 703 // locking d.fs.renameMu. 704 d.parent.decRefLocked(ctx) 705 } 706 refs.Unregister(d) 707 } 708 709 // RefType implements refs.CheckedObject.Type. 710 func (d *dentry) RefType() string { 711 return "overlay.dentry" 712 } 713 714 // LeakMessage implements refs.CheckedObject.LeakMessage. 715 func (d *dentry) LeakMessage() string { 716 return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 717 } 718 719 // LogRefs implements refs.CheckedObject.LogRefs. 720 // 721 // This should only be set to true for debugging purposes, as it can generate an 722 // extremely large amount of output and drastically degrade performance. 723 func (d *dentry) LogRefs() bool { 724 return false 725 } 726 727 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 728 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { 729 if d.isDir() { 730 events |= linux.IN_ISDIR 731 } 732 733 // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 734 // that d was deleted. 735 deleted := d.vfsd.IsDead() 736 737 d.fs.renameMu.RLock() 738 // The ordering below is important, Linux always notifies the parent first. 739 if d.parent != nil { 740 d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) 741 } 742 d.watches.Notify(ctx, "", events, cookie, et, deleted) 743 d.fs.renameMu.RUnlock() 744 } 745 746 // Watches implements vfs.DentryImpl.Watches. 747 func (d *dentry) Watches() *vfs.Watches { 748 return &d.watches 749 } 750 751 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 752 func (d *dentry) OnZeroWatches(ctx context.Context) { 753 if d.refs.Load() == 0 { 754 d.fs.renameMu.Lock() 755 d.checkDropLocked(ctx) 756 d.fs.renameMu.Unlock() 757 } 758 } 759 760 // iterLayers invokes yield on each layer comprising d, from top to bottom. If 761 // any call to yield returns false, iterLayer stops iteration. 762 func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { 763 if d.isCopiedUp() { 764 if !yield(d.upperVD, true) { 765 return 766 } 767 } 768 for _, lowerVD := range d.lowerVDs { 769 if !yield(lowerVD, false) { 770 return 771 } 772 } 773 } 774 775 func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { 776 if d.isCopiedUp() { 777 return d.upperVD, true 778 } 779 return d.lowerVDs[0], false 780 } 781 782 func (d *dentry) topLayer() vfs.VirtualDentry { 783 vd, _ := d.topLayerInfo() 784 return vd 785 } 786 787 func (d *dentry) topLookupLayer() lookupLayer { 788 if d.upperVD.Ok() { 789 return lookupLayerUpper 790 } 791 return lookupLayerLower 792 } 793 794 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 795 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) 796 } 797 798 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 799 mode := linux.FileMode(d.mode.Load()) 800 kuid := auth.KUID(d.uid.Load()) 801 kgid := auth.KGID(d.gid.Load()) 802 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 803 return err 804 } 805 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 806 } 807 808 // statInternalMask is the set of stat fields that is set by 809 // dentry.statInternalTo(). 810 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 811 812 // statInternalTo writes fields to stat that are stored in d, and therefore do 813 // not requiring invoking StatAt on the overlay's layers. 814 func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { 815 stat.Mask |= statInternalMask 816 if d.isDir() { 817 // Linux sets nlink to 1 for merged directories 818 // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is 819 // correct more often ("." and the directory's entry in its parent), 820 // and some of our tests expect this. 821 stat.Nlink = 2 822 } 823 stat.UID = d.uid.Load() 824 stat.GID = d.gid.Load() 825 stat.Mode = uint16(d.mode.Load()) 826 stat.Ino = d.ino.Load() 827 stat.DevMajor = d.devMajor.Load() 828 stat.DevMinor = d.devMinor.Load() 829 } 830 831 // Preconditions: d.copyMu must be locked for writing. 832 func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { 833 if opts.Stat.Mask&linux.STATX_MODE != 0 { 834 d.mode.Store((d.mode.RacyLoad() & linux.S_IFMT) | uint32(opts.Stat.Mode&^linux.S_IFMT)) 835 } 836 if opts.Stat.Mask&linux.STATX_UID != 0 { 837 d.uid.Store(opts.Stat.UID) 838 } 839 if opts.Stat.Mask&linux.STATX_GID != 0 { 840 d.gid.Store(opts.Stat.GID) 841 } 842 } 843 844 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 845 return vfs.CheckDeleteSticky( 846 creds, 847 linux.FileMode(d.mode.Load()), 848 auth.KUID(d.uid.Load()), 849 auth.KUID(child.uid.Load()), 850 auth.KGID(child.gid.Load()), 851 ) 852 } 853 854 // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of 855 // children. 856 func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx { 857 stat := linux.Statx{ 858 Mask: uint32(linux.STATX_UID | linux.STATX_GID), 859 UID: uint32(creds.EffectiveKUID), 860 GID: uint32(creds.EffectiveKGID), 861 } 862 // Set GID and possibly the SGID bit if the parent is an SGID directory. 863 d.copyMu.RLock() 864 defer d.copyMu.RUnlock() 865 if d.mode.Load()&linux.ModeSetGID == linux.ModeSetGID { 866 stat.GID = d.gid.Load() 867 if stat.Mode&linux.ModeDirectory == linux.ModeDirectory { 868 stat.Mode = uint16(mode) | linux.ModeSetGID 869 stat.Mask |= linux.STATX_MODE 870 } 871 } 872 return stat 873 } 874 875 // fileDescription is embedded by overlay implementations of 876 // vfs.FileDescriptionImpl. 877 // 878 // +stateify savable 879 type fileDescription struct { 880 vfsfd vfs.FileDescription 881 vfs.FileDescriptionDefaultImpl 882 vfs.LockFD 883 } 884 885 func (fd *fileDescription) filesystem() *filesystem { 886 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 887 } 888 889 func (fd *fileDescription) dentry() *dentry { 890 return fd.vfsfd.Dentry().Impl().(*dentry) 891 } 892 893 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 894 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 895 return fd.filesystem().listXattr(ctx, fd.dentry(), size) 896 } 897 898 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 899 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 900 return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) 901 } 902 903 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 904 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 905 fs := fd.filesystem() 906 fs.renameMu.RLock() 907 defer fs.renameMu.RUnlock() 908 return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) 909 } 910 911 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 912 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 913 fs := fd.filesystem() 914 fs.renameMu.RLock() 915 defer fs.renameMu.RUnlock() 916 return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) 917 }