github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/overlay/overlay.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package overlay provides an overlay filesystem implementation, which 16 // synthesizes a filesystem by composing one or more immutable filesystems 17 // ("lower layers") with an optional mutable filesystem ("upper layer"). 18 // 19 // Lock order: 20 // 21 // directoryFD.mu / regularFileFD.mu 22 // filesystem.renameMu 23 // dentry.dirMu 24 // dentry.copyMu 25 // filesystem.devMu 26 // *** "memmap.Mappable locks" below this point 27 // dentry.mapsMu 28 // *** "memmap.Mappable locks taken by Translate" below this point 29 // dentry.dataMu 30 // 31 // Locking dentry.dirMu in multiple dentries requires that parent dentries are 32 // locked before child dentries, and that filesystem.renameMu is locked to 33 // stabilize this relationship. 34 package overlay 35 36 import ( 37 "fmt" 38 "strings" 39 40 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 41 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 42 "github.com/nicocha30/gvisor-ligolo/pkg/context" 43 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 44 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 45 "github.com/nicocha30/gvisor-ligolo/pkg/refs" 46 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 47 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 48 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 49 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 50 ) 51 52 // Name is the default filesystem name. 53 const Name = "overlay" 54 55 // FilesystemType implements vfs.FilesystemType. 56 // 57 // +stateify savable 58 type FilesystemType struct{} 59 60 // Name implements vfs.FilesystemType.Name. 61 func (FilesystemType) Name() string { 62 return Name 63 } 64 65 // Release implements FilesystemType.Release. 66 func (FilesystemType) Release(ctx context.Context) {} 67 68 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to 69 // FilesystemType.GetFilesystem. 70 // 71 // +stateify savable 72 type FilesystemOptions struct { 73 // Callers passing FilesystemOptions to 74 // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that 75 // the vfs.Mounts comprising the layers of the overlay filesystem do not 76 // contain submounts. 77 78 // If UpperRoot.Ok(), it is the root of the writable upper layer of the 79 // overlay. 80 UpperRoot vfs.VirtualDentry 81 82 // LowerRoots contains the roots of the immutable lower layers of the 83 // overlay. LowerRoots is immutable. 84 LowerRoots []vfs.VirtualDentry 85 } 86 87 // filesystem implements vfs.FilesystemImpl. 88 // 89 // +stateify savable 90 type filesystem struct { 91 vfsfs vfs.Filesystem 92 93 // Immutable options. 94 opts FilesystemOptions 95 96 // creds is a copy of the filesystem's creator's credentials, which are 97 // used for accesses to the filesystem's layers. creds is immutable. 98 creds *auth.Credentials 99 100 // dirDevMinor is the device minor number used for directories. dirDevMinor 101 // is immutable. 102 dirDevMinor uint32 103 104 // lowerDevMinors maps device numbers from lower layer filesystems to 105 // device minor numbers assigned to non-directory files originating from 106 // that filesystem. (This remapping is necessary for lower layers because a 107 // file on a lower layer, and that same file on an overlay, are 108 // distinguishable because they will diverge after copy-up; this isn't true 109 // for non-directory files already on the upper layer.) lowerDevMinors is 110 // protected by devMu. 111 devMu devMutex `state:"nosave"` 112 lowerDevMinors map[layerDevNumber]uint32 113 114 // renameMu synchronizes renaming with non-renaming operations in order to 115 // ensure consistent lock ordering between dentry.dirMu in different 116 // dentries. 117 renameMu renameRWMutex `state:"nosave"` 118 119 // dirInoCache caches overlay-private directory inode numbers by mapped 120 // bottommost device numbers and inode number. dirInoCache is protected by 121 // dirInoCacheMu. 122 dirInoCacheMu dirInoCacheMutex `state:"nosave"` 123 dirInoCache map[layerDevNoAndIno]uint64 124 125 // lastDirIno is the last inode number assigned to a directory. lastDirIno 126 // is protected by dirInoCacheMu. 127 lastDirIno uint64 128 129 // MaxFilenameLen is the maximum filename length allowed by the overlayfs. 130 maxFilenameLen uint64 131 } 132 133 // +stateify savable 134 type layerDevNumber struct { 135 major uint32 136 minor uint32 137 } 138 139 // +stateify savable 140 type layerDevNoAndIno struct { 141 layerDevNumber 142 ino uint64 143 } 144 145 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 146 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 147 mopts := vfs.GenericParseMountOptions(opts.Data) 148 fsoptsRaw := opts.InternalData 149 fsopts, ok := fsoptsRaw.(FilesystemOptions) 150 if fsoptsRaw != nil && !ok { 151 ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) 152 return nil, nil, linuxerr.EINVAL 153 } 154 vfsroot := vfs.RootFromContext(ctx) 155 if vfsroot.Ok() { 156 defer vfsroot.DecRef(ctx) 157 } 158 159 if upperPathname, ok := mopts["upperdir"]; ok { 160 if fsopts.UpperRoot.Ok() { 161 ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") 162 return nil, nil, linuxerr.EINVAL 163 } 164 delete(mopts, "upperdir") 165 // Linux overlayfs also requires a workdir when upperdir is 166 // specified; we don't, so silently ignore this option. 167 delete(mopts, "workdir") 168 upperPath := fspath.Parse(upperPathname) 169 if !upperPath.Absolute { 170 ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) 171 return nil, nil, linuxerr.EINVAL 172 } 173 upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 174 Root: vfsroot, 175 Start: vfsroot, 176 Path: upperPath, 177 FollowFinalSymlink: true, 178 }, &vfs.GetDentryOptions{ 179 CheckSearchable: true, 180 }) 181 if err != nil { 182 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) 183 return nil, nil, err 184 } 185 privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) 186 upperRoot.DecRef(ctx) 187 if err != nil { 188 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) 189 return nil, nil, err 190 } 191 defer privateUpperRoot.DecRef(ctx) 192 fsopts.UpperRoot = privateUpperRoot 193 } 194 195 if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { 196 if len(fsopts.LowerRoots) != 0 { 197 ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") 198 return nil, nil, linuxerr.EINVAL 199 } 200 delete(mopts, "lowerdir") 201 lowerPathnames := strings.Split(lowerPathnamesStr, ":") 202 for _, lowerPathname := range lowerPathnames { 203 lowerPath := fspath.Parse(lowerPathname) 204 if !lowerPath.Absolute { 205 ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) 206 return nil, nil, linuxerr.EINVAL 207 } 208 lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 209 Root: vfsroot, 210 Start: vfsroot, 211 Path: lowerPath, 212 FollowFinalSymlink: true, 213 }, &vfs.GetDentryOptions{ 214 CheckSearchable: true, 215 }) 216 if err != nil { 217 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) 218 return nil, nil, err 219 } 220 privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) 221 lowerRoot.DecRef(ctx) 222 if err != nil { 223 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) 224 return nil, nil, err 225 } 226 defer privateLowerRoot.DecRef(ctx) 227 fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) 228 } 229 } 230 231 if len(mopts) != 0 { 232 ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) 233 return nil, nil, linuxerr.EINVAL 234 } 235 236 if len(fsopts.LowerRoots) == 0 { 237 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") 238 return nil, nil, linuxerr.EINVAL 239 } 240 if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { 241 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") 242 return nil, nil, linuxerr.EINVAL 243 } 244 const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK 245 if len(fsopts.LowerRoots) > maxLowerLayers { 246 ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) 247 return nil, nil, linuxerr.EINVAL 248 } 249 250 // Allocate dirDevMinor. lowerDevMinors are allocated dynamically. 251 dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() 252 if err != nil { 253 return nil, nil, err 254 } 255 256 // Take extra references held by the filesystem. 257 if fsopts.UpperRoot.Ok() { 258 fsopts.UpperRoot.IncRef() 259 } 260 for _, lowerRoot := range fsopts.LowerRoots { 261 lowerRoot.IncRef() 262 } 263 264 fs := &filesystem{ 265 opts: fsopts, 266 creds: creds.Fork(), 267 dirDevMinor: dirDevMinor, 268 lowerDevMinors: make(map[layerDevNumber]uint32), 269 dirInoCache: make(map[layerDevNoAndIno]uint64), 270 maxFilenameLen: linux.NAME_MAX, 271 } 272 fs.vfsfs.Init(vfsObj, &fstype, fs) 273 274 // Configure max filename length. Similar to what Linux does in 275 // fs/overlayfs/super.c:ovl_fill_super() -> ... -> ovl_check_namelen(). 276 if fsopts.UpperRoot.Ok() { 277 if err := fs.updateMaxNameLen(ctx, creds, vfsObj, fs.opts.UpperRoot); err != nil { 278 ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on upper layer root: %v", err) 279 } 280 } 281 for _, lowerRoot := range fsopts.LowerRoots { 282 if err := fs.updateMaxNameLen(ctx, creds, vfsObj, lowerRoot); err != nil { 283 ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on lower layer root: %v", err) 284 } 285 } 286 287 // Construct the root dentry. 288 root := fs.newDentry() 289 root.refs = atomicbitops.FromInt64(1) 290 if fs.opts.UpperRoot.Ok() { 291 fs.opts.UpperRoot.IncRef() 292 root.copiedUp = atomicbitops.FromUint32(1) 293 root.upperVD = fs.opts.UpperRoot 294 } 295 for _, lowerRoot := range fs.opts.LowerRoots { 296 lowerRoot.IncRef() 297 root.lowerVDs = append(root.lowerVDs, lowerRoot) 298 } 299 rootTopVD := root.topLayer() 300 // Get metadata from the topmost layer. See fs.lookupLocked(). 301 const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 302 rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ 303 Root: rootTopVD, 304 Start: rootTopVD, 305 }, &vfs.StatOptions{ 306 Mask: rootStatMask, 307 }) 308 if err != nil { 309 root.destroyLocked(ctx) 310 fs.vfsfs.DecRef(ctx) 311 return nil, nil, err 312 } 313 if rootStat.Mask&rootStatMask != rootStatMask { 314 root.destroyLocked(ctx) 315 fs.vfsfs.DecRef(ctx) 316 return nil, nil, linuxerr.EREMOTE 317 } 318 if isWhiteout(&rootStat) { 319 ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") 320 root.destroyLocked(ctx) 321 fs.vfsfs.DecRef(ctx) 322 return nil, nil, linuxerr.EINVAL 323 } 324 root.mode = atomicbitops.FromUint32(uint32(rootStat.Mode)) 325 root.uid = atomicbitops.FromUint32(rootStat.UID) 326 root.gid = atomicbitops.FromUint32(rootStat.GID) 327 if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { 328 root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) 329 root.devMinor = atomicbitops.FromUint32(fs.dirDevMinor) 330 // For root dir, it is okay to use top most level's stat to compute inode 331 // number because we don't allow copy ups on root dentries. 332 root.ino.Store(fs.newDirIno(rootStat.DevMajor, rootStat.DevMinor, rootStat.Ino)) 333 } else if !root.upperVD.Ok() { 334 root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR) 335 rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor) 336 if err != nil { 337 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) 338 root.destroyLocked(ctx) 339 fs.vfsfs.DecRef(ctx) 340 return nil, nil, err 341 } 342 root.devMinor = atomicbitops.FromUint32(rootDevMinor) 343 root.ino.Store(rootStat.Ino) 344 } else { 345 root.devMajor = atomicbitops.FromUint32(rootStat.DevMajor) 346 root.devMinor = atomicbitops.FromUint32(rootStat.DevMinor) 347 root.ino.Store(rootStat.Ino) 348 } 349 350 return &fs.vfsfs, &root.vfsd, nil 351 } 352 353 // clonePrivateMount creates a non-recursive bind mount rooted at vd, not 354 // associated with any MountNamespace, and returns the root of the new mount. 355 // (This is required to ensure that each layer of an overlay comprises only a 356 // single mount, and therefore can't cross into e.g. the overlay filesystem 357 // itself, risking lock recursion.) A reference is held on the returned 358 // VirtualDentry. 359 func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { 360 oldmnt := vd.Mount() 361 opts := oldmnt.Options() 362 if forceReadOnly { 363 opts.ReadOnly = true 364 } 365 newmnt := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) 366 // Take a reference on the dentry which will be owned by the returned 367 // VirtualDentry. 368 d := vd.Dentry() 369 d.IncRef() 370 return vfs.MakeVirtualDentry(newmnt, d), nil 371 } 372 373 // Release implements vfs.FilesystemImpl.Release. 374 func (fs *filesystem) Release(ctx context.Context) { 375 vfsObj := fs.vfsfs.VirtualFilesystem() 376 vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) 377 for _, lowerDevMinor := range fs.lowerDevMinors { 378 vfsObj.PutAnonBlockDevMinor(lowerDevMinor) 379 } 380 if fs.opts.UpperRoot.Ok() { 381 fs.opts.UpperRoot.DecRef(ctx) 382 } 383 for _, lowerRoot := range fs.opts.LowerRoots { 384 lowerRoot.DecRef(ctx) 385 } 386 } 387 388 // updateMaxNameLen is analogous to fs/overlayfs/super.c:ovl_check_namelen(). 389 func (fs *filesystem) updateMaxNameLen(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry) error { 390 statfs, err := vfsObj.StatFSAt(ctx, creds, &vfs.PathOperation{ 391 Root: vd, 392 Start: vd, 393 }) 394 if err != nil { 395 return err 396 } 397 if statfs.NameLength > fs.maxFilenameLen { 398 fs.maxFilenameLen = statfs.NameLength 399 } 400 return nil 401 } 402 403 func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { 404 // Always statfs the root of the topmost layer. Compare Linux's 405 // fs/overlayfs/super.c:ovl_statfs(). 406 var rootVD vfs.VirtualDentry 407 if fs.opts.UpperRoot.Ok() { 408 rootVD = fs.opts.UpperRoot 409 } else { 410 rootVD = fs.opts.LowerRoots[0] 411 } 412 fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ 413 Root: rootVD, 414 Start: rootVD, 415 }) 416 if err != nil { 417 return linux.Statfs{}, err 418 } 419 fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC 420 return fsstat, nil 421 } 422 423 func (fs *filesystem) newDirIno(layerMajor, layerMinor uint32, layerIno uint64) uint64 { 424 fs.dirInoCacheMu.Lock() 425 defer fs.dirInoCacheMu.Unlock() 426 orig := layerDevNoAndIno{ 427 layerDevNumber: layerDevNumber{layerMajor, layerMinor}, 428 ino: layerIno, 429 } 430 if ino, ok := fs.dirInoCache[orig]; ok { 431 return ino 432 } 433 fs.lastDirIno++ 434 newIno := fs.lastDirIno 435 fs.dirInoCache[orig] = newIno 436 return newIno 437 } 438 439 func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) { 440 fs.devMu.Lock() 441 defer fs.devMu.Unlock() 442 orig := layerDevNumber{layerMajor, layerMinor} 443 if minor, ok := fs.lowerDevMinors[orig]; ok { 444 return minor, nil 445 } 446 minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() 447 if err != nil { 448 return 0, err 449 } 450 fs.lowerDevMinors[orig] = minor 451 return minor, nil 452 } 453 454 // dentry implements vfs.DentryImpl. 455 // 456 // +stateify savable 457 type dentry struct { 458 vfsd vfs.Dentry 459 460 refs atomicbitops.Int64 461 462 // fs is the owning filesystem. fs is immutable. 463 fs *filesystem 464 465 // mode, uid, and gid are the file mode, owner, and group of the file in 466 // the topmost layer (and therefore the overlay file as well), and are used 467 // for permission checks on this dentry. These fields are protected by 468 // copyMu. 469 mode atomicbitops.Uint32 470 uid atomicbitops.Uint32 471 gid atomicbitops.Uint32 472 473 // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and 474 // 0 otherwise. 475 copiedUp atomicbitops.Uint32 476 477 // parent is the dentry corresponding to this dentry's parent directory. 478 // name is this dentry's name in parent. If this dentry is a filesystem 479 // root, parent is nil and name is the empty string. parent and name are 480 // protected by fs.renameMu. 481 parent *dentry 482 name string 483 484 // If this dentry represents a directory, children maps the names of 485 // children for which dentries have been instantiated to those dentries, 486 // and dirents (if not nil) is a cache of dirents as returned by 487 // directoryFDs representing this directory. children is protected by 488 // dirMu. 489 dirMu dirMutex `state:"nosave"` 490 children map[string]*dentry 491 dirents []vfs.Dirent 492 493 // upperVD and lowerVDs are the files from the overlay filesystem's layers 494 // that comprise the file on the overlay filesystem. 495 // 496 // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. 497 // be copied up) with copyMu locked for writing; otherwise, it is 498 // immutable. lowerVDs is always immutable. 499 copyMu sync.RWMutex `state:"nosave"` 500 upperVD vfs.VirtualDentry 501 lowerVDs []vfs.VirtualDentry 502 503 // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= 504 // len(inlineLowerVDs). 505 inlineLowerVDs [1]vfs.VirtualDentry 506 507 // devMajor, devMinor, and ino are the device major/minor and inode numbers 508 // used by this dentry. These fields are protected by copyMu. 509 devMajor atomicbitops.Uint32 510 devMinor atomicbitops.Uint32 511 ino atomicbitops.Uint64 512 513 // If this dentry represents a regular file, then: 514 // 515 // - mapsMu is used to synchronize between copy-up and memmap.Mappable 516 // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. 517 // 518 // - dataMu is used to synchronize between copy-up and 519 // dentry.(memmap.Mappable).Translate. 520 // 521 // - lowerMappings tracks memory mappings of the file. lowerMappings is 522 // used to invalidate mappings of the lower layer when the file is copied 523 // up to ensure that they remain coherent with subsequent writes to the 524 // file. (Note that, as of this writing, Linux overlayfs does not do this; 525 // this feature is a gVisor extension.) lowerMappings is protected by 526 // mapsMu. 527 // 528 // - If this dentry is copied-up, then wrappedMappable is the Mappable 529 // obtained from a call to the current top layer's 530 // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil 531 // (from a call to regularFileFD.ensureMappable()), it cannot become nil. 532 // wrappedMappable is protected by mapsMu and dataMu. 533 // 534 // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is 535 // accessed using atomic memory operations. 536 // 537 // - wrappedMappable is protected by mapsMu and dataMu. In addition, 538 // it has to be immutable if copyMu is taken for write. 539 // copyUpMaybeSyntheticMountpointLocked relies on this behavior. 540 mapsMu mapsMutex `state:"nosave"` 541 lowerMappings memmap.MappingSet 542 dataMu dataRWMutex `state:"nosave"` 543 wrappedMappable memmap.Mappable 544 isMappable atomicbitops.Uint32 545 546 locks vfs.FileLocks 547 548 // watches is the set of inotify watches on the file repesented by this dentry. 549 // 550 // Note that hard links to the same file will not share the same set of 551 // watches, due to the fact that we do not have inode structures in this 552 // overlay implementation. 553 watches vfs.Watches 554 } 555 556 // newDentry creates a new dentry. The dentry initially has no references; it 557 // is the caller's responsibility to set the dentry's reference count and/or 558 // call dentry.destroy() as appropriate. The dentry is initially invalid in 559 // that it contains no layers; the caller is responsible for setting them. 560 func (fs *filesystem) newDentry() *dentry { 561 d := &dentry{ 562 fs: fs, 563 } 564 d.lowerVDs = d.inlineLowerVDs[:0] 565 d.vfsd.Init(d) 566 refs.Register(d) 567 return d 568 } 569 570 // IncRef implements vfs.DentryImpl.IncRef. 571 func (d *dentry) IncRef() { 572 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 573 // d.checkDropLocked(). 574 r := d.refs.Add(1) 575 if d.LogRefs() { 576 refs.LogIncRef(d, r) 577 } 578 } 579 580 // TryIncRef implements vfs.DentryImpl.TryIncRef. 581 func (d *dentry) TryIncRef() bool { 582 for { 583 r := d.refs.Load() 584 if r <= 0 { 585 return false 586 } 587 if d.refs.CompareAndSwap(r, r+1) { 588 if d.LogRefs() { 589 refs.LogTryIncRef(d, r+1) 590 } 591 return true 592 } 593 } 594 } 595 596 // DecRef implements vfs.DentryImpl.DecRef. 597 func (d *dentry) DecRef(ctx context.Context) { 598 r := d.refs.Add(-1) 599 if d.LogRefs() { 600 refs.LogDecRef(d, r) 601 } 602 if r == 0 { 603 d.fs.renameMu.Lock() 604 d.checkDropLocked(ctx) 605 d.fs.renameMu.Unlock() 606 } else if r < 0 { 607 panic("overlay.dentry.DecRef() called without holding a reference") 608 } 609 } 610 611 func (d *dentry) decRefLocked(ctx context.Context) { 612 r := d.refs.Add(-1) 613 if d.LogRefs() { 614 refs.LogDecRef(d, r) 615 } 616 if r == 0 { 617 d.checkDropLocked(ctx) 618 } else if r < 0 { 619 panic("overlay.dentry.decRefLocked() called without holding a reference") 620 } 621 } 622 623 // checkDropLocked should be called after d's reference count becomes 0 or it 624 // becomes deleted. 625 // 626 // Preconditions: d.fs.renameMu must be locked for writing. 627 func (d *dentry) checkDropLocked(ctx context.Context) { 628 // Dentries with a positive reference count must be retained. (The only way 629 // to obtain a reference on a dentry with zero references is via path 630 // resolution, which requires renameMu, so if d.refs is zero then it will 631 // remain zero while we hold renameMu for writing.) Dentries with a 632 // negative reference count have already been destroyed. 633 if d.refs.Load() != 0 { 634 return 635 } 636 637 // Make sure that we do not lose watches on dentries that have not been 638 // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so 639 // d.vfsd.IsDead() indicates that d was deleted. 640 if !d.vfsd.IsDead() && d.watches.Size() > 0 { 641 return 642 } 643 644 // Refs is still zero; destroy it. 645 d.destroyLocked(ctx) 646 return 647 } 648 649 // destroyLocked destroys the dentry. 650 // 651 // Preconditions: 652 // - d.fs.renameMu must be locked for writing. 653 // - d.refs == 0. 654 func (d *dentry) destroyLocked(ctx context.Context) { 655 switch d.refs.Load() { 656 case 0: 657 // Mark the dentry destroyed. 658 d.refs.Store(-1) 659 case -1: 660 panic("overlay.dentry.destroyLocked() called on already destroyed dentry") 661 default: 662 panic("overlay.dentry.destroyLocked() called with references on the dentry") 663 } 664 665 if d.upperVD.Ok() { 666 d.upperVD.DecRef(ctx) 667 } 668 for _, lowerVD := range d.lowerVDs { 669 lowerVD.DecRef(ctx) 670 } 671 672 d.watches.HandleDeletion(ctx) 673 674 if d.parent != nil { 675 d.parent.dirMu.Lock() 676 if !d.vfsd.IsDead() { 677 delete(d.parent.children, d.name) 678 } 679 d.parent.dirMu.Unlock() 680 // Drop the reference held by d on its parent without recursively 681 // locking d.fs.renameMu. 682 d.parent.decRefLocked(ctx) 683 } 684 refs.Unregister(d) 685 } 686 687 // RefType implements refs.CheckedObject.Type. 688 func (d *dentry) RefType() string { 689 return "overlay.dentry" 690 } 691 692 // LeakMessage implements refs.CheckedObject.LeakMessage. 693 func (d *dentry) LeakMessage() string { 694 return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 695 } 696 697 // LogRefs implements refs.CheckedObject.LogRefs. 698 // 699 // This should only be set to true for debugging purposes, as it can generate an 700 // extremely large amount of output and drastically degrade performance. 701 func (d *dentry) LogRefs() bool { 702 return false 703 } 704 705 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 706 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { 707 if d.isDir() { 708 events |= linux.IN_ISDIR 709 } 710 711 // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 712 // that d was deleted. 713 deleted := d.vfsd.IsDead() 714 715 d.fs.renameMu.RLock() 716 // The ordering below is important, Linux always notifies the parent first. 717 if d.parent != nil { 718 d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) 719 } 720 d.watches.Notify(ctx, "", events, cookie, et, deleted) 721 d.fs.renameMu.RUnlock() 722 } 723 724 // Watches implements vfs.DentryImpl.Watches. 725 func (d *dentry) Watches() *vfs.Watches { 726 return &d.watches 727 } 728 729 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 730 func (d *dentry) OnZeroWatches(ctx context.Context) { 731 if d.refs.Load() == 0 { 732 d.fs.renameMu.Lock() 733 d.checkDropLocked(ctx) 734 d.fs.renameMu.Unlock() 735 } 736 } 737 738 // iterLayers invokes yield on each layer comprising d, from top to bottom. If 739 // any call to yield returns false, iterLayer stops iteration. 740 func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { 741 if d.isCopiedUp() { 742 if !yield(d.upperVD, true) { 743 return 744 } 745 } 746 for _, lowerVD := range d.lowerVDs { 747 if !yield(lowerVD, false) { 748 return 749 } 750 } 751 } 752 753 func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { 754 if d.isCopiedUp() { 755 return d.upperVD, true 756 } 757 return d.lowerVDs[0], false 758 } 759 760 func (d *dentry) topLayer() vfs.VirtualDentry { 761 vd, _ := d.topLayerInfo() 762 return vd 763 } 764 765 func (d *dentry) topLookupLayer() lookupLayer { 766 if d.upperVD.Ok() { 767 return lookupLayerUpper 768 } 769 return lookupLayerLower 770 } 771 772 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 773 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) 774 } 775 776 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 777 mode := linux.FileMode(d.mode.Load()) 778 kuid := auth.KUID(d.uid.Load()) 779 kgid := auth.KGID(d.gid.Load()) 780 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 781 return err 782 } 783 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 784 } 785 786 // statInternalMask is the set of stat fields that is set by 787 // dentry.statInternalTo(). 788 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 789 790 // statInternalTo writes fields to stat that are stored in d, and therefore do 791 // not requiring invoking StatAt on the overlay's layers. 792 func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { 793 stat.Mask |= statInternalMask 794 if d.isDir() { 795 // Linux sets nlink to 1 for merged directories 796 // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is 797 // correct more often ("." and the directory's entry in its parent), 798 // and some of our tests expect this. 799 stat.Nlink = 2 800 } 801 stat.UID = d.uid.Load() 802 stat.GID = d.gid.Load() 803 stat.Mode = uint16(d.mode.Load()) 804 stat.Ino = d.ino.Load() 805 stat.DevMajor = d.devMajor.Load() 806 stat.DevMinor = d.devMinor.Load() 807 } 808 809 // Preconditions: d.copyMu must be locked for writing. 810 func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { 811 if opts.Stat.Mask&linux.STATX_MODE != 0 { 812 d.mode.Store((d.mode.RacyLoad() & linux.S_IFMT) | uint32(opts.Stat.Mode&^linux.S_IFMT)) 813 } 814 if opts.Stat.Mask&linux.STATX_UID != 0 { 815 d.uid.Store(opts.Stat.UID) 816 } 817 if opts.Stat.Mask&linux.STATX_GID != 0 { 818 d.gid.Store(opts.Stat.GID) 819 } 820 } 821 822 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 823 return vfs.CheckDeleteSticky( 824 creds, 825 linux.FileMode(d.mode.Load()), 826 auth.KUID(d.uid.Load()), 827 auth.KUID(child.uid.Load()), 828 auth.KGID(child.gid.Load()), 829 ) 830 } 831 832 // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of 833 // children. 834 func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx { 835 stat := linux.Statx{ 836 Mask: uint32(linux.STATX_UID | linux.STATX_GID), 837 UID: uint32(creds.EffectiveKUID), 838 GID: uint32(creds.EffectiveKGID), 839 } 840 // Set GID and possibly the SGID bit if the parent is an SGID directory. 841 d.copyMu.RLock() 842 defer d.copyMu.RUnlock() 843 if d.mode.Load()&linux.ModeSetGID == linux.ModeSetGID { 844 stat.GID = d.gid.Load() 845 if stat.Mode&linux.ModeDirectory == linux.ModeDirectory { 846 stat.Mode = uint16(mode) | linux.ModeSetGID 847 stat.Mask |= linux.STATX_MODE 848 } 849 } 850 return stat 851 } 852 853 // fileDescription is embedded by overlay implementations of 854 // vfs.FileDescriptionImpl. 855 // 856 // +stateify savable 857 type fileDescription struct { 858 vfsfd vfs.FileDescription 859 vfs.FileDescriptionDefaultImpl 860 vfs.LockFD 861 } 862 863 func (fd *fileDescription) filesystem() *filesystem { 864 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 865 } 866 867 func (fd *fileDescription) dentry() *dentry { 868 return fd.vfsfd.Dentry().Impl().(*dentry) 869 } 870 871 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 872 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 873 return fd.filesystem().listXattr(ctx, fd.dentry(), size) 874 } 875 876 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 877 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 878 return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) 879 } 880 881 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 882 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 883 fs := fd.filesystem() 884 fs.renameMu.RLock() 885 defer fs.renameMu.RUnlock() 886 return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) 887 } 888 889 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 890 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 891 fs := fd.filesystem() 892 fs.renameMu.RLock() 893 defer fs.renameMu.RUnlock() 894 return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) 895 }