github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/overlay/overlay.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package overlay provides an overlay filesystem implementation, which 16 // synthesizes a filesystem by composing one or more immutable filesystems 17 // ("lower layers") with an optional mutable filesystem ("upper layer"). 18 // 19 // Lock order: 20 // 21 // directoryFD.mu / regularFileFD.mu 22 // filesystem.renameMu 23 // dentry.dirMu 24 // dentry.copyMu 25 // filesystem.devMu 26 // *** "memmap.Mappable locks" below this point 27 // dentry.mapsMu 28 // *** "memmap.Mappable locks taken by Translate" below this point 29 // dentry.dataMu 30 // 31 // Locking dentry.dirMu in multiple dentries requires that parent dentries are 32 // locked before child dentries, and that filesystem.renameMu is locked to 33 // stabilize this relationship. 34 package overlay 35 36 import ( 37 "fmt" 38 "strings" 39 "sync/atomic" 40 41 "github.com/SagerNet/gvisor/pkg/abi/linux" 42 "github.com/SagerNet/gvisor/pkg/context" 43 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 44 "github.com/SagerNet/gvisor/pkg/fspath" 45 "github.com/SagerNet/gvisor/pkg/refsvfs2" 46 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 47 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 48 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 49 "github.com/SagerNet/gvisor/pkg/sync" 50 ) 51 52 // Name is the default filesystem name. 53 const Name = "overlay" 54 55 // FilesystemType implements vfs.FilesystemType. 56 // 57 // +stateify savable 58 type FilesystemType struct{} 59 60 // Name implements vfs.FilesystemType.Name. 61 func (FilesystemType) Name() string { 62 return Name 63 } 64 65 // Release implements FilesystemType.Release. 66 func (FilesystemType) Release(ctx context.Context) {} 67 68 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to 69 // FilesystemType.GetFilesystem. 70 // 71 // +stateify savable 72 type FilesystemOptions struct { 73 // Callers passing FilesystemOptions to 74 // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that 75 // the vfs.Mounts comprising the layers of the overlay filesystem do not 76 // contain submounts. 77 78 // If UpperRoot.Ok(), it is the root of the writable upper layer of the 79 // overlay. 80 UpperRoot vfs.VirtualDentry 81 82 // LowerRoots contains the roots of the immutable lower layers of the 83 // overlay. LowerRoots is immutable. 84 LowerRoots []vfs.VirtualDentry 85 } 86 87 // filesystem implements vfs.FilesystemImpl. 88 // 89 // +stateify savable 90 type filesystem struct { 91 vfsfs vfs.Filesystem 92 93 // Immutable options. 94 opts FilesystemOptions 95 96 // creds is a copy of the filesystem's creator's credentials, which are 97 // used for accesses to the filesystem's layers. creds is immutable. 98 creds *auth.Credentials 99 100 // privateDevMinors maps device numbers from layer filesystems to device 101 // minor numbers assigned to files originating from that filesystem. 102 // 103 // For non-directory files, this remapping is necessary for lower layers 104 // because a file on a lower layer, and that same file on an overlay, are 105 // distinguishable because they will diverge after copy-up. (Once a 106 // non-directory file has been copied up, its contents on the upper layer 107 // completely determine its contents in the overlay, so this is no longer 108 // true; but we still do the mapping for consistency.) 109 // 110 // For directories, this remapping may be necessary even if the directory 111 // exists on the upper layer due to directory merging; rather than make the 112 // mapping conditional on whether the directory is opaque, we again 113 // unconditionally apply the mapping unconditionally. 114 // 115 // privateDevMinors is protected by devMu. 116 devMu sync.Mutex `state:"nosave"` 117 privateDevMinors map[layerDevNumber]uint32 118 119 // renameMu synchronizes renaming with non-renaming operations in order to 120 // ensure consistent lock ordering between dentry.dirMu in different 121 // dentries. 122 renameMu sync.RWMutex `state:"nosave"` 123 } 124 125 // +stateify savable 126 type layerDevNumber struct { 127 major uint32 128 minor uint32 129 } 130 131 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 132 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 133 mopts := vfs.GenericParseMountOptions(opts.Data) 134 fsoptsRaw := opts.InternalData 135 fsopts, ok := fsoptsRaw.(FilesystemOptions) 136 if fsoptsRaw != nil && !ok { 137 ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) 138 return nil, nil, linuxerr.EINVAL 139 } 140 vfsroot := vfs.RootFromContext(ctx) 141 if vfsroot.Ok() { 142 defer vfsroot.DecRef(ctx) 143 } 144 145 if upperPathname, ok := mopts["upperdir"]; ok { 146 if fsopts.UpperRoot.Ok() { 147 ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified") 148 return nil, nil, linuxerr.EINVAL 149 } 150 delete(mopts, "upperdir") 151 // Linux overlayfs also requires a workdir when upperdir is 152 // specified; we don't, so silently ignore this option. 153 delete(mopts, "workdir") 154 upperPath := fspath.Parse(upperPathname) 155 if !upperPath.Absolute { 156 ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) 157 return nil, nil, linuxerr.EINVAL 158 } 159 upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 160 Root: vfsroot, 161 Start: vfsroot, 162 Path: upperPath, 163 FollowFinalSymlink: true, 164 }, &vfs.GetDentryOptions{ 165 CheckSearchable: true, 166 }) 167 if err != nil { 168 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) 169 return nil, nil, err 170 } 171 privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) 172 upperRoot.DecRef(ctx) 173 if err != nil { 174 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) 175 return nil, nil, err 176 } 177 defer privateUpperRoot.DecRef(ctx) 178 fsopts.UpperRoot = privateUpperRoot 179 } 180 181 if lowerPathnamesStr, ok := mopts["lowerdir"]; ok { 182 if len(fsopts.LowerRoots) != 0 { 183 ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified") 184 return nil, nil, linuxerr.EINVAL 185 } 186 delete(mopts, "lowerdir") 187 lowerPathnames := strings.Split(lowerPathnamesStr, ":") 188 for _, lowerPathname := range lowerPathnames { 189 lowerPath := fspath.Parse(lowerPathname) 190 if !lowerPath.Absolute { 191 ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) 192 return nil, nil, linuxerr.EINVAL 193 } 194 lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ 195 Root: vfsroot, 196 Start: vfsroot, 197 Path: lowerPath, 198 FollowFinalSymlink: true, 199 }, &vfs.GetDentryOptions{ 200 CheckSearchable: true, 201 }) 202 if err != nil { 203 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) 204 return nil, nil, err 205 } 206 privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) 207 lowerRoot.DecRef(ctx) 208 if err != nil { 209 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) 210 return nil, nil, err 211 } 212 defer privateLowerRoot.DecRef(ctx) 213 fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) 214 } 215 } 216 217 if len(mopts) != 0 { 218 ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) 219 return nil, nil, linuxerr.EINVAL 220 } 221 222 if len(fsopts.LowerRoots) == 0 { 223 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required") 224 return nil, nil, linuxerr.EINVAL 225 } 226 if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { 227 ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present") 228 return nil, nil, linuxerr.EINVAL 229 } 230 const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK 231 if len(fsopts.LowerRoots) > maxLowerLayers { 232 ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers) 233 return nil, nil, linuxerr.EINVAL 234 } 235 236 // Take extra references held by the filesystem. 237 if fsopts.UpperRoot.Ok() { 238 fsopts.UpperRoot.IncRef() 239 } 240 for _, lowerRoot := range fsopts.LowerRoots { 241 lowerRoot.IncRef() 242 } 243 244 fs := &filesystem{ 245 opts: fsopts, 246 creds: creds.Fork(), 247 privateDevMinors: make(map[layerDevNumber]uint32), 248 } 249 fs.vfsfs.Init(vfsObj, &fstype, fs) 250 251 // Construct the root dentry. 252 root := fs.newDentry() 253 root.refs = 1 254 if fs.opts.UpperRoot.Ok() { 255 fs.opts.UpperRoot.IncRef() 256 root.copiedUp = 1 257 root.upperVD = fs.opts.UpperRoot 258 } 259 for _, lowerRoot := range fs.opts.LowerRoots { 260 lowerRoot.IncRef() 261 root.lowerVDs = append(root.lowerVDs, lowerRoot) 262 } 263 rootTopVD := root.topLayer() 264 // Get metadata from the topmost layer. See fs.lookupLocked(). 265 const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 266 rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ 267 Root: rootTopVD, 268 Start: rootTopVD, 269 }, &vfs.StatOptions{ 270 Mask: rootStatMask, 271 }) 272 if err != nil { 273 root.destroyLocked(ctx) 274 fs.vfsfs.DecRef(ctx) 275 return nil, nil, err 276 } 277 if rootStat.Mask&rootStatMask != rootStatMask { 278 root.destroyLocked(ctx) 279 fs.vfsfs.DecRef(ctx) 280 return nil, nil, linuxerr.EREMOTE 281 } 282 if isWhiteout(&rootStat) { 283 ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") 284 root.destroyLocked(ctx) 285 fs.vfsfs.DecRef(ctx) 286 return nil, nil, linuxerr.EINVAL 287 } 288 root.mode = uint32(rootStat.Mode) 289 root.uid = rootStat.UID 290 root.gid = rootStat.GID 291 root.devMajor = linux.UNNAMED_MAJOR 292 rootDevMinor, err := fs.getPrivateDevMinor(rootStat.DevMajor, rootStat.DevMinor) 293 if err != nil { 294 ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err) 295 root.destroyLocked(ctx) 296 fs.vfsfs.DecRef(ctx) 297 return nil, nil, err 298 } 299 root.devMinor = rootDevMinor 300 root.ino = rootStat.Ino 301 302 return &fs.vfsfs, &root.vfsd, nil 303 } 304 305 // clonePrivateMount creates a non-recursive bind mount rooted at vd, not 306 // associated with any MountNamespace, and returns the root of the new mount. 307 // (This is required to ensure that each layer of an overlay comprises only a 308 // single mount, and therefore can't cross into e.g. the overlay filesystem 309 // itself, risking lock recursion.) A reference is held on the returned 310 // VirtualDentry. 311 func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { 312 oldmnt := vd.Mount() 313 opts := oldmnt.Options() 314 if forceReadOnly { 315 opts.ReadOnly = true 316 } 317 newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) 318 if err != nil { 319 return vfs.VirtualDentry{}, err 320 } 321 // Take a reference on the dentry which will be owned by the returned 322 // VirtualDentry. 323 d := vd.Dentry() 324 d.IncRef() 325 return vfs.MakeVirtualDentry(newmnt, d), nil 326 } 327 328 // Release implements vfs.FilesystemImpl.Release. 329 func (fs *filesystem) Release(ctx context.Context) { 330 vfsObj := fs.vfsfs.VirtualFilesystem() 331 for _, devMinor := range fs.privateDevMinors { 332 vfsObj.PutAnonBlockDevMinor(devMinor) 333 } 334 if fs.opts.UpperRoot.Ok() { 335 fs.opts.UpperRoot.DecRef(ctx) 336 } 337 for _, lowerRoot := range fs.opts.LowerRoots { 338 lowerRoot.DecRef(ctx) 339 } 340 } 341 342 func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { 343 // Always statfs the root of the topmost layer. Compare Linux's 344 // fs/overlayfs/super.c:ovl_statfs(). 345 var rootVD vfs.VirtualDentry 346 if fs.opts.UpperRoot.Ok() { 347 rootVD = fs.opts.UpperRoot 348 } else { 349 rootVD = fs.opts.LowerRoots[0] 350 } 351 fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ 352 Root: rootVD, 353 Start: rootVD, 354 }) 355 if err != nil { 356 return linux.Statfs{}, err 357 } 358 fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC 359 return fsstat, nil 360 } 361 362 func (fs *filesystem) getPrivateDevMinor(layerMajor, layerMinor uint32) (uint32, error) { 363 fs.devMu.Lock() 364 defer fs.devMu.Unlock() 365 orig := layerDevNumber{layerMajor, layerMinor} 366 if minor, ok := fs.privateDevMinors[orig]; ok { 367 return minor, nil 368 } 369 minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor() 370 if err != nil { 371 return 0, err 372 } 373 fs.privateDevMinors[orig] = minor 374 return minor, nil 375 } 376 377 // dentry implements vfs.DentryImpl. 378 // 379 // +stateify savable 380 type dentry struct { 381 vfsd vfs.Dentry 382 383 refs int64 384 385 // fs is the owning filesystem. fs is immutable. 386 fs *filesystem 387 388 // mode, uid, and gid are the file mode, owner, and group of the file in 389 // the topmost layer (and therefore the overlay file as well), and are used 390 // for permission checks on this dentry. These fields are protected by 391 // copyMu and accessed using atomic memory operations. 392 mode uint32 393 uid uint32 394 gid uint32 395 396 // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and 397 // 0 otherwise. copiedUp is accessed using atomic memory operations. 398 copiedUp uint32 399 400 // parent is the dentry corresponding to this dentry's parent directory. 401 // name is this dentry's name in parent. If this dentry is a filesystem 402 // root, parent is nil and name is the empty string. parent and name are 403 // protected by fs.renameMu. 404 parent *dentry 405 name string 406 407 // If this dentry represents a directory, children maps the names of 408 // children for which dentries have been instantiated to those dentries, 409 // and dirents (if not nil) is a cache of dirents as returned by 410 // directoryFDs representing this directory. children is protected by 411 // dirMu. 412 dirMu sync.Mutex `state:"nosave"` 413 children map[string]*dentry 414 dirents []vfs.Dirent 415 416 // upperVD and lowerVDs are the files from the overlay filesystem's layers 417 // that comprise the file on the overlay filesystem. 418 // 419 // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. 420 // be copied up) with copyMu locked for writing; otherwise, it is 421 // immutable. lowerVDs is always immutable. 422 copyMu sync.RWMutex `state:"nosave"` 423 upperVD vfs.VirtualDentry 424 lowerVDs []vfs.VirtualDentry 425 426 // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= 427 // len(inlineLowerVDs). 428 inlineLowerVDs [1]vfs.VirtualDentry 429 430 // devMajor, devMinor, and ino are the device major/minor and inode numbers 431 // used by this dentry. These fields are protected by copyMu and accessed 432 // using atomic memory operations. 433 devMajor uint32 434 devMinor uint32 435 ino uint64 436 437 // If this dentry represents a regular file, then: 438 // 439 // - mapsMu is used to synchronize between copy-up and memmap.Mappable 440 // methods on dentry preceding mm.MemoryManager.activeMu in the lock order. 441 // 442 // - dataMu is used to synchronize between copy-up and 443 // dentry.(memmap.Mappable).Translate. 444 // 445 // - lowerMappings tracks memory mappings of the file. lowerMappings is 446 // used to invalidate mappings of the lower layer when the file is copied 447 // up to ensure that they remain coherent with subsequent writes to the 448 // file. (Note that, as of this writing, Linux overlayfs does not do this; 449 // this feature is a gVisor extension.) lowerMappings is protected by 450 // mapsMu. 451 // 452 // - If this dentry is copied-up, then wrappedMappable is the Mappable 453 // obtained from a call to the current top layer's 454 // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil 455 // (from a call to regularFileFD.ensureMappable()), it cannot become nil. 456 // wrappedMappable is protected by mapsMu and dataMu. 457 // 458 // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is 459 // accessed using atomic memory operations. 460 mapsMu sync.Mutex `state:"nosave"` 461 lowerMappings memmap.MappingSet 462 dataMu sync.RWMutex `state:"nosave"` 463 wrappedMappable memmap.Mappable 464 isMappable uint32 465 466 locks vfs.FileLocks 467 468 // watches is the set of inotify watches on the file repesented by this dentry. 469 // 470 // Note that hard links to the same file will not share the same set of 471 // watches, due to the fact that we do not have inode structures in this 472 // overlay implementation. 473 watches vfs.Watches 474 } 475 476 // newDentry creates a new dentry. The dentry initially has no references; it 477 // is the caller's responsibility to set the dentry's reference count and/or 478 // call dentry.destroy() as appropriate. The dentry is initially invalid in 479 // that it contains no layers; the caller is responsible for setting them. 480 func (fs *filesystem) newDentry() *dentry { 481 d := &dentry{ 482 fs: fs, 483 } 484 d.lowerVDs = d.inlineLowerVDs[:0] 485 d.vfsd.Init(d) 486 refsvfs2.Register(d) 487 return d 488 } 489 490 // IncRef implements vfs.DentryImpl.IncRef. 491 func (d *dentry) IncRef() { 492 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 493 // d.checkDropLocked(). 494 r := atomic.AddInt64(&d.refs, 1) 495 if d.LogRefs() { 496 refsvfs2.LogIncRef(d, r) 497 } 498 } 499 500 // TryIncRef implements vfs.DentryImpl.TryIncRef. 501 func (d *dentry) TryIncRef() bool { 502 for { 503 r := atomic.LoadInt64(&d.refs) 504 if r <= 0 { 505 return false 506 } 507 if atomic.CompareAndSwapInt64(&d.refs, r, r+1) { 508 if d.LogRefs() { 509 refsvfs2.LogTryIncRef(d, r+1) 510 } 511 return true 512 } 513 } 514 } 515 516 // DecRef implements vfs.DentryImpl.DecRef. 517 func (d *dentry) DecRef(ctx context.Context) { 518 r := atomic.AddInt64(&d.refs, -1) 519 if d.LogRefs() { 520 refsvfs2.LogDecRef(d, r) 521 } 522 if r == 0 { 523 d.fs.renameMu.Lock() 524 d.checkDropLocked(ctx) 525 d.fs.renameMu.Unlock() 526 } else if r < 0 { 527 panic("overlay.dentry.DecRef() called without holding a reference") 528 } 529 } 530 531 func (d *dentry) decRefLocked(ctx context.Context) { 532 r := atomic.AddInt64(&d.refs, -1) 533 if d.LogRefs() { 534 refsvfs2.LogDecRef(d, r) 535 } 536 if r == 0 { 537 d.checkDropLocked(ctx) 538 } else if r < 0 { 539 panic("overlay.dentry.decRefLocked() called without holding a reference") 540 } 541 } 542 543 // checkDropLocked should be called after d's reference count becomes 0 or it 544 // becomes deleted. 545 // 546 // Preconditions: d.fs.renameMu must be locked for writing. 547 func (d *dentry) checkDropLocked(ctx context.Context) { 548 // Dentries with a positive reference count must be retained. (The only way 549 // to obtain a reference on a dentry with zero references is via path 550 // resolution, which requires renameMu, so if d.refs is zero then it will 551 // remain zero while we hold renameMu for writing.) Dentries with a 552 // negative reference count have already been destroyed. 553 if atomic.LoadInt64(&d.refs) != 0 { 554 return 555 } 556 557 // Make sure that we do not lose watches on dentries that have not been 558 // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so 559 // d.vfsd.IsDead() indicates that d was deleted. 560 if !d.vfsd.IsDead() && d.watches.Size() > 0 { 561 return 562 } 563 564 // Refs is still zero; destroy it. 565 d.destroyLocked(ctx) 566 return 567 } 568 569 // destroyLocked destroys the dentry. 570 // 571 // Preconditions: 572 // * d.fs.renameMu must be locked for writing. 573 // * d.refs == 0. 574 func (d *dentry) destroyLocked(ctx context.Context) { 575 switch atomic.LoadInt64(&d.refs) { 576 case 0: 577 // Mark the dentry destroyed. 578 atomic.StoreInt64(&d.refs, -1) 579 case -1: 580 panic("overlay.dentry.destroyLocked() called on already destroyed dentry") 581 default: 582 panic("overlay.dentry.destroyLocked() called with references on the dentry") 583 } 584 585 if d.upperVD.Ok() { 586 d.upperVD.DecRef(ctx) 587 } 588 for _, lowerVD := range d.lowerVDs { 589 lowerVD.DecRef(ctx) 590 } 591 592 d.watches.HandleDeletion(ctx) 593 594 if d.parent != nil { 595 d.parent.dirMu.Lock() 596 if !d.vfsd.IsDead() { 597 delete(d.parent.children, d.name) 598 } 599 d.parent.dirMu.Unlock() 600 // Drop the reference held by d on its parent without recursively 601 // locking d.fs.renameMu. 602 d.parent.decRefLocked(ctx) 603 } 604 refsvfs2.Unregister(d) 605 } 606 607 // RefType implements refsvfs2.CheckedObject.Type. 608 func (d *dentry) RefType() string { 609 return "overlay.dentry" 610 } 611 612 // LeakMessage implements refsvfs2.CheckedObject.LeakMessage. 613 func (d *dentry) LeakMessage() string { 614 return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs)) 615 } 616 617 // LogRefs implements refsvfs2.CheckedObject.LogRefs. 618 // 619 // This should only be set to true for debugging purposes, as it can generate an 620 // extremely large amount of output and drastically degrade performance. 621 func (d *dentry) LogRefs() bool { 622 return false 623 } 624 625 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 626 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { 627 if d.isDir() { 628 events |= linux.IN_ISDIR 629 } 630 631 // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 632 // that d was deleted. 633 deleted := d.vfsd.IsDead() 634 635 d.fs.renameMu.RLock() 636 // The ordering below is important, Linux always notifies the parent first. 637 if d.parent != nil { 638 d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted) 639 } 640 d.watches.Notify(ctx, "", events, cookie, et, deleted) 641 d.fs.renameMu.RUnlock() 642 } 643 644 // Watches implements vfs.DentryImpl.Watches. 645 func (d *dentry) Watches() *vfs.Watches { 646 return &d.watches 647 } 648 649 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 650 func (d *dentry) OnZeroWatches(ctx context.Context) { 651 if atomic.LoadInt64(&d.refs) == 0 { 652 d.fs.renameMu.Lock() 653 d.checkDropLocked(ctx) 654 d.fs.renameMu.Unlock() 655 } 656 } 657 658 // iterLayers invokes yield on each layer comprising d, from top to bottom. If 659 // any call to yield returns false, iterLayer stops iteration. 660 func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { 661 if d.isCopiedUp() { 662 if !yield(d.upperVD, true) { 663 return 664 } 665 } 666 for _, lowerVD := range d.lowerVDs { 667 if !yield(lowerVD, false) { 668 return 669 } 670 } 671 } 672 673 func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { 674 if d.isCopiedUp() { 675 return d.upperVD, true 676 } 677 return d.lowerVDs[0], false 678 } 679 680 func (d *dentry) topLayer() vfs.VirtualDentry { 681 vd, _ := d.topLayerInfo() 682 return vd 683 } 684 685 func (d *dentry) topLookupLayer() lookupLayer { 686 if d.upperVD.Ok() { 687 return lookupLayerUpper 688 } 689 return lookupLayerLower 690 } 691 692 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 693 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) 694 } 695 696 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 697 mode := linux.FileMode(atomic.LoadUint32(&d.mode)) 698 kuid := auth.KUID(atomic.LoadUint32(&d.uid)) 699 kgid := auth.KGID(atomic.LoadUint32(&d.gid)) 700 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 701 return err 702 } 703 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 704 } 705 706 // statInternalMask is the set of stat fields that is set by 707 // dentry.statInternalTo(). 708 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO 709 710 // statInternalTo writes fields to stat that are stored in d, and therefore do 711 // not requiring invoking StatAt on the overlay's layers. 712 func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { 713 stat.Mask |= statInternalMask 714 if d.isDir() { 715 // Linux sets nlink to 1 for merged directories 716 // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is 717 // correct more often ("." and the directory's entry in its parent), 718 // and some of our tests expect this. 719 stat.Nlink = 2 720 } 721 stat.UID = atomic.LoadUint32(&d.uid) 722 stat.GID = atomic.LoadUint32(&d.gid) 723 stat.Mode = uint16(atomic.LoadUint32(&d.mode)) 724 stat.Ino = atomic.LoadUint64(&d.ino) 725 stat.DevMajor = atomic.LoadUint32(&d.devMajor) 726 stat.DevMinor = atomic.LoadUint32(&d.devMinor) 727 } 728 729 // Preconditions: d.copyMu must be locked for writing. 730 func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { 731 if opts.Stat.Mask&linux.STATX_MODE != 0 { 732 atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT)) 733 } 734 if opts.Stat.Mask&linux.STATX_UID != 0 { 735 atomic.StoreUint32(&d.uid, opts.Stat.UID) 736 } 737 if opts.Stat.Mask&linux.STATX_GID != 0 { 738 atomic.StoreUint32(&d.gid, opts.Stat.GID) 739 } 740 } 741 742 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 743 return vfs.CheckDeleteSticky( 744 creds, 745 linux.FileMode(atomic.LoadUint32(&d.mode)), 746 auth.KUID(atomic.LoadUint32(&d.uid)), 747 auth.KUID(atomic.LoadUint32(&child.uid)), 748 auth.KGID(atomic.LoadUint32(&child.gid)), 749 ) 750 } 751 752 // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of 753 // children. 754 func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx { 755 stat := linux.Statx{ 756 Mask: uint32(linux.STATX_UID | linux.STATX_GID), 757 UID: uint32(creds.EffectiveKUID), 758 GID: uint32(creds.EffectiveKGID), 759 } 760 // Set GID and possibly the SGID bit if the parent is an SGID directory. 761 d.copyMu.RLock() 762 defer d.copyMu.RUnlock() 763 if atomic.LoadUint32(&d.mode)&linux.ModeSetGID == linux.ModeSetGID { 764 stat.GID = atomic.LoadUint32(&d.gid) 765 if stat.Mode&linux.ModeDirectory == linux.ModeDirectory { 766 stat.Mode = uint16(mode) | linux.ModeSetGID 767 stat.Mask |= linux.STATX_MODE 768 } 769 } 770 return stat 771 } 772 773 // fileDescription is embedded by overlay implementations of 774 // vfs.FileDescriptionImpl. 775 // 776 // +stateify savable 777 type fileDescription struct { 778 vfsfd vfs.FileDescription 779 vfs.FileDescriptionDefaultImpl 780 vfs.LockFD 781 } 782 783 func (fd *fileDescription) filesystem() *filesystem { 784 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 785 } 786 787 func (fd *fileDescription) dentry() *dentry { 788 return fd.vfsfd.Dentry().Impl().(*dentry) 789 } 790 791 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 792 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 793 return fd.filesystem().listXattr(ctx, fd.dentry(), size) 794 } 795 796 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 797 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 798 return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts) 799 } 800 801 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 802 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 803 fs := fd.filesystem() 804 d := fd.dentry() 805 806 fs.renameMu.RLock() 807 err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts) 808 fs.renameMu.RUnlock() 809 if err != nil { 810 return err 811 } 812 813 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 814 return nil 815 } 816 817 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 818 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 819 fs := fd.filesystem() 820 d := fd.dentry() 821 822 fs.renameMu.RLock() 823 err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name) 824 fs.renameMu.RUnlock() 825 if err != nil { 826 return err 827 } 828 829 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 830 return nil 831 }