gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/gofer/gofer.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package gofer provides a filesystem implementation that is backed by a 9p 16 // server, interchangeably referred to as "gofers" throughout this package. 17 // 18 // Lock order: 19 // 20 // regularFileFD/directoryFD.mu 21 // filesystem.renameMu 22 // dentry.cachingMu 23 // dentryCache.mu 24 // dentry.opMu 25 // dentry.childrenMu 26 // filesystem.syncMu 27 // dentry.metadataMu 28 // *** "memmap.Mappable locks" below this point 29 // dentry.mapsMu 30 // *** "memmap.Mappable locks taken by Translate" below this point 31 // dentry.handleMu 32 // dentry.dataMu 33 // filesystem.inoMu 34 // specialFileFD.mu 35 // specialFileFD.bufMu 36 // 37 // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that 38 // either ancestor dentries are locked before descendant dentries, or that 39 // filesystem.renameMu is locked for writing. 40 package gofer 41 42 import ( 43 "fmt" 44 "path" 45 "strconv" 46 "strings" 47 "sync/atomic" 48 49 "golang.org/x/sys/unix" 50 "gvisor.dev/gvisor/pkg/abi/linux" 51 "gvisor.dev/gvisor/pkg/atomicbitops" 52 "gvisor.dev/gvisor/pkg/cleanup" 53 "gvisor.dev/gvisor/pkg/context" 54 "gvisor.dev/gvisor/pkg/errors/linuxerr" 55 "gvisor.dev/gvisor/pkg/hostarch" 56 "gvisor.dev/gvisor/pkg/lisafs" 57 "gvisor.dev/gvisor/pkg/log" 58 "gvisor.dev/gvisor/pkg/refs" 59 fslock "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" 60 "gvisor.dev/gvisor/pkg/sentry/fsutil" 61 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 62 "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" 63 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 64 "gvisor.dev/gvisor/pkg/sentry/memmap" 65 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 66 "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" 67 "gvisor.dev/gvisor/pkg/sentry/vfs" 68 "gvisor.dev/gvisor/pkg/sync" 69 "gvisor.dev/gvisor/pkg/unet" 70 ) 71 72 // Name is the default filesystem name. 73 const Name = "9p" 74 75 // Mount option names for goferfs. 76 const ( 77 moptTransport = "trans" 78 moptReadFD = "rfdno" 79 moptWriteFD = "wfdno" 80 moptAname = "aname" 81 moptDfltUID = "dfltuid" 82 moptDfltGID = "dfltgid" 83 moptCache = "cache" 84 moptDcache = "dcache" 85 moptForcePageCache = "force_page_cache" 86 moptLimitHostFDTranslation = "limit_host_fd_translation" 87 moptOverlayfsStaleRead = "overlayfs_stale_read" 88 moptDisableFileHandleSharing = "disable_file_handle_sharing" 89 moptDisableFifoOpen = "disable_fifo_open" 90 91 // Directfs options. 92 moptDirectfs = "directfs" 93 ) 94 95 // Valid values for the "cache" mount option. 96 const ( 97 cacheFSCache = "fscache" 98 cacheFSCacheWritethrough = "fscache_writethrough" 99 cacheRemoteRevalidating = "remote_revalidating" 100 ) 101 102 // SupportedMountOptions is the set of mount options that can be set externally. 103 var SupportedMountOptions = []string{moptOverlayfsStaleRead, moptDisableFileHandleSharing, moptDcache} 104 105 const ( 106 defaultMaxCachedDentries = 1000 107 maxCachedNegativeChildren = 1000 108 ) 109 110 // stringFixedCache is a fixed sized cache, once initialized, 111 // its size never changes. 112 // 113 // +stateify savable 114 type stringFixedCache struct { 115 // namesList stores negative names with fifo list. 116 // name stored in namesList only means it used to be negative 117 // at the moment you pushed it to the list. 118 namesList stringList 119 size uint64 120 } 121 122 func (cache *stringFixedCache) isInited() bool { 123 return cache.size != 0 124 } 125 126 func (cache *stringFixedCache) init(size uint64) { 127 elements := make([]stringListElem, size) 128 for i := uint64(0); i < size; i++ { 129 cache.namesList.PushFront(&elements[i]) 130 } 131 cache.size = size 132 } 133 134 // Update will push name to the front of the list, 135 // and pop the tail value. 136 func (cache *stringFixedCache) add(name string) string { 137 tail := cache.namesList.Back() 138 victimName := tail.str 139 tail.str = name 140 cache.namesList.Remove(tail) 141 cache.namesList.PushFront(tail) 142 return victimName 143 } 144 145 // +stateify savable 146 type dentryCache struct { 147 // maxCachedDentries is the maximum number of cacheable dentries. 148 // maxCachedDentries is immutable. 149 maxCachedDentries uint64 150 // mu protects the below fields. 151 mu sync.Mutex `state:"nosave"` 152 // dentries contains all dentries with 0 references. Due to race conditions, 153 // it may also contain dentries with non-zero references. 154 dentries dentryList 155 // dentriesLen is the number of dentries in dentries. 156 dentriesLen uint64 157 } 158 159 // SetDentryCacheSize sets the size of the global gofer dentry cache. 160 func SetDentryCacheSize(size int) { 161 if size < 0 { 162 return 163 } 164 if globalDentryCache != nil { 165 log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.") 166 return 167 } 168 globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)} 169 } 170 171 // globalDentryCache is a global cache of dentries across all gofer clients. 172 var globalDentryCache *dentryCache 173 174 // Valid values for "trans" mount option. 175 const transportModeFD = "fd" 176 177 // FilesystemType implements vfs.FilesystemType. 178 // 179 // +stateify savable 180 type FilesystemType struct{} 181 182 // filesystem implements vfs.FilesystemImpl. 183 // 184 // +stateify savable 185 type filesystem struct { 186 vfsfs vfs.Filesystem 187 188 // mf is used to allocate memory that caches regular file contents. mf is 189 // immutable. 190 mf *pgalloc.MemoryFile `state:"nosave"` 191 192 // Immutable options. 193 opts filesystemOptions 194 iopts InternalFilesystemOptions 195 196 // client is the LISAFS client used for communicating with the server. client 197 // is immutable. 198 client *lisafs.Client `state:"nosave"` 199 200 // clock is a realtime clock used to set timestamps in file operations. 201 clock ktime.Clock 202 203 // devMinor is the filesystem's minor device number. devMinor is immutable. 204 devMinor uint32 205 206 // root is the root dentry. root is immutable. 207 root *dentry 208 209 // renameMu serves two purposes: 210 // 211 // - It synchronizes path resolution with renaming initiated by this 212 // client. 213 // 214 // - It is held by path resolution to ensure that reachable dentries remain 215 // valid. A dentry is reachable by path resolution if it has a non-zero 216 // reference count (such that it is usable as vfs.ResolvingPath.Start() or 217 // is reachable from its children), or if it is a child dentry (such that 218 // it is reachable from its parent). 219 renameMu sync.RWMutex `state:"nosave"` 220 221 dentryCache *dentryCache 222 223 // syncableDentries contains all non-synthetic dentries. specialFileFDs 224 // contains all open specialFileFDs. These fields are protected by syncMu. 225 syncMu sync.Mutex `state:"nosave"` 226 syncableDentries dentryList 227 specialFileFDs specialFDList 228 229 // inoByKey maps previously-observed device ID and host inode numbers to 230 // internal inode numbers assigned to those files. inoByKey is not preserved 231 // across checkpoint/restore because inode numbers may be reused between 232 // different gofer processes, so inode numbers may be repeated for different 233 // files across checkpoint/restore. inoByKey is protected by inoMu. 234 inoMu sync.Mutex `state:"nosave"` 235 inoByKey map[inoKey]uint64 `state:"nosave"` 236 237 // lastIno is the last inode number assigned to a file. lastIno is accessed 238 // using atomic memory operations. 239 lastIno atomicbitops.Uint64 240 241 // savedDentryRW records open read/write handles during save/restore. 242 savedDentryRW map[*dentry]savedDentryRW 243 244 // released is nonzero once filesystem.Release has been called. 245 released atomicbitops.Int32 246 } 247 248 // +stateify savable 249 type filesystemOptions struct { 250 fd int 251 aname string 252 interop InteropMode // derived from the "cache" mount option 253 dfltuid auth.KUID 254 dfltgid auth.KGID 255 256 // dcache is the maximum number of dentries that can be cached. This is 257 // effective only if globalDentryCache is not being used. 258 dcache uint64 259 260 // If forcePageCache is true, host FDs may not be used for application 261 // memory mappings even if available; instead, the client must perform its 262 // own caching of regular file pages. This is primarily useful for testing. 263 forcePageCache bool 264 265 // If limitHostFDTranslation is true, apply maxFillRange() constraints to 266 // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This 267 // makes memory accounting behavior more consistent between cases where 268 // host FDs are / are not available, but may increase the frequency of 269 // sentry-handled page faults on files for which a host FD is available. 270 limitHostFDTranslation bool 271 272 // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote 273 // filesystem may not be coherent with writable host FDs opened later, so 274 // all uses of the former must be replaced by uses of the latter. This is 275 // usually only the case when the remote filesystem is a Linux overlayfs 276 // mount. (Prior to Linux 4.18, patch series centered on commit 277 // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were 278 // incoherent between pre-copy-up and post-copy-up FDs; after that patch 279 // series, only memory mappings are incoherent.) 280 overlayfsStaleRead bool 281 282 // If regularFilesUseSpecialFileFD is true, application FDs representing 283 // regular files will use distinct file handles for each FD, in the same 284 // way that application FDs representing "special files" such as sockets 285 // do. Note that this disables client caching for regular files. This option 286 // may regress performance due to excessive Open RPCs. This option is not 287 // supported with overlayfsStaleRead for now. 288 regularFilesUseSpecialFileFD bool 289 290 // If disableFifoOpen is true, application attempts to open(2) a host FIFO 291 // are disallowed. 292 disableFifoOpen bool 293 294 // directfs holds options for directfs mode. 295 directfs directfsOpts 296 } 297 298 // +stateify savable 299 type directfsOpts struct { 300 // If directfs is enabled, the gofer client does not make RPCs to the gofer 301 // process. Instead, it makes host syscalls to perform file operations. 302 enabled bool 303 } 304 305 // InteropMode controls the client's interaction with other remote filesystem 306 // users. 307 // 308 // +stateify savable 309 type InteropMode uint32 310 311 const ( 312 // InteropModeExclusive is appropriate when the filesystem client is the 313 // only user of the remote filesystem. 314 // 315 // - The client may cache arbitrary filesystem state (file data, metadata, 316 // filesystem structure, etc.). 317 // 318 // - Client changes to filesystem state may be sent to the remote 319 // filesystem asynchronously, except when server permission checks are 320 // necessary. 321 // 322 // - File timestamps are based on client clocks. This ensures that users of 323 // the client observe timestamps that are coherent with their own clocks 324 // and consistent with Linux's semantics (in particular, it is not always 325 // possible for clients to set arbitrary atimes and mtimes depending on the 326 // remote filesystem implementation, and never possible for clients to set 327 // arbitrary ctimes.) 328 InteropModeExclusive InteropMode = iota 329 330 // InteropModeWritethrough is appropriate when there are read-only users of 331 // the remote filesystem that expect to observe changes made by the 332 // filesystem client. 333 // 334 // - The client may cache arbitrary filesystem state. 335 // 336 // - Client changes to filesystem state must be sent to the remote 337 // filesystem synchronously. 338 // 339 // - File timestamps are based on client clocks. As a corollary, access 340 // timestamp changes from other remote filesystem users will not be visible 341 // to the client. 342 InteropModeWritethrough 343 344 // InteropModeShared is appropriate when there are users of the remote 345 // filesystem that may mutate its state other than the client. 346 // 347 // - The client must verify ("revalidate") cached filesystem state before 348 // using it. 349 // 350 // - Client changes to filesystem state must be sent to the remote 351 // filesystem synchronously. 352 // 353 // - File timestamps are based on server clocks. This is necessary to 354 // ensure that timestamp changes are synchronized between remote filesystem 355 // users. 356 // 357 // Note that the correctness of InteropModeShared depends on the server 358 // correctly implementing 9P fids (i.e. each fid immutably represents a 359 // single filesystem object), even in the presence of remote filesystem 360 // mutations from other users. If this is violated, the behavior of the 361 // client is undefined. 362 InteropModeShared 363 ) 364 365 // InternalFilesystemOptions may be passed as 366 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. 367 // 368 // +stateify savable 369 type InternalFilesystemOptions struct { 370 // If UniqueID is non-empty, it is an opaque string used to reassociate the 371 // filesystem with a new server FD during restoration from checkpoint. 372 UniqueID vfs.RestoreID 373 374 // If LeakConnection is true, do not close the connection to the server 375 // when the Filesystem is released. This is necessary for deployments in 376 // which servers can handle only a single client and report failure if that 377 // client disconnects. 378 LeakConnection bool 379 380 // If OpenSocketsByConnecting is true, silently translate attempts to open 381 // files identifying as sockets to connect RPCs. 382 OpenSocketsByConnecting bool 383 } 384 385 // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default 386 // UIDs and GIDs used for files that do not provide a specific owner or group 387 // respectively. 388 const ( 389 // uint32(-2) doesn't work in Go. 390 _V9FS_DEFUID = auth.KUID(4294967294) 391 _V9FS_DEFGID = auth.KGID(4294967294) 392 ) 393 394 // Name implements vfs.FilesystemType.Name. 395 func (FilesystemType) Name() string { 396 return Name 397 } 398 399 // Release implements vfs.FilesystemType.Release. 400 func (FilesystemType) Release(ctx context.Context) {} 401 402 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 403 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 404 mf := pgalloc.MemoryFileFromContext(ctx) 405 if mf == nil { 406 ctx.Warningf("gofer.FilesystemType.GetFilesystem: CtxMemoryFile is nil") 407 return nil, nil, linuxerr.EINVAL 408 } 409 410 mopts := vfs.GenericParseMountOptions(opts.Data) 411 var fsopts filesystemOptions 412 413 fd, err := getFDFromMountOptionsMap(ctx, mopts) 414 if err != nil { 415 return nil, nil, err 416 } 417 fsopts.fd = fd 418 419 // Get the attach name. 420 fsopts.aname = "/" 421 if aname, ok := mopts[moptAname]; ok { 422 delete(mopts, moptAname) 423 if !path.IsAbs(aname) { 424 ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname) 425 return nil, nil, linuxerr.EINVAL 426 } 427 fsopts.aname = path.Clean(aname) 428 } 429 430 // Parse the cache policy. For historical reasons, this defaults to the 431 // least generally-applicable option, InteropModeExclusive. 432 fsopts.interop = InteropModeExclusive 433 if cache, ok := mopts[moptCache]; ok { 434 delete(mopts, moptCache) 435 switch cache { 436 case cacheFSCache: 437 fsopts.interop = InteropModeExclusive 438 case cacheFSCacheWritethrough: 439 fsopts.interop = InteropModeWritethrough 440 case cacheRemoteRevalidating: 441 fsopts.interop = InteropModeShared 442 default: 443 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) 444 return nil, nil, linuxerr.EINVAL 445 } 446 } 447 448 // Parse the dentry cache size. 449 fsopts.dcache = defaultMaxCachedDentries 450 if dcacheStr, ok := mopts[moptDcache]; ok { 451 delete(mopts, moptDcache) 452 dcache, err := strconv.ParseInt(dcacheStr, 10, 64) 453 if err != nil { 454 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dcache: %s=%s", moptDcache, dcacheStr) 455 return nil, nil, linuxerr.EINVAL 456 } 457 if dcache >= 0 { 458 fsopts.dcache = uint64(dcache) 459 } 460 } 461 462 // Parse the default UID and GID. 463 fsopts.dfltuid = _V9FS_DEFUID 464 if dfltuidstr, ok := mopts[moptDfltUID]; ok { 465 delete(mopts, moptDfltUID) 466 dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) 467 if err != nil { 468 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) 469 return nil, nil, linuxerr.EINVAL 470 } 471 // In Linux, dfltuid is interpreted as a UID and is converted to a KUID 472 // in the caller's user namespace, but goferfs isn't 473 // application-mountable. 474 fsopts.dfltuid = auth.KUID(dfltuid) 475 } 476 fsopts.dfltgid = _V9FS_DEFGID 477 if dfltgidstr, ok := mopts[moptDfltGID]; ok { 478 delete(mopts, moptDfltGID) 479 dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) 480 if err != nil { 481 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) 482 return nil, nil, linuxerr.EINVAL 483 } 484 fsopts.dfltgid = auth.KGID(dfltgid) 485 } 486 487 // Handle simple flags. 488 if _, ok := mopts[moptDisableFileHandleSharing]; ok { 489 delete(mopts, moptDisableFileHandleSharing) 490 fsopts.regularFilesUseSpecialFileFD = true 491 } 492 if _, ok := mopts[moptDisableFifoOpen]; ok { 493 delete(mopts, moptDisableFifoOpen) 494 fsopts.disableFifoOpen = true 495 } 496 if _, ok := mopts[moptForcePageCache]; ok { 497 delete(mopts, moptForcePageCache) 498 fsopts.forcePageCache = true 499 } 500 if _, ok := mopts[moptLimitHostFDTranslation]; ok { 501 delete(mopts, moptLimitHostFDTranslation) 502 fsopts.limitHostFDTranslation = true 503 } 504 if _, ok := mopts[moptOverlayfsStaleRead]; ok { 505 delete(mopts, moptOverlayfsStaleRead) 506 fsopts.overlayfsStaleRead = true 507 } 508 if _, ok := mopts[moptDirectfs]; ok { 509 delete(mopts, moptDirectfs) 510 fsopts.directfs.enabled = true 511 } 512 // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying 513 // "cache=none". 514 515 // Check for unparsed options. 516 if len(mopts) != 0 { 517 ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) 518 return nil, nil, linuxerr.EINVAL 519 } 520 521 // Validation. 522 if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead { 523 // These options are not supported together. To support this, when a dentry 524 // is opened writably for the first time, we need to iterate over all the 525 // specialFileFDs of that dentry that represent a regular file and call 526 // fd.hostFileMapper.RegenerateMappings(writable_fd). 527 ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.") 528 return nil, nil, linuxerr.EINVAL 529 } 530 531 // Handle internal options. 532 iopts, ok := opts.InternalData.(InternalFilesystemOptions) 533 if opts.InternalData != nil && !ok { 534 ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) 535 return nil, nil, linuxerr.EINVAL 536 } 537 // If !ok, iopts being the zero value is correct. 538 539 // Construct the filesystem object. 540 devMinor, err := vfsObj.GetAnonBlockDevMinor() 541 if err != nil { 542 return nil, nil, err 543 } 544 fs := &filesystem{ 545 mf: mf, 546 opts: fsopts, 547 iopts: iopts, 548 clock: ktime.RealtimeClockFromContext(ctx), 549 devMinor: devMinor, 550 inoByKey: make(map[inoKey]uint64), 551 } 552 553 // Did the user configure a global dentry cache? 554 if globalDentryCache != nil { 555 fs.dentryCache = globalDentryCache 556 } else { 557 fs.dentryCache = &dentryCache{maxCachedDentries: fsopts.dcache} 558 } 559 560 fs.vfsfs.Init(vfsObj, &fstype, fs) 561 562 rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx) 563 if err != nil { 564 fs.vfsfs.DecRef(ctx) 565 return nil, nil, err 566 } 567 if fs.opts.directfs.enabled { 568 fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD)) 569 } else { 570 fs.root, err = fs.newLisafsDentry(ctx, &rootInode) 571 } 572 if err != nil { 573 fs.vfsfs.DecRef(ctx) 574 return nil, nil, err 575 } 576 // Set the root's reference count to 2. One reference is returned to the 577 // caller, and the other is held by fs to prevent the root from being "cached" 578 // and subsequently evicted. 579 fs.root.refs = atomicbitops.FromInt64(2) 580 return &fs.vfsfs, &fs.root.vfsd, nil 581 } 582 583 // initClientAndGetRoot initializes fs.client and returns the root inode for 584 // this mount point. It handles the attach point (fs.opts.aname) resolution. 585 func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) { 586 sock, err := unet.NewSocket(fs.opts.fd) 587 if err != nil { 588 return lisafs.Inode{}, -1, err 589 } 590 591 ctx.UninterruptibleSleepStart(false) 592 defer ctx.UninterruptibleSleepFinish(false) 593 594 var ( 595 rootInode lisafs.Inode 596 rootHostFD int 597 ) 598 fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock) 599 if err != nil { 600 return lisafs.Inode{}, -1, err 601 } 602 603 cu := cleanup.Make(func() { 604 if rootHostFD >= 0 { 605 _ = unix.Close(rootHostFD) 606 } 607 rootControlFD := fs.client.NewFD(rootInode.ControlFD) 608 rootControlFD.Close(ctx, false /* flush */) 609 }) 610 defer cu.Clean() 611 612 if fs.opts.directfs.enabled { 613 if fs.opts.aname != "/" { 614 log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname) 615 return lisafs.Inode{}, -1, unix.EINVAL 616 } 617 if rootHostFD < 0 { 618 log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled") 619 return lisafs.Inode{}, -1, unix.EINVAL 620 } 621 } else { 622 if rootHostFD >= 0 { 623 log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it") 624 _ = unix.Close(rootHostFD) 625 rootHostFD = -1 626 } 627 // Use flipcall channels with lisafs because it makes a lot of RPCs. 628 if err := fs.client.StartChannels(); err != nil { 629 return lisafs.Inode{}, -1, err 630 } 631 rootInode, err = fs.handleAnameLisafs(ctx, rootInode) 632 if err != nil { 633 return lisafs.Inode{}, -1, err 634 } 635 } 636 cu.Release() 637 return rootInode, rootHostFD, nil 638 } 639 640 func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { 641 // Check that the transport is "fd". 642 trans, ok := mopts[moptTransport] 643 if !ok || trans != transportModeFD { 644 ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) 645 return -1, linuxerr.EINVAL 646 } 647 delete(mopts, moptTransport) 648 649 // Check that read and write FDs are provided and identical. 650 rfdstr, ok := mopts[moptReadFD] 651 if !ok { 652 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD) 653 return -1, linuxerr.EINVAL 654 } 655 delete(mopts, moptReadFD) 656 rfd, err := strconv.Atoi(rfdstr) 657 if err != nil { 658 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) 659 return -1, linuxerr.EINVAL 660 } 661 wfdstr, ok := mopts[moptWriteFD] 662 if !ok { 663 ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD) 664 return -1, linuxerr.EINVAL 665 } 666 delete(mopts, moptWriteFD) 667 wfd, err := strconv.Atoi(wfdstr) 668 if err != nil { 669 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) 670 return -1, linuxerr.EINVAL 671 } 672 if rfd != wfd { 673 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) 674 return -1, linuxerr.EINVAL 675 } 676 return rfd, nil 677 } 678 679 // Release implements vfs.FilesystemImpl.Release. 680 func (fs *filesystem) Release(ctx context.Context) { 681 fs.released.Store(1) 682 683 mf := fs.mf 684 fs.syncMu.Lock() 685 for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { 686 d := elem.d 687 d.handleMu.Lock() 688 d.dataMu.Lock() 689 if d.isWriteHandleOk() { 690 // Write dirty cached data to the remote file. 691 h := d.writeHandle() 692 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 693 log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) 694 } 695 // TODO(jamieliu): Do we need to flushf/fsync d? 696 } 697 // Discard cached pages. 698 d.cache.DropAll(mf) 699 d.dirty.RemoveAll() 700 d.dataMu.Unlock() 701 // Close host FDs if they exist. 702 d.closeHostFDs() 703 d.handleMu.Unlock() 704 } 705 // There can't be any specialFileFDs still using fs, since each such 706 // FileDescription would hold a reference on a Mount holding a reference on 707 // fs. 708 fs.syncMu.Unlock() 709 710 // If leak checking is enabled, release all outstanding references in the 711 // filesystem. We deliberately avoid doing this outside of leak checking; we 712 // have released all external resources above rather than relying on dentry 713 // destructors. fs.root may be nil if creating the client or initializing the 714 // root dentry failed in GetFilesystem. 715 if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil { 716 fs.renameMu.Lock() 717 fs.root.releaseSyntheticRecursiveLocked(ctx) 718 fs.evictAllCachedDentriesLocked(ctx) 719 fs.renameMu.Unlock() 720 721 // An extra reference was held by the filesystem on the root to prevent it from 722 // being cached/evicted. 723 fs.root.DecRef(ctx) 724 } 725 726 if !fs.iopts.LeakConnection { 727 // Close the connection to the server. This implicitly closes all FDs. 728 if fs.client != nil { 729 fs.client.Close() 730 } 731 } 732 733 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 734 } 735 736 // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements 737 // the reference count on every synthetic dentry. Synthetic dentries have one 738 // reference for existence that should be dropped during filesystem.Release. 739 // 740 // Precondition: d.fs.renameMu is locked for writing. 741 func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { 742 if d.isSynthetic() { 743 d.decRefNoCaching() 744 d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 745 } 746 if d.isDir() { 747 var children []*dentry 748 d.childrenMu.Lock() 749 for _, child := range d.children { 750 children = append(children, child) 751 } 752 d.childrenMu.Unlock() 753 for _, child := range children { 754 if child != nil { 755 child.releaseSyntheticRecursiveLocked(ctx) 756 } 757 } 758 } 759 } 760 761 // inoKey is the key used to identify the inode backed by this dentry. 762 // 763 // +stateify savable 764 type inoKey struct { 765 ino uint64 766 devMinor uint32 767 devMajor uint32 768 } 769 770 func inoKeyFromStatx(stat *linux.Statx) inoKey { 771 return inoKey{ 772 ino: stat.Ino, 773 devMinor: stat.DevMinor, 774 devMajor: stat.DevMajor, 775 } 776 } 777 778 func inoKeyFromStat(stat *unix.Stat_t) inoKey { 779 return inoKey{ 780 ino: stat.Ino, 781 devMinor: unix.Minor(stat.Dev), 782 devMajor: unix.Major(stat.Dev), 783 } 784 } 785 786 // dentry implements vfs.DentryImpl. 787 // 788 // +stateify savable 789 type dentry struct { 790 vfsd vfs.Dentry 791 792 // refs is the reference count. Each dentry holds a reference on its 793 // parent, even if disowned. An additional reference is held on all 794 // synthetic dentries until they are unlinked or invalidated. When refs 795 // reaches 0, the dentry may be added to the cache or destroyed. If refs == 796 // -1, the dentry has already been destroyed. refs is accessed using atomic 797 // memory operations. 798 refs atomicbitops.Int64 799 800 // fs is the owning filesystem. fs is immutable. 801 fs *filesystem 802 803 // parent is this dentry's parent directory. Each dentry holds a reference 804 // on its parent. If this dentry is a filesystem root, parent is nil. 805 // parent is protected by filesystem.renameMu. 806 parent atomic.Pointer[dentry] `state:".(*dentry)"` 807 808 // name is the name of this dentry in its parent. If this dentry is a 809 // filesystem root, name is the empty string. name is protected by 810 // filesystem.renameMu. 811 name string 812 813 // inoKey is used to identify this dentry's inode. 814 inoKey inoKey 815 816 // If deleted is non-zero, the file represented by this dentry has been 817 // deleted is accessed using atomic memory operations. 818 deleted atomicbitops.Uint32 819 820 // cachingMu is used to synchronize concurrent dentry caching attempts on 821 // this dentry. 822 cachingMu sync.Mutex `state:"nosave"` 823 824 // If cached is true, this dentry is part of filesystem.dentryCache. cached 825 // is protected by cachingMu. 826 cached bool 827 828 // cacheEntry links dentry into filesystem.dentryCache.dentries. It is 829 // protected by filesystem.dentryCache.mu. 830 cacheEntry dentryListElem 831 832 // syncableListEntry links dentry into filesystem.syncableDentries. It is 833 // protected by filesystem.syncMu. 834 syncableListEntry dentryListElem 835 836 // opMu synchronizes operations on this dentry. Operations that mutate 837 // the dentry tree must hold this lock for writing. Operations that 838 // only read the tree must hold for reading. 839 opMu sync.RWMutex `state:"nosave"` 840 841 // childrenMu protects the cached children data for this dentry. 842 childrenMu sync.Mutex `state:"nosave"` 843 844 // If this dentry represents a directory, children contains: 845 // 846 // - Mappings of child filenames to dentries representing those children. 847 // 848 // - Mappings of child filenames that are known not to exist to nil 849 // dentries (only if InteropModeShared is not in effect and the directory 850 // is not synthetic). 851 // 852 // +checklocks:childrenMu 853 children map[string]*dentry 854 855 // If this dentry represents a directory, negativeChildrenCache cache 856 // names of negative children. negativeChildrenCache is not saved since 857 // dentry.prepareSaveRecursive() drops all negative children. 858 // 859 // +checklocks:childrenMu 860 negativeChildrenCache stringFixedCache `state:"nosave"` 861 // If this dentry represents a directory, negativeChildren is the number of 862 // negative children cached in dentry.children. negativeChildren is not 863 // saved since dentry.prepareSaveRecursive() drops all negative children. 864 // 865 // +checklocks:childrenMu 866 negativeChildren int `state:"nosave"` 867 868 // If this dentry represents a directory, syntheticChildren is the number 869 // of child dentries for which dentry.isSynthetic() == true. 870 // 871 // +checklocks:childrenMu 872 syntheticChildren int 873 874 // If this dentry represents a directory, 875 // dentry.cachedMetadataAuthoritative() == true, and dirents is not 876 // nil, then dirents is a cache of all entries in the directory, in the 877 // order they were returned by the server. childrenSet just stores the 878 // `Name` field of all dirents in a set for fast query. dirents and 879 // childrenSet share the same lifecycle. 880 // 881 // +checklocks:childrenMu 882 dirents []vfs.Dirent `state:"nosave"` 883 // +checklocks:childrenMu 884 childrenSet map[string]struct{} `state:"nosave"` 885 886 // Cached metadata; protected by metadataMu. 887 // To access: 888 // - In situations where consistency is not required (like stat), these 889 // can be accessed using atomic operations only (without locking). 890 // - Lock metadataMu and can access without atomic operations. 891 // To mutate: 892 // - Lock metadataMu and use atomic operations to update because we might 893 // have atomic readers that don't hold the lock. 894 metadataMu sync.Mutex `state:"nosave"` 895 ino uint64 // immutable 896 mode atomicbitops.Uint32 // type is immutable, perms are mutable 897 uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 898 gid atomicbitops.Uint32 // auth.KGID, but ... 899 blockSize atomicbitops.Uint32 // 0 if unknown 900 // Timestamps, all nsecs from the Unix epoch. 901 atime atomicbitops.Int64 902 mtime atomicbitops.Int64 903 ctime atomicbitops.Int64 904 btime atomicbitops.Int64 905 // File size, which differs from other metadata in two ways: 906 // 907 // - We make a best-effort attempt to keep it up to date even if 908 // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. 909 // 910 // - size is protected by both metadataMu and dataMu (i.e. both must be 911 // locked to mutate it; locking either is sufficient to access it). 912 size atomicbitops.Uint64 913 // If this dentry does not represent a synthetic file, deleted is 0, and 914 // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the 915 // remote file's timestamps, which should be updated when this dentry is 916 // evicted. 917 atimeDirty atomicbitops.Uint32 918 mtimeDirty atomicbitops.Uint32 919 920 // nlink counts the number of hard links to this dentry. It's updated and 921 // accessed using atomic operations. It's not protected by metadataMu like the 922 // other metadata fields. 923 nlink atomicbitops.Uint32 924 925 mapsMu sync.Mutex `state:"nosave"` 926 927 // If this dentry represents a regular file, mappings tracks mappings of 928 // the file into memmap.MappingSpaces. mappings is protected by mapsMu. 929 mappings memmap.MappingSet 930 931 // - If this dentry represents a regular file or directory, readFD (if not 932 // -1) is a host FD used for reads by all regularFileFDs/directoryFDs 933 // representing this dentry. 934 // 935 // - If this dentry represents a regular file, writeFD (if not -1) is a host 936 // FD used for writes by all regularFileFDs representing this dentry. 937 // 938 // - If this dentry represents a regular file, mmapFD is the host FD used 939 // for memory mappings. If mmapFD is -1, no such FD is available, and the 940 // internal page cache implementation is used for memory mappings instead. 941 // 942 // These fields are protected by handleMu. readFD, writeFD, and mmapFD are 943 // additionally written using atomic memory operations, allowing them to be 944 // read (albeit racily) with atomic.LoadInt32() without locking handleMu. 945 // 946 // readFD and writeFD may or may not be the same file descriptor. Once either 947 // transitions from closed (-1) to open, it may be mutated with handleMu 948 // locked, but cannot be closed until the dentry is destroyed. 949 // 950 // readFD and writeFD may or may not be the same file descriptor. mmapFD is 951 // always either -1 or equal to readFD; if the file has been opened for 952 // writing, it is additionally either -1 or equal to writeFD. 953 handleMu sync.RWMutex `state:"nosave"` 954 readFD atomicbitops.Int32 `state:"nosave"` 955 writeFD atomicbitops.Int32 `state:"nosave"` 956 mmapFD atomicbitops.Int32 `state:"nosave"` 957 958 dataMu sync.RWMutex `state:"nosave"` 959 960 // If this dentry represents a regular file that is client-cached, cache 961 // maps offsets into the cached file to offsets into 962 // filesystem.mfp.MemoryFile() that store the file's data. cache is 963 // protected by dataMu. 964 cache fsutil.FileRangeSet 965 966 // If this dentry represents a regular file that is client-cached, dirty 967 // tracks dirty segments in cache. dirty is protected by dataMu. 968 dirty fsutil.DirtySet 969 970 // pf implements memmap.File for mappings of hostFD. 971 pf dentryPlatformFile 972 973 // If this dentry represents a symbolic link, InteropModeShared is not in 974 // effect, and haveTarget is true, target is the symlink target. haveTarget 975 // and target are protected by dataMu. 976 haveTarget bool 977 target string 978 979 // If this dentry represents a synthetic socket file, endpoint is the 980 // transport endpoint bound to this file. 981 endpoint transport.BoundEndpoint 982 983 // If this dentry represents a synthetic named pipe, pipe is the pipe 984 // endpoint bound to this file. 985 pipe *pipe.VFSPipe 986 987 locks vfs.FileLocks 988 989 // Inotify watches for this dentry. 990 // 991 // Note that inotify may behave unexpectedly in the presence of hard links, 992 // because dentries corresponding to the same file have separate inotify 993 // watches when they should share the same set. This is the case because it is 994 // impossible for us to know for sure whether two dentries correspond to the 995 // same underlying file (see the gofer filesystem section fo vfs/inotify.md for 996 // a more in-depth discussion on this matter). 997 watches vfs.Watches 998 999 // impl is the specific dentry implementation for non-synthetic dentries. 1000 // impl is immutable. 1001 // 1002 // If impl is nil, this dentry represents a synthetic file, i.e. a 1003 // file that does not exist on the host filesystem. As of this writing, the 1004 // only files that can be synthetic are sockets, pipes, and directories. 1005 impl any 1006 } 1007 1008 // +stateify savable 1009 type stringListElem struct { 1010 // str is the string that this elem represents. 1011 str string 1012 stringEntry 1013 } 1014 1015 // +stateify savable 1016 type dentryListElem struct { 1017 // d is the dentry that this elem represents. 1018 d *dentry 1019 dentryEntry 1020 } 1021 1022 func (fs *filesystem) inoFromKey(key inoKey) uint64 { 1023 fs.inoMu.Lock() 1024 defer fs.inoMu.Unlock() 1025 1026 if ino, ok := fs.inoByKey[key]; ok { 1027 return ino 1028 } 1029 ino := fs.nextIno() 1030 fs.inoByKey[key] = ino 1031 return ino 1032 } 1033 1034 func (fs *filesystem) nextIno() uint64 { 1035 return fs.lastIno.Add(1) 1036 } 1037 1038 // init must be called before first use of d. 1039 func (d *dentry) init(impl any) { 1040 d.pf.dentry = d 1041 d.cacheEntry.d = d 1042 d.syncableListEntry.d = d 1043 // Nested impl-inheritance pattern. In memory it looks like: 1044 // [[[ vfs.Dentry ] dentry ] dentryImpl ] 1045 // All 3 abstractions are allocated in one allocation. We achieve this by 1046 // making each outer dentry implementation hold the inner dentry by value. 1047 // Then the outer most dentry is allocated and we initialize fields inward. 1048 // Each inner dentry has a pointer to the next level of implementation. 1049 d.impl = impl 1050 d.vfsd.Init(d) 1051 refs.Register(d) 1052 } 1053 1054 func (d *dentry) isSynthetic() bool { 1055 return d.impl == nil 1056 } 1057 1058 func (d *dentry) cachedMetadataAuthoritative() bool { 1059 return d.fs.opts.interop != InteropModeShared || d.isSynthetic() 1060 } 1061 1062 // updateMetadataFromStatxLocked is called to update d's metadata after an update 1063 // from the remote filesystem. 1064 // Precondition: d.metadataMu must be locked. 1065 // +checklocks:d.metadataMu 1066 func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { 1067 if stat.Mask&linux.STATX_TYPE != 0 { 1068 if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { 1069 panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) 1070 } 1071 } 1072 if stat.Mask&linux.STATX_MODE != 0 { 1073 d.mode.Store(uint32(stat.Mode)) 1074 } 1075 if stat.Mask&linux.STATX_UID != 0 { 1076 d.uid.Store(dentryUID(lisafs.UID(stat.UID))) 1077 } 1078 if stat.Mask&linux.STATX_GID != 0 { 1079 d.gid.Store(dentryGID(lisafs.GID(stat.GID))) 1080 } 1081 if stat.Blksize != 0 { 1082 d.blockSize.Store(stat.Blksize) 1083 } 1084 // Don't override newer client-defined timestamps with old server-defined 1085 // ones. 1086 if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 { 1087 d.atime.Store(dentryTimestamp(stat.Atime)) 1088 } 1089 if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 { 1090 d.mtime.Store(dentryTimestamp(stat.Mtime)) 1091 } 1092 if stat.Mask&linux.STATX_CTIME != 0 { 1093 d.ctime.Store(dentryTimestamp(stat.Ctime)) 1094 } 1095 if stat.Mask&linux.STATX_BTIME != 0 { 1096 d.btime.Store(dentryTimestamp(stat.Btime)) 1097 } 1098 if stat.Mask&linux.STATX_NLINK != 0 { 1099 d.nlink.Store(stat.Nlink) 1100 } 1101 if stat.Mask&linux.STATX_SIZE != 0 { 1102 d.updateSizeLocked(stat.Size) 1103 } 1104 } 1105 1106 // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked, 1107 // except that it takes a unix.Stat_t argument. 1108 // Precondition: d.metadataMu must be locked. 1109 // +checklocks:d.metadataMu 1110 func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error { 1111 if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want { 1112 panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got)) 1113 } 1114 d.mode.Store(stat.Mode) 1115 d.uid.Store(stat.Uid) 1116 d.gid.Store(stat.Gid) 1117 d.blockSize.Store(uint32(stat.Blksize)) 1118 // Don't override newer client-defined timestamps with old host-defined 1119 // ones. 1120 if d.atimeDirty.Load() == 0 { 1121 d.atime.Store(dentryTimestampFromUnix(stat.Atim)) 1122 } 1123 if d.mtimeDirty.Load() == 0 { 1124 d.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) 1125 } 1126 d.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) 1127 d.nlink.Store(uint32(stat.Nlink)) 1128 d.updateSizeLocked(uint64(stat.Size)) 1129 return nil 1130 } 1131 1132 // Preconditions: !d.isSynthetic(). 1133 // Preconditions: d.metadataMu is locked. 1134 // +checklocks:d.metadataMu 1135 func (d *dentry) refreshSizeLocked(ctx context.Context) error { 1136 d.handleMu.RLock() 1137 1138 // Can use RacyLoad() because handleMu is locked. 1139 if d.writeFD.RacyLoad() < 0 { 1140 d.handleMu.RUnlock() 1141 // Use a suitable FD if we don't have a writable host FD. 1142 return d.updateMetadataLocked(ctx, noHandle) 1143 } 1144 1145 // Using statx(2) with a minimal mask is faster than fstat(2). 1146 var stat unix.Statx_t 1147 // Can use RacyLoad() because handleMu is locked. 1148 err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) 1149 d.handleMu.RUnlock() // must be released before updateSizeLocked() 1150 if err != nil { 1151 return err 1152 } 1153 d.updateSizeLocked(stat.Size) 1154 return nil 1155 } 1156 1157 // Preconditions: !d.isSynthetic(). 1158 func (d *dentry) updateMetadata(ctx context.Context) error { 1159 // d.metadataMu must be locked *before* we stat so that we do not end up 1160 // updating stale attributes in d.updateMetadataFromStatLocked(). 1161 d.metadataMu.Lock() 1162 defer d.metadataMu.Unlock() 1163 return d.updateMetadataLocked(ctx, noHandle) 1164 } 1165 1166 func (d *dentry) fileType() uint32 { 1167 return d.mode.Load() & linux.S_IFMT 1168 } 1169 1170 func (d *dentry) statTo(stat *linux.Statx) { 1171 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME 1172 stat.Blksize = d.blockSize.Load() 1173 stat.Nlink = d.nlink.Load() 1174 if stat.Nlink == 0 { 1175 // The remote filesystem doesn't support link count; just make 1176 // something up. This is consistent with Linux, where 1177 // fs/inode.c:inode_init_always() initializes link count to 1, and 1178 // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if 1179 // it's not provided by the remote filesystem. 1180 stat.Nlink = 1 1181 } 1182 stat.UID = d.uid.Load() 1183 stat.GID = d.gid.Load() 1184 stat.Mode = uint16(d.mode.Load()) 1185 stat.Ino = uint64(d.ino) 1186 stat.Size = d.size.Load() 1187 // This is consistent with regularFileFD.Seek(), which treats regular files 1188 // as having no holes. 1189 stat.Blocks = (stat.Size + 511) / 512 1190 stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load()) 1191 stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load()) 1192 stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load()) 1193 stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load()) 1194 stat.DevMajor = linux.UNNAMED_MAJOR 1195 stat.DevMinor = d.fs.devMinor 1196 } 1197 1198 // Precondition: fs.renameMu is locked. 1199 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { 1200 stat := &opts.Stat 1201 if stat.Mask == 0 { 1202 return nil 1203 } 1204 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { 1205 return linuxerr.EPERM 1206 } 1207 mode := linux.FileMode(d.mode.Load()) 1208 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { 1209 return err 1210 } 1211 if err := mnt.CheckBeginWrite(); err != nil { 1212 return err 1213 } 1214 defer mnt.EndWrite() 1215 1216 if stat.Mask&linux.STATX_SIZE != 0 { 1217 // Reject attempts to truncate files other than regular files, since 1218 // filesystem implementations may return the wrong errno. 1219 switch mode.FileType() { 1220 case linux.S_IFREG: 1221 // ok 1222 case linux.S_IFDIR: 1223 return linuxerr.EISDIR 1224 default: 1225 return linuxerr.EINVAL 1226 } 1227 } 1228 1229 var now int64 1230 if d.cachedMetadataAuthoritative() { 1231 // Truncate updates mtime. 1232 if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { 1233 stat.Mask |= linux.STATX_MTIME 1234 stat.Mtime = linux.StatxTimestamp{ 1235 Nsec: linux.UTIME_NOW, 1236 } 1237 } 1238 1239 // Use client clocks for timestamps. 1240 now = d.fs.clock.Now().Nanoseconds() 1241 if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { 1242 stat.Atime = linux.NsecToStatxTimestamp(now) 1243 } 1244 if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { 1245 stat.Mtime = linux.NsecToStatxTimestamp(now) 1246 } 1247 } 1248 1249 d.metadataMu.Lock() 1250 defer d.metadataMu.Unlock() 1251 1252 // As with Linux, if the UID, GID, or file size is changing, we have to 1253 // clear permission bits. Note that when set, clearSGID may cause 1254 // permissions to be updated. 1255 clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) || 1256 (stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) || 1257 stat.Mask&linux.STATX_SIZE != 0 1258 if clearSGID { 1259 if stat.Mask&linux.STATX_MODE != 0 { 1260 stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) 1261 } else { 1262 oldMode := d.mode.Load() 1263 if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { 1264 stat.Mode = uint16(updatedMode) 1265 stat.Mask |= linux.STATX_MODE 1266 } 1267 } 1268 } 1269 1270 // failureMask indicates which attributes could not be set on the remote 1271 // filesystem. p9 returns an error if any of the attributes could not be set 1272 // but that leads to inconsistency as the server could have set a few 1273 // attributes successfully but a later failure will cause the successful ones 1274 // to not be updated in the dentry cache. 1275 var failureMask uint32 1276 var failureErr error 1277 if !d.isSynthetic() { 1278 if stat.Mask != 0 { 1279 if err := d.prepareSetStat(ctx, stat); err != nil { 1280 return err 1281 } 1282 d.handleMu.RLock() 1283 if stat.Mask&linux.STATX_SIZE != 0 { 1284 // d.dataMu must be held around the update to both the remote 1285 // file's size and d.size to serialize with writeback (which 1286 // might otherwise write data back up to the old d.size after 1287 // the remote file has been truncated). 1288 d.dataMu.Lock() 1289 } 1290 var err error 1291 failureMask, failureErr, err = d.setStatLocked(ctx, stat) 1292 d.handleMu.RUnlock() 1293 if err != nil { 1294 if stat.Mask&linux.STATX_SIZE != 0 { 1295 d.dataMu.Unlock() // +checklocksforce: locked conditionally above 1296 } 1297 return err 1298 } 1299 if stat.Mask&linux.STATX_SIZE != 0 { 1300 if failureMask&linux.STATX_SIZE == 0 { 1301 // d.size should be kept up to date, and privatized 1302 // copy-on-write mappings of truncated pages need to be 1303 // invalidated, even if InteropModeShared is in effect. 1304 d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above 1305 } else { 1306 d.dataMu.Unlock() // +checklocksforce: locked conditionally above 1307 } 1308 } 1309 } 1310 if d.fs.opts.interop == InteropModeShared { 1311 // There's no point to updating d's metadata in this case since 1312 // it'll be overwritten by revalidation before the next time it's 1313 // used anyway. (InteropModeShared inhibits client caching of 1314 // regular file data, so there's no cache to truncate either.) 1315 return nil 1316 } 1317 } 1318 if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { 1319 d.mode.Store(d.fileType() | uint32(stat.Mode)) 1320 } 1321 if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { 1322 d.uid.Store(stat.UID) 1323 } 1324 if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { 1325 d.gid.Store(stat.GID) 1326 } 1327 // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because 1328 // if d.cachedMetadataAuthoritative() then we converted stat.Atime and 1329 // stat.Mtime to client-local timestamps above, and if 1330 // !d.cachedMetadataAuthoritative() then we returned after calling 1331 // d.file.setAttr(). For the same reason, now must have been initialized. 1332 if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { 1333 d.atime.Store(stat.Atime.ToNsec()) 1334 d.atimeDirty.Store(0) 1335 } 1336 if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { 1337 d.mtime.Store(stat.Mtime.ToNsec()) 1338 d.mtimeDirty.Store(0) 1339 } 1340 d.ctime.Store(now) 1341 if failureMask != 0 { 1342 // Setting some attribute failed on the remote filesystem. 1343 return failureErr 1344 } 1345 return nil 1346 } 1347 1348 // doAllocate performs an allocate operation on d. Note that d.metadataMu will 1349 // be held when allocate is called. 1350 func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { 1351 d.metadataMu.Lock() 1352 defer d.metadataMu.Unlock() 1353 1354 // Allocating a smaller size is a noop. 1355 size := offset + length 1356 if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() { 1357 return nil 1358 } 1359 1360 err := allocate() 1361 if err != nil { 1362 return err 1363 } 1364 d.updateSizeLocked(size) 1365 if d.cachedMetadataAuthoritative() { 1366 d.touchCMtimeLocked() 1367 } 1368 return nil 1369 } 1370 1371 // Preconditions: d.metadataMu must be locked. 1372 func (d *dentry) updateSizeLocked(newSize uint64) { 1373 d.dataMu.Lock() 1374 d.updateSizeAndUnlockDataMuLocked(newSize) 1375 } 1376 1377 // Preconditions: d.metadataMu and d.dataMu must be locked. 1378 // 1379 // Postconditions: d.dataMu is unlocked. 1380 // +checklocksrelease:d.dataMu 1381 func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { 1382 oldSize := d.size.RacyLoad() 1383 d.size.Store(newSize) 1384 // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings 1385 // below. This allows concurrent calls to Read/Translate/etc. These 1386 // functions synchronize with truncation by refusing to use cache 1387 // contents beyond the new d.size. (We are still holding d.metadataMu, 1388 // so we can't race with Write or another truncate.) 1389 d.dataMu.Unlock() 1390 if newSize < oldSize { 1391 oldpgend, _ := hostarch.PageRoundUp(oldSize) 1392 newpgend, _ := hostarch.PageRoundUp(newSize) 1393 if oldpgend != newpgend { 1394 d.mapsMu.Lock() 1395 d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 1396 // Compare Linux's mm/truncate.c:truncate_setsize() => 1397 // truncate_pagecache() => 1398 // mm/memory.c:unmap_mapping_range(evencows=1). 1399 InvalidatePrivate: true, 1400 }) 1401 d.mapsMu.Unlock() 1402 } 1403 // We are now guaranteed that there are no translations of 1404 // truncated pages, and can remove them from the cache. Since 1405 // truncated pages have been removed from the remote file, they 1406 // should be dropped without being written back. 1407 d.dataMu.Lock() 1408 d.cache.Truncate(newSize, d.fs.mf) 1409 d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) 1410 d.dataMu.Unlock() 1411 } 1412 } 1413 1414 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 1415 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) 1416 } 1417 1418 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 1419 // Deny access to the "system" namespaces since applications 1420 // may expect these to affect kernel behavior in unimplemented ways 1421 // (b/148380782). Allow all other extended attributes to be passed through 1422 // to the remote filesystem. This is inconsistent with Linux's 9p client, 1423 // but consistent with other filesystems (e.g. FUSE). 1424 // 1425 // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is 1426 // consistent with the VFS1 gofer client. 1427 if strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { 1428 return linuxerr.EOPNOTSUPP 1429 } 1430 mode := linux.FileMode(d.mode.Load()) 1431 kuid := auth.KUID(d.uid.Load()) 1432 kgid := auth.KGID(d.gid.Load()) 1433 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 1434 return err 1435 } 1436 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 1437 } 1438 1439 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 1440 return vfs.CheckDeleteSticky( 1441 creds, 1442 linux.FileMode(d.mode.Load()), 1443 auth.KUID(d.uid.Load()), 1444 auth.KUID(child.uid.Load()), 1445 auth.KGID(child.gid.Load()), 1446 ) 1447 } 1448 1449 func dentryUID(uid lisafs.UID) uint32 { 1450 if !uid.Ok() { 1451 return uint32(auth.OverflowUID) 1452 } 1453 return uint32(uid) 1454 } 1455 1456 func dentryGID(gid lisafs.GID) uint32 { 1457 if !gid.Ok() { 1458 return uint32(auth.OverflowGID) 1459 } 1460 return uint32(gid) 1461 } 1462 1463 // IncRef implements vfs.DentryImpl.IncRef. 1464 func (d *dentry) IncRef() { 1465 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 1466 // d.checkCachingLocked(). 1467 r := d.refs.Add(1) 1468 if d.LogRefs() { 1469 refs.LogIncRef(d, r) 1470 } 1471 } 1472 1473 // TryIncRef implements vfs.DentryImpl.TryIncRef. 1474 func (d *dentry) TryIncRef() bool { 1475 for { 1476 r := d.refs.Load() 1477 if r <= 0 { 1478 return false 1479 } 1480 if d.refs.CompareAndSwap(r, r+1) { 1481 if d.LogRefs() { 1482 refs.LogTryIncRef(d, r+1) 1483 } 1484 return true 1485 } 1486 } 1487 } 1488 1489 // DecRef implements vfs.DentryImpl.DecRef. 1490 func (d *dentry) DecRef(ctx context.Context) { 1491 if d.decRefNoCaching() == 0 { 1492 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1493 } 1494 } 1495 1496 // decRefNoCaching decrements d's reference count without calling 1497 // d.checkCachingLocked, even if d's reference count reaches 0; callers are 1498 // responsible for ensuring that d.checkCachingLocked will be called later. 1499 func (d *dentry) decRefNoCaching() int64 { 1500 r := d.refs.Add(-1) 1501 if d.LogRefs() { 1502 refs.LogDecRef(d, r) 1503 } 1504 if r < 0 { 1505 panic("gofer.dentry.decRefNoCaching() called without holding a reference") 1506 } 1507 return r 1508 } 1509 1510 // RefType implements refs.CheckedObject.Type. 1511 func (d *dentry) RefType() string { 1512 return "gofer.dentry" 1513 } 1514 1515 // LeakMessage implements refs.CheckedObject.LeakMessage. 1516 func (d *dentry) LeakMessage() string { 1517 return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 1518 } 1519 1520 // LogRefs implements refs.CheckedObject.LogRefs. 1521 // 1522 // This should only be set to true for debugging purposes, as it can generate an 1523 // extremely large amount of output and drastically degrade performance. 1524 func (d *dentry) LogRefs() bool { 1525 return false 1526 } 1527 1528 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 1529 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 1530 if d.isDir() { 1531 events |= linux.IN_ISDIR 1532 } 1533 1534 d.fs.renameMu.RLock() 1535 // The ordering below is important, Linux always notifies the parent first. 1536 if parent := d.parent.Load(); parent != nil { 1537 parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) 1538 } 1539 d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) 1540 d.fs.renameMu.RUnlock() 1541 } 1542 1543 // Watches implements vfs.DentryImpl.Watches. 1544 func (d *dentry) Watches() *vfs.Watches { 1545 return &d.watches 1546 } 1547 1548 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 1549 // 1550 // If no watches are left on this dentry and it has no references, cache it. 1551 func (d *dentry) OnZeroWatches(ctx context.Context) { 1552 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1553 } 1554 1555 // checkCachingLocked should be called after d's reference count becomes 0 or 1556 // it becomes disowned. 1557 // 1558 // For performance, checkCachingLocked can also be called after d's reference 1559 // count becomes non-zero, so that d can be removed from the LRU cache. This 1560 // may help in reducing the size of the cache and hence reduce evictions. Note 1561 // that this is not necessary for correctness. 1562 // 1563 // It may be called on a destroyed dentry. For example, 1564 // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times 1565 // for the same dentry when the dentry is visited more than once in the same 1566 // operation. One of the calls may destroy the dentry, so subsequent calls will 1567 // do nothing. 1568 // 1569 // Preconditions: d.fs.renameMu must be locked for writing if 1570 // renameMuWriteLocked is true; it may be temporarily unlocked. 1571 func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { 1572 d.cachingMu.Lock() 1573 refs := d.refs.Load() 1574 if refs == -1 { 1575 // Dentry has already been destroyed. 1576 d.cachingMu.Unlock() 1577 return 1578 } 1579 if refs > 0 { 1580 // fs.dentryCache.dentries is permitted to contain dentries with non-zero 1581 // refs, which are skipped by fs.evictCachedDentryLocked() upon reaching 1582 // the end of the LRU. But it is still beneficial to remove d from the 1583 // cache as we are already holding d.cachingMu. Keeping a cleaner cache 1584 // also reduces the number of evictions (which is expensive as it acquires 1585 // fs.renameMu). 1586 d.removeFromCacheLocked() 1587 d.cachingMu.Unlock() 1588 return 1589 } 1590 // Deleted and invalidated dentries with zero references are no longer 1591 // reachable by path resolution and should be dropped immediately. 1592 if d.vfsd.IsDead() { 1593 d.removeFromCacheLocked() 1594 d.cachingMu.Unlock() 1595 if !renameMuWriteLocked { 1596 // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). 1597 d.fs.renameMu.Lock() 1598 defer d.fs.renameMu.Unlock() 1599 // Now that renameMu is locked for writing, no more refs can be taken on 1600 // d because path resolution requires renameMu for reading at least. 1601 if d.refs.Load() != 0 { 1602 // Destroy d only if its ref is still 0. If not, either someone took a 1603 // ref on it or it got destroyed before fs.renameMu could be acquired. 1604 return 1605 } 1606 } 1607 if d.isDeleted() { 1608 d.watches.HandleDeletion(ctx) 1609 } 1610 d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. 1611 return 1612 } 1613 if d.vfsd.IsEvictable() { 1614 d.cachingMu.Unlock() 1615 // Attempt to evict. 1616 if renameMuWriteLocked { 1617 d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case. 1618 return 1619 } 1620 d.evict(ctx) 1621 return 1622 } 1623 // If d still has inotify watches and it is not deleted or invalidated, it 1624 // can't be evicted. Otherwise, we will lose its watches, even if a new 1625 // dentry is created for the same file in the future. Note that the size of 1626 // d.watches cannot concurrently transition from zero to non-zero, because 1627 // adding a watch requires holding a reference on d. 1628 if d.watches.Size() > 0 { 1629 // As in the refs > 0 case, removing d is beneficial. 1630 d.removeFromCacheLocked() 1631 d.cachingMu.Unlock() 1632 return 1633 } 1634 1635 if d.fs.released.Load() != 0 { 1636 d.cachingMu.Unlock() 1637 if !renameMuWriteLocked { 1638 // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as 1639 // needed by d.destroyLocked() later. 1640 d.fs.renameMu.Lock() 1641 defer d.fs.renameMu.Unlock() 1642 } 1643 if parent := d.parent.Load(); parent != nil { 1644 parent.childrenMu.Lock() 1645 delete(parent.children, d.name) 1646 parent.childrenMu.Unlock() 1647 } 1648 d.destroyLocked(ctx) // +checklocksforce: see above. 1649 return 1650 } 1651 1652 d.fs.dentryCache.mu.Lock() 1653 // If d is already cached, just move it to the front of the LRU. 1654 if d.cached { 1655 d.fs.dentryCache.dentries.Remove(&d.cacheEntry) 1656 d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) 1657 d.fs.dentryCache.mu.Unlock() 1658 d.cachingMu.Unlock() 1659 return 1660 } 1661 // Cache the dentry, then evict the least recently used cached dentry if 1662 // the cache becomes over-full. 1663 d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) 1664 d.fs.dentryCache.dentriesLen++ 1665 d.cached = true 1666 shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries 1667 d.fs.dentryCache.mu.Unlock() 1668 d.cachingMu.Unlock() 1669 1670 if shouldEvict { 1671 if !renameMuWriteLocked { 1672 // Need to lock d.fs.renameMu for writing as needed by 1673 // d.evictCachedDentryLocked(). 1674 d.fs.renameMu.Lock() 1675 defer d.fs.renameMu.Unlock() 1676 } 1677 d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. 1678 } 1679 } 1680 1681 // Preconditions: d.cachingMu must be locked. 1682 func (d *dentry) removeFromCacheLocked() { 1683 if d.cached { 1684 d.fs.dentryCache.mu.Lock() 1685 d.fs.dentryCache.dentries.Remove(&d.cacheEntry) 1686 d.fs.dentryCache.dentriesLen-- 1687 d.fs.dentryCache.mu.Unlock() 1688 d.cached = false 1689 } 1690 } 1691 1692 // Precondition: fs.renameMu must be locked for writing; it may be temporarily 1693 // unlocked. 1694 // +checklocks:fs.renameMu 1695 func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { 1696 for fs.dentryCache.dentriesLen != 0 { 1697 fs.evictCachedDentryLocked(ctx) 1698 } 1699 } 1700 1701 // Preconditions: 1702 // - fs.renameMu must be locked for writing; it may be temporarily unlocked. 1703 // 1704 // +checklocks:fs.renameMu 1705 func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { 1706 fs.dentryCache.mu.Lock() 1707 victim := fs.dentryCache.dentries.Back() 1708 fs.dentryCache.mu.Unlock() 1709 if victim == nil { 1710 // fs.dentryCache.dentries may have become empty between when it was 1711 // checked and when we locked fs.dentryCache.mu. 1712 return 1713 } 1714 1715 if victim.d.fs == fs { 1716 victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs 1717 return 1718 } 1719 1720 // The dentry cache is shared between all gofer filesystems and the victim is 1721 // from another filesystem. Have that filesystem do the work. We unlock 1722 // fs.renameMu to prevent deadlock: two filesystems could otherwise wait on 1723 // each others' renameMu. 1724 fs.renameMu.Unlock() 1725 defer fs.renameMu.Lock() 1726 victim.d.evict(ctx) 1727 } 1728 1729 // Preconditions: 1730 // - d.fs.renameMu must not be locked for writing. 1731 func (d *dentry) evict(ctx context.Context) { 1732 d.fs.renameMu.Lock() 1733 defer d.fs.renameMu.Unlock() 1734 d.evictLocked(ctx) 1735 } 1736 1737 // Preconditions: 1738 // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1739 // 1740 // +checklocks:d.fs.renameMu 1741 func (d *dentry) evictLocked(ctx context.Context) { 1742 d.cachingMu.Lock() 1743 d.removeFromCacheLocked() 1744 // d.refs or d.watches.Size() may have become non-zero from an earlier path 1745 // resolution since it was inserted into fs.dentryCache.dentries. 1746 if d.refs.Load() != 0 || d.watches.Size() != 0 { 1747 d.cachingMu.Unlock() 1748 return 1749 } 1750 if parent := d.parent.Load(); parent != nil { 1751 parent.opMu.Lock() 1752 if !d.vfsd.IsDead() { 1753 // Note that d can't be a mount point (in any mount namespace), since VFS 1754 // holds references on mount points. 1755 rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) 1756 for _, rc := range rcs { 1757 rc.DecRef(ctx) 1758 } 1759 1760 parent.childrenMu.Lock() 1761 delete(parent.children, d.name) 1762 parent.childrenMu.Unlock() 1763 1764 // We're only deleting the dentry, not the file it 1765 // represents, so we don't need to update 1766 // victim parent.dirents etc. 1767 } 1768 parent.opMu.Unlock() 1769 } 1770 // Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any 1771 // concurrent caching attempts on d will attempt to destroy it and so will 1772 // try to acquire fs.renameMu (which we have already acquiredd). Hence, 1773 // fs.renameMu will synchronize the destroy attempts. 1774 d.cachingMu.Unlock() 1775 d.destroyLocked(ctx) // +checklocksforce: owned as precondition. 1776 } 1777 1778 // destroyDisconnected destroys an uncached, unparented dentry. There are no 1779 // locking preconditions. 1780 func (d *dentry) destroyDisconnected(ctx context.Context) { 1781 mf := d.fs.mf 1782 1783 d.handleMu.Lock() 1784 d.dataMu.Lock() 1785 1786 if d.isWriteHandleOk() { 1787 // Write dirty pages back to the remote filesystem. 1788 h := d.writeHandle() 1789 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 1790 log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) 1791 } 1792 } 1793 // Discard cached data. 1794 if !d.cache.IsEmpty() { 1795 mf.MarkAllUnevictable(d) 1796 d.cache.DropAll(mf) 1797 d.dirty.RemoveAll() 1798 } 1799 d.dataMu.Unlock() 1800 1801 // Close any resources held by the implementation. 1802 d.destroyImpl(ctx) 1803 1804 // Can use RacyLoad() because handleMu is locked. 1805 if d.readFD.RacyLoad() >= 0 { 1806 _ = unix.Close(int(d.readFD.RacyLoad())) 1807 } 1808 if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { 1809 _ = unix.Close(int(d.writeFD.RacyLoad())) 1810 } 1811 d.readFD = atomicbitops.FromInt32(-1) 1812 d.writeFD = atomicbitops.FromInt32(-1) 1813 d.mmapFD = atomicbitops.FromInt32(-1) 1814 d.handleMu.Unlock() 1815 1816 if !d.isSynthetic() { 1817 // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, 1818 // i.e. client and server timestamps may differ (because e.g. a client 1819 // write was serviced by the page cache, and only written back to the 1820 // remote file later). Ideally, we'd write client timestamps back to 1821 // the remote filesystem so that timestamps for a new dentry 1822 // instantiated for the same file would remain coherent. Unfortunately, 1823 // this turns out to be too expensive in many cases, so for now we 1824 // don't do this. 1825 1826 // Remove d from the set of syncable dentries. 1827 d.fs.syncMu.Lock() 1828 d.fs.syncableDentries.Remove(&d.syncableListEntry) 1829 d.fs.syncMu.Unlock() 1830 } 1831 1832 // Drop references and stop tracking this child. 1833 d.refs.Store(-1) 1834 refs.Unregister(d) 1835 } 1836 1837 // destroyLocked destroys the dentry. 1838 // 1839 // Preconditions: 1840 // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1841 // - d.refs == 0. 1842 // - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal 1843 // from its former parent dentry. 1844 // 1845 // +checklocks:d.fs.renameMu 1846 func (d *dentry) destroyLocked(ctx context.Context) { 1847 switch d.refs.Load() { 1848 case 0: 1849 // Mark the dentry destroyed. 1850 d.refs.Store(-1) 1851 case -1: 1852 panic("dentry.destroyLocked() called on already destroyed dentry") 1853 default: 1854 panic("dentry.destroyLocked() called with references on the dentry") 1855 } 1856 1857 // Allow the following to proceed without renameMu locked to improve 1858 // scalability. 1859 d.fs.renameMu.Unlock() 1860 1861 // No locks need to be held during destoryDisconnected. 1862 d.destroyDisconnected(ctx) 1863 1864 d.fs.renameMu.Lock() 1865 1866 // Drop the reference held by d on its parent without recursively locking 1867 // d.fs.renameMu. 1868 1869 if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 { 1870 parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 1871 } 1872 } 1873 1874 func (d *dentry) isDeleted() bool { 1875 return d.deleted.Load() != 0 1876 } 1877 1878 func (d *dentry) setDeleted() { 1879 d.deleted.Store(1) 1880 } 1881 1882 func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { 1883 if d.isSynthetic() { 1884 return nil, nil 1885 } 1886 1887 return d.listXattrImpl(ctx, size) 1888 } 1889 1890 func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 1891 if d.isSynthetic() { 1892 return "", linuxerr.ENODATA 1893 } 1894 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { 1895 return "", err 1896 } 1897 return d.getXattrImpl(ctx, opts) 1898 } 1899 1900 func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 1901 if d.isSynthetic() { 1902 return linuxerr.EPERM 1903 } 1904 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { 1905 return err 1906 } 1907 return d.setXattrImpl(ctx, opts) 1908 } 1909 1910 func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { 1911 if d.isSynthetic() { 1912 return linuxerr.EPERM 1913 } 1914 if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { 1915 return err 1916 } 1917 return d.removeXattrImpl(ctx, name) 1918 } 1919 1920 // Preconditions: 1921 // - !d.isSynthetic(). 1922 // - d.isRegularFile() || d.isDir(). 1923 // - fs.renameMu is locked. 1924 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { 1925 // O_TRUNC unconditionally requires us to obtain a new handle (opened with 1926 // O_TRUNC). 1927 if !trunc { 1928 d.handleMu.RLock() 1929 canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk()) 1930 d.handleMu.RUnlock() 1931 if canReuseCurHandle { 1932 // Current handles are sufficient. 1933 return nil 1934 } 1935 } 1936 1937 d.handleMu.Lock() 1938 needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc 1939 if !needNewHandle { 1940 d.handleMu.Unlock() 1941 return nil 1942 } 1943 1944 var fdsToCloseArr [2]int32 1945 fdsToClose := fdsToCloseArr[:0] 1946 invalidateTranslations := false 1947 // Get a new handle. If this file has been opened for both reading and 1948 // writing, try to get a single handle that is usable for both: 1949 // 1950 // - Writable memory mappings of a host FD require that the host FD is 1951 // opened for both reading and writing. 1952 // 1953 // - NOTE(b/141991141): Some filesystems may not ensure coherence 1954 // between multiple handles for the same file. 1955 openReadable := d.isReadHandleOk() || read 1956 openWritable := d.isWriteHandleOk() || write 1957 h, err := d.openHandle(ctx, openReadable, openWritable, trunc) 1958 if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { 1959 // It may not be possible to use a single handle for both 1960 // reading and writing, since permissions on the file may have 1961 // changed to e.g. disallow reading after previously being 1962 // opened for reading. In this case, we have no choice but to 1963 // use separate handles for reading and writing. 1964 ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) 1965 openReadable = read 1966 openWritable = write 1967 h, err = d.openHandle(ctx, openReadable, openWritable, trunc) 1968 } 1969 if err != nil { 1970 d.handleMu.Unlock() 1971 return err 1972 } 1973 1974 // Update d.readFD and d.writeFD 1975 if h.fd >= 0 { 1976 if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) { 1977 // Replace existing FDs with this one. 1978 if d.readFD.RacyLoad() >= 0 { 1979 // We already have a readable FD that may be in use by 1980 // concurrent callers of d.pf.FD(). 1981 if d.fs.opts.overlayfsStaleRead { 1982 // If overlayfsStaleRead is in effect, then the new FD 1983 // may not be coherent with the existing one, so we 1984 // have no choice but to switch to mappings of the new 1985 // FD in both the application and sentry. 1986 if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { 1987 d.handleMu.Unlock() 1988 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) 1989 h.close(ctx) 1990 return err 1991 } 1992 fdsToClose = append(fdsToClose, d.readFD.RacyLoad()) 1993 invalidateTranslations = true 1994 d.readFD.Store(h.fd) 1995 } else { 1996 // Otherwise, we want to avoid invalidating existing 1997 // memmap.Translations (which is expensive); instead, use 1998 // dup3 to make the old file descriptor refer to the new 1999 // file description, then close the new file descriptor 2000 // (which is no longer needed). Racing callers of d.pf.FD() 2001 // may use the old or new file description, but this 2002 // doesn't matter since they refer to the same file, and 2003 // any racing mappings must be read-only. 2004 if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { 2005 oldFD := d.readFD.RacyLoad() 2006 d.handleMu.Unlock() 2007 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) 2008 h.close(ctx) 2009 return err 2010 } 2011 fdsToClose = append(fdsToClose, h.fd) 2012 h.fd = d.readFD.RacyLoad() 2013 } 2014 } else { 2015 d.readFD.Store(h.fd) 2016 } 2017 if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 { 2018 fdsToClose = append(fdsToClose, d.writeFD.RacyLoad()) 2019 } 2020 d.writeFD.Store(h.fd) 2021 d.mmapFD.Store(h.fd) 2022 } else if openReadable && d.readFD.RacyLoad() < 0 { 2023 readHandleWasOk := d.isReadHandleOk() 2024 d.readFD.Store(h.fd) 2025 // If the file has not been opened for writing, the new FD may 2026 // be used for read-only memory mappings. If the file was 2027 // previously opened for reading (without an FD), then existing 2028 // translations of the file may use the internal page cache; 2029 // invalidate those mappings. 2030 if !d.isWriteHandleOk() { 2031 invalidateTranslations = readHandleWasOk 2032 d.mmapFD.Store(h.fd) 2033 } 2034 } else if openWritable && d.writeFD.RacyLoad() < 0 { 2035 d.writeFD.Store(h.fd) 2036 if d.readFD.RacyLoad() >= 0 { 2037 // We have an existing read-only FD, but the file has just 2038 // been opened for writing, so we need to start supporting 2039 // writable memory mappings. However, the new FD is not 2040 // readable, so we have no FD that can be used to create 2041 // writable memory mappings. Switch to using the internal 2042 // page cache. 2043 invalidateTranslations = true 2044 d.mmapFD.Store(-1) 2045 } 2046 } else { 2047 // The new FD is not useful. 2048 fdsToClose = append(fdsToClose, h.fd) 2049 } 2050 } else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 { 2051 // We have an existing read-only FD, but the file has just been 2052 // opened for writing, so we need to start supporting writable 2053 // memory mappings. However, we have no writable host FD. Switch to 2054 // using the internal page cache. 2055 invalidateTranslations = true 2056 d.mmapFD.Store(-1) 2057 } 2058 2059 d.updateHandles(ctx, h, openReadable, openWritable) 2060 d.handleMu.Unlock() 2061 2062 if invalidateTranslations { 2063 // Invalidate application mappings that may be using an old FD; they 2064 // will be replaced with mappings using the new FD after future calls 2065 // to d.Translate(). This requires holding d.mapsMu, which precedes 2066 // d.handleMu in the lock order. 2067 d.mapsMu.Lock() 2068 d.mappings.InvalidateAll(memmap.InvalidateOpts{}) 2069 d.mapsMu.Unlock() 2070 } 2071 for _, fd := range fdsToClose { 2072 unix.Close(int(fd)) 2073 } 2074 2075 return nil 2076 } 2077 2078 func (d *dentry) syncRemoteFile(ctx context.Context) error { 2079 d.handleMu.RLock() 2080 defer d.handleMu.RUnlock() 2081 return d.syncRemoteFileLocked(ctx) 2082 } 2083 2084 // Preconditions: d.handleMu must be locked. 2085 func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { 2086 // Prefer syncing write handles over read handles, since some remote 2087 // filesystem implementations may not sync changes made through write 2088 // handles otherwise. 2089 wh := d.writeHandle() 2090 wh.sync(ctx) 2091 rh := d.readHandle() 2092 rh.sync(ctx) 2093 return nil 2094 } 2095 2096 func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { 2097 d.handleMu.RLock() 2098 defer d.handleMu.RUnlock() 2099 if d.isWriteHandleOk() { 2100 // Write back dirty pages to the remote file. 2101 d.dataMu.Lock() 2102 h := d.writeHandle() 2103 err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt) 2104 d.dataMu.Unlock() 2105 if err != nil { 2106 return err 2107 } 2108 } 2109 if err := d.syncRemoteFileLocked(ctx); err != nil { 2110 if !forFilesystemSync { 2111 return err 2112 } 2113 // Only return err if we can reasonably have expected sync to succeed 2114 // (d is a regular file and was opened for writing). 2115 if d.isRegularFile() && d.isWriteHandleOk() { 2116 return err 2117 } 2118 ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) 2119 } 2120 return nil 2121 } 2122 2123 // incLinks increments link count. 2124 func (d *dentry) incLinks() { 2125 if d.nlink.Load() == 0 { 2126 // The remote filesystem doesn't support link count. 2127 return 2128 } 2129 d.nlink.Add(1) 2130 } 2131 2132 // decLinks decrements link count. 2133 func (d *dentry) decLinks() { 2134 if d.nlink.Load() == 0 { 2135 // The remote filesystem doesn't support link count. 2136 return 2137 } 2138 d.nlink.Add(^uint32(0)) 2139 } 2140 2141 // fileDescription is embedded by gofer implementations of 2142 // vfs.FileDescriptionImpl. 2143 // 2144 // +stateify savable 2145 type fileDescription struct { 2146 vfsfd vfs.FileDescription 2147 vfs.FileDescriptionDefaultImpl 2148 vfs.LockFD 2149 2150 lockLogging sync.Once `state:"nosave"` 2151 } 2152 2153 func (fd *fileDescription) filesystem() *filesystem { 2154 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 2155 } 2156 2157 func (fd *fileDescription) dentry() *dentry { 2158 return fd.vfsfd.Dentry().Impl().(*dentry) 2159 } 2160 2161 // Stat implements vfs.FileDescriptionImpl.Stat. 2162 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 2163 d := fd.dentry() 2164 const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) 2165 if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { 2166 // Use specialFileFD.handle.fileLisa for the Stat if available, for the 2167 // same reason that we try to use open FD in updateMetadataLocked(). 2168 var err error 2169 if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { 2170 err = sffd.updateMetadata(ctx) 2171 } else { 2172 err = d.updateMetadata(ctx) 2173 } 2174 if err != nil { 2175 return linux.Statx{}, err 2176 } 2177 } 2178 var stat linux.Statx 2179 d.statTo(&stat) 2180 return stat, nil 2181 } 2182 2183 // SetStat implements vfs.FileDescriptionImpl.SetStat. 2184 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 2185 fs := fd.filesystem() 2186 fs.renameMu.RLock() 2187 defer fs.renameMu.RUnlock() 2188 return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()) 2189 } 2190 2191 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 2192 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 2193 return fd.dentry().listXattr(ctx, size) 2194 } 2195 2196 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 2197 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 2198 return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2199 } 2200 2201 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 2202 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 2203 return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2204 } 2205 2206 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 2207 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 2208 return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name) 2209 } 2210 2211 // LockBSD implements vfs.FileDescriptionImpl.LockBSD. 2212 func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { 2213 fd.lockLogging.Do(func() { 2214 log.Infof("File lock using gofer file handled internally.") 2215 }) 2216 return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) 2217 } 2218 2219 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. 2220 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { 2221 fd.lockLogging.Do(func() { 2222 log.Infof("Range lock using gofer file handled internally.") 2223 }) 2224 return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) 2225 } 2226 2227 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. 2228 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { 2229 return fd.Locks().UnlockPOSIX(ctx, uid, r) 2230 } 2231 2232 // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally 2233 // holds some information around the intent behind resolving the path. 2234 type resolvingPath struct { 2235 *vfs.ResolvingPath 2236 2237 // excludeLast indicates whether the intent is to resolve until the last path 2238 // component. If true, the last path component should remain unresolved. 2239 excludeLast bool 2240 } 2241 2242 func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath { 2243 return resolvingPath{ResolvingPath: rp, excludeLast: false} 2244 } 2245 2246 func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath { 2247 return resolvingPath{ResolvingPath: rp, excludeLast: true} 2248 } 2249 2250 func (rp *resolvingPath) done() bool { 2251 if rp.excludeLast { 2252 return rp.Final() 2253 } 2254 return rp.Done() 2255 } 2256 2257 func (rp *resolvingPath) copy() resolvingPath { 2258 return resolvingPath{ 2259 ResolvingPath: rp.ResolvingPath.Copy(), 2260 excludeLast: rp.excludeLast, 2261 } 2262 } 2263 2264 // Precondition: !rp.done() && rp.Component() is not "." or "..". 2265 func (rp *resolvingPath) getComponents(emit func(string) bool) { 2266 rp.GetComponents(rp.excludeLast, emit) 2267 }