github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/gofer/gofer.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package gofer provides a filesystem implementation that is backed by a 9p 16 // server, interchangably referred to as "gofers" throughout this package. 17 // 18 // Lock order: 19 // regularFileFD/directoryFD.mu 20 // filesystem.renameMu 21 // dentry.cachingMu 22 // filesystem.cacheMu 23 // dentry.dirMu 24 // filesystem.syncMu 25 // dentry.metadataMu 26 // *** "memmap.Mappable locks" below this point 27 // dentry.mapsMu 28 // *** "memmap.Mappable locks taken by Translate" below this point 29 // dentry.handleMu 30 // dentry.dataMu 31 // filesystem.inoMu 32 // specialFileFD.mu 33 // specialFileFD.bufMu 34 // 35 // Locking dentry.dirMu and dentry.metadataMu in multiple dentries requires that 36 // either ancestor dentries are locked before descendant dentries, or that 37 // filesystem.renameMu is locked for writing. 38 package gofer 39 40 import ( 41 "fmt" 42 "strconv" 43 "strings" 44 "sync/atomic" 45 46 "golang.org/x/sys/unix" 47 "github.com/SagerNet/gvisor/pkg/abi/linux" 48 "github.com/SagerNet/gvisor/pkg/context" 49 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 50 "github.com/SagerNet/gvisor/pkg/hostarch" 51 "github.com/SagerNet/gvisor/pkg/log" 52 "github.com/SagerNet/gvisor/pkg/p9" 53 refs_vfs1 "github.com/SagerNet/gvisor/pkg/refs" 54 "github.com/SagerNet/gvisor/pkg/refsvfs2" 55 "github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil" 56 fslock "github.com/SagerNet/gvisor/pkg/sentry/fs/lock" 57 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 58 "github.com/SagerNet/gvisor/pkg/sentry/kernel/pipe" 59 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 60 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 61 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 62 "github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport" 63 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 64 "github.com/SagerNet/gvisor/pkg/sync" 65 "github.com/SagerNet/gvisor/pkg/syserror" 66 "github.com/SagerNet/gvisor/pkg/unet" 67 ) 68 69 // Name is the default filesystem name. 70 const Name = "9p" 71 72 // Mount option names for goferfs. 73 const ( 74 moptTransport = "trans" 75 moptReadFD = "rfdno" 76 moptWriteFD = "wfdno" 77 moptAname = "aname" 78 moptDfltUID = "dfltuid" 79 moptDfltGID = "dfltgid" 80 moptMsize = "msize" 81 moptVersion = "version" 82 moptDentryCacheLimit = "dentry_cache_limit" 83 moptCache = "cache" 84 moptForcePageCache = "force_page_cache" 85 moptLimitHostFDTranslation = "limit_host_fd_translation" 86 moptOverlayfsStaleRead = "overlayfs_stale_read" 87 ) 88 89 // Valid values for the "cache" mount option. 90 const ( 91 cacheNone = "none" 92 cacheFSCache = "fscache" 93 cacheFSCacheWritethrough = "fscache_writethrough" 94 cacheRemoteRevalidating = "remote_revalidating" 95 ) 96 97 // Valid values for "trans" mount option. 98 const transportModeFD = "fd" 99 100 // FilesystemType implements vfs.FilesystemType. 101 // 102 // +stateify savable 103 type FilesystemType struct{} 104 105 // filesystem implements vfs.FilesystemImpl. 106 // 107 // +stateify savable 108 type filesystem struct { 109 vfsfs vfs.Filesystem 110 111 // mfp is used to allocate memory that caches regular file contents. mfp is 112 // immutable. 113 mfp pgalloc.MemoryFileProvider 114 115 // Immutable options. 116 opts filesystemOptions 117 iopts InternalFilesystemOptions 118 119 // client is the client used by this filesystem. client is immutable. 120 client *p9.Client `state:"nosave"` 121 122 // clock is a realtime clock used to set timestamps in file operations. 123 clock ktime.Clock 124 125 // devMinor is the filesystem's minor device number. devMinor is immutable. 126 devMinor uint32 127 128 // root is the root dentry. root is immutable. 129 root *dentry 130 131 // renameMu serves two purposes: 132 // 133 // - It synchronizes path resolution with renaming initiated by this 134 // client. 135 // 136 // - It is held by path resolution to ensure that reachable dentries remain 137 // valid. A dentry is reachable by path resolution if it has a non-zero 138 // reference count (such that it is usable as vfs.ResolvingPath.Start() or 139 // is reachable from its children), or if it is a child dentry (such that 140 // it is reachable from its parent). 141 renameMu sync.RWMutex `state:"nosave"` 142 143 // cachedDentries contains all dentries with 0 references. (Due to race 144 // conditions, it may also contain dentries with non-zero references.) 145 // cachedDentriesLen is the number of dentries in cachedDentries. These fields 146 // are protected by cacheMu. 147 cacheMu sync.Mutex `state:"nosave"` 148 cachedDentries dentryList 149 cachedDentriesLen uint64 150 151 // syncableDentries contains all non-synthetic dentries. specialFileFDs 152 // contains all open specialFileFDs. These fields are protected by syncMu. 153 syncMu sync.Mutex `state:"nosave"` 154 syncableDentries map[*dentry]struct{} 155 specialFileFDs map[*specialFileFD]struct{} 156 157 // inoByQIDPath maps previously-observed QID.Paths to inode numbers 158 // assigned to those paths. inoByQIDPath is not preserved across 159 // checkpoint/restore because QIDs may be reused between different gofer 160 // processes, so QIDs may be repeated for different files across 161 // checkpoint/restore. inoByQIDPath is protected by inoMu. 162 inoMu sync.Mutex `state:"nosave"` 163 inoByQIDPath map[uint64]uint64 `state:"nosave"` 164 165 // lastIno is the last inode number assigned to a file. lastIno is accessed 166 // using atomic memory operations. 167 lastIno uint64 168 169 // savedDentryRW records open read/write handles during save/restore. 170 savedDentryRW map[*dentry]savedDentryRW 171 172 // released is nonzero once filesystem.Release has been called. It is accessed 173 // with atomic memory operations. 174 released int32 175 } 176 177 // +stateify savable 178 type filesystemOptions struct { 179 // "Standard" 9P options. 180 fd int 181 aname string 182 interop InteropMode // derived from the "cache" mount option 183 dfltuid auth.KUID 184 dfltgid auth.KGID 185 msize uint32 186 version string 187 188 // maxCachedDentries is the maximum size of filesystem.cachedDentries. 189 maxCachedDentries uint64 190 191 // If forcePageCache is true, host FDs may not be used for application 192 // memory mappings even if available; instead, the client must perform its 193 // own caching of regular file pages. This is primarily useful for testing. 194 forcePageCache bool 195 196 // If limitHostFDTranslation is true, apply maxFillRange() constraints to 197 // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This 198 // makes memory accounting behavior more consistent between cases where 199 // host FDs are / are not available, but may increase the frequency of 200 // sentry-handled page faults on files for which a host FD is available. 201 limitHostFDTranslation bool 202 203 // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote 204 // filesystem may not be coherent with writable host FDs opened later, so 205 // all uses of the former must be replaced by uses of the latter. This is 206 // usually only the case when the remote filesystem is a Linux overlayfs 207 // mount. (Prior to Linux 4.18, patch series centered on commit 208 // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were 209 // incoherent between pre-copy-up and post-copy-up FDs; after that patch 210 // series, only memory mappings are incoherent.) 211 overlayfsStaleRead bool 212 213 // If regularFilesUseSpecialFileFD is true, application FDs representing 214 // regular files will use distinct file handles for each FD, in the same 215 // way that application FDs representing "special files" such as sockets 216 // do. Note that this disables client caching and mmap for regular files. 217 regularFilesUseSpecialFileFD bool 218 } 219 220 // InteropMode controls the client's interaction with other remote filesystem 221 // users. 222 // 223 // +stateify savable 224 type InteropMode uint32 225 226 const ( 227 // InteropModeExclusive is appropriate when the filesystem client is the 228 // only user of the remote filesystem. 229 // 230 // - The client may cache arbitrary filesystem state (file data, metadata, 231 // filesystem structure, etc.). 232 // 233 // - Client changes to filesystem state may be sent to the remote 234 // filesystem asynchronously, except when server permission checks are 235 // necessary. 236 // 237 // - File timestamps are based on client clocks. This ensures that users of 238 // the client observe timestamps that are coherent with their own clocks 239 // and consistent with Linux's semantics (in particular, it is not always 240 // possible for clients to set arbitrary atimes and mtimes depending on the 241 // remote filesystem implementation, and never possible for clients to set 242 // arbitrary ctimes.) 243 InteropModeExclusive InteropMode = iota 244 245 // InteropModeWritethrough is appropriate when there are read-only users of 246 // the remote filesystem that expect to observe changes made by the 247 // filesystem client. 248 // 249 // - The client may cache arbitrary filesystem state. 250 // 251 // - Client changes to filesystem state must be sent to the remote 252 // filesystem synchronously. 253 // 254 // - File timestamps are based on client clocks. As a corollary, access 255 // timestamp changes from other remote filesystem users will not be visible 256 // to the client. 257 InteropModeWritethrough 258 259 // InteropModeShared is appropriate when there are users of the remote 260 // filesystem that may mutate its state other than the client. 261 // 262 // - The client must verify ("revalidate") cached filesystem state before 263 // using it. 264 // 265 // - Client changes to filesystem state must be sent to the remote 266 // filesystem synchronously. 267 // 268 // - File timestamps are based on server clocks. This is necessary to 269 // ensure that timestamp changes are synchronized between remote filesystem 270 // users. 271 // 272 // Note that the correctness of InteropModeShared depends on the server 273 // correctly implementing 9P fids (i.e. each fid immutably represents a 274 // single filesystem object), even in the presence of remote filesystem 275 // mutations from other users. If this is violated, the behavior of the 276 // client is undefined. 277 InteropModeShared 278 ) 279 280 // InternalFilesystemOptions may be passed as 281 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. 282 // 283 // +stateify savable 284 type InternalFilesystemOptions struct { 285 // If UniqueID is non-empty, it is an opaque string used to reassociate the 286 // filesystem with a new server FD during restoration from checkpoint. 287 UniqueID string 288 289 // If LeakConnection is true, do not close the connection to the server 290 // when the Filesystem is released. This is necessary for deployments in 291 // which servers can handle only a single client and report failure if that 292 // client disconnects. 293 LeakConnection bool 294 295 // If OpenSocketsByConnecting is true, silently translate attempts to open 296 // files identifying as sockets to connect RPCs. 297 OpenSocketsByConnecting bool 298 } 299 300 // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default 301 // UIDs and GIDs used for files that do not provide a specific owner or group 302 // respectively. 303 const ( 304 // uint32(-2) doesn't work in Go. 305 _V9FS_DEFUID = auth.KUID(4294967294) 306 _V9FS_DEFGID = auth.KGID(4294967294) 307 ) 308 309 // Name implements vfs.FilesystemType.Name. 310 func (FilesystemType) Name() string { 311 return Name 312 } 313 314 // Release implements vfs.FilesystemType.Release. 315 func (FilesystemType) Release(ctx context.Context) {} 316 317 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 318 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 319 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 320 if mfp == nil { 321 ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") 322 return nil, nil, linuxerr.EINVAL 323 } 324 325 mopts := vfs.GenericParseMountOptions(opts.Data) 326 var fsopts filesystemOptions 327 328 fd, err := getFDFromMountOptionsMap(ctx, mopts) 329 if err != nil { 330 return nil, nil, err 331 } 332 fsopts.fd = fd 333 334 // Get the attach name. 335 fsopts.aname = "/" 336 if aname, ok := mopts[moptAname]; ok { 337 delete(mopts, moptAname) 338 fsopts.aname = aname 339 } 340 341 // Parse the cache policy. For historical reasons, this defaults to the 342 // least generally-applicable option, InteropModeExclusive. 343 fsopts.interop = InteropModeExclusive 344 if cache, ok := mopts[moptCache]; ok { 345 delete(mopts, moptCache) 346 switch cache { 347 case cacheFSCache: 348 fsopts.interop = InteropModeExclusive 349 case cacheFSCacheWritethrough: 350 fsopts.interop = InteropModeWritethrough 351 case cacheNone: 352 fsopts.regularFilesUseSpecialFileFD = true 353 fallthrough 354 case cacheRemoteRevalidating: 355 fsopts.interop = InteropModeShared 356 default: 357 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) 358 return nil, nil, linuxerr.EINVAL 359 } 360 } 361 362 // Parse the default UID and GID. 363 fsopts.dfltuid = _V9FS_DEFUID 364 if dfltuidstr, ok := mopts[moptDfltUID]; ok { 365 delete(mopts, moptDfltUID) 366 dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) 367 if err != nil { 368 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) 369 return nil, nil, linuxerr.EINVAL 370 } 371 // In Linux, dfltuid is interpreted as a UID and is converted to a KUID 372 // in the caller's user namespace, but goferfs isn't 373 // application-mountable. 374 fsopts.dfltuid = auth.KUID(dfltuid) 375 } 376 fsopts.dfltgid = _V9FS_DEFGID 377 if dfltgidstr, ok := mopts[moptDfltGID]; ok { 378 delete(mopts, moptDfltGID) 379 dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) 380 if err != nil { 381 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) 382 return nil, nil, linuxerr.EINVAL 383 } 384 fsopts.dfltgid = auth.KGID(dfltgid) 385 } 386 387 // Parse the 9P message size. 388 fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M 389 if msizestr, ok := mopts[moptMsize]; ok { 390 delete(mopts, moptMsize) 391 msize, err := strconv.ParseUint(msizestr, 10, 32) 392 if err != nil { 393 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: %s=%s", moptMsize, msizestr) 394 return nil, nil, linuxerr.EINVAL 395 } 396 fsopts.msize = uint32(msize) 397 } 398 399 // Parse the 9P protocol version. 400 fsopts.version = p9.HighestVersionString() 401 if version, ok := mopts[moptVersion]; ok { 402 delete(mopts, moptVersion) 403 fsopts.version = version 404 } 405 406 // Parse the dentry cache limit. 407 fsopts.maxCachedDentries = 1000 408 if str, ok := mopts[moptDentryCacheLimit]; ok { 409 delete(mopts, moptDentryCacheLimit) 410 maxCachedDentries, err := strconv.ParseUint(str, 10, 64) 411 if err != nil { 412 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str) 413 return nil, nil, linuxerr.EINVAL 414 } 415 fsopts.maxCachedDentries = maxCachedDentries 416 } 417 418 // Handle simple flags. 419 if _, ok := mopts[moptForcePageCache]; ok { 420 delete(mopts, moptForcePageCache) 421 fsopts.forcePageCache = true 422 } 423 if _, ok := mopts[moptLimitHostFDTranslation]; ok { 424 delete(mopts, moptLimitHostFDTranslation) 425 fsopts.limitHostFDTranslation = true 426 } 427 if _, ok := mopts[moptOverlayfsStaleRead]; ok { 428 delete(mopts, moptOverlayfsStaleRead) 429 fsopts.overlayfsStaleRead = true 430 } 431 // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying 432 // "cache=none". 433 434 // Check for unparsed options. 435 if len(mopts) != 0 { 436 ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) 437 return nil, nil, linuxerr.EINVAL 438 } 439 440 // Handle internal options. 441 iopts, ok := opts.InternalData.(InternalFilesystemOptions) 442 if opts.InternalData != nil && !ok { 443 ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) 444 return nil, nil, linuxerr.EINVAL 445 } 446 // If !ok, iopts being the zero value is correct. 447 448 // Construct the filesystem object. 449 devMinor, err := vfsObj.GetAnonBlockDevMinor() 450 if err != nil { 451 return nil, nil, err 452 } 453 fs := &filesystem{ 454 mfp: mfp, 455 opts: fsopts, 456 iopts: iopts, 457 clock: ktime.RealtimeClockFromContext(ctx), 458 devMinor: devMinor, 459 syncableDentries: make(map[*dentry]struct{}), 460 specialFileFDs: make(map[*specialFileFD]struct{}), 461 inoByQIDPath: make(map[uint64]uint64), 462 } 463 fs.vfsfs.Init(vfsObj, &fstype, fs) 464 465 // Connect to the server. 466 if err := fs.dial(ctx); err != nil { 467 return nil, nil, err 468 } 469 470 // Perform attach to obtain the filesystem root. 471 ctx.UninterruptibleSleepStart(false) 472 attached, err := fs.client.Attach(fsopts.aname) 473 ctx.UninterruptibleSleepFinish(false) 474 if err != nil { 475 fs.vfsfs.DecRef(ctx) 476 return nil, nil, err 477 } 478 attachFile := p9file{attached} 479 qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) 480 if err != nil { 481 attachFile.close(ctx) 482 fs.vfsfs.DecRef(ctx) 483 return nil, nil, err 484 } 485 486 // Construct the root dentry. 487 root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) 488 if err != nil { 489 attachFile.close(ctx) 490 fs.vfsfs.DecRef(ctx) 491 return nil, nil, err 492 } 493 // Set the root's reference count to 2. One reference is returned to the 494 // caller, and the other is held by fs to prevent the root from being "cached" 495 // and subsequently evicted. 496 root.refs = 2 497 fs.root = root 498 499 return &fs.vfsfs, &root.vfsd, nil 500 } 501 502 func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { 503 // Check that the transport is "fd". 504 trans, ok := mopts[moptTransport] 505 if !ok || trans != transportModeFD { 506 ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) 507 return -1, linuxerr.EINVAL 508 } 509 delete(mopts, moptTransport) 510 511 // Check that read and write FDs are provided and identical. 512 rfdstr, ok := mopts[moptReadFD] 513 if !ok { 514 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD) 515 return -1, linuxerr.EINVAL 516 } 517 delete(mopts, moptReadFD) 518 rfd, err := strconv.Atoi(rfdstr) 519 if err != nil { 520 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) 521 return -1, linuxerr.EINVAL 522 } 523 wfdstr, ok := mopts[moptWriteFD] 524 if !ok { 525 ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD) 526 return -1, linuxerr.EINVAL 527 } 528 delete(mopts, moptWriteFD) 529 wfd, err := strconv.Atoi(wfdstr) 530 if err != nil { 531 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) 532 return -1, linuxerr.EINVAL 533 } 534 if rfd != wfd { 535 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) 536 return -1, linuxerr.EINVAL 537 } 538 return rfd, nil 539 } 540 541 // Preconditions: fs.client == nil. 542 func (fs *filesystem) dial(ctx context.Context) error { 543 // Establish a connection with the server. 544 conn, err := unet.NewSocket(fs.opts.fd) 545 if err != nil { 546 return err 547 } 548 549 // Perform version negotiation with the server. 550 ctx.UninterruptibleSleepStart(false) 551 client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version) 552 ctx.UninterruptibleSleepFinish(false) 553 if err != nil { 554 conn.Close() 555 return err 556 } 557 // Ownership of conn has been transferred to client. 558 559 fs.client = client 560 return nil 561 } 562 563 // Release implements vfs.FilesystemImpl.Release. 564 func (fs *filesystem) Release(ctx context.Context) { 565 atomic.StoreInt32(&fs.released, 1) 566 567 mf := fs.mfp.MemoryFile() 568 fs.syncMu.Lock() 569 for d := range fs.syncableDentries { 570 d.handleMu.Lock() 571 d.dataMu.Lock() 572 if h := d.writeHandleLocked(); h.isOpen() { 573 // Write dirty cached data to the remote file. 574 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { 575 log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) 576 } 577 // TODO(jamieliu): Do we need to flushf/fsync d? 578 } 579 // Discard cached pages. 580 d.cache.DropAll(mf) 581 d.dirty.RemoveAll() 582 d.dataMu.Unlock() 583 // Close host FDs if they exist. 584 if d.readFD >= 0 { 585 unix.Close(int(d.readFD)) 586 } 587 if d.writeFD >= 0 && d.readFD != d.writeFD { 588 unix.Close(int(d.writeFD)) 589 } 590 d.readFD = -1 591 d.writeFD = -1 592 d.mmapFD = -1 593 d.handleMu.Unlock() 594 } 595 // There can't be any specialFileFDs still using fs, since each such 596 // FileDescription would hold a reference on a Mount holding a reference on 597 // fs. 598 fs.syncMu.Unlock() 599 600 // If leak checking is enabled, release all outstanding references in the 601 // filesystem. We deliberately avoid doing this outside of leak checking; we 602 // have released all external resources above rather than relying on dentry 603 // destructors. 604 if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { 605 fs.renameMu.Lock() 606 fs.root.releaseSyntheticRecursiveLocked(ctx) 607 fs.evictAllCachedDentriesLocked(ctx) 608 fs.renameMu.Unlock() 609 610 // An extra reference was held by the filesystem on the root to prevent it from 611 // being cached/evicted. 612 fs.root.DecRef(ctx) 613 } 614 615 if !fs.iopts.LeakConnection { 616 // Close the connection to the server. This implicitly clunks all fids. 617 fs.client.Close() 618 } 619 620 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 621 } 622 623 // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements 624 // the reference count on every synthetic dentry. Synthetic dentries have one 625 // reference for existence that should be dropped during filesystem.Release. 626 // 627 // Precondition: d.fs.renameMu is locked for writing. 628 func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { 629 if d.isSynthetic() { 630 d.decRefNoCaching() 631 d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 632 } 633 if d.isDir() { 634 var children []*dentry 635 d.dirMu.Lock() 636 for _, child := range d.children { 637 children = append(children, child) 638 } 639 d.dirMu.Unlock() 640 for _, child := range children { 641 if child != nil { 642 child.releaseSyntheticRecursiveLocked(ctx) 643 } 644 } 645 } 646 } 647 648 // dentry implements vfs.DentryImpl. 649 // 650 // +stateify savable 651 type dentry struct { 652 vfsd vfs.Dentry 653 654 // refs is the reference count. Each dentry holds a reference on its 655 // parent, even if disowned. An additional reference is held on all 656 // synthetic dentries until they are unlinked or invalidated. When refs 657 // reaches 0, the dentry may be added to the cache or destroyed. If refs == 658 // -1, the dentry has already been destroyed. refs is accessed using atomic 659 // memory operations. 660 refs int64 661 662 // fs is the owning filesystem. fs is immutable. 663 fs *filesystem 664 665 // parent is this dentry's parent directory. Each dentry holds a reference 666 // on its parent. If this dentry is a filesystem root, parent is nil. 667 // parent is protected by filesystem.renameMu. 668 parent *dentry 669 670 // name is the name of this dentry in its parent. If this dentry is a 671 // filesystem root, name is the empty string. name is protected by 672 // filesystem.renameMu. 673 name string 674 675 // qidPath is the p9.QID.Path for this file. qidPath is immutable. 676 qidPath uint64 677 678 // file is the unopened p9.File that backs this dentry. file is immutable. 679 // 680 // If file.isNil(), this dentry represents a synthetic file, i.e. a file 681 // that does not exist on the remote filesystem. As of this writing, the 682 // only files that can be synthetic are sockets, pipes, and directories. 683 file p9file `state:"nosave"` 684 685 // If deleted is non-zero, the file represented by this dentry has been 686 // deleted. deleted is accessed using atomic memory operations. 687 deleted uint32 688 689 // cachingMu is used to synchronize concurrent dentry caching attempts on 690 // this dentry. 691 cachingMu sync.Mutex `state:"nosave"` 692 693 // If cached is true, dentryEntry links dentry into 694 // filesystem.cachedDentries. cached and dentryEntry are protected by 695 // cachingMu. 696 cached bool 697 dentryEntry 698 699 dirMu sync.Mutex `state:"nosave"` 700 701 // If this dentry represents a directory, children contains: 702 // 703 // - Mappings of child filenames to dentries representing those children. 704 // 705 // - Mappings of child filenames that are known not to exist to nil 706 // dentries (only if InteropModeShared is not in effect and the directory 707 // is not synthetic). 708 // 709 // children is protected by dirMu. 710 children map[string]*dentry 711 712 // If this dentry represents a directory, syntheticChildren is the number 713 // of child dentries for which dentry.isSynthetic() == true. 714 // syntheticChildren is protected by dirMu. 715 syntheticChildren int 716 717 // If this dentry represents a directory, 718 // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it 719 // is a cache of all entries in the directory, in the order they were 720 // returned by the server. dirents is protected by dirMu. 721 dirents []vfs.Dirent 722 723 // Cached metadata; protected by metadataMu. 724 // To access: 725 // - In situations where consistency is not required (like stat), these 726 // can be accessed using atomic operations only (without locking). 727 // - Lock metadataMu and can access without atomic operations. 728 // To mutate: 729 // - Lock metadataMu and use atomic operations to update because we might 730 // have atomic readers that don't hold the lock. 731 metadataMu sync.Mutex `state:"nosave"` 732 ino uint64 // immutable 733 mode uint32 // type is immutable, perms are mutable 734 uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 735 gid uint32 // auth.KGID, but ... 736 blockSize uint32 // 0 if unknown 737 // Timestamps, all nsecs from the Unix epoch. 738 atime int64 739 mtime int64 740 ctime int64 741 btime int64 742 // File size, which differs from other metadata in two ways: 743 // 744 // - We make a best-effort attempt to keep it up to date even if 745 // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. 746 // 747 // - size is protected by both metadataMu and dataMu (i.e. both must be 748 // locked to mutate it; locking either is sufficient to access it). 749 size uint64 750 // If this dentry does not represent a synthetic file, deleted is 0, and 751 // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the 752 // remote file's timestamps, which should be updated when this dentry is 753 // evicted. 754 atimeDirty uint32 755 mtimeDirty uint32 756 757 // nlink counts the number of hard links to this dentry. It's updated and 758 // accessed using atomic operations. It's not protected by metadataMu like the 759 // other metadata fields. 760 nlink uint32 761 762 mapsMu sync.Mutex `state:"nosave"` 763 764 // If this dentry represents a regular file, mappings tracks mappings of 765 // the file into memmap.MappingSpaces. mappings is protected by mapsMu. 766 mappings memmap.MappingSet 767 768 // - If this dentry represents a regular file or directory, readFile is the 769 // p9.File used for reads by all regularFileFDs/directoryFDs representing 770 // this dentry, and readFD (if not -1) is a host FD equivalent to readFile 771 // used as a faster alternative. 772 // 773 // - If this dentry represents a regular file, writeFile is the p9.File 774 // used for writes by all regularFileFDs representing this dentry, and 775 // writeFD (if not -1) is a host FD equivalent to writeFile used as a 776 // faster alternative. 777 // 778 // - If this dentry represents a regular file, mmapFD is the host FD used 779 // for memory mappings. If mmapFD is -1, no such FD is available, and the 780 // internal page cache implementation is used for memory mappings instead. 781 // 782 // These fields are protected by handleMu. readFD, writeFD, and mmapFD are 783 // additionally written using atomic memory operations, allowing them to be 784 // read (albeit racily) with atomic.LoadInt32() without locking handleMu. 785 // 786 // readFile and writeFile may or may not represent the same p9.File. Once 787 // either p9.File transitions from closed (isNil() == true) to open 788 // (isNil() == false), it may be mutated with handleMu locked, but cannot 789 // be closed until the dentry is destroyed. 790 // 791 // readFD and writeFD may or may not be the same file descriptor. mmapFD is 792 // always either -1 or equal to readFD; if !writeFile.isNil() (the file has 793 // been opened for writing), it is additionally either -1 or equal to 794 // writeFD. 795 handleMu sync.RWMutex `state:"nosave"` 796 readFile p9file `state:"nosave"` 797 writeFile p9file `state:"nosave"` 798 readFD int32 `state:"nosave"` 799 writeFD int32 `state:"nosave"` 800 mmapFD int32 `state:"nosave"` 801 802 dataMu sync.RWMutex `state:"nosave"` 803 804 // If this dentry represents a regular file that is client-cached, cache 805 // maps offsets into the cached file to offsets into 806 // filesystem.mfp.MemoryFile() that store the file's data. cache is 807 // protected by dataMu. 808 cache fsutil.FileRangeSet 809 810 // If this dentry represents a regular file that is client-cached, dirty 811 // tracks dirty segments in cache. dirty is protected by dataMu. 812 dirty fsutil.DirtySet 813 814 // pf implements platform.File for mappings of hostFD. 815 pf dentryPlatformFile 816 817 // If this dentry represents a symbolic link, InteropModeShared is not in 818 // effect, and haveTarget is true, target is the symlink target. haveTarget 819 // and target are protected by dataMu. 820 haveTarget bool 821 target string 822 823 // If this dentry represents a synthetic socket file, endpoint is the 824 // transport endpoint bound to this file. 825 endpoint transport.BoundEndpoint 826 827 // If this dentry represents a synthetic named pipe, pipe is the pipe 828 // endpoint bound to this file. 829 pipe *pipe.VFSPipe 830 831 locks vfs.FileLocks 832 833 // Inotify watches for this dentry. 834 // 835 // Note that inotify may behave unexpectedly in the presence of hard links, 836 // because dentries corresponding to the same file have separate inotify 837 // watches when they should share the same set. This is the case because it is 838 // impossible for us to know for sure whether two dentries correspond to the 839 // same underlying file (see the gofer filesystem section fo vfs/inotify.md for 840 // a more in-depth discussion on this matter). 841 watches vfs.Watches 842 } 843 844 // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the 845 // gofer client. 846 func dentryAttrMask() p9.AttrMask { 847 return p9.AttrMask{ 848 Mode: true, 849 UID: true, 850 GID: true, 851 ATime: true, 852 MTime: true, 853 CTime: true, 854 Size: true, 855 BTime: true, 856 } 857 } 858 859 // newDentry creates a new dentry representing the given file. The dentry 860 // initially has no references, but is not cached; it is the caller's 861 // responsibility to set the dentry's reference count and/or call 862 // dentry.checkCachingLocked() as appropriate. 863 // 864 // Preconditions: !file.isNil(). 865 func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { 866 if !mask.Mode { 867 ctx.Warningf("can't create gofer.dentry without file type") 868 return nil, syserror.EIO 869 } 870 if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { 871 ctx.Warningf("can't create regular file gofer.dentry without file size") 872 return nil, syserror.EIO 873 } 874 875 d := &dentry{ 876 fs: fs, 877 qidPath: qid.Path, 878 file: file, 879 ino: fs.inoFromQIDPath(qid.Path), 880 mode: uint32(attr.Mode), 881 uid: uint32(fs.opts.dfltuid), 882 gid: uint32(fs.opts.dfltgid), 883 blockSize: hostarch.PageSize, 884 readFD: -1, 885 writeFD: -1, 886 mmapFD: -1, 887 } 888 d.pf.dentry = d 889 if mask.UID { 890 d.uid = dentryUIDFromP9UID(attr.UID) 891 } 892 if mask.GID { 893 d.gid = dentryGIDFromP9GID(attr.GID) 894 } 895 if mask.Size { 896 d.size = attr.Size 897 } 898 if attr.BlockSize != 0 { 899 d.blockSize = uint32(attr.BlockSize) 900 } 901 if mask.ATime { 902 d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds) 903 } 904 if mask.MTime { 905 d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds) 906 } 907 if mask.CTime { 908 d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds) 909 } 910 if mask.BTime { 911 d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) 912 } 913 if mask.NLink { 914 d.nlink = uint32(attr.NLink) 915 } 916 d.vfsd.Init(d) 917 refsvfs2.Register(d) 918 fs.syncMu.Lock() 919 fs.syncableDentries[d] = struct{}{} 920 fs.syncMu.Unlock() 921 return d, nil 922 } 923 924 func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 { 925 fs.inoMu.Lock() 926 defer fs.inoMu.Unlock() 927 if ino, ok := fs.inoByQIDPath[qidPath]; ok { 928 return ino 929 } 930 ino := fs.nextIno() 931 fs.inoByQIDPath[qidPath] = ino 932 return ino 933 } 934 935 func (fs *filesystem) nextIno() uint64 { 936 return atomic.AddUint64(&fs.lastIno, 1) 937 } 938 939 func (d *dentry) isSynthetic() bool { 940 return d.file.isNil() 941 } 942 943 func (d *dentry) cachedMetadataAuthoritative() bool { 944 return d.fs.opts.interop != InteropModeShared || d.isSynthetic() 945 } 946 947 // updateFromP9Attrs is called to update d's metadata after an update from the 948 // remote filesystem. 949 // Precondition: d.metadataMu must be locked. 950 // +checklocks:d.metadataMu 951 func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { 952 if mask.Mode { 953 if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { 954 panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) 955 } 956 atomic.StoreUint32(&d.mode, uint32(attr.Mode)) 957 } 958 if mask.UID { 959 atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID)) 960 } 961 if mask.GID { 962 atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID)) 963 } 964 // There is no P9_GETATTR_* bit for I/O block size. 965 if attr.BlockSize != 0 { 966 atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) 967 } 968 // Don't override newer client-defined timestamps with old server-defined 969 // ones. 970 if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 { 971 atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) 972 } 973 if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 { 974 atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) 975 } 976 if mask.CTime { 977 atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)) 978 } 979 if mask.BTime { 980 atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) 981 } 982 if mask.NLink { 983 atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) 984 } 985 if mask.Size { 986 d.updateSizeLocked(attr.Size) 987 } 988 } 989 990 // Preconditions: !d.isSynthetic(). 991 // Preconditions: d.metadataMu is locked. 992 // +checklocks:d.metadataMu 993 func (d *dentry) refreshSizeLocked(ctx context.Context) error { 994 d.handleMu.RLock() 995 996 if d.writeFD < 0 { 997 d.handleMu.RUnlock() 998 // Ask the gofer if we don't have a host FD. 999 return d.updateFromGetattrLocked(ctx) 1000 } 1001 1002 var stat unix.Statx_t 1003 err := unix.Statx(int(d.writeFD), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) 1004 d.handleMu.RUnlock() // must be released before updateSizeLocked() 1005 if err != nil { 1006 return err 1007 } 1008 d.updateSizeLocked(stat.Size) 1009 return nil 1010 } 1011 1012 // Preconditions: !d.isSynthetic(). 1013 func (d *dentry) updateFromGetattr(ctx context.Context) error { 1014 // d.metadataMu must be locked *before* we getAttr so that we do not end up 1015 // updating stale attributes in d.updateFromP9AttrsLocked(). 1016 d.metadataMu.Lock() 1017 defer d.metadataMu.Unlock() 1018 return d.updateFromGetattrLocked(ctx) 1019 } 1020 1021 // Preconditions: 1022 // * !d.isSynthetic(). 1023 // * d.metadataMu is locked. 1024 // +checklocks:d.metadataMu 1025 func (d *dentry) updateFromGetattrLocked(ctx context.Context) error { 1026 // Use d.readFile or d.writeFile, which represent 9P FIDs that have been 1027 // opened, in preference to d.file, which represents a 9P fid that has not. 1028 // This may be significantly more efficient in some implementations. Prefer 1029 // d.writeFile over d.readFile since some filesystem implementations may 1030 // update a writable handle's metadata after writes to that handle, without 1031 // making metadata updates immediately visible to read-only handles 1032 // representing the same file. 1033 d.handleMu.RLock() 1034 handleMuRLocked := true 1035 var file p9file 1036 switch { 1037 case !d.writeFile.isNil(): 1038 file = d.writeFile 1039 case !d.readFile.isNil(): 1040 file = d.readFile 1041 default: 1042 file = d.file 1043 d.handleMu.RUnlock() 1044 handleMuRLocked = false 1045 } 1046 1047 _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) 1048 if handleMuRLocked { 1049 // handleMu must be released before updateFromP9AttrsLocked(). 1050 d.handleMu.RUnlock() // +checklocksforce: complex case. 1051 } 1052 if err != nil { 1053 return err 1054 } 1055 d.updateFromP9AttrsLocked(attrMask, &attr) 1056 return nil 1057 } 1058 1059 func (d *dentry) fileType() uint32 { 1060 return atomic.LoadUint32(&d.mode) & linux.S_IFMT 1061 } 1062 1063 func (d *dentry) statTo(stat *linux.Statx) { 1064 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME 1065 stat.Blksize = atomic.LoadUint32(&d.blockSize) 1066 stat.Nlink = atomic.LoadUint32(&d.nlink) 1067 if stat.Nlink == 0 { 1068 // The remote filesystem doesn't support link count; just make 1069 // something up. This is consistent with Linux, where 1070 // fs/inode.c:inode_init_always() initializes link count to 1, and 1071 // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if 1072 // it's not provided by the remote filesystem. 1073 stat.Nlink = 1 1074 } 1075 stat.UID = atomic.LoadUint32(&d.uid) 1076 stat.GID = atomic.LoadUint32(&d.gid) 1077 stat.Mode = uint16(atomic.LoadUint32(&d.mode)) 1078 stat.Ino = uint64(d.ino) 1079 stat.Size = atomic.LoadUint64(&d.size) 1080 // This is consistent with regularFileFD.Seek(), which treats regular files 1081 // as having no holes. 1082 stat.Blocks = (stat.Size + 511) / 512 1083 stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime)) 1084 stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime)) 1085 stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime)) 1086 stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime)) 1087 stat.DevMajor = linux.UNNAMED_MAJOR 1088 stat.DevMinor = d.fs.devMinor 1089 } 1090 1091 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { 1092 stat := &opts.Stat 1093 if stat.Mask == 0 { 1094 return nil 1095 } 1096 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { 1097 return linuxerr.EPERM 1098 } 1099 mode := linux.FileMode(atomic.LoadUint32(&d.mode)) 1100 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { 1101 return err 1102 } 1103 if err := mnt.CheckBeginWrite(); err != nil { 1104 return err 1105 } 1106 defer mnt.EndWrite() 1107 1108 if stat.Mask&linux.STATX_SIZE != 0 { 1109 // Reject attempts to truncate files other than regular files, since 1110 // filesystem implementations may return the wrong errno. 1111 switch mode.FileType() { 1112 case linux.S_IFREG: 1113 // ok 1114 case linux.S_IFDIR: 1115 return syserror.EISDIR 1116 default: 1117 return linuxerr.EINVAL 1118 } 1119 } 1120 1121 var now int64 1122 if d.cachedMetadataAuthoritative() { 1123 // Truncate updates mtime. 1124 if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { 1125 stat.Mask |= linux.STATX_MTIME 1126 stat.Mtime = linux.StatxTimestamp{ 1127 Nsec: linux.UTIME_NOW, 1128 } 1129 } 1130 1131 // Use client clocks for timestamps. 1132 now = d.fs.clock.Now().Nanoseconds() 1133 if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { 1134 stat.Atime = linux.NsecToStatxTimestamp(now) 1135 } 1136 if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { 1137 stat.Mtime = linux.NsecToStatxTimestamp(now) 1138 } 1139 } 1140 1141 d.metadataMu.Lock() 1142 defer d.metadataMu.Unlock() 1143 1144 // As with Linux, if the UID, GID, or file size is changing, we have to 1145 // clear permission bits. Note that when set, clearSGID may cause 1146 // permissions to be updated. 1147 clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != atomic.LoadUint32(&d.uid)) || 1148 (stat.Mask&linux.STATX_GID != 0 && stat.GID != atomic.LoadUint32(&d.gid)) || 1149 stat.Mask&linux.STATX_SIZE != 0 1150 if clearSGID { 1151 if stat.Mask&linux.STATX_MODE != 0 { 1152 stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) 1153 } else { 1154 oldMode := atomic.LoadUint32(&d.mode) 1155 if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { 1156 stat.Mode = uint16(updatedMode) 1157 stat.Mask |= linux.STATX_MODE 1158 } 1159 } 1160 } 1161 1162 if !d.isSynthetic() { 1163 if stat.Mask != 0 { 1164 if err := d.file.setAttr(ctx, p9.SetAttrMask{ 1165 Permissions: stat.Mask&linux.STATX_MODE != 0, 1166 UID: stat.Mask&linux.STATX_UID != 0, 1167 GID: stat.Mask&linux.STATX_GID != 0, 1168 Size: stat.Mask&linux.STATX_SIZE != 0, 1169 ATime: stat.Mask&linux.STATX_ATIME != 0, 1170 MTime: stat.Mask&linux.STATX_MTIME != 0, 1171 ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, 1172 MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, 1173 }, p9.SetAttr{ 1174 Permissions: p9.FileMode(stat.Mode), 1175 UID: p9.UID(stat.UID), 1176 GID: p9.GID(stat.GID), 1177 Size: stat.Size, 1178 ATimeSeconds: uint64(stat.Atime.Sec), 1179 ATimeNanoSeconds: uint64(stat.Atime.Nsec), 1180 MTimeSeconds: uint64(stat.Mtime.Sec), 1181 MTimeNanoSeconds: uint64(stat.Mtime.Nsec), 1182 }); err != nil { 1183 return err 1184 } 1185 if stat.Mask&linux.STATX_SIZE != 0 { 1186 // d.size should be kept up to date, and privatized 1187 // copy-on-write mappings of truncated pages need to be 1188 // invalidated, even if InteropModeShared is in effect. 1189 d.updateSizeLocked(stat.Size) 1190 } 1191 } 1192 if d.fs.opts.interop == InteropModeShared { 1193 // There's no point to updating d's metadata in this case since 1194 // it'll be overwritten by revalidation before the next time it's 1195 // used anyway. (InteropModeShared inhibits client caching of 1196 // regular file data, so there's no cache to truncate either.) 1197 return nil 1198 } 1199 } 1200 if stat.Mask&linux.STATX_MODE != 0 { 1201 atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) 1202 } 1203 if stat.Mask&linux.STATX_UID != 0 { 1204 atomic.StoreUint32(&d.uid, stat.UID) 1205 } 1206 if stat.Mask&linux.STATX_GID != 0 { 1207 atomic.StoreUint32(&d.gid, stat.GID) 1208 } 1209 // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because 1210 // if d.cachedMetadataAuthoritative() then we converted stat.Atime and 1211 // stat.Mtime to client-local timestamps above, and if 1212 // !d.cachedMetadataAuthoritative() then we returned after calling 1213 // d.file.setAttr(). For the same reason, now must have been initialized. 1214 if stat.Mask&linux.STATX_ATIME != 0 { 1215 atomic.StoreInt64(&d.atime, stat.Atime.ToNsec()) 1216 atomic.StoreUint32(&d.atimeDirty, 0) 1217 } 1218 if stat.Mask&linux.STATX_MTIME != 0 { 1219 atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec()) 1220 atomic.StoreUint32(&d.mtimeDirty, 0) 1221 } 1222 atomic.StoreInt64(&d.ctime, now) 1223 return nil 1224 } 1225 1226 // doAllocate performs an allocate operation on d. Note that d.metadataMu will 1227 // be held when allocate is called. 1228 func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { 1229 d.metadataMu.Lock() 1230 defer d.metadataMu.Unlock() 1231 1232 // Allocating a smaller size is a noop. 1233 size := offset + length 1234 if d.cachedMetadataAuthoritative() && size <= d.size { 1235 return nil 1236 } 1237 1238 err := allocate() 1239 if err != nil { 1240 return err 1241 } 1242 d.updateSizeLocked(size) 1243 if d.cachedMetadataAuthoritative() { 1244 d.touchCMtimeLocked() 1245 } 1246 return nil 1247 } 1248 1249 // Preconditions: d.metadataMu must be locked. 1250 func (d *dentry) updateSizeLocked(newSize uint64) { 1251 d.dataMu.Lock() 1252 oldSize := d.size 1253 atomic.StoreUint64(&d.size, newSize) 1254 // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings 1255 // below. This allows concurrent calls to Read/Translate/etc. These 1256 // functions synchronize with truncation by refusing to use cache 1257 // contents beyond the new d.size. (We are still holding d.metadataMu, 1258 // so we can't race with Write or another truncate.) 1259 d.dataMu.Unlock() 1260 if d.size < oldSize { 1261 oldpgend, _ := hostarch.PageRoundUp(oldSize) 1262 newpgend, _ := hostarch.PageRoundUp(d.size) 1263 if oldpgend != newpgend { 1264 d.mapsMu.Lock() 1265 d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 1266 // Compare Linux's mm/truncate.c:truncate_setsize() => 1267 // truncate_pagecache() => 1268 // mm/memory.c:unmap_mapping_range(evencows=1). 1269 InvalidatePrivate: true, 1270 }) 1271 d.mapsMu.Unlock() 1272 } 1273 // We are now guaranteed that there are no translations of 1274 // truncated pages, and can remove them from the cache. Since 1275 // truncated pages have been removed from the remote file, they 1276 // should be dropped without being written back. 1277 d.dataMu.Lock() 1278 d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) 1279 d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) 1280 d.dataMu.Unlock() 1281 } 1282 } 1283 1284 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 1285 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) 1286 } 1287 1288 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 1289 // Deny access to the "security" and "system" namespaces since applications 1290 // may expect these to affect kernel behavior in unimplemented ways 1291 // (b/148380782). Allow all other extended attributes to be passed through 1292 // to the remote filesystem. This is inconsistent with Linux's 9p client, 1293 // but consistent with other filesystems (e.g. FUSE). 1294 if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) { 1295 return syserror.EOPNOTSUPP 1296 } 1297 mode := linux.FileMode(atomic.LoadUint32(&d.mode)) 1298 kuid := auth.KUID(atomic.LoadUint32(&d.uid)) 1299 kgid := auth.KGID(atomic.LoadUint32(&d.gid)) 1300 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 1301 return err 1302 } 1303 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 1304 } 1305 1306 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 1307 return vfs.CheckDeleteSticky( 1308 creds, 1309 linux.FileMode(atomic.LoadUint32(&d.mode)), 1310 auth.KUID(atomic.LoadUint32(&d.uid)), 1311 auth.KUID(atomic.LoadUint32(&child.uid)), 1312 auth.KGID(atomic.LoadUint32(&child.gid)), 1313 ) 1314 } 1315 1316 func dentryUIDFromP9UID(uid p9.UID) uint32 { 1317 if !uid.Ok() { 1318 return uint32(auth.OverflowUID) 1319 } 1320 return uint32(uid) 1321 } 1322 1323 func dentryGIDFromP9GID(gid p9.GID) uint32 { 1324 if !gid.Ok() { 1325 return uint32(auth.OverflowGID) 1326 } 1327 return uint32(gid) 1328 } 1329 1330 // IncRef implements vfs.DentryImpl.IncRef. 1331 func (d *dentry) IncRef() { 1332 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 1333 // d.checkCachingLocked(). 1334 r := atomic.AddInt64(&d.refs, 1) 1335 if d.LogRefs() { 1336 refsvfs2.LogIncRef(d, r) 1337 } 1338 } 1339 1340 // TryIncRef implements vfs.DentryImpl.TryIncRef. 1341 func (d *dentry) TryIncRef() bool { 1342 for { 1343 r := atomic.LoadInt64(&d.refs) 1344 if r <= 0 { 1345 return false 1346 } 1347 if atomic.CompareAndSwapInt64(&d.refs, r, r+1) { 1348 if d.LogRefs() { 1349 refsvfs2.LogTryIncRef(d, r+1) 1350 } 1351 return true 1352 } 1353 } 1354 } 1355 1356 // DecRef implements vfs.DentryImpl.DecRef. 1357 func (d *dentry) DecRef(ctx context.Context) { 1358 if d.decRefNoCaching() == 0 { 1359 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1360 } 1361 } 1362 1363 // decRefNoCaching decrements d's reference count without calling 1364 // d.checkCachingLocked, even if d's reference count reaches 0; callers are 1365 // responsible for ensuring that d.checkCachingLocked will be called later. 1366 func (d *dentry) decRefNoCaching() int64 { 1367 r := atomic.AddInt64(&d.refs, -1) 1368 if d.LogRefs() { 1369 refsvfs2.LogDecRef(d, r) 1370 } 1371 if r < 0 { 1372 panic("gofer.dentry.decRefNoCaching() called without holding a reference") 1373 } 1374 return r 1375 } 1376 1377 // RefType implements refsvfs2.CheckedObject.Type. 1378 func (d *dentry) RefType() string { 1379 return "gofer.dentry" 1380 } 1381 1382 // LeakMessage implements refsvfs2.CheckedObject.LeakMessage. 1383 func (d *dentry) LeakMessage() string { 1384 return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs)) 1385 } 1386 1387 // LogRefs implements refsvfs2.CheckedObject.LogRefs. 1388 // 1389 // This should only be set to true for debugging purposes, as it can generate an 1390 // extremely large amount of output and drastically degrade performance. 1391 func (d *dentry) LogRefs() bool { 1392 return false 1393 } 1394 1395 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 1396 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 1397 if d.isDir() { 1398 events |= linux.IN_ISDIR 1399 } 1400 1401 d.fs.renameMu.RLock() 1402 // The ordering below is important, Linux always notifies the parent first. 1403 if d.parent != nil { 1404 d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) 1405 } 1406 d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) 1407 d.fs.renameMu.RUnlock() 1408 } 1409 1410 // Watches implements vfs.DentryImpl.Watches. 1411 func (d *dentry) Watches() *vfs.Watches { 1412 return &d.watches 1413 } 1414 1415 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 1416 // 1417 // If no watches are left on this dentry and it has no references, cache it. 1418 func (d *dentry) OnZeroWatches(ctx context.Context) { 1419 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1420 } 1421 1422 // checkCachingLocked should be called after d's reference count becomes 0 or 1423 // it becomes disowned. 1424 // 1425 // For performance, checkCachingLocked can also be called after d's reference 1426 // count becomes non-zero, so that d can be removed from the LRU cache. This 1427 // may help in reducing the size of the cache and hence reduce evictions. Note 1428 // that this is not necessary for correctness. 1429 // 1430 // It may be called on a destroyed dentry. For example, 1431 // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times 1432 // for the same dentry when the dentry is visited more than once in the same 1433 // operation. One of the calls may destroy the dentry, so subsequent calls will 1434 // do nothing. 1435 // 1436 // Preconditions: d.fs.renameMu must be locked for writing if 1437 // renameMuWriteLocked is true; it may be temporarily unlocked. 1438 func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { 1439 d.cachingMu.Lock() 1440 refs := atomic.LoadInt64(&d.refs) 1441 if refs == -1 { 1442 // Dentry has already been destroyed. 1443 d.cachingMu.Unlock() 1444 return 1445 } 1446 if refs > 0 { 1447 // fs.cachedDentries is permitted to contain dentries with non-zero refs, 1448 // which are skipped by fs.evictCachedDentryLocked() upon reaching the end 1449 // of the LRU. But it is still beneficial to remove d from the cache as we 1450 // are already holding d.cachingMu. Keeping a cleaner cache also reduces 1451 // the number of evictions (which is expensive as it acquires fs.renameMu). 1452 d.removeFromCacheLocked() 1453 d.cachingMu.Unlock() 1454 return 1455 } 1456 // Deleted and invalidated dentries with zero references are no longer 1457 // reachable by path resolution and should be dropped immediately. 1458 if d.vfsd.IsDead() { 1459 d.removeFromCacheLocked() 1460 d.cachingMu.Unlock() 1461 if !renameMuWriteLocked { 1462 // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). 1463 d.fs.renameMu.Lock() 1464 defer d.fs.renameMu.Unlock() 1465 // Now that renameMu is locked for writing, no more refs can be taken on 1466 // d because path resolution requires renameMu for reading at least. 1467 if atomic.LoadInt64(&d.refs) != 0 { 1468 // Destroy d only if its ref is still 0. If not, either someone took a 1469 // ref on it or it got destroyed before fs.renameMu could be acquired. 1470 return 1471 } 1472 } 1473 if d.isDeleted() { 1474 d.watches.HandleDeletion(ctx) 1475 } 1476 d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. 1477 return 1478 } 1479 // If d still has inotify watches and it is not deleted or invalidated, it 1480 // can't be evicted. Otherwise, we will lose its watches, even if a new 1481 // dentry is created for the same file in the future. Note that the size of 1482 // d.watches cannot concurrently transition from zero to non-zero, because 1483 // adding a watch requires holding a reference on d. 1484 if d.watches.Size() > 0 { 1485 // As in the refs > 0 case, removing d is beneficial. 1486 d.removeFromCacheLocked() 1487 d.cachingMu.Unlock() 1488 return 1489 } 1490 1491 if atomic.LoadInt32(&d.fs.released) != 0 { 1492 d.cachingMu.Unlock() 1493 if !renameMuWriteLocked { 1494 // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as 1495 // needed by d.destroyLocked() later. 1496 d.fs.renameMu.Lock() 1497 defer d.fs.renameMu.Unlock() 1498 } 1499 if d.parent != nil { 1500 d.parent.dirMu.Lock() 1501 delete(d.parent.children, d.name) 1502 d.parent.dirMu.Unlock() 1503 } 1504 d.destroyLocked(ctx) // +checklocksforce: see above. 1505 return 1506 } 1507 1508 d.fs.cacheMu.Lock() 1509 // If d is already cached, just move it to the front of the LRU. 1510 if d.cached { 1511 d.fs.cachedDentries.Remove(d) 1512 d.fs.cachedDentries.PushFront(d) 1513 d.fs.cacheMu.Unlock() 1514 d.cachingMu.Unlock() 1515 return 1516 } 1517 // Cache the dentry, then evict the least recently used cached dentry if 1518 // the cache becomes over-full. 1519 d.fs.cachedDentries.PushFront(d) 1520 d.fs.cachedDentriesLen++ 1521 d.cached = true 1522 shouldEvict := d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries 1523 d.fs.cacheMu.Unlock() 1524 d.cachingMu.Unlock() 1525 1526 if shouldEvict { 1527 if !renameMuWriteLocked { 1528 // Need to lock d.fs.renameMu for writing as needed by 1529 // d.evictCachedDentryLocked(). 1530 d.fs.renameMu.Lock() 1531 defer d.fs.renameMu.Unlock() 1532 } 1533 d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. 1534 } 1535 } 1536 1537 // Preconditions: d.cachingMu must be locked. 1538 func (d *dentry) removeFromCacheLocked() { 1539 if d.cached { 1540 d.fs.cacheMu.Lock() 1541 d.fs.cachedDentries.Remove(d) 1542 d.fs.cachedDentriesLen-- 1543 d.fs.cacheMu.Unlock() 1544 d.cached = false 1545 } 1546 } 1547 1548 // Precondition: fs.renameMu must be locked for writing; it may be temporarily 1549 // unlocked. 1550 // +checklocks:fs.renameMu 1551 func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { 1552 for fs.cachedDentriesLen != 0 { 1553 fs.evictCachedDentryLocked(ctx) 1554 } 1555 } 1556 1557 // Preconditions: 1558 // * fs.renameMu must be locked for writing; it may be temporarily unlocked. 1559 // +checklocks:fs.renameMu 1560 func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { 1561 fs.cacheMu.Lock() 1562 victim := fs.cachedDentries.Back() 1563 fs.cacheMu.Unlock() 1564 if victim == nil { 1565 // fs.cachedDentries may have become empty between when it was checked and 1566 // when we locked fs.cacheMu. 1567 return 1568 } 1569 1570 victim.cachingMu.Lock() 1571 victim.removeFromCacheLocked() 1572 // victim.refs or victim.watches.Size() may have become non-zero from an 1573 // earlier path resolution since it was inserted into fs.cachedDentries. 1574 if atomic.LoadInt64(&victim.refs) != 0 || victim.watches.Size() != 0 { 1575 victim.cachingMu.Unlock() 1576 return 1577 } 1578 if victim.parent != nil { 1579 victim.parent.dirMu.Lock() 1580 if !victim.vfsd.IsDead() { 1581 // Note that victim can't be a mount point (in any mount 1582 // namespace), since VFS holds references on mount points. 1583 fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) 1584 delete(victim.parent.children, victim.name) 1585 // We're only deleting the dentry, not the file it 1586 // represents, so we don't need to update 1587 // victimParent.dirents etc. 1588 } 1589 victim.parent.dirMu.Unlock() 1590 } 1591 // Safe to unlock cachingMu now that victim.vfsd.IsDead(). Henceforth any 1592 // concurrent caching attempts on victim will attempt to destroy it and so 1593 // will try to acquire fs.renameMu (which we have already acquired). Hence, 1594 // fs.renameMu will synchronize the destroy attempts. 1595 victim.cachingMu.Unlock() 1596 victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs. 1597 } 1598 1599 // destroyLocked destroys the dentry. 1600 // 1601 // Preconditions: 1602 // * d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1603 // * d.refs == 0. 1604 // * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal 1605 // from its former parent dentry. 1606 // +checklocks:d.fs.renameMu 1607 func (d *dentry) destroyLocked(ctx context.Context) { 1608 switch atomic.LoadInt64(&d.refs) { 1609 case 0: 1610 // Mark the dentry destroyed. 1611 atomic.StoreInt64(&d.refs, -1) 1612 case -1: 1613 panic("dentry.destroyLocked() called on already destroyed dentry") 1614 default: 1615 panic("dentry.destroyLocked() called with references on the dentry") 1616 } 1617 1618 // Allow the following to proceed without renameMu locked to improve 1619 // scalability. 1620 d.fs.renameMu.Unlock() 1621 1622 mf := d.fs.mfp.MemoryFile() 1623 d.handleMu.Lock() 1624 d.dataMu.Lock() 1625 if h := d.writeHandleLocked(); h.isOpen() { 1626 // Write dirty pages back to the remote filesystem. 1627 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { 1628 log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) 1629 } 1630 } 1631 // Discard cached data. 1632 if !d.cache.IsEmpty() { 1633 mf.MarkAllUnevictable(d) 1634 d.cache.DropAll(mf) 1635 d.dirty.RemoveAll() 1636 } 1637 d.dataMu.Unlock() 1638 // Clunk open fids and close open host FDs. 1639 if !d.readFile.isNil() { 1640 d.readFile.close(ctx) 1641 } 1642 if !d.writeFile.isNil() && d.readFile != d.writeFile { 1643 d.writeFile.close(ctx) 1644 } 1645 d.readFile = p9file{} 1646 d.writeFile = p9file{} 1647 if d.readFD >= 0 { 1648 unix.Close(int(d.readFD)) 1649 } 1650 if d.writeFD >= 0 && d.readFD != d.writeFD { 1651 unix.Close(int(d.writeFD)) 1652 } 1653 d.readFD = -1 1654 d.writeFD = -1 1655 d.mmapFD = -1 1656 d.handleMu.Unlock() 1657 1658 if !d.file.isNil() { 1659 // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, 1660 // i.e. client and server timestamps may differ (because e.g. a client 1661 // write was serviced by the page cache, and only written back to the 1662 // remote file later). Ideally, we'd write client timestamps back to 1663 // the remote filesystem so that timestamps for a new dentry 1664 // instantiated for the same file would remain coherent. Unfortunately, 1665 // this turns out to be too expensive in many cases, so for now we 1666 // don't do this. 1667 if err := d.file.close(ctx); err != nil { 1668 log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err) 1669 } 1670 d.file = p9file{} 1671 1672 // Remove d from the set of syncable dentries. 1673 d.fs.syncMu.Lock() 1674 delete(d.fs.syncableDentries, d) 1675 d.fs.syncMu.Unlock() 1676 } 1677 1678 d.fs.renameMu.Lock() 1679 1680 // Drop the reference held by d on its parent without recursively locking 1681 // d.fs.renameMu. 1682 if d.parent != nil && d.parent.decRefNoCaching() == 0 { 1683 d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 1684 } 1685 refsvfs2.Unregister(d) 1686 } 1687 1688 func (d *dentry) isDeleted() bool { 1689 return atomic.LoadUint32(&d.deleted) != 0 1690 } 1691 1692 func (d *dentry) setDeleted() { 1693 atomic.StoreUint32(&d.deleted, 1) 1694 } 1695 1696 func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { 1697 if d.file.isNil() { 1698 return nil, nil 1699 } 1700 xattrMap, err := d.file.listXattr(ctx, size) 1701 if err != nil { 1702 return nil, err 1703 } 1704 xattrs := make([]string, 0, len(xattrMap)) 1705 for x := range xattrMap { 1706 xattrs = append(xattrs, x) 1707 } 1708 return xattrs, nil 1709 } 1710 1711 func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 1712 if d.file.isNil() { 1713 return "", linuxerr.ENODATA 1714 } 1715 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { 1716 return "", err 1717 } 1718 return d.file.getXattr(ctx, opts.Name, opts.Size) 1719 } 1720 1721 func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 1722 if d.file.isNil() { 1723 return linuxerr.EPERM 1724 } 1725 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { 1726 return err 1727 } 1728 return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) 1729 } 1730 1731 func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { 1732 if d.file.isNil() { 1733 return linuxerr.EPERM 1734 } 1735 if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { 1736 return err 1737 } 1738 return d.file.removeXattr(ctx, name) 1739 } 1740 1741 // Preconditions: 1742 // * !d.isSynthetic(). 1743 // * d.isRegularFile() || d.isDir(). 1744 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { 1745 // O_TRUNC unconditionally requires us to obtain a new handle (opened with 1746 // O_TRUNC). 1747 if !trunc { 1748 d.handleMu.RLock() 1749 if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) { 1750 // Current handles are sufficient. 1751 d.handleMu.RUnlock() 1752 return nil 1753 } 1754 d.handleMu.RUnlock() 1755 } 1756 1757 var fdsToCloseArr [2]int32 1758 fdsToClose := fdsToCloseArr[:0] 1759 invalidateTranslations := false 1760 d.handleMu.Lock() 1761 if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc { 1762 // Get a new handle. If this file has been opened for both reading and 1763 // writing, try to get a single handle that is usable for both: 1764 // 1765 // - Writable memory mappings of a host FD require that the host FD is 1766 // opened for both reading and writing. 1767 // 1768 // - NOTE(b/141991141): Some filesystems may not ensure coherence 1769 // between multiple handles for the same file. 1770 openReadable := !d.readFile.isNil() || read 1771 openWritable := !d.writeFile.isNil() || write 1772 h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) 1773 if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { 1774 // It may not be possible to use a single handle for both 1775 // reading and writing, since permissions on the file may have 1776 // changed to e.g. disallow reading after previously being 1777 // opened for reading. In this case, we have no choice but to 1778 // use separate handles for reading and writing. 1779 ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) 1780 openReadable = read 1781 openWritable = write 1782 h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) 1783 } 1784 if err != nil { 1785 d.handleMu.Unlock() 1786 return err 1787 } 1788 1789 // Update d.readFD and d.writeFD. 1790 if h.fd >= 0 { 1791 if openReadable && openWritable && (d.readFD < 0 || d.writeFD < 0 || d.readFD != d.writeFD) { 1792 // Replace existing FDs with this one. 1793 if d.readFD >= 0 { 1794 // We already have a readable FD that may be in use by 1795 // concurrent callers of d.pf.FD(). 1796 if d.fs.opts.overlayfsStaleRead { 1797 // If overlayfsStaleRead is in effect, then the new FD 1798 // may not be coherent with the existing one, so we 1799 // have no choice but to switch to mappings of the new 1800 // FD in both the application and sentry. 1801 if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { 1802 d.handleMu.Unlock() 1803 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) 1804 h.close(ctx) 1805 return err 1806 } 1807 fdsToClose = append(fdsToClose, d.readFD) 1808 invalidateTranslations = true 1809 atomic.StoreInt32(&d.readFD, h.fd) 1810 } else { 1811 // Otherwise, we want to avoid invalidating existing 1812 // memmap.Translations (which is expensive); instead, use 1813 // dup3 to make the old file descriptor refer to the new 1814 // file description, then close the new file descriptor 1815 // (which is no longer needed). Racing callers of d.pf.FD() 1816 // may use the old or new file description, but this 1817 // doesn't matter since they refer to the same file, and 1818 // any racing mappings must be read-only. 1819 if err := unix.Dup3(int(h.fd), int(d.readFD), unix.O_CLOEXEC); err != nil { 1820 oldFD := d.readFD 1821 d.handleMu.Unlock() 1822 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) 1823 h.close(ctx) 1824 return err 1825 } 1826 fdsToClose = append(fdsToClose, h.fd) 1827 h.fd = d.readFD 1828 } 1829 } else { 1830 atomic.StoreInt32(&d.readFD, h.fd) 1831 } 1832 if d.writeFD != h.fd && d.writeFD >= 0 { 1833 fdsToClose = append(fdsToClose, d.writeFD) 1834 } 1835 atomic.StoreInt32(&d.writeFD, h.fd) 1836 atomic.StoreInt32(&d.mmapFD, h.fd) 1837 } else if openReadable && d.readFD < 0 { 1838 atomic.StoreInt32(&d.readFD, h.fd) 1839 // If the file has not been opened for writing, the new FD may 1840 // be used for read-only memory mappings. If the file was 1841 // previously opened for reading (without an FD), then existing 1842 // translations of the file may use the internal page cache; 1843 // invalidate those mappings. 1844 if d.writeFile.isNil() { 1845 invalidateTranslations = !d.readFile.isNil() 1846 atomic.StoreInt32(&d.mmapFD, h.fd) 1847 } 1848 } else if openWritable && d.writeFD < 0 { 1849 atomic.StoreInt32(&d.writeFD, h.fd) 1850 if d.readFD >= 0 { 1851 // We have an existing read-only FD, but the file has just 1852 // been opened for writing, so we need to start supporting 1853 // writable memory mappings. However, the new FD is not 1854 // readable, so we have no FD that can be used to create 1855 // writable memory mappings. Switch to using the internal 1856 // page cache. 1857 invalidateTranslations = true 1858 atomic.StoreInt32(&d.mmapFD, -1) 1859 } 1860 } else { 1861 // The new FD is not useful. 1862 fdsToClose = append(fdsToClose, h.fd) 1863 } 1864 } else if openWritable && d.writeFD < 0 && d.mmapFD >= 0 { 1865 // We have an existing read-only FD, but the file has just been 1866 // opened for writing, so we need to start supporting writable 1867 // memory mappings. However, we have no writable host FD. Switch to 1868 // using the internal page cache. 1869 invalidateTranslations = true 1870 atomic.StoreInt32(&d.mmapFD, -1) 1871 } 1872 1873 // Switch to new fids. 1874 var oldReadFile p9file 1875 if openReadable { 1876 oldReadFile = d.readFile 1877 d.readFile = h.file 1878 } 1879 var oldWriteFile p9file 1880 if openWritable { 1881 oldWriteFile = d.writeFile 1882 d.writeFile = h.file 1883 } 1884 // NOTE(b/141991141): Clunk old fids before making new fids visible (by 1885 // unlocking d.handleMu). 1886 if !oldReadFile.isNil() { 1887 oldReadFile.close(ctx) 1888 } 1889 if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { 1890 oldWriteFile.close(ctx) 1891 } 1892 } 1893 d.handleMu.Unlock() 1894 1895 if invalidateTranslations { 1896 // Invalidate application mappings that may be using an old FD; they 1897 // will be replaced with mappings using the new FD after future calls 1898 // to d.Translate(). This requires holding d.mapsMu, which precedes 1899 // d.handleMu in the lock order. 1900 d.mapsMu.Lock() 1901 d.mappings.InvalidateAll(memmap.InvalidateOpts{}) 1902 d.mapsMu.Unlock() 1903 } 1904 for _, fd := range fdsToClose { 1905 unix.Close(int(fd)) 1906 } 1907 1908 return nil 1909 } 1910 1911 // Preconditions: d.handleMu must be locked. 1912 func (d *dentry) readHandleLocked() handle { 1913 return handle{ 1914 file: d.readFile, 1915 fd: d.readFD, 1916 } 1917 } 1918 1919 // Preconditions: d.handleMu must be locked. 1920 func (d *dentry) writeHandleLocked() handle { 1921 return handle{ 1922 file: d.writeFile, 1923 fd: d.writeFD, 1924 } 1925 } 1926 1927 func (d *dentry) syncRemoteFile(ctx context.Context) error { 1928 d.handleMu.RLock() 1929 defer d.handleMu.RUnlock() 1930 return d.syncRemoteFileLocked(ctx) 1931 } 1932 1933 // Preconditions: d.handleMu must be locked. 1934 func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { 1935 // If we have a host FD, fsyncing it is likely to be faster than an fsync 1936 // RPC. Prefer syncing write handles over read handles, since some remote 1937 // filesystem implementations may not sync changes made through write 1938 // handles otherwise. 1939 if d.writeFD >= 0 { 1940 ctx.UninterruptibleSleepStart(false) 1941 err := unix.Fsync(int(d.writeFD)) 1942 ctx.UninterruptibleSleepFinish(false) 1943 return err 1944 } 1945 if !d.writeFile.isNil() { 1946 return d.writeFile.fsync(ctx) 1947 } 1948 if d.readFD >= 0 { 1949 ctx.UninterruptibleSleepStart(false) 1950 err := unix.Fsync(int(d.readFD)) 1951 ctx.UninterruptibleSleepFinish(false) 1952 return err 1953 } 1954 if !d.readFile.isNil() { 1955 return d.readFile.fsync(ctx) 1956 } 1957 return nil 1958 } 1959 1960 func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { 1961 d.handleMu.RLock() 1962 defer d.handleMu.RUnlock() 1963 h := d.writeHandleLocked() 1964 if h.isOpen() { 1965 // Write back dirty pages to the remote file. 1966 d.dataMu.Lock() 1967 err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) 1968 d.dataMu.Unlock() 1969 if err != nil { 1970 return err 1971 } 1972 } 1973 if err := d.syncRemoteFileLocked(ctx); err != nil { 1974 if !forFilesystemSync { 1975 return err 1976 } 1977 // Only return err if we can reasonably have expected sync to succeed 1978 // (d is a regular file and was opened for writing). 1979 if d.isRegularFile() && h.isOpen() { 1980 return err 1981 } 1982 ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) 1983 } 1984 return nil 1985 } 1986 1987 // incLinks increments link count. 1988 func (d *dentry) incLinks() { 1989 if atomic.LoadUint32(&d.nlink) == 0 { 1990 // The remote filesystem doesn't support link count. 1991 return 1992 } 1993 atomic.AddUint32(&d.nlink, 1) 1994 } 1995 1996 // decLinks decrements link count. 1997 func (d *dentry) decLinks() { 1998 if atomic.LoadUint32(&d.nlink) == 0 { 1999 // The remote filesystem doesn't support link count. 2000 return 2001 } 2002 atomic.AddUint32(&d.nlink, ^uint32(0)) 2003 } 2004 2005 // fileDescription is embedded by gofer implementations of 2006 // vfs.FileDescriptionImpl. 2007 // 2008 // +stateify savable 2009 type fileDescription struct { 2010 vfsfd vfs.FileDescription 2011 vfs.FileDescriptionDefaultImpl 2012 vfs.LockFD 2013 2014 lockLogging sync.Once `state:"nosave"` 2015 } 2016 2017 func (fd *fileDescription) filesystem() *filesystem { 2018 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 2019 } 2020 2021 func (fd *fileDescription) dentry() *dentry { 2022 return fd.vfsfd.Dentry().Impl().(*dentry) 2023 } 2024 2025 // Stat implements vfs.FileDescriptionImpl.Stat. 2026 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 2027 d := fd.dentry() 2028 const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) 2029 if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { 2030 // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if 2031 // available? 2032 if err := d.updateFromGetattr(ctx); err != nil { 2033 return linux.Statx{}, err 2034 } 2035 } 2036 var stat linux.Statx 2037 d.statTo(&stat) 2038 return stat, nil 2039 } 2040 2041 // SetStat implements vfs.FileDescriptionImpl.SetStat. 2042 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 2043 if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil { 2044 return err 2045 } 2046 if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { 2047 fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) 2048 } 2049 return nil 2050 } 2051 2052 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 2053 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 2054 return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size) 2055 } 2056 2057 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 2058 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 2059 return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2060 } 2061 2062 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 2063 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 2064 d := fd.dentry() 2065 if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { 2066 return err 2067 } 2068 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 2069 return nil 2070 } 2071 2072 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 2073 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 2074 d := fd.dentry() 2075 if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { 2076 return err 2077 } 2078 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 2079 return nil 2080 } 2081 2082 // LockBSD implements vfs.FileDescriptionImpl.LockBSD. 2083 func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error { 2084 fd.lockLogging.Do(func() { 2085 log.Infof("File lock using gofer file handled internally.") 2086 }) 2087 return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) 2088 } 2089 2090 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. 2091 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error { 2092 fd.lockLogging.Do(func() { 2093 log.Infof("Range lock using gofer file handled internally.") 2094 }) 2095 return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) 2096 } 2097 2098 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. 2099 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { 2100 return fd.Locks().UnlockPOSIX(ctx, uid, r) 2101 }