github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/gofer/gofer.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package gofer provides a filesystem implementation that is backed by a 9p 16 // server, interchangeably referred to as "gofers" throughout this package. 17 // 18 // Lock order: 19 // 20 // regularFileFD/directoryFD.mu 21 // filesystem.renameMu 22 // dentry.cachingMu 23 // dentryCache.mu 24 // dentry.opMu 25 // dentry.childrenMu 26 // filesystem.syncMu 27 // dentry.metadataMu 28 // *** "memmap.Mappable locks" below this point 29 // dentry.mapsMu 30 // *** "memmap.Mappable locks taken by Translate" below this point 31 // dentry.handleMu 32 // dentry.dataMu 33 // filesystem.inoMu 34 // specialFileFD.mu 35 // specialFileFD.bufMu 36 // 37 // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that 38 // either ancestor dentries are locked before descendant dentries, or that 39 // filesystem.renameMu is locked for writing. 40 package gofer 41 42 import ( 43 "fmt" 44 "path" 45 "strconv" 46 "strings" 47 "sync/atomic" 48 49 "golang.org/x/sys/unix" 50 "github.com/metacubex/gvisor/pkg/abi/linux" 51 "github.com/metacubex/gvisor/pkg/atomicbitops" 52 "github.com/metacubex/gvisor/pkg/cleanup" 53 "github.com/metacubex/gvisor/pkg/context" 54 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 55 "github.com/metacubex/gvisor/pkg/hostarch" 56 "github.com/metacubex/gvisor/pkg/lisafs" 57 "github.com/metacubex/gvisor/pkg/log" 58 "github.com/metacubex/gvisor/pkg/refs" 59 fslock "github.com/metacubex/gvisor/pkg/sentry/fsimpl/lock" 60 "github.com/metacubex/gvisor/pkg/sentry/fsutil" 61 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 62 "github.com/metacubex/gvisor/pkg/sentry/kernel/pipe" 63 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 64 "github.com/metacubex/gvisor/pkg/sentry/memmap" 65 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 66 "github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport" 67 "github.com/metacubex/gvisor/pkg/sentry/vfs" 68 "github.com/metacubex/gvisor/pkg/sync" 69 "github.com/metacubex/gvisor/pkg/unet" 70 ) 71 72 // Name is the default filesystem name. 73 const Name = "9p" 74 75 // Mount option names for goferfs. 76 const ( 77 moptTransport = "trans" 78 moptReadFD = "rfdno" 79 moptWriteFD = "wfdno" 80 moptAname = "aname" 81 moptDfltUID = "dfltuid" 82 moptDfltGID = "dfltgid" 83 moptCache = "cache" 84 moptForcePageCache = "force_page_cache" 85 moptLimitHostFDTranslation = "limit_host_fd_translation" 86 moptOverlayfsStaleRead = "overlayfs_stale_read" 87 moptDisableFileHandleSharing = "disable_file_handle_sharing" 88 moptDisableFifoOpen = "disable_fifo_open" 89 90 // Directfs options. 91 moptDirectfs = "directfs" 92 ) 93 94 // Valid values for the "cache" mount option. 95 const ( 96 cacheFSCache = "fscache" 97 cacheFSCacheWritethrough = "fscache_writethrough" 98 cacheRemoteRevalidating = "remote_revalidating" 99 ) 100 101 // SupportedMountOptions is the set of mount options that can be set externally. 102 var SupportedMountOptions = []string{moptOverlayfsStaleRead, moptDisableFileHandleSharing} 103 104 const ( 105 defaultMaxCachedDentries = 1000 106 maxCachedNegativeChildren = 1000 107 ) 108 109 // stringFixedCache is a fixed sized cache, once initialized, 110 // its size never changes. 111 // 112 // +stateify savable 113 type stringFixedCache struct { 114 // namesList stores negative names with fifo list. 115 // name stored in namesList only means it used to be negative 116 // at the moment you pushed it to the list. 117 namesList stringList 118 size uint64 119 } 120 121 func (cache *stringFixedCache) isInited() bool { 122 return cache.size != 0 123 } 124 125 func (cache *stringFixedCache) init(size uint64) { 126 elements := make([]stringListElem, size) 127 for i := uint64(0); i < size; i++ { 128 cache.namesList.PushFront(&elements[i]) 129 } 130 cache.size = size 131 } 132 133 // Update will push name to the front of the list, 134 // and pop the tail value. 135 func (cache *stringFixedCache) add(name string) string { 136 tail := cache.namesList.Back() 137 victimName := tail.str 138 tail.str = name 139 cache.namesList.Remove(tail) 140 cache.namesList.PushFront(tail) 141 return victimName 142 } 143 144 // +stateify savable 145 type dentryCache struct { 146 // mu protects the below fields. 147 mu sync.Mutex `state:"nosave"` 148 // dentries contains all dentries with 0 references. Due to race conditions, 149 // it may also contain dentries with non-zero references. 150 dentries dentryList 151 // dentriesLen is the number of dentries in dentries. 152 dentriesLen uint64 153 // maxCachedDentries is the maximum number of cacheable dentries. 154 maxCachedDentries uint64 155 } 156 157 // SetDentryCacheSize sets the size of the global gofer dentry cache. 158 func SetDentryCacheSize(size int) { 159 if size < 0 { 160 return 161 } 162 if globalDentryCache != nil { 163 log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.") 164 return 165 } 166 globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)} 167 } 168 169 // globalDentryCache is a global cache of dentries across all gofers. 170 var globalDentryCache *dentryCache 171 172 // Valid values for "trans" mount option. 173 const transportModeFD = "fd" 174 175 // FilesystemType implements vfs.FilesystemType. 176 // 177 // +stateify savable 178 type FilesystemType struct{} 179 180 // filesystem implements vfs.FilesystemImpl. 181 // 182 // +stateify savable 183 type filesystem struct { 184 vfsfs vfs.Filesystem 185 186 // mf is used to allocate memory that caches regular file contents. mf is 187 // immutable. 188 mf *pgalloc.MemoryFile `state:"nosave"` 189 190 // Immutable options. 191 opts filesystemOptions 192 iopts InternalFilesystemOptions 193 194 // client is the LISAFS client used for communicating with the server. client 195 // is immutable. 196 client *lisafs.Client `state:"nosave"` 197 198 // clock is a realtime clock used to set timestamps in file operations. 199 clock ktime.Clock 200 201 // devMinor is the filesystem's minor device number. devMinor is immutable. 202 devMinor uint32 203 204 // root is the root dentry. root is immutable. 205 root *dentry 206 207 // renameMu serves two purposes: 208 // 209 // - It synchronizes path resolution with renaming initiated by this 210 // client. 211 // 212 // - It is held by path resolution to ensure that reachable dentries remain 213 // valid. A dentry is reachable by path resolution if it has a non-zero 214 // reference count (such that it is usable as vfs.ResolvingPath.Start() or 215 // is reachable from its children), or if it is a child dentry (such that 216 // it is reachable from its parent). 217 renameMu sync.RWMutex `state:"nosave"` 218 219 dentryCache *dentryCache 220 221 // syncableDentries contains all non-synthetic dentries. specialFileFDs 222 // contains all open specialFileFDs. These fields are protected by syncMu. 223 syncMu sync.Mutex `state:"nosave"` 224 syncableDentries dentryList 225 specialFileFDs specialFDList 226 227 // inoByKey maps previously-observed device ID and host inode numbers to 228 // internal inode numbers assigned to those files. inoByKey is not preserved 229 // across checkpoint/restore because inode numbers may be reused between 230 // different gofer processes, so inode numbers may be repeated for different 231 // files across checkpoint/restore. inoByKey is protected by inoMu. 232 inoMu sync.Mutex `state:"nosave"` 233 inoByKey map[inoKey]uint64 `state:"nosave"` 234 235 // lastIno is the last inode number assigned to a file. lastIno is accessed 236 // using atomic memory operations. 237 lastIno atomicbitops.Uint64 238 239 // savedDentryRW records open read/write handles during save/restore. 240 savedDentryRW map[*dentry]savedDentryRW 241 242 // released is nonzero once filesystem.Release has been called. 243 released atomicbitops.Int32 244 } 245 246 // +stateify savable 247 type filesystemOptions struct { 248 fd int 249 aname string 250 interop InteropMode // derived from the "cache" mount option 251 dfltuid auth.KUID 252 dfltgid auth.KGID 253 254 // If forcePageCache is true, host FDs may not be used for application 255 // memory mappings even if available; instead, the client must perform its 256 // own caching of regular file pages. This is primarily useful for testing. 257 forcePageCache bool 258 259 // If limitHostFDTranslation is true, apply maxFillRange() constraints to 260 // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This 261 // makes memory accounting behavior more consistent between cases where 262 // host FDs are / are not available, but may increase the frequency of 263 // sentry-handled page faults on files for which a host FD is available. 264 limitHostFDTranslation bool 265 266 // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote 267 // filesystem may not be coherent with writable host FDs opened later, so 268 // all uses of the former must be replaced by uses of the latter. This is 269 // usually only the case when the remote filesystem is a Linux overlayfs 270 // mount. (Prior to Linux 4.18, patch series centered on commit 271 // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were 272 // incoherent between pre-copy-up and post-copy-up FDs; after that patch 273 // series, only memory mappings are incoherent.) 274 overlayfsStaleRead bool 275 276 // If regularFilesUseSpecialFileFD is true, application FDs representing 277 // regular files will use distinct file handles for each FD, in the same 278 // way that application FDs representing "special files" such as sockets 279 // do. Note that this disables client caching for regular files. This option 280 // may regress performance due to excessive Open RPCs. This option is not 281 // supported with overlayfsStaleRead for now. 282 regularFilesUseSpecialFileFD bool 283 284 // If disableFifoOpen is true, application attempts to open(2) a host FIFO 285 // are disallowed. 286 disableFifoOpen bool 287 288 // directfs holds options for directfs mode. 289 directfs directfsOpts 290 } 291 292 // +stateify savable 293 type directfsOpts struct { 294 // If directfs is enabled, the gofer client does not make RPCs to the gofer 295 // process. Instead, it makes host syscalls to perform file operations. 296 enabled bool 297 } 298 299 // InteropMode controls the client's interaction with other remote filesystem 300 // users. 301 // 302 // +stateify savable 303 type InteropMode uint32 304 305 const ( 306 // InteropModeExclusive is appropriate when the filesystem client is the 307 // only user of the remote filesystem. 308 // 309 // - The client may cache arbitrary filesystem state (file data, metadata, 310 // filesystem structure, etc.). 311 // 312 // - Client changes to filesystem state may be sent to the remote 313 // filesystem asynchronously, except when server permission checks are 314 // necessary. 315 // 316 // - File timestamps are based on client clocks. This ensures that users of 317 // the client observe timestamps that are coherent with their own clocks 318 // and consistent with Linux's semantics (in particular, it is not always 319 // possible for clients to set arbitrary atimes and mtimes depending on the 320 // remote filesystem implementation, and never possible for clients to set 321 // arbitrary ctimes.) 322 InteropModeExclusive InteropMode = iota 323 324 // InteropModeWritethrough is appropriate when there are read-only users of 325 // the remote filesystem that expect to observe changes made by the 326 // filesystem client. 327 // 328 // - The client may cache arbitrary filesystem state. 329 // 330 // - Client changes to filesystem state must be sent to the remote 331 // filesystem synchronously. 332 // 333 // - File timestamps are based on client clocks. As a corollary, access 334 // timestamp changes from other remote filesystem users will not be visible 335 // to the client. 336 InteropModeWritethrough 337 338 // InteropModeShared is appropriate when there are users of the remote 339 // filesystem that may mutate its state other than the client. 340 // 341 // - The client must verify ("revalidate") cached filesystem state before 342 // using it. 343 // 344 // - Client changes to filesystem state must be sent to the remote 345 // filesystem synchronously. 346 // 347 // - File timestamps are based on server clocks. This is necessary to 348 // ensure that timestamp changes are synchronized between remote filesystem 349 // users. 350 // 351 // Note that the correctness of InteropModeShared depends on the server 352 // correctly implementing 9P fids (i.e. each fid immutably represents a 353 // single filesystem object), even in the presence of remote filesystem 354 // mutations from other users. If this is violated, the behavior of the 355 // client is undefined. 356 InteropModeShared 357 ) 358 359 // InternalFilesystemOptions may be passed as 360 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. 361 // 362 // +stateify savable 363 type InternalFilesystemOptions struct { 364 // If UniqueID is non-empty, it is an opaque string used to reassociate the 365 // filesystem with a new server FD during restoration from checkpoint. 366 UniqueID vfs.RestoreID 367 368 // If LeakConnection is true, do not close the connection to the server 369 // when the Filesystem is released. This is necessary for deployments in 370 // which servers can handle only a single client and report failure if that 371 // client disconnects. 372 LeakConnection bool 373 374 // If OpenSocketsByConnecting is true, silently translate attempts to open 375 // files identifying as sockets to connect RPCs. 376 OpenSocketsByConnecting bool 377 } 378 379 // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default 380 // UIDs and GIDs used for files that do not provide a specific owner or group 381 // respectively. 382 const ( 383 // uint32(-2) doesn't work in Go. 384 _V9FS_DEFUID = auth.KUID(4294967294) 385 _V9FS_DEFGID = auth.KGID(4294967294) 386 ) 387 388 // Name implements vfs.FilesystemType.Name. 389 func (FilesystemType) Name() string { 390 return Name 391 } 392 393 // Release implements vfs.FilesystemType.Release. 394 func (FilesystemType) Release(ctx context.Context) {} 395 396 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 397 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 398 mf := pgalloc.MemoryFileFromContext(ctx) 399 if mf == nil { 400 ctx.Warningf("gofer.FilesystemType.GetFilesystem: CtxMemoryFile is nil") 401 return nil, nil, linuxerr.EINVAL 402 } 403 404 mopts := vfs.GenericParseMountOptions(opts.Data) 405 var fsopts filesystemOptions 406 407 fd, err := getFDFromMountOptionsMap(ctx, mopts) 408 if err != nil { 409 return nil, nil, err 410 } 411 fsopts.fd = fd 412 413 // Get the attach name. 414 fsopts.aname = "/" 415 if aname, ok := mopts[moptAname]; ok { 416 delete(mopts, moptAname) 417 if !path.IsAbs(aname) { 418 ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname) 419 return nil, nil, linuxerr.EINVAL 420 } 421 fsopts.aname = path.Clean(aname) 422 } 423 424 // Parse the cache policy. For historical reasons, this defaults to the 425 // least generally-applicable option, InteropModeExclusive. 426 fsopts.interop = InteropModeExclusive 427 if cache, ok := mopts[moptCache]; ok { 428 delete(mopts, moptCache) 429 switch cache { 430 case cacheFSCache: 431 fsopts.interop = InteropModeExclusive 432 case cacheFSCacheWritethrough: 433 fsopts.interop = InteropModeWritethrough 434 case cacheRemoteRevalidating: 435 fsopts.interop = InteropModeShared 436 default: 437 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) 438 return nil, nil, linuxerr.EINVAL 439 } 440 } 441 442 // Parse the default UID and GID. 443 fsopts.dfltuid = _V9FS_DEFUID 444 if dfltuidstr, ok := mopts[moptDfltUID]; ok { 445 delete(mopts, moptDfltUID) 446 dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) 447 if err != nil { 448 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) 449 return nil, nil, linuxerr.EINVAL 450 } 451 // In Linux, dfltuid is interpreted as a UID and is converted to a KUID 452 // in the caller's user namespace, but goferfs isn't 453 // application-mountable. 454 fsopts.dfltuid = auth.KUID(dfltuid) 455 } 456 fsopts.dfltgid = _V9FS_DEFGID 457 if dfltgidstr, ok := mopts[moptDfltGID]; ok { 458 delete(mopts, moptDfltGID) 459 dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) 460 if err != nil { 461 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) 462 return nil, nil, linuxerr.EINVAL 463 } 464 fsopts.dfltgid = auth.KGID(dfltgid) 465 } 466 467 // Handle simple flags. 468 if _, ok := mopts[moptDisableFileHandleSharing]; ok { 469 delete(mopts, moptDisableFileHandleSharing) 470 fsopts.regularFilesUseSpecialFileFD = true 471 } 472 if _, ok := mopts[moptDisableFifoOpen]; ok { 473 delete(mopts, moptDisableFifoOpen) 474 fsopts.disableFifoOpen = true 475 } 476 if _, ok := mopts[moptForcePageCache]; ok { 477 delete(mopts, moptForcePageCache) 478 fsopts.forcePageCache = true 479 } 480 if _, ok := mopts[moptLimitHostFDTranslation]; ok { 481 delete(mopts, moptLimitHostFDTranslation) 482 fsopts.limitHostFDTranslation = true 483 } 484 if _, ok := mopts[moptOverlayfsStaleRead]; ok { 485 delete(mopts, moptOverlayfsStaleRead) 486 fsopts.overlayfsStaleRead = true 487 } 488 if _, ok := mopts[moptDirectfs]; ok { 489 delete(mopts, moptDirectfs) 490 fsopts.directfs.enabled = true 491 } 492 // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying 493 // "cache=none". 494 495 // Check for unparsed options. 496 if len(mopts) != 0 { 497 ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) 498 return nil, nil, linuxerr.EINVAL 499 } 500 501 // Validation. 502 if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead { 503 // These options are not supported together. To support this, when a dentry 504 // is opened writably for the first time, we need to iterate over all the 505 // specialFileFDs of that dentry that represent a regular file and call 506 // fd.hostFileMapper.RegenerateMappings(writable_fd). 507 ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.") 508 return nil, nil, linuxerr.EINVAL 509 } 510 511 // Handle internal options. 512 iopts, ok := opts.InternalData.(InternalFilesystemOptions) 513 if opts.InternalData != nil && !ok { 514 ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) 515 return nil, nil, linuxerr.EINVAL 516 } 517 // If !ok, iopts being the zero value is correct. 518 519 // Construct the filesystem object. 520 devMinor, err := vfsObj.GetAnonBlockDevMinor() 521 if err != nil { 522 return nil, nil, err 523 } 524 fs := &filesystem{ 525 mf: mf, 526 opts: fsopts, 527 iopts: iopts, 528 clock: ktime.RealtimeClockFromContext(ctx), 529 devMinor: devMinor, 530 inoByKey: make(map[inoKey]uint64), 531 } 532 533 // Did the user configure a global dentry cache? 534 if globalDentryCache != nil { 535 fs.dentryCache = globalDentryCache 536 } else { 537 fs.dentryCache = &dentryCache{maxCachedDentries: defaultMaxCachedDentries} 538 } 539 540 fs.vfsfs.Init(vfsObj, &fstype, fs) 541 542 rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx) 543 if err != nil { 544 fs.vfsfs.DecRef(ctx) 545 return nil, nil, err 546 } 547 if fs.opts.directfs.enabled { 548 fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD)) 549 } else { 550 fs.root, err = fs.newLisafsDentry(ctx, &rootInode) 551 } 552 if err != nil { 553 fs.vfsfs.DecRef(ctx) 554 return nil, nil, err 555 } 556 // Set the root's reference count to 2. One reference is returned to the 557 // caller, and the other is held by fs to prevent the root from being "cached" 558 // and subsequently evicted. 559 fs.root.refs = atomicbitops.FromInt64(2) 560 return &fs.vfsfs, &fs.root.vfsd, nil 561 } 562 563 // initClientAndGetRoot initializes fs.client and returns the root inode for 564 // this mount point. It handles the attach point (fs.opts.aname) resolution. 565 func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) { 566 sock, err := unet.NewSocket(fs.opts.fd) 567 if err != nil { 568 return lisafs.Inode{}, -1, err 569 } 570 571 ctx.UninterruptibleSleepStart(false) 572 defer ctx.UninterruptibleSleepFinish(false) 573 574 var ( 575 rootInode lisafs.Inode 576 rootHostFD int 577 ) 578 fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock) 579 if err != nil { 580 return lisafs.Inode{}, -1, err 581 } 582 583 cu := cleanup.Make(func() { 584 if rootHostFD >= 0 { 585 _ = unix.Close(rootHostFD) 586 } 587 rootControlFD := fs.client.NewFD(rootInode.ControlFD) 588 rootControlFD.Close(ctx, false /* flush */) 589 }) 590 defer cu.Clean() 591 592 if fs.opts.directfs.enabled { 593 if fs.opts.aname != "/" { 594 log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname) 595 return lisafs.Inode{}, -1, unix.EINVAL 596 } 597 if rootHostFD < 0 { 598 log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled") 599 return lisafs.Inode{}, -1, unix.EINVAL 600 } 601 } else { 602 if rootHostFD >= 0 { 603 log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it") 604 _ = unix.Close(rootHostFD) 605 rootHostFD = -1 606 } 607 // Use flipcall channels with lisafs because it makes a lot of RPCs. 608 if err := fs.client.StartChannels(); err != nil { 609 return lisafs.Inode{}, -1, err 610 } 611 rootInode, err = fs.handleAnameLisafs(ctx, rootInode) 612 if err != nil { 613 return lisafs.Inode{}, -1, err 614 } 615 } 616 cu.Release() 617 return rootInode, rootHostFD, nil 618 } 619 620 func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { 621 // Check that the transport is "fd". 622 trans, ok := mopts[moptTransport] 623 if !ok || trans != transportModeFD { 624 ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) 625 return -1, linuxerr.EINVAL 626 } 627 delete(mopts, moptTransport) 628 629 // Check that read and write FDs are provided and identical. 630 rfdstr, ok := mopts[moptReadFD] 631 if !ok { 632 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD) 633 return -1, linuxerr.EINVAL 634 } 635 delete(mopts, moptReadFD) 636 rfd, err := strconv.Atoi(rfdstr) 637 if err != nil { 638 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) 639 return -1, linuxerr.EINVAL 640 } 641 wfdstr, ok := mopts[moptWriteFD] 642 if !ok { 643 ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD) 644 return -1, linuxerr.EINVAL 645 } 646 delete(mopts, moptWriteFD) 647 wfd, err := strconv.Atoi(wfdstr) 648 if err != nil { 649 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) 650 return -1, linuxerr.EINVAL 651 } 652 if rfd != wfd { 653 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) 654 return -1, linuxerr.EINVAL 655 } 656 return rfd, nil 657 } 658 659 // Release implements vfs.FilesystemImpl.Release. 660 func (fs *filesystem) Release(ctx context.Context) { 661 fs.released.Store(1) 662 663 mf := fs.mf 664 fs.syncMu.Lock() 665 for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { 666 d := elem.d 667 d.handleMu.Lock() 668 d.dataMu.Lock() 669 if d.isWriteHandleOk() { 670 // Write dirty cached data to the remote file. 671 h := d.writeHandle() 672 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 673 log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) 674 } 675 // TODO(jamieliu): Do we need to flushf/fsync d? 676 } 677 // Discard cached pages. 678 d.cache.DropAll(mf) 679 d.dirty.RemoveAll() 680 d.dataMu.Unlock() 681 // Close host FDs if they exist. 682 d.closeHostFDs() 683 d.handleMu.Unlock() 684 } 685 // There can't be any specialFileFDs still using fs, since each such 686 // FileDescription would hold a reference on a Mount holding a reference on 687 // fs. 688 fs.syncMu.Unlock() 689 690 // If leak checking is enabled, release all outstanding references in the 691 // filesystem. We deliberately avoid doing this outside of leak checking; we 692 // have released all external resources above rather than relying on dentry 693 // destructors. fs.root may be nil if creating the client or initializing the 694 // root dentry failed in GetFilesystem. 695 if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil { 696 fs.renameMu.Lock() 697 fs.root.releaseSyntheticRecursiveLocked(ctx) 698 fs.evictAllCachedDentriesLocked(ctx) 699 fs.renameMu.Unlock() 700 701 // An extra reference was held by the filesystem on the root to prevent it from 702 // being cached/evicted. 703 fs.root.DecRef(ctx) 704 } 705 706 if !fs.iopts.LeakConnection { 707 // Close the connection to the server. This implicitly closes all FDs. 708 if fs.client != nil { 709 fs.client.Close() 710 } 711 } 712 713 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 714 } 715 716 // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements 717 // the reference count on every synthetic dentry. Synthetic dentries have one 718 // reference for existence that should be dropped during filesystem.Release. 719 // 720 // Precondition: d.fs.renameMu is locked for writing. 721 func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { 722 if d.isSynthetic() { 723 d.decRefNoCaching() 724 d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 725 } 726 if d.isDir() { 727 var children []*dentry 728 d.childrenMu.Lock() 729 for _, child := range d.children { 730 children = append(children, child) 731 } 732 d.childrenMu.Unlock() 733 for _, child := range children { 734 if child != nil { 735 child.releaseSyntheticRecursiveLocked(ctx) 736 } 737 } 738 } 739 } 740 741 // inoKey is the key used to identify the inode backed by this dentry. 742 // 743 // +stateify savable 744 type inoKey struct { 745 ino uint64 746 devMinor uint32 747 devMajor uint32 748 } 749 750 func inoKeyFromStatx(stat *linux.Statx) inoKey { 751 return inoKey{ 752 ino: stat.Ino, 753 devMinor: stat.DevMinor, 754 devMajor: stat.DevMajor, 755 } 756 } 757 758 func inoKeyFromStat(stat *unix.Stat_t) inoKey { 759 return inoKey{ 760 ino: stat.Ino, 761 devMinor: unix.Minor(stat.Dev), 762 devMajor: unix.Major(stat.Dev), 763 } 764 } 765 766 // dentry implements vfs.DentryImpl. 767 // 768 // +stateify savable 769 type dentry struct { 770 vfsd vfs.Dentry 771 772 // refs is the reference count. Each dentry holds a reference on its 773 // parent, even if disowned. An additional reference is held on all 774 // synthetic dentries until they are unlinked or invalidated. When refs 775 // reaches 0, the dentry may be added to the cache or destroyed. If refs == 776 // -1, the dentry has already been destroyed. refs is accessed using atomic 777 // memory operations. 778 refs atomicbitops.Int64 779 780 // fs is the owning filesystem. fs is immutable. 781 fs *filesystem 782 783 // parent is this dentry's parent directory. Each dentry holds a reference 784 // on its parent. If this dentry is a filesystem root, parent is nil. 785 // parent is protected by filesystem.renameMu. 786 parent atomic.Pointer[dentry] `state:".(*dentry)"` 787 788 // name is the name of this dentry in its parent. If this dentry is a 789 // filesystem root, name is the empty string. name is protected by 790 // filesystem.renameMu. 791 name string 792 793 // inoKey is used to identify this dentry's inode. 794 inoKey inoKey 795 796 // If deleted is non-zero, the file represented by this dentry has been 797 // deleted is accessed using atomic memory operations. 798 deleted atomicbitops.Uint32 799 800 // cachingMu is used to synchronize concurrent dentry caching attempts on 801 // this dentry. 802 cachingMu sync.Mutex `state:"nosave"` 803 804 // If cached is true, this dentry is part of filesystem.dentryCache. cached 805 // is protected by cachingMu. 806 cached bool 807 808 // cacheEntry links dentry into filesystem.dentryCache.dentries. It is 809 // protected by filesystem.dentryCache.mu. 810 cacheEntry dentryListElem 811 812 // syncableListEntry links dentry into filesystem.syncableDentries. It is 813 // protected by filesystem.syncMu. 814 syncableListEntry dentryListElem 815 816 // opMu synchronizes operations on this dentry. Operations that mutate 817 // the dentry tree must hold this lock for writing. Operations that 818 // only read the tree must hold for reading. 819 opMu sync.RWMutex `state:"nosave"` 820 821 // childrenMu protects the cached children data for this dentry. 822 childrenMu sync.Mutex `state:"nosave"` 823 824 // If this dentry represents a directory, children contains: 825 // 826 // - Mappings of child filenames to dentries representing those children. 827 // 828 // - Mappings of child filenames that are known not to exist to nil 829 // dentries (only if InteropModeShared is not in effect and the directory 830 // is not synthetic). 831 // 832 // +checklocks:childrenMu 833 children map[string]*dentry 834 835 // If this dentry represents a directory, negativeChildrenCache cache 836 // names of negative children. negativeChildrenCache is not saved since 837 // dentry.prepareSaveRecursive() drops all negative children. 838 // 839 // +checklocks:childrenMu 840 negativeChildrenCache stringFixedCache `state:"nosave"` 841 // If this dentry represents a directory, negativeChildren is the number of 842 // negative children cached in dentry.children. negativeChildren is not 843 // saved since dentry.prepareSaveRecursive() drops all negative children. 844 // 845 // +checklocks:childrenMu 846 negativeChildren int `state:"nosave"` 847 848 // If this dentry represents a directory, syntheticChildren is the number 849 // of child dentries for which dentry.isSynthetic() == true. 850 // 851 // +checklocks:childrenMu 852 syntheticChildren int 853 854 // If this dentry represents a directory, 855 // dentry.cachedMetadataAuthoritative() == true, and dirents is not 856 // nil, then dirents is a cache of all entries in the directory, in the 857 // order they were returned by the server. childrenSet just stores the 858 // `Name` field of all dirents in a set for fast query. dirents and 859 // childrenSet share the same lifecycle. 860 // 861 // +checklocks:childrenMu 862 dirents []vfs.Dirent `state:"nosave"` 863 // +checklocks:childrenMu 864 childrenSet map[string]struct{} `state:"nosave"` 865 866 // Cached metadata; protected by metadataMu. 867 // To access: 868 // - In situations where consistency is not required (like stat), these 869 // can be accessed using atomic operations only (without locking). 870 // - Lock metadataMu and can access without atomic operations. 871 // To mutate: 872 // - Lock metadataMu and use atomic operations to update because we might 873 // have atomic readers that don't hold the lock. 874 metadataMu sync.Mutex `state:"nosave"` 875 ino uint64 // immutable 876 mode atomicbitops.Uint32 // type is immutable, perms are mutable 877 uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 878 gid atomicbitops.Uint32 // auth.KGID, but ... 879 blockSize atomicbitops.Uint32 // 0 if unknown 880 // Timestamps, all nsecs from the Unix epoch. 881 atime atomicbitops.Int64 882 mtime atomicbitops.Int64 883 ctime atomicbitops.Int64 884 btime atomicbitops.Int64 885 // File size, which differs from other metadata in two ways: 886 // 887 // - We make a best-effort attempt to keep it up to date even if 888 // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. 889 // 890 // - size is protected by both metadataMu and dataMu (i.e. both must be 891 // locked to mutate it; locking either is sufficient to access it). 892 size atomicbitops.Uint64 893 // If this dentry does not represent a synthetic file, deleted is 0, and 894 // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the 895 // remote file's timestamps, which should be updated when this dentry is 896 // evicted. 897 atimeDirty atomicbitops.Uint32 898 mtimeDirty atomicbitops.Uint32 899 900 // nlink counts the number of hard links to this dentry. It's updated and 901 // accessed using atomic operations. It's not protected by metadataMu like the 902 // other metadata fields. 903 nlink atomicbitops.Uint32 904 905 mapsMu sync.Mutex `state:"nosave"` 906 907 // If this dentry represents a regular file, mappings tracks mappings of 908 // the file into memmap.MappingSpaces. mappings is protected by mapsMu. 909 mappings memmap.MappingSet 910 911 // - If this dentry represents a regular file or directory, readFD (if not 912 // -1) is a host FD used for reads by all regularFileFDs/directoryFDs 913 // representing this dentry. 914 // 915 // - If this dentry represents a regular file, writeFD (if not -1) is a host 916 // FD used for writes by all regularFileFDs representing this dentry. 917 // 918 // - If this dentry represents a regular file, mmapFD is the host FD used 919 // for memory mappings. If mmapFD is -1, no such FD is available, and the 920 // internal page cache implementation is used for memory mappings instead. 921 // 922 // These fields are protected by handleMu. readFD, writeFD, and mmapFD are 923 // additionally written using atomic memory operations, allowing them to be 924 // read (albeit racily) with atomic.LoadInt32() without locking handleMu. 925 // 926 // readFD and writeFD may or may not be the same file descriptor. Once either 927 // transitions from closed (-1) to open, it may be mutated with handleMu 928 // locked, but cannot be closed until the dentry is destroyed. 929 // 930 // readFD and writeFD may or may not be the same file descriptor. mmapFD is 931 // always either -1 or equal to readFD; if the file has been opened for 932 // writing, it is additionally either -1 or equal to writeFD. 933 handleMu sync.RWMutex `state:"nosave"` 934 readFD atomicbitops.Int32 `state:"nosave"` 935 writeFD atomicbitops.Int32 `state:"nosave"` 936 mmapFD atomicbitops.Int32 `state:"nosave"` 937 938 dataMu sync.RWMutex `state:"nosave"` 939 940 // If this dentry represents a regular file that is client-cached, cache 941 // maps offsets into the cached file to offsets into 942 // filesystem.mfp.MemoryFile() that store the file's data. cache is 943 // protected by dataMu. 944 cache fsutil.FileRangeSet 945 946 // If this dentry represents a regular file that is client-cached, dirty 947 // tracks dirty segments in cache. dirty is protected by dataMu. 948 dirty fsutil.DirtySet 949 950 // pf implements memmap.File for mappings of hostFD. 951 pf dentryPlatformFile 952 953 // If this dentry represents a symbolic link, InteropModeShared is not in 954 // effect, and haveTarget is true, target is the symlink target. haveTarget 955 // and target are protected by dataMu. 956 haveTarget bool 957 target string 958 959 // If this dentry represents a synthetic socket file, endpoint is the 960 // transport endpoint bound to this file. 961 endpoint transport.BoundEndpoint 962 963 // If this dentry represents a synthetic named pipe, pipe is the pipe 964 // endpoint bound to this file. 965 pipe *pipe.VFSPipe 966 967 locks vfs.FileLocks 968 969 // Inotify watches for this dentry. 970 // 971 // Note that inotify may behave unexpectedly in the presence of hard links, 972 // because dentries corresponding to the same file have separate inotify 973 // watches when they should share the same set. This is the case because it is 974 // impossible for us to know for sure whether two dentries correspond to the 975 // same underlying file (see the gofer filesystem section fo vfs/inotify.md for 976 // a more in-depth discussion on this matter). 977 watches vfs.Watches 978 979 // impl is the specific dentry implementation for non-synthetic dentries. 980 // impl is immutable. 981 // 982 // If impl is nil, this dentry represents a synthetic file, i.e. a 983 // file that does not exist on the host filesystem. As of this writing, the 984 // only files that can be synthetic are sockets, pipes, and directories. 985 impl any 986 } 987 988 // +stateify savable 989 type stringListElem struct { 990 // str is the string that this elem represents. 991 str string 992 stringEntry 993 } 994 995 // +stateify savable 996 type dentryListElem struct { 997 // d is the dentry that this elem represents. 998 d *dentry 999 dentryEntry 1000 } 1001 1002 func (fs *filesystem) inoFromKey(key inoKey) uint64 { 1003 fs.inoMu.Lock() 1004 defer fs.inoMu.Unlock() 1005 1006 if ino, ok := fs.inoByKey[key]; ok { 1007 return ino 1008 } 1009 ino := fs.nextIno() 1010 fs.inoByKey[key] = ino 1011 return ino 1012 } 1013 1014 func (fs *filesystem) nextIno() uint64 { 1015 return fs.lastIno.Add(1) 1016 } 1017 1018 // init must be called before first use of d. 1019 func (d *dentry) init(impl any) { 1020 d.pf.dentry = d 1021 d.cacheEntry.d = d 1022 d.syncableListEntry.d = d 1023 // Nested impl-inheritance pattern. In memory it looks like: 1024 // [[[ vfs.Dentry ] dentry ] dentryImpl ] 1025 // All 3 abstractions are allocated in one allocation. We achieve this by 1026 // making each outer dentry implementation hold the inner dentry by value. 1027 // Then the outer most dentry is allocated and we initialize fields inward. 1028 // Each inner dentry has a pointer to the next level of implementation. 1029 d.impl = impl 1030 d.vfsd.Init(d) 1031 refs.Register(d) 1032 } 1033 1034 func (d *dentry) isSynthetic() bool { 1035 return d.impl == nil 1036 } 1037 1038 func (d *dentry) cachedMetadataAuthoritative() bool { 1039 return d.fs.opts.interop != InteropModeShared || d.isSynthetic() 1040 } 1041 1042 // updateMetadataFromStatxLocked is called to update d's metadata after an update 1043 // from the remote filesystem. 1044 // Precondition: d.metadataMu must be locked. 1045 // +checklocks:d.metadataMu 1046 func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { 1047 if stat.Mask&linux.STATX_TYPE != 0 { 1048 if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { 1049 panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) 1050 } 1051 } 1052 if stat.Mask&linux.STATX_MODE != 0 { 1053 d.mode.Store(uint32(stat.Mode)) 1054 } 1055 if stat.Mask&linux.STATX_UID != 0 { 1056 d.uid.Store(dentryUID(lisafs.UID(stat.UID))) 1057 } 1058 if stat.Mask&linux.STATX_GID != 0 { 1059 d.gid.Store(dentryGID(lisafs.GID(stat.GID))) 1060 } 1061 if stat.Blksize != 0 { 1062 d.blockSize.Store(stat.Blksize) 1063 } 1064 // Don't override newer client-defined timestamps with old server-defined 1065 // ones. 1066 if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 { 1067 d.atime.Store(dentryTimestamp(stat.Atime)) 1068 } 1069 if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 { 1070 d.mtime.Store(dentryTimestamp(stat.Mtime)) 1071 } 1072 if stat.Mask&linux.STATX_CTIME != 0 { 1073 d.ctime.Store(dentryTimestamp(stat.Ctime)) 1074 } 1075 if stat.Mask&linux.STATX_BTIME != 0 { 1076 d.btime.Store(dentryTimestamp(stat.Btime)) 1077 } 1078 if stat.Mask&linux.STATX_NLINK != 0 { 1079 d.nlink.Store(stat.Nlink) 1080 } 1081 if stat.Mask&linux.STATX_SIZE != 0 { 1082 d.updateSizeLocked(stat.Size) 1083 } 1084 } 1085 1086 // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked, 1087 // except that it takes a unix.Stat_t argument. 1088 // Precondition: d.metadataMu must be locked. 1089 // +checklocks:d.metadataMu 1090 func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error { 1091 if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want { 1092 panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got)) 1093 } 1094 d.mode.Store(stat.Mode) 1095 d.uid.Store(stat.Uid) 1096 d.gid.Store(stat.Gid) 1097 d.blockSize.Store(uint32(stat.Blksize)) 1098 // Don't override newer client-defined timestamps with old host-defined 1099 // ones. 1100 if d.atimeDirty.Load() == 0 { 1101 d.atime.Store(dentryTimestampFromUnix(stat.Atim)) 1102 } 1103 if d.mtimeDirty.Load() == 0 { 1104 d.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) 1105 } 1106 d.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) 1107 d.nlink.Store(uint32(stat.Nlink)) 1108 d.updateSizeLocked(uint64(stat.Size)) 1109 return nil 1110 } 1111 1112 // Preconditions: !d.isSynthetic(). 1113 // Preconditions: d.metadataMu is locked. 1114 // +checklocks:d.metadataMu 1115 func (d *dentry) refreshSizeLocked(ctx context.Context) error { 1116 d.handleMu.RLock() 1117 1118 // Can use RacyLoad() because handleMu is locked. 1119 if d.writeFD.RacyLoad() < 0 { 1120 d.handleMu.RUnlock() 1121 // Use a suitable FD if we don't have a writable host FD. 1122 return d.updateMetadataLocked(ctx, noHandle) 1123 } 1124 1125 // Using statx(2) with a minimal mask is faster than fstat(2). 1126 var stat unix.Statx_t 1127 // Can use RacyLoad() because handleMu is locked. 1128 err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) 1129 d.handleMu.RUnlock() // must be released before updateSizeLocked() 1130 if err != nil { 1131 return err 1132 } 1133 d.updateSizeLocked(stat.Size) 1134 return nil 1135 } 1136 1137 // Preconditions: !d.isSynthetic(). 1138 func (d *dentry) updateMetadata(ctx context.Context) error { 1139 // d.metadataMu must be locked *before* we stat so that we do not end up 1140 // updating stale attributes in d.updateMetadataFromStatLocked(). 1141 d.metadataMu.Lock() 1142 defer d.metadataMu.Unlock() 1143 return d.updateMetadataLocked(ctx, noHandle) 1144 } 1145 1146 func (d *dentry) fileType() uint32 { 1147 return d.mode.Load() & linux.S_IFMT 1148 } 1149 1150 func (d *dentry) statTo(stat *linux.Statx) { 1151 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME 1152 stat.Blksize = d.blockSize.Load() 1153 stat.Nlink = d.nlink.Load() 1154 if stat.Nlink == 0 { 1155 // The remote filesystem doesn't support link count; just make 1156 // something up. This is consistent with Linux, where 1157 // fs/inode.c:inode_init_always() initializes link count to 1, and 1158 // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if 1159 // it's not provided by the remote filesystem. 1160 stat.Nlink = 1 1161 } 1162 stat.UID = d.uid.Load() 1163 stat.GID = d.gid.Load() 1164 stat.Mode = uint16(d.mode.Load()) 1165 stat.Ino = uint64(d.ino) 1166 stat.Size = d.size.Load() 1167 // This is consistent with regularFileFD.Seek(), which treats regular files 1168 // as having no holes. 1169 stat.Blocks = (stat.Size + 511) / 512 1170 stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load()) 1171 stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load()) 1172 stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load()) 1173 stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load()) 1174 stat.DevMajor = linux.UNNAMED_MAJOR 1175 stat.DevMinor = d.fs.devMinor 1176 } 1177 1178 // Precondition: fs.renameMu is locked. 1179 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { 1180 stat := &opts.Stat 1181 if stat.Mask == 0 { 1182 return nil 1183 } 1184 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { 1185 return linuxerr.EPERM 1186 } 1187 mode := linux.FileMode(d.mode.Load()) 1188 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { 1189 return err 1190 } 1191 if err := mnt.CheckBeginWrite(); err != nil { 1192 return err 1193 } 1194 defer mnt.EndWrite() 1195 1196 if stat.Mask&linux.STATX_SIZE != 0 { 1197 // Reject attempts to truncate files other than regular files, since 1198 // filesystem implementations may return the wrong errno. 1199 switch mode.FileType() { 1200 case linux.S_IFREG: 1201 // ok 1202 case linux.S_IFDIR: 1203 return linuxerr.EISDIR 1204 default: 1205 return linuxerr.EINVAL 1206 } 1207 } 1208 1209 var now int64 1210 if d.cachedMetadataAuthoritative() { 1211 // Truncate updates mtime. 1212 if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { 1213 stat.Mask |= linux.STATX_MTIME 1214 stat.Mtime = linux.StatxTimestamp{ 1215 Nsec: linux.UTIME_NOW, 1216 } 1217 } 1218 1219 // Use client clocks for timestamps. 1220 now = d.fs.clock.Now().Nanoseconds() 1221 if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { 1222 stat.Atime = linux.NsecToStatxTimestamp(now) 1223 } 1224 if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { 1225 stat.Mtime = linux.NsecToStatxTimestamp(now) 1226 } 1227 } 1228 1229 d.metadataMu.Lock() 1230 defer d.metadataMu.Unlock() 1231 1232 // As with Linux, if the UID, GID, or file size is changing, we have to 1233 // clear permission bits. Note that when set, clearSGID may cause 1234 // permissions to be updated. 1235 clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) || 1236 (stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) || 1237 stat.Mask&linux.STATX_SIZE != 0 1238 if clearSGID { 1239 if stat.Mask&linux.STATX_MODE != 0 { 1240 stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) 1241 } else { 1242 oldMode := d.mode.Load() 1243 if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { 1244 stat.Mode = uint16(updatedMode) 1245 stat.Mask |= linux.STATX_MODE 1246 } 1247 } 1248 } 1249 1250 // failureMask indicates which attributes could not be set on the remote 1251 // filesystem. p9 returns an error if any of the attributes could not be set 1252 // but that leads to inconsistency as the server could have set a few 1253 // attributes successfully but a later failure will cause the successful ones 1254 // to not be updated in the dentry cache. 1255 var failureMask uint32 1256 var failureErr error 1257 if !d.isSynthetic() { 1258 if stat.Mask != 0 { 1259 if err := d.prepareSetStat(ctx, stat); err != nil { 1260 return err 1261 } 1262 d.handleMu.RLock() 1263 if stat.Mask&linux.STATX_SIZE != 0 { 1264 // d.dataMu must be held around the update to both the remote 1265 // file's size and d.size to serialize with writeback (which 1266 // might otherwise write data back up to the old d.size after 1267 // the remote file has been truncated). 1268 d.dataMu.Lock() 1269 } 1270 var err error 1271 failureMask, failureErr, err = d.setStatLocked(ctx, stat) 1272 d.handleMu.RUnlock() 1273 if err != nil { 1274 if stat.Mask&linux.STATX_SIZE != 0 { 1275 d.dataMu.Unlock() // +checklocksforce: locked conditionally above 1276 } 1277 return err 1278 } 1279 if stat.Mask&linux.STATX_SIZE != 0 { 1280 if failureMask&linux.STATX_SIZE == 0 { 1281 // d.size should be kept up to date, and privatized 1282 // copy-on-write mappings of truncated pages need to be 1283 // invalidated, even if InteropModeShared is in effect. 1284 d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above 1285 } else { 1286 d.dataMu.Unlock() // +checklocksforce: locked conditionally above 1287 } 1288 } 1289 } 1290 if d.fs.opts.interop == InteropModeShared { 1291 // There's no point to updating d's metadata in this case since 1292 // it'll be overwritten by revalidation before the next time it's 1293 // used anyway. (InteropModeShared inhibits client caching of 1294 // regular file data, so there's no cache to truncate either.) 1295 return nil 1296 } 1297 } 1298 if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { 1299 d.mode.Store(d.fileType() | uint32(stat.Mode)) 1300 } 1301 if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { 1302 d.uid.Store(stat.UID) 1303 } 1304 if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { 1305 d.gid.Store(stat.GID) 1306 } 1307 // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because 1308 // if d.cachedMetadataAuthoritative() then we converted stat.Atime and 1309 // stat.Mtime to client-local timestamps above, and if 1310 // !d.cachedMetadataAuthoritative() then we returned after calling 1311 // d.file.setAttr(). For the same reason, now must have been initialized. 1312 if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { 1313 d.atime.Store(stat.Atime.ToNsec()) 1314 d.atimeDirty.Store(0) 1315 } 1316 if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { 1317 d.mtime.Store(stat.Mtime.ToNsec()) 1318 d.mtimeDirty.Store(0) 1319 } 1320 d.ctime.Store(now) 1321 if failureMask != 0 { 1322 // Setting some attribute failed on the remote filesystem. 1323 return failureErr 1324 } 1325 return nil 1326 } 1327 1328 // doAllocate performs an allocate operation on d. Note that d.metadataMu will 1329 // be held when allocate is called. 1330 func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { 1331 d.metadataMu.Lock() 1332 defer d.metadataMu.Unlock() 1333 1334 // Allocating a smaller size is a noop. 1335 size := offset + length 1336 if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() { 1337 return nil 1338 } 1339 1340 err := allocate() 1341 if err != nil { 1342 return err 1343 } 1344 d.updateSizeLocked(size) 1345 if d.cachedMetadataAuthoritative() { 1346 d.touchCMtimeLocked() 1347 } 1348 return nil 1349 } 1350 1351 // Preconditions: d.metadataMu must be locked. 1352 func (d *dentry) updateSizeLocked(newSize uint64) { 1353 d.dataMu.Lock() 1354 d.updateSizeAndUnlockDataMuLocked(newSize) 1355 } 1356 1357 // Preconditions: d.metadataMu and d.dataMu must be locked. 1358 // 1359 // Postconditions: d.dataMu is unlocked. 1360 // +checklocksrelease:d.dataMu 1361 func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { 1362 oldSize := d.size.RacyLoad() 1363 d.size.Store(newSize) 1364 // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings 1365 // below. This allows concurrent calls to Read/Translate/etc. These 1366 // functions synchronize with truncation by refusing to use cache 1367 // contents beyond the new d.size. (We are still holding d.metadataMu, 1368 // so we can't race with Write or another truncate.) 1369 d.dataMu.Unlock() 1370 if newSize < oldSize { 1371 oldpgend, _ := hostarch.PageRoundUp(oldSize) 1372 newpgend, _ := hostarch.PageRoundUp(newSize) 1373 if oldpgend != newpgend { 1374 d.mapsMu.Lock() 1375 d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 1376 // Compare Linux's mm/truncate.c:truncate_setsize() => 1377 // truncate_pagecache() => 1378 // mm/memory.c:unmap_mapping_range(evencows=1). 1379 InvalidatePrivate: true, 1380 }) 1381 d.mapsMu.Unlock() 1382 } 1383 // We are now guaranteed that there are no translations of 1384 // truncated pages, and can remove them from the cache. Since 1385 // truncated pages have been removed from the remote file, they 1386 // should be dropped without being written back. 1387 d.dataMu.Lock() 1388 d.cache.Truncate(newSize, d.fs.mf) 1389 d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) 1390 d.dataMu.Unlock() 1391 } 1392 } 1393 1394 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 1395 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) 1396 } 1397 1398 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 1399 // Deny access to the "system" namespaces since applications 1400 // may expect these to affect kernel behavior in unimplemented ways 1401 // (b/148380782). Allow all other extended attributes to be passed through 1402 // to the remote filesystem. This is inconsistent with Linux's 9p client, 1403 // but consistent with other filesystems (e.g. FUSE). 1404 // 1405 // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is 1406 // consistent with the VFS1 gofer client. 1407 if strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { 1408 return linuxerr.EOPNOTSUPP 1409 } 1410 mode := linux.FileMode(d.mode.Load()) 1411 kuid := auth.KUID(d.uid.Load()) 1412 kgid := auth.KGID(d.gid.Load()) 1413 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 1414 return err 1415 } 1416 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 1417 } 1418 1419 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 1420 return vfs.CheckDeleteSticky( 1421 creds, 1422 linux.FileMode(d.mode.Load()), 1423 auth.KUID(d.uid.Load()), 1424 auth.KUID(child.uid.Load()), 1425 auth.KGID(child.gid.Load()), 1426 ) 1427 } 1428 1429 func dentryUID(uid lisafs.UID) uint32 { 1430 if !uid.Ok() { 1431 return uint32(auth.OverflowUID) 1432 } 1433 return uint32(uid) 1434 } 1435 1436 func dentryGID(gid lisafs.GID) uint32 { 1437 if !gid.Ok() { 1438 return uint32(auth.OverflowGID) 1439 } 1440 return uint32(gid) 1441 } 1442 1443 // IncRef implements vfs.DentryImpl.IncRef. 1444 func (d *dentry) IncRef() { 1445 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 1446 // d.checkCachingLocked(). 1447 r := d.refs.Add(1) 1448 if d.LogRefs() { 1449 refs.LogIncRef(d, r) 1450 } 1451 } 1452 1453 // TryIncRef implements vfs.DentryImpl.TryIncRef. 1454 func (d *dentry) TryIncRef() bool { 1455 for { 1456 r := d.refs.Load() 1457 if r <= 0 { 1458 return false 1459 } 1460 if d.refs.CompareAndSwap(r, r+1) { 1461 if d.LogRefs() { 1462 refs.LogTryIncRef(d, r+1) 1463 } 1464 return true 1465 } 1466 } 1467 } 1468 1469 // DecRef implements vfs.DentryImpl.DecRef. 1470 func (d *dentry) DecRef(ctx context.Context) { 1471 if d.decRefNoCaching() == 0 { 1472 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1473 } 1474 } 1475 1476 // decRefNoCaching decrements d's reference count without calling 1477 // d.checkCachingLocked, even if d's reference count reaches 0; callers are 1478 // responsible for ensuring that d.checkCachingLocked will be called later. 1479 func (d *dentry) decRefNoCaching() int64 { 1480 r := d.refs.Add(-1) 1481 if d.LogRefs() { 1482 refs.LogDecRef(d, r) 1483 } 1484 if r < 0 { 1485 panic("gofer.dentry.decRefNoCaching() called without holding a reference") 1486 } 1487 return r 1488 } 1489 1490 // RefType implements refs.CheckedObject.Type. 1491 func (d *dentry) RefType() string { 1492 return "gofer.dentry" 1493 } 1494 1495 // LeakMessage implements refs.CheckedObject.LeakMessage. 1496 func (d *dentry) LeakMessage() string { 1497 return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 1498 } 1499 1500 // LogRefs implements refs.CheckedObject.LogRefs. 1501 // 1502 // This should only be set to true for debugging purposes, as it can generate an 1503 // extremely large amount of output and drastically degrade performance. 1504 func (d *dentry) LogRefs() bool { 1505 return false 1506 } 1507 1508 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 1509 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 1510 if d.isDir() { 1511 events |= linux.IN_ISDIR 1512 } 1513 1514 d.fs.renameMu.RLock() 1515 // The ordering below is important, Linux always notifies the parent first. 1516 if parent := d.parent.Load(); parent != nil { 1517 parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) 1518 } 1519 d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) 1520 d.fs.renameMu.RUnlock() 1521 } 1522 1523 // Watches implements vfs.DentryImpl.Watches. 1524 func (d *dentry) Watches() *vfs.Watches { 1525 return &d.watches 1526 } 1527 1528 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 1529 // 1530 // If no watches are left on this dentry and it has no references, cache it. 1531 func (d *dentry) OnZeroWatches(ctx context.Context) { 1532 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1533 } 1534 1535 // checkCachingLocked should be called after d's reference count becomes 0 or 1536 // it becomes disowned. 1537 // 1538 // For performance, checkCachingLocked can also be called after d's reference 1539 // count becomes non-zero, so that d can be removed from the LRU cache. This 1540 // may help in reducing the size of the cache and hence reduce evictions. Note 1541 // that this is not necessary for correctness. 1542 // 1543 // It may be called on a destroyed dentry. For example, 1544 // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times 1545 // for the same dentry when the dentry is visited more than once in the same 1546 // operation. One of the calls may destroy the dentry, so subsequent calls will 1547 // do nothing. 1548 // 1549 // Preconditions: d.fs.renameMu must be locked for writing if 1550 // renameMuWriteLocked is true; it may be temporarily unlocked. 1551 func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { 1552 d.cachingMu.Lock() 1553 refs := d.refs.Load() 1554 if refs == -1 { 1555 // Dentry has already been destroyed. 1556 d.cachingMu.Unlock() 1557 return 1558 } 1559 if refs > 0 { 1560 // fs.dentryCache.dentries is permitted to contain dentries with non-zero 1561 // refs, which are skipped by fs.evictCachedDentryLocked() upon reaching 1562 // the end of the LRU. But it is still beneficial to remove d from the 1563 // cache as we are already holding d.cachingMu. Keeping a cleaner cache 1564 // also reduces the number of evictions (which is expensive as it acquires 1565 // fs.renameMu). 1566 d.removeFromCacheLocked() 1567 d.cachingMu.Unlock() 1568 return 1569 } 1570 // Deleted and invalidated dentries with zero references are no longer 1571 // reachable by path resolution and should be dropped immediately. 1572 if d.vfsd.IsDead() { 1573 d.removeFromCacheLocked() 1574 d.cachingMu.Unlock() 1575 if !renameMuWriteLocked { 1576 // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). 1577 d.fs.renameMu.Lock() 1578 defer d.fs.renameMu.Unlock() 1579 // Now that renameMu is locked for writing, no more refs can be taken on 1580 // d because path resolution requires renameMu for reading at least. 1581 if d.refs.Load() != 0 { 1582 // Destroy d only if its ref is still 0. If not, either someone took a 1583 // ref on it or it got destroyed before fs.renameMu could be acquired. 1584 return 1585 } 1586 } 1587 if d.isDeleted() { 1588 d.watches.HandleDeletion(ctx) 1589 } 1590 d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. 1591 return 1592 } 1593 if d.vfsd.IsEvictable() { 1594 d.cachingMu.Unlock() 1595 // Attempt to evict. 1596 if renameMuWriteLocked { 1597 d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case. 1598 return 1599 } 1600 d.evict(ctx) 1601 return 1602 } 1603 // If d still has inotify watches and it is not deleted or invalidated, it 1604 // can't be evicted. Otherwise, we will lose its watches, even if a new 1605 // dentry is created for the same file in the future. Note that the size of 1606 // d.watches cannot concurrently transition from zero to non-zero, because 1607 // adding a watch requires holding a reference on d. 1608 if d.watches.Size() > 0 { 1609 // As in the refs > 0 case, removing d is beneficial. 1610 d.removeFromCacheLocked() 1611 d.cachingMu.Unlock() 1612 return 1613 } 1614 1615 if d.fs.released.Load() != 0 { 1616 d.cachingMu.Unlock() 1617 if !renameMuWriteLocked { 1618 // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as 1619 // needed by d.destroyLocked() later. 1620 d.fs.renameMu.Lock() 1621 defer d.fs.renameMu.Unlock() 1622 } 1623 if parent := d.parent.Load(); parent != nil { 1624 parent.childrenMu.Lock() 1625 delete(parent.children, d.name) 1626 parent.childrenMu.Unlock() 1627 } 1628 d.destroyLocked(ctx) // +checklocksforce: see above. 1629 return 1630 } 1631 1632 d.fs.dentryCache.mu.Lock() 1633 // If d is already cached, just move it to the front of the LRU. 1634 if d.cached { 1635 d.fs.dentryCache.dentries.Remove(&d.cacheEntry) 1636 d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) 1637 d.fs.dentryCache.mu.Unlock() 1638 d.cachingMu.Unlock() 1639 return 1640 } 1641 // Cache the dentry, then evict the least recently used cached dentry if 1642 // the cache becomes over-full. 1643 d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) 1644 d.fs.dentryCache.dentriesLen++ 1645 d.cached = true 1646 shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries 1647 d.fs.dentryCache.mu.Unlock() 1648 d.cachingMu.Unlock() 1649 1650 if shouldEvict { 1651 if !renameMuWriteLocked { 1652 // Need to lock d.fs.renameMu for writing as needed by 1653 // d.evictCachedDentryLocked(). 1654 d.fs.renameMu.Lock() 1655 defer d.fs.renameMu.Unlock() 1656 } 1657 d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. 1658 } 1659 } 1660 1661 // Preconditions: d.cachingMu must be locked. 1662 func (d *dentry) removeFromCacheLocked() { 1663 if d.cached { 1664 d.fs.dentryCache.mu.Lock() 1665 d.fs.dentryCache.dentries.Remove(&d.cacheEntry) 1666 d.fs.dentryCache.dentriesLen-- 1667 d.fs.dentryCache.mu.Unlock() 1668 d.cached = false 1669 } 1670 } 1671 1672 // Precondition: fs.renameMu must be locked for writing; it may be temporarily 1673 // unlocked. 1674 // +checklocks:fs.renameMu 1675 func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { 1676 for fs.dentryCache.dentriesLen != 0 { 1677 fs.evictCachedDentryLocked(ctx) 1678 } 1679 } 1680 1681 // Preconditions: 1682 // - fs.renameMu must be locked for writing; it may be temporarily unlocked. 1683 // 1684 // +checklocks:fs.renameMu 1685 func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { 1686 fs.dentryCache.mu.Lock() 1687 victim := fs.dentryCache.dentries.Back() 1688 fs.dentryCache.mu.Unlock() 1689 if victim == nil { 1690 // fs.dentryCache.dentries may have become empty between when it was 1691 // checked and when we locked fs.dentryCache.mu. 1692 return 1693 } 1694 1695 if victim.d.fs == fs { 1696 victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs 1697 return 1698 } 1699 1700 // The dentry cache is shared between all gofer filesystems and the victim is 1701 // from another filesystem. Have that filesystem do the work. We unlock 1702 // fs.renameMu to prevent deadlock: two filesystems could otherwise wait on 1703 // each others' renameMu. 1704 fs.renameMu.Unlock() 1705 defer fs.renameMu.Lock() 1706 victim.d.evict(ctx) 1707 } 1708 1709 // Preconditions: 1710 // - d.fs.renameMu must not be locked for writing. 1711 func (d *dentry) evict(ctx context.Context) { 1712 d.fs.renameMu.Lock() 1713 defer d.fs.renameMu.Unlock() 1714 d.evictLocked(ctx) 1715 } 1716 1717 // Preconditions: 1718 // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1719 // 1720 // +checklocks:d.fs.renameMu 1721 func (d *dentry) evictLocked(ctx context.Context) { 1722 d.cachingMu.Lock() 1723 d.removeFromCacheLocked() 1724 // d.refs or d.watches.Size() may have become non-zero from an earlier path 1725 // resolution since it was inserted into fs.dentryCache.dentries. 1726 if d.refs.Load() != 0 || d.watches.Size() != 0 { 1727 d.cachingMu.Unlock() 1728 return 1729 } 1730 if parent := d.parent.Load(); parent != nil { 1731 parent.opMu.Lock() 1732 if !d.vfsd.IsDead() { 1733 // Note that d can't be a mount point (in any mount namespace), since VFS 1734 // holds references on mount points. 1735 rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) 1736 for _, rc := range rcs { 1737 rc.DecRef(ctx) 1738 } 1739 1740 parent.childrenMu.Lock() 1741 delete(parent.children, d.name) 1742 parent.childrenMu.Unlock() 1743 1744 // We're only deleting the dentry, not the file it 1745 // represents, so we don't need to update 1746 // victim parent.dirents etc. 1747 } 1748 parent.opMu.Unlock() 1749 } 1750 // Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any 1751 // concurrent caching attempts on d will attempt to destroy it and so will 1752 // try to acquire fs.renameMu (which we have already acquiredd). Hence, 1753 // fs.renameMu will synchronize the destroy attempts. 1754 d.cachingMu.Unlock() 1755 d.destroyLocked(ctx) // +checklocksforce: owned as precondition. 1756 } 1757 1758 // destroyDisconnected destroys an uncached, unparented dentry. There are no 1759 // locking preconditions. 1760 func (d *dentry) destroyDisconnected(ctx context.Context) { 1761 mf := d.fs.mf 1762 1763 d.handleMu.Lock() 1764 d.dataMu.Lock() 1765 1766 if d.isWriteHandleOk() { 1767 // Write dirty pages back to the remote filesystem. 1768 h := d.writeHandle() 1769 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 1770 log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) 1771 } 1772 } 1773 // Discard cached data. 1774 if !d.cache.IsEmpty() { 1775 mf.MarkAllUnevictable(d) 1776 d.cache.DropAll(mf) 1777 d.dirty.RemoveAll() 1778 } 1779 d.dataMu.Unlock() 1780 1781 // Close any resources held by the implementation. 1782 d.destroyImpl(ctx) 1783 1784 // Can use RacyLoad() because handleMu is locked. 1785 if d.readFD.RacyLoad() >= 0 { 1786 _ = unix.Close(int(d.readFD.RacyLoad())) 1787 } 1788 if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { 1789 _ = unix.Close(int(d.writeFD.RacyLoad())) 1790 } 1791 d.readFD = atomicbitops.FromInt32(-1) 1792 d.writeFD = atomicbitops.FromInt32(-1) 1793 d.mmapFD = atomicbitops.FromInt32(-1) 1794 d.handleMu.Unlock() 1795 1796 if !d.isSynthetic() { 1797 // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, 1798 // i.e. client and server timestamps may differ (because e.g. a client 1799 // write was serviced by the page cache, and only written back to the 1800 // remote file later). Ideally, we'd write client timestamps back to 1801 // the remote filesystem so that timestamps for a new dentry 1802 // instantiated for the same file would remain coherent. Unfortunately, 1803 // this turns out to be too expensive in many cases, so for now we 1804 // don't do this. 1805 1806 // Remove d from the set of syncable dentries. 1807 d.fs.syncMu.Lock() 1808 d.fs.syncableDentries.Remove(&d.syncableListEntry) 1809 d.fs.syncMu.Unlock() 1810 } 1811 1812 // Drop references and stop tracking this child. 1813 d.refs.Store(-1) 1814 refs.Unregister(d) 1815 } 1816 1817 // destroyLocked destroys the dentry. 1818 // 1819 // Preconditions: 1820 // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1821 // - d.refs == 0. 1822 // - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal 1823 // from its former parent dentry. 1824 // 1825 // +checklocks:d.fs.renameMu 1826 func (d *dentry) destroyLocked(ctx context.Context) { 1827 switch d.refs.Load() { 1828 case 0: 1829 // Mark the dentry destroyed. 1830 d.refs.Store(-1) 1831 case -1: 1832 panic("dentry.destroyLocked() called on already destroyed dentry") 1833 default: 1834 panic("dentry.destroyLocked() called with references on the dentry") 1835 } 1836 1837 // Allow the following to proceed without renameMu locked to improve 1838 // scalability. 1839 d.fs.renameMu.Unlock() 1840 1841 // No locks need to be held during destoryDisconnected. 1842 d.destroyDisconnected(ctx) 1843 1844 d.fs.renameMu.Lock() 1845 1846 // Drop the reference held by d on its parent without recursively locking 1847 // d.fs.renameMu. 1848 1849 if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 { 1850 parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 1851 } 1852 } 1853 1854 func (d *dentry) isDeleted() bool { 1855 return d.deleted.Load() != 0 1856 } 1857 1858 func (d *dentry) setDeleted() { 1859 d.deleted.Store(1) 1860 } 1861 1862 func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { 1863 if d.isSynthetic() { 1864 return nil, nil 1865 } 1866 1867 return d.listXattrImpl(ctx, size) 1868 } 1869 1870 func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 1871 if d.isSynthetic() { 1872 return "", linuxerr.ENODATA 1873 } 1874 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { 1875 return "", err 1876 } 1877 return d.getXattrImpl(ctx, opts) 1878 } 1879 1880 func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 1881 if d.isSynthetic() { 1882 return linuxerr.EPERM 1883 } 1884 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { 1885 return err 1886 } 1887 return d.setXattrImpl(ctx, opts) 1888 } 1889 1890 func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { 1891 if d.isSynthetic() { 1892 return linuxerr.EPERM 1893 } 1894 if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { 1895 return err 1896 } 1897 return d.removeXattrImpl(ctx, name) 1898 } 1899 1900 // Preconditions: 1901 // - !d.isSynthetic(). 1902 // - d.isRegularFile() || d.isDir(). 1903 // - fs.renameMu is locked. 1904 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { 1905 // O_TRUNC unconditionally requires us to obtain a new handle (opened with 1906 // O_TRUNC). 1907 if !trunc { 1908 d.handleMu.RLock() 1909 canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk()) 1910 d.handleMu.RUnlock() 1911 if canReuseCurHandle { 1912 // Current handles are sufficient. 1913 return nil 1914 } 1915 } 1916 1917 d.handleMu.Lock() 1918 needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc 1919 if !needNewHandle { 1920 d.handleMu.Unlock() 1921 return nil 1922 } 1923 1924 var fdsToCloseArr [2]int32 1925 fdsToClose := fdsToCloseArr[:0] 1926 invalidateTranslations := false 1927 // Get a new handle. If this file has been opened for both reading and 1928 // writing, try to get a single handle that is usable for both: 1929 // 1930 // - Writable memory mappings of a host FD require that the host FD is 1931 // opened for both reading and writing. 1932 // 1933 // - NOTE(b/141991141): Some filesystems may not ensure coherence 1934 // between multiple handles for the same file. 1935 openReadable := d.isReadHandleOk() || read 1936 openWritable := d.isWriteHandleOk() || write 1937 h, err := d.openHandle(ctx, openReadable, openWritable, trunc) 1938 if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { 1939 // It may not be possible to use a single handle for both 1940 // reading and writing, since permissions on the file may have 1941 // changed to e.g. disallow reading after previously being 1942 // opened for reading. In this case, we have no choice but to 1943 // use separate handles for reading and writing. 1944 ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) 1945 openReadable = read 1946 openWritable = write 1947 h, err = d.openHandle(ctx, openReadable, openWritable, trunc) 1948 } 1949 if err != nil { 1950 d.handleMu.Unlock() 1951 return err 1952 } 1953 1954 // Update d.readFD and d.writeFD 1955 if h.fd >= 0 { 1956 if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) { 1957 // Replace existing FDs with this one. 1958 if d.readFD.RacyLoad() >= 0 { 1959 // We already have a readable FD that may be in use by 1960 // concurrent callers of d.pf.FD(). 1961 if d.fs.opts.overlayfsStaleRead { 1962 // If overlayfsStaleRead is in effect, then the new FD 1963 // may not be coherent with the existing one, so we 1964 // have no choice but to switch to mappings of the new 1965 // FD in both the application and sentry. 1966 if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { 1967 d.handleMu.Unlock() 1968 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) 1969 h.close(ctx) 1970 return err 1971 } 1972 fdsToClose = append(fdsToClose, d.readFD.RacyLoad()) 1973 invalidateTranslations = true 1974 d.readFD.Store(h.fd) 1975 } else { 1976 // Otherwise, we want to avoid invalidating existing 1977 // memmap.Translations (which is expensive); instead, use 1978 // dup3 to make the old file descriptor refer to the new 1979 // file description, then close the new file descriptor 1980 // (which is no longer needed). Racing callers of d.pf.FD() 1981 // may use the old or new file description, but this 1982 // doesn't matter since they refer to the same file, and 1983 // any racing mappings must be read-only. 1984 if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { 1985 oldFD := d.readFD.RacyLoad() 1986 d.handleMu.Unlock() 1987 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) 1988 h.close(ctx) 1989 return err 1990 } 1991 fdsToClose = append(fdsToClose, h.fd) 1992 h.fd = d.readFD.RacyLoad() 1993 } 1994 } else { 1995 d.readFD.Store(h.fd) 1996 } 1997 if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 { 1998 fdsToClose = append(fdsToClose, d.writeFD.RacyLoad()) 1999 } 2000 d.writeFD.Store(h.fd) 2001 d.mmapFD.Store(h.fd) 2002 } else if openReadable && d.readFD.RacyLoad() < 0 { 2003 readHandleWasOk := d.isReadHandleOk() 2004 d.readFD.Store(h.fd) 2005 // If the file has not been opened for writing, the new FD may 2006 // be used for read-only memory mappings. If the file was 2007 // previously opened for reading (without an FD), then existing 2008 // translations of the file may use the internal page cache; 2009 // invalidate those mappings. 2010 if !d.isWriteHandleOk() { 2011 invalidateTranslations = readHandleWasOk 2012 d.mmapFD.Store(h.fd) 2013 } 2014 } else if openWritable && d.writeFD.RacyLoad() < 0 { 2015 d.writeFD.Store(h.fd) 2016 if d.readFD.RacyLoad() >= 0 { 2017 // We have an existing read-only FD, but the file has just 2018 // been opened for writing, so we need to start supporting 2019 // writable memory mappings. However, the new FD is not 2020 // readable, so we have no FD that can be used to create 2021 // writable memory mappings. Switch to using the internal 2022 // page cache. 2023 invalidateTranslations = true 2024 d.mmapFD.Store(-1) 2025 } 2026 } else { 2027 // The new FD is not useful. 2028 fdsToClose = append(fdsToClose, h.fd) 2029 } 2030 } else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 { 2031 // We have an existing read-only FD, but the file has just been 2032 // opened for writing, so we need to start supporting writable 2033 // memory mappings. However, we have no writable host FD. Switch to 2034 // using the internal page cache. 2035 invalidateTranslations = true 2036 d.mmapFD.Store(-1) 2037 } 2038 2039 d.updateHandles(ctx, h, openReadable, openWritable) 2040 d.handleMu.Unlock() 2041 2042 if invalidateTranslations { 2043 // Invalidate application mappings that may be using an old FD; they 2044 // will be replaced with mappings using the new FD after future calls 2045 // to d.Translate(). This requires holding d.mapsMu, which precedes 2046 // d.handleMu in the lock order. 2047 d.mapsMu.Lock() 2048 d.mappings.InvalidateAll(memmap.InvalidateOpts{}) 2049 d.mapsMu.Unlock() 2050 } 2051 for _, fd := range fdsToClose { 2052 unix.Close(int(fd)) 2053 } 2054 2055 return nil 2056 } 2057 2058 func (d *dentry) syncRemoteFile(ctx context.Context) error { 2059 d.handleMu.RLock() 2060 defer d.handleMu.RUnlock() 2061 return d.syncRemoteFileLocked(ctx) 2062 } 2063 2064 // Preconditions: d.handleMu must be locked. 2065 func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { 2066 // Prefer syncing write handles over read handles, since some remote 2067 // filesystem implementations may not sync changes made through write 2068 // handles otherwise. 2069 wh := d.writeHandle() 2070 wh.sync(ctx) 2071 rh := d.readHandle() 2072 rh.sync(ctx) 2073 return nil 2074 } 2075 2076 func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { 2077 d.handleMu.RLock() 2078 defer d.handleMu.RUnlock() 2079 if d.isWriteHandleOk() { 2080 // Write back dirty pages to the remote file. 2081 d.dataMu.Lock() 2082 h := d.writeHandle() 2083 err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt) 2084 d.dataMu.Unlock() 2085 if err != nil { 2086 return err 2087 } 2088 } 2089 if err := d.syncRemoteFileLocked(ctx); err != nil { 2090 if !forFilesystemSync { 2091 return err 2092 } 2093 // Only return err if we can reasonably have expected sync to succeed 2094 // (d is a regular file and was opened for writing). 2095 if d.isRegularFile() && d.isWriteHandleOk() { 2096 return err 2097 } 2098 ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) 2099 } 2100 return nil 2101 } 2102 2103 // incLinks increments link count. 2104 func (d *dentry) incLinks() { 2105 if d.nlink.Load() == 0 { 2106 // The remote filesystem doesn't support link count. 2107 return 2108 } 2109 d.nlink.Add(1) 2110 } 2111 2112 // decLinks decrements link count. 2113 func (d *dentry) decLinks() { 2114 if d.nlink.Load() == 0 { 2115 // The remote filesystem doesn't support link count. 2116 return 2117 } 2118 d.nlink.Add(^uint32(0)) 2119 } 2120 2121 // fileDescription is embedded by gofer implementations of 2122 // vfs.FileDescriptionImpl. 2123 // 2124 // +stateify savable 2125 type fileDescription struct { 2126 vfsfd vfs.FileDescription 2127 vfs.FileDescriptionDefaultImpl 2128 vfs.LockFD 2129 2130 lockLogging sync.Once `state:"nosave"` 2131 } 2132 2133 func (fd *fileDescription) filesystem() *filesystem { 2134 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 2135 } 2136 2137 func (fd *fileDescription) dentry() *dentry { 2138 return fd.vfsfd.Dentry().Impl().(*dentry) 2139 } 2140 2141 // Stat implements vfs.FileDescriptionImpl.Stat. 2142 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 2143 d := fd.dentry() 2144 const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) 2145 if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { 2146 // Use specialFileFD.handle.fileLisa for the Stat if available, for the 2147 // same reason that we try to use open FD in updateMetadataLocked(). 2148 var err error 2149 if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { 2150 err = sffd.updateMetadata(ctx) 2151 } else { 2152 err = d.updateMetadata(ctx) 2153 } 2154 if err != nil { 2155 return linux.Statx{}, err 2156 } 2157 } 2158 var stat linux.Statx 2159 d.statTo(&stat) 2160 return stat, nil 2161 } 2162 2163 // SetStat implements vfs.FileDescriptionImpl.SetStat. 2164 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 2165 fs := fd.filesystem() 2166 fs.renameMu.RLock() 2167 defer fs.renameMu.RUnlock() 2168 return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()) 2169 } 2170 2171 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 2172 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 2173 return fd.dentry().listXattr(ctx, size) 2174 } 2175 2176 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 2177 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 2178 return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2179 } 2180 2181 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 2182 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 2183 return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2184 } 2185 2186 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 2187 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 2188 return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name) 2189 } 2190 2191 // LockBSD implements vfs.FileDescriptionImpl.LockBSD. 2192 func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { 2193 fd.lockLogging.Do(func() { 2194 log.Infof("File lock using gofer file handled internally.") 2195 }) 2196 return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) 2197 } 2198 2199 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. 2200 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { 2201 fd.lockLogging.Do(func() { 2202 log.Infof("Range lock using gofer file handled internally.") 2203 }) 2204 return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) 2205 } 2206 2207 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. 2208 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { 2209 return fd.Locks().UnlockPOSIX(ctx, uid, r) 2210 } 2211 2212 // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally 2213 // holds some information around the intent behind resolving the path. 2214 type resolvingPath struct { 2215 *vfs.ResolvingPath 2216 2217 // excludeLast indicates whether the intent is to resolve until the last path 2218 // component. If true, the last path component should remain unresolved. 2219 excludeLast bool 2220 } 2221 2222 func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath { 2223 return resolvingPath{ResolvingPath: rp, excludeLast: false} 2224 } 2225 2226 func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath { 2227 return resolvingPath{ResolvingPath: rp, excludeLast: true} 2228 } 2229 2230 func (rp *resolvingPath) done() bool { 2231 if rp.excludeLast { 2232 return rp.Final() 2233 } 2234 return rp.Done() 2235 } 2236 2237 func (rp *resolvingPath) copy() resolvingPath { 2238 return resolvingPath{ 2239 ResolvingPath: rp.ResolvingPath.Copy(), 2240 excludeLast: rp.excludeLast, 2241 } 2242 } 2243 2244 // Precondition: !rp.done() && rp.Component() is not "." or "..". 2245 func (rp *resolvingPath) getComponents(emit func(string) bool) { 2246 rp.GetComponents(rp.excludeLast, emit) 2247 }