github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/gofer/gofer.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package gofer provides a filesystem implementation that is backed by a 9p 16 // server, interchangably referred to as "gofers" throughout this package. 17 // 18 // Lock order: 19 // 20 // regularFileFD/directoryFD.mu 21 // filesystem.renameMu 22 // dentry.cachingMu 23 // dentryCache.mu 24 // dentry.opMu 25 // dentry.childrenMu 26 // filesystem.syncMu 27 // dentry.metadataMu 28 // *** "memmap.Mappable locks" below this point 29 // dentry.mapsMu 30 // *** "memmap.Mappable locks taken by Translate" below this point 31 // dentry.handleMu 32 // dentry.dataMu 33 // filesystem.inoMu 34 // specialFileFD.mu 35 // specialFileFD.bufMu 36 // 37 // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that 38 // either ancestor dentries are locked before descendant dentries, or that 39 // filesystem.renameMu is locked for writing. 40 package gofer 41 42 import ( 43 "fmt" 44 "path" 45 "strconv" 46 "strings" 47 48 "golang.org/x/sys/unix" 49 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 50 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 51 "github.com/nicocha30/gvisor-ligolo/pkg/cleanup" 52 "github.com/nicocha30/gvisor-ligolo/pkg/context" 53 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 54 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 55 "github.com/nicocha30/gvisor-ligolo/pkg/lisafs" 56 "github.com/nicocha30/gvisor-ligolo/pkg/log" 57 "github.com/nicocha30/gvisor-ligolo/pkg/refs" 58 fslock "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/lock" 59 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsutil" 60 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 61 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/pipe" 62 ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time" 63 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 64 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 65 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport" 66 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 67 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 68 "github.com/nicocha30/gvisor-ligolo/pkg/unet" 69 ) 70 71 // Name is the default filesystem name. 72 const Name = "9p" 73 74 // Mount option names for goferfs. 75 const ( 76 moptTransport = "trans" 77 moptReadFD = "rfdno" 78 moptWriteFD = "wfdno" 79 moptAname = "aname" 80 moptDfltUID = "dfltuid" 81 moptDfltGID = "dfltgid" 82 moptCache = "cache" 83 moptForcePageCache = "force_page_cache" 84 moptLimitHostFDTranslation = "limit_host_fd_translation" 85 moptOverlayfsStaleRead = "overlayfs_stale_read" 86 moptDisableFileHandleSharing = "disable_file_handle_sharing" 87 moptDisableFifoOpen = "disable_fifo_open" 88 89 // Directfs options. 90 moptDirectfs = "directfs" 91 ) 92 93 // Valid values for the "cache" mount option. 94 const ( 95 cacheFSCache = "fscache" 96 cacheFSCacheWritethrough = "fscache_writethrough" 97 cacheRemoteRevalidating = "remote_revalidating" 98 ) 99 100 const ( 101 defaultMaxCachedDentries = 1000 102 maxCachedNegativeChildren = 1000 103 ) 104 105 // stringFixedCache is a fixed sized cache, once initialized, 106 // its size never changes. 107 // 108 // +stateify savable 109 type stringFixedCache struct { 110 // namesList stores negative names with fifo list. 111 // name stored in namesList only means it used to be negative 112 // at the moment you pushed it to the list. 113 namesList stringList 114 size uint64 115 } 116 117 func (cache *stringFixedCache) isInited() bool { 118 return cache.size != 0 119 } 120 121 func (cache *stringFixedCache) init(size uint64) { 122 elements := make([]stringListElem, size) 123 for i := uint64(0); i < size; i++ { 124 cache.namesList.PushFront(&elements[i]) 125 } 126 cache.size = size 127 } 128 129 // Update will push name to the front of the list, 130 // and pop the tail value. 131 func (cache *stringFixedCache) add(name string) string { 132 tail := cache.namesList.Back() 133 victimName := tail.str 134 tail.str = name 135 cache.namesList.Remove(tail) 136 cache.namesList.PushFront(tail) 137 return victimName 138 } 139 140 // +stateify savable 141 type dentryCache struct { 142 // mu protects the below fields. 143 mu sync.Mutex `state:"nosave"` 144 // dentries contains all dentries with 0 references. Due to race conditions, 145 // it may also contain dentries with non-zero references. 146 dentries dentryList 147 // dentriesLen is the number of dentries in dentries. 148 dentriesLen uint64 149 // maxCachedDentries is the maximum number of cachable dentries. 150 maxCachedDentries uint64 151 } 152 153 // SetDentryCacheSize sets the size of the global gofer dentry cache. 154 func SetDentryCacheSize(size int) { 155 if size < 0 { 156 return 157 } 158 if globalDentryCache != nil { 159 log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.") 160 return 161 } 162 globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)} 163 } 164 165 // globalDentryCache is a global cache of dentries across all gofers. 166 var globalDentryCache *dentryCache 167 168 // Valid values for "trans" mount option. 169 const transportModeFD = "fd" 170 171 // FilesystemType implements vfs.FilesystemType. 172 // 173 // +stateify savable 174 type FilesystemType struct{} 175 176 // filesystem implements vfs.FilesystemImpl. 177 // 178 // +stateify savable 179 type filesystem struct { 180 vfsfs vfs.Filesystem 181 182 // mfp is used to allocate memory that caches regular file contents. mfp is 183 // immutable. 184 mfp pgalloc.MemoryFileProvider 185 186 // Immutable options. 187 opts filesystemOptions 188 iopts InternalFilesystemOptions 189 190 // client is the LISAFS client used for communicating with the server. client 191 // is immutable. 192 client *lisafs.Client `state:"nosave"` 193 194 // clock is a realtime clock used to set timestamps in file operations. 195 clock ktime.Clock 196 197 // devMinor is the filesystem's minor device number. devMinor is immutable. 198 devMinor uint32 199 200 // root is the root dentry. root is immutable. 201 root *dentry 202 203 // renameMu serves two purposes: 204 // 205 // - It synchronizes path resolution with renaming initiated by this 206 // client. 207 // 208 // - It is held by path resolution to ensure that reachable dentries remain 209 // valid. A dentry is reachable by path resolution if it has a non-zero 210 // reference count (such that it is usable as vfs.ResolvingPath.Start() or 211 // is reachable from its children), or if it is a child dentry (such that 212 // it is reachable from its parent). 213 renameMu sync.RWMutex `state:"nosave"` 214 215 dentryCache *dentryCache 216 217 // syncableDentries contains all non-synthetic dentries. specialFileFDs 218 // contains all open specialFileFDs. These fields are protected by syncMu. 219 syncMu sync.Mutex `state:"nosave"` 220 syncableDentries dentryList 221 specialFileFDs specialFDList 222 223 // inoByKey maps previously-observed device ID and host inode numbers to 224 // internal inode numbers assigned to those files. inoByKey is not preserved 225 // across checkpoint/restore because inode numbers may be reused between 226 // different gofer processes, so inode numbers may be repeated for different 227 // files across checkpoint/restore. inoByKey is protected by inoMu. 228 inoMu sync.Mutex `state:"nosave"` 229 inoByKey map[inoKey]uint64 `state:"nosave"` 230 231 // lastIno is the last inode number assigned to a file. lastIno is accessed 232 // using atomic memory operations. 233 lastIno atomicbitops.Uint64 234 235 // savedDentryRW records open read/write handles during save/restore. 236 savedDentryRW map[*dentry]savedDentryRW 237 238 // released is nonzero once filesystem.Release has been called. 239 released atomicbitops.Int32 240 } 241 242 // +stateify savable 243 type filesystemOptions struct { 244 fd int 245 aname string 246 interop InteropMode // derived from the "cache" mount option 247 dfltuid auth.KUID 248 dfltgid auth.KGID 249 250 // If forcePageCache is true, host FDs may not be used for application 251 // memory mappings even if available; instead, the client must perform its 252 // own caching of regular file pages. This is primarily useful for testing. 253 forcePageCache bool 254 255 // If limitHostFDTranslation is true, apply maxFillRange() constraints to 256 // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This 257 // makes memory accounting behavior more consistent between cases where 258 // host FDs are / are not available, but may increase the frequency of 259 // sentry-handled page faults on files for which a host FD is available. 260 limitHostFDTranslation bool 261 262 // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote 263 // filesystem may not be coherent with writable host FDs opened later, so 264 // all uses of the former must be replaced by uses of the latter. This is 265 // usually only the case when the remote filesystem is a Linux overlayfs 266 // mount. (Prior to Linux 4.18, patch series centered on commit 267 // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were 268 // incoherent between pre-copy-up and post-copy-up FDs; after that patch 269 // series, only memory mappings are incoherent.) 270 overlayfsStaleRead bool 271 272 // If regularFilesUseSpecialFileFD is true, application FDs representing 273 // regular files will use distinct file handles for each FD, in the same 274 // way that application FDs representing "special files" such as sockets 275 // do. Note that this disables client caching for regular files. This option 276 // may regress performance due to excessive Open RPCs. This option is not 277 // supported with overlayfsStaleRead for now. 278 regularFilesUseSpecialFileFD bool 279 280 // If disableFifoOpen is true, application attempts to open(2) a host FIFO 281 // are disallowed. 282 disableFifoOpen bool 283 284 // directfs holds options for directfs mode. 285 directfs directfsOpts 286 } 287 288 // +stateify savable 289 type directfsOpts struct { 290 // If directfs is enabled, the gofer client does not make RPCs to the gofer 291 // process. Instead, it makes host syscalls to perform file operations. 292 enabled bool 293 } 294 295 // InteropMode controls the client's interaction with other remote filesystem 296 // users. 297 // 298 // +stateify savable 299 type InteropMode uint32 300 301 const ( 302 // InteropModeExclusive is appropriate when the filesystem client is the 303 // only user of the remote filesystem. 304 // 305 // - The client may cache arbitrary filesystem state (file data, metadata, 306 // filesystem structure, etc.). 307 // 308 // - Client changes to filesystem state may be sent to the remote 309 // filesystem asynchronously, except when server permission checks are 310 // necessary. 311 // 312 // - File timestamps are based on client clocks. This ensures that users of 313 // the client observe timestamps that are coherent with their own clocks 314 // and consistent with Linux's semantics (in particular, it is not always 315 // possible for clients to set arbitrary atimes and mtimes depending on the 316 // remote filesystem implementation, and never possible for clients to set 317 // arbitrary ctimes.) 318 InteropModeExclusive InteropMode = iota 319 320 // InteropModeWritethrough is appropriate when there are read-only users of 321 // the remote filesystem that expect to observe changes made by the 322 // filesystem client. 323 // 324 // - The client may cache arbitrary filesystem state. 325 // 326 // - Client changes to filesystem state must be sent to the remote 327 // filesystem synchronously. 328 // 329 // - File timestamps are based on client clocks. As a corollary, access 330 // timestamp changes from other remote filesystem users will not be visible 331 // to the client. 332 InteropModeWritethrough 333 334 // InteropModeShared is appropriate when there are users of the remote 335 // filesystem that may mutate its state other than the client. 336 // 337 // - The client must verify ("revalidate") cached filesystem state before 338 // using it. 339 // 340 // - Client changes to filesystem state must be sent to the remote 341 // filesystem synchronously. 342 // 343 // - File timestamps are based on server clocks. This is necessary to 344 // ensure that timestamp changes are synchronized between remote filesystem 345 // users. 346 // 347 // Note that the correctness of InteropModeShared depends on the server 348 // correctly implementing 9P fids (i.e. each fid immutably represents a 349 // single filesystem object), even in the presence of remote filesystem 350 // mutations from other users. If this is violated, the behavior of the 351 // client is undefined. 352 InteropModeShared 353 ) 354 355 // InternalFilesystemOptions may be passed as 356 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. 357 // 358 // +stateify savable 359 type InternalFilesystemOptions struct { 360 // If UniqueID is non-empty, it is an opaque string used to reassociate the 361 // filesystem with a new server FD during restoration from checkpoint. 362 UniqueID string 363 364 // If LeakConnection is true, do not close the connection to the server 365 // when the Filesystem is released. This is necessary for deployments in 366 // which servers can handle only a single client and report failure if that 367 // client disconnects. 368 LeakConnection bool 369 370 // If OpenSocketsByConnecting is true, silently translate attempts to open 371 // files identifying as sockets to connect RPCs. 372 OpenSocketsByConnecting bool 373 } 374 375 // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default 376 // UIDs and GIDs used for files that do not provide a specific owner or group 377 // respectively. 378 const ( 379 // uint32(-2) doesn't work in Go. 380 _V9FS_DEFUID = auth.KUID(4294967294) 381 _V9FS_DEFGID = auth.KGID(4294967294) 382 ) 383 384 // Name implements vfs.FilesystemType.Name. 385 func (FilesystemType) Name() string { 386 return Name 387 } 388 389 // Release implements vfs.FilesystemType.Release. 390 func (FilesystemType) Release(ctx context.Context) {} 391 392 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 393 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 394 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 395 if mfp == nil { 396 ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") 397 return nil, nil, linuxerr.EINVAL 398 } 399 400 mopts := vfs.GenericParseMountOptions(opts.Data) 401 var fsopts filesystemOptions 402 403 fd, err := getFDFromMountOptionsMap(ctx, mopts) 404 if err != nil { 405 return nil, nil, err 406 } 407 fsopts.fd = fd 408 409 // Get the attach name. 410 fsopts.aname = "/" 411 if aname, ok := mopts[moptAname]; ok { 412 delete(mopts, moptAname) 413 if !path.IsAbs(aname) { 414 ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname) 415 return nil, nil, linuxerr.EINVAL 416 } 417 fsopts.aname = path.Clean(aname) 418 } 419 420 // Parse the cache policy. For historical reasons, this defaults to the 421 // least generally-applicable option, InteropModeExclusive. 422 fsopts.interop = InteropModeExclusive 423 if cache, ok := mopts[moptCache]; ok { 424 delete(mopts, moptCache) 425 switch cache { 426 case cacheFSCache: 427 fsopts.interop = InteropModeExclusive 428 case cacheFSCacheWritethrough: 429 fsopts.interop = InteropModeWritethrough 430 case cacheRemoteRevalidating: 431 fsopts.interop = InteropModeShared 432 default: 433 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache) 434 return nil, nil, linuxerr.EINVAL 435 } 436 } 437 438 // Parse the default UID and GID. 439 fsopts.dfltuid = _V9FS_DEFUID 440 if dfltuidstr, ok := mopts[moptDfltUID]; ok { 441 delete(mopts, moptDfltUID) 442 dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) 443 if err != nil { 444 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr) 445 return nil, nil, linuxerr.EINVAL 446 } 447 // In Linux, dfltuid is interpreted as a UID and is converted to a KUID 448 // in the caller's user namespace, but goferfs isn't 449 // application-mountable. 450 fsopts.dfltuid = auth.KUID(dfltuid) 451 } 452 fsopts.dfltgid = _V9FS_DEFGID 453 if dfltgidstr, ok := mopts[moptDfltGID]; ok { 454 delete(mopts, moptDfltGID) 455 dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) 456 if err != nil { 457 ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr) 458 return nil, nil, linuxerr.EINVAL 459 } 460 fsopts.dfltgid = auth.KGID(dfltgid) 461 } 462 463 // Handle simple flags. 464 if _, ok := mopts[moptDisableFileHandleSharing]; ok { 465 delete(mopts, moptDisableFileHandleSharing) 466 fsopts.regularFilesUseSpecialFileFD = true 467 } 468 if _, ok := mopts[moptDisableFifoOpen]; ok { 469 delete(mopts, moptDisableFifoOpen) 470 fsopts.disableFifoOpen = true 471 } 472 if _, ok := mopts[moptForcePageCache]; ok { 473 delete(mopts, moptForcePageCache) 474 fsopts.forcePageCache = true 475 } 476 if _, ok := mopts[moptLimitHostFDTranslation]; ok { 477 delete(mopts, moptLimitHostFDTranslation) 478 fsopts.limitHostFDTranslation = true 479 } 480 if _, ok := mopts[moptOverlayfsStaleRead]; ok { 481 delete(mopts, moptOverlayfsStaleRead) 482 fsopts.overlayfsStaleRead = true 483 } 484 if _, ok := mopts[moptDirectfs]; ok { 485 delete(mopts, moptDirectfs) 486 fsopts.directfs.enabled = true 487 } 488 // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying 489 // "cache=none". 490 491 // Check for unparsed options. 492 if len(mopts) != 0 { 493 ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) 494 return nil, nil, linuxerr.EINVAL 495 } 496 497 // Validation. 498 if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead { 499 // These options are not supported together. To support this, when a dentry 500 // is opened writably for the first time, we need to iterate over all the 501 // specialFileFDs of that dentry that represent a regular file and call 502 // fd.hostFileMapper.RegenerateMappings(writable_fd). 503 ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.") 504 return nil, nil, linuxerr.EINVAL 505 } 506 507 // Handle internal options. 508 iopts, ok := opts.InternalData.(InternalFilesystemOptions) 509 if opts.InternalData != nil && !ok { 510 ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) 511 return nil, nil, linuxerr.EINVAL 512 } 513 // If !ok, iopts being the zero value is correct. 514 515 // Construct the filesystem object. 516 devMinor, err := vfsObj.GetAnonBlockDevMinor() 517 if err != nil { 518 return nil, nil, err 519 } 520 fs := &filesystem{ 521 mfp: mfp, 522 opts: fsopts, 523 iopts: iopts, 524 clock: ktime.RealtimeClockFromContext(ctx), 525 devMinor: devMinor, 526 inoByKey: make(map[inoKey]uint64), 527 } 528 529 // Did the user configure a global dentry cache? 530 if globalDentryCache != nil { 531 fs.dentryCache = globalDentryCache 532 } else { 533 fs.dentryCache = &dentryCache{maxCachedDentries: defaultMaxCachedDentries} 534 } 535 536 fs.vfsfs.Init(vfsObj, &fstype, fs) 537 538 rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx) 539 if err != nil { 540 fs.vfsfs.DecRef(ctx) 541 return nil, nil, err 542 } 543 if fs.opts.directfs.enabled { 544 fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD)) 545 } else { 546 fs.root, err = fs.newLisafsDentry(ctx, &rootInode) 547 } 548 if err != nil { 549 fs.vfsfs.DecRef(ctx) 550 return nil, nil, err 551 } 552 // Set the root's reference count to 2. One reference is returned to the 553 // caller, and the other is held by fs to prevent the root from being "cached" 554 // and subsequently evicted. 555 fs.root.refs = atomicbitops.FromInt64(2) 556 return &fs.vfsfs, &fs.root.vfsd, nil 557 } 558 559 // initClientAndGetRoot initializes fs.client and returns the root inode for 560 // this mount point. It handles the attach point (fs.opts.aname) resolution. 561 func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) { 562 sock, err := unet.NewSocket(fs.opts.fd) 563 if err != nil { 564 return lisafs.Inode{}, -1, err 565 } 566 567 ctx.UninterruptibleSleepStart(false) 568 defer ctx.UninterruptibleSleepFinish(false) 569 570 var ( 571 rootInode lisafs.Inode 572 rootHostFD int 573 ) 574 fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock) 575 if err != nil { 576 return lisafs.Inode{}, -1, err 577 } 578 579 cu := cleanup.Make(func() { 580 if rootHostFD >= 0 { 581 _ = unix.Close(rootHostFD) 582 } 583 rootControlFD := fs.client.NewFD(rootInode.ControlFD) 584 rootControlFD.Close(ctx, false /* flush */) 585 }) 586 defer cu.Clean() 587 588 if fs.opts.directfs.enabled { 589 if fs.opts.aname != "/" { 590 log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname) 591 return lisafs.Inode{}, -1, unix.EINVAL 592 } 593 if rootHostFD < 0 { 594 log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled") 595 return lisafs.Inode{}, -1, unix.EINVAL 596 } 597 } else { 598 if rootHostFD >= 0 { 599 log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it") 600 _ = unix.Close(rootHostFD) 601 rootHostFD = -1 602 } 603 // Use flipcall channels with lisafs because it makes a lot of RPCs. 604 if err := fs.client.StartChannels(); err != nil { 605 return lisafs.Inode{}, -1, err 606 } 607 rootInode, err = fs.handleAnameLisafs(ctx, rootInode) 608 if err != nil { 609 return lisafs.Inode{}, -1, err 610 } 611 } 612 cu.Release() 613 return rootInode, rootHostFD, nil 614 } 615 616 func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) { 617 // Check that the transport is "fd". 618 trans, ok := mopts[moptTransport] 619 if !ok || trans != transportModeFD { 620 ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD) 621 return -1, linuxerr.EINVAL 622 } 623 delete(mopts, moptTransport) 624 625 // Check that read and write FDs are provided and identical. 626 rfdstr, ok := mopts[moptReadFD] 627 if !ok { 628 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD) 629 return -1, linuxerr.EINVAL 630 } 631 delete(mopts, moptReadFD) 632 rfd, err := strconv.Atoi(rfdstr) 633 if err != nil { 634 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr) 635 return -1, linuxerr.EINVAL 636 } 637 wfdstr, ok := mopts[moptWriteFD] 638 if !ok { 639 ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD) 640 return -1, linuxerr.EINVAL 641 } 642 delete(mopts, moptWriteFD) 643 wfd, err := strconv.Atoi(wfdstr) 644 if err != nil { 645 ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr) 646 return -1, linuxerr.EINVAL 647 } 648 if rfd != wfd { 649 ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd) 650 return -1, linuxerr.EINVAL 651 } 652 return rfd, nil 653 } 654 655 // Release implements vfs.FilesystemImpl.Release. 656 func (fs *filesystem) Release(ctx context.Context) { 657 fs.released.Store(1) 658 659 mf := fs.mfp.MemoryFile() 660 fs.syncMu.Lock() 661 for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { 662 d := elem.d 663 d.handleMu.Lock() 664 d.dataMu.Lock() 665 if d.isWriteHandleOk() { 666 // Write dirty cached data to the remote file. 667 h := d.writeHandle() 668 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 669 log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) 670 } 671 // TODO(jamieliu): Do we need to flushf/fsync d? 672 } 673 // Discard cached pages. 674 d.cache.DropAll(mf) 675 d.dirty.RemoveAll() 676 d.dataMu.Unlock() 677 // Close host FDs if they exist. We can use RacyLoad() because d.handleMu 678 // is locked. 679 if d.readFD.RacyLoad() >= 0 { 680 _ = unix.Close(int(d.readFD.RacyLoad())) 681 } 682 if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { 683 _ = unix.Close(int(d.writeFD.RacyLoad())) 684 } 685 d.readFD = atomicbitops.FromInt32(-1) 686 d.writeFD = atomicbitops.FromInt32(-1) 687 d.mmapFD = atomicbitops.FromInt32(-1) 688 d.handleMu.Unlock() 689 } 690 // There can't be any specialFileFDs still using fs, since each such 691 // FileDescription would hold a reference on a Mount holding a reference on 692 // fs. 693 fs.syncMu.Unlock() 694 695 // If leak checking is enabled, release all outstanding references in the 696 // filesystem. We deliberately avoid doing this outside of leak checking; we 697 // have released all external resources above rather than relying on dentry 698 // destructors. fs.root may be nil if creating the client or initializing the 699 // root dentry failed in GetFilesystem. 700 if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil { 701 fs.renameMu.Lock() 702 fs.root.releaseSyntheticRecursiveLocked(ctx) 703 fs.evictAllCachedDentriesLocked(ctx) 704 fs.renameMu.Unlock() 705 706 // An extra reference was held by the filesystem on the root to prevent it from 707 // being cached/evicted. 708 fs.root.DecRef(ctx) 709 } 710 711 if !fs.iopts.LeakConnection { 712 // Close the connection to the server. This implicitly closes all FDs. 713 if fs.client != nil { 714 fs.client.Close() 715 } 716 } 717 718 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 719 } 720 721 // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements 722 // the reference count on every synthetic dentry. Synthetic dentries have one 723 // reference for existence that should be dropped during filesystem.Release. 724 // 725 // Precondition: d.fs.renameMu is locked for writing. 726 func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) { 727 if d.isSynthetic() { 728 d.decRefNoCaching() 729 d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 730 } 731 if d.isDir() { 732 var children []*dentry 733 d.childrenMu.Lock() 734 for _, child := range d.children { 735 children = append(children, child) 736 } 737 d.childrenMu.Unlock() 738 for _, child := range children { 739 if child != nil { 740 child.releaseSyntheticRecursiveLocked(ctx) 741 } 742 } 743 } 744 } 745 746 // inoKey is the key used to identify the inode backed by this dentry. 747 // 748 // +stateify savable 749 type inoKey struct { 750 ino uint64 751 devMinor uint32 752 devMajor uint32 753 } 754 755 func inoKeyFromStatx(stat *linux.Statx) inoKey { 756 return inoKey{ 757 ino: stat.Ino, 758 devMinor: stat.DevMinor, 759 devMajor: stat.DevMajor, 760 } 761 } 762 763 func inoKeyFromStat(stat *unix.Stat_t) inoKey { 764 return inoKey{ 765 ino: stat.Ino, 766 devMinor: unix.Minor(stat.Dev), 767 devMajor: unix.Major(stat.Dev), 768 } 769 } 770 771 // dentry implements vfs.DentryImpl. 772 // 773 // +stateify savable 774 type dentry struct { 775 vfsd vfs.Dentry 776 777 // refs is the reference count. Each dentry holds a reference on its 778 // parent, even if disowned. An additional reference is held on all 779 // synthetic dentries until they are unlinked or invalidated. When refs 780 // reaches 0, the dentry may be added to the cache or destroyed. If refs == 781 // -1, the dentry has already been destroyed. refs is accessed using atomic 782 // memory operations. 783 refs atomicbitops.Int64 784 785 // fs is the owning filesystem. fs is immutable. 786 fs *filesystem 787 788 // parent is this dentry's parent directory. Each dentry holds a reference 789 // on its parent. If this dentry is a filesystem root, parent is nil. 790 // parent is protected by filesystem.renameMu. 791 parent *dentry 792 793 // name is the name of this dentry in its parent. If this dentry is a 794 // filesystem root, name is the empty string. name is protected by 795 // filesystem.renameMu. 796 name string 797 798 // inoKey is used to identify this dentry's inode. 799 inoKey inoKey 800 801 // If deleted is non-zero, the file represented by this dentry has been 802 // deleted is accessed using atomic memory operations. 803 deleted atomicbitops.Uint32 804 805 // cachingMu is used to synchronize concurrent dentry caching attempts on 806 // this dentry. 807 cachingMu sync.Mutex `state:"nosave"` 808 809 // If cached is true, this dentry is part of filesystem.dentryCache. cached 810 // is protected by cachingMu. 811 cached bool 812 813 // cacheEntry links dentry into filesystem.dentryCache.dentries. It is 814 // protected by filesystem.dentryCache.mu. 815 cacheEntry dentryListElem 816 817 // syncableListEntry links dentry into filesystem.syncableDentries. It is 818 // protected by filesystem.syncMu. 819 syncableListEntry dentryListElem 820 821 // opMu synchronizes operations on this dentry. Operations that mutate 822 // the dentry tree must hold this lock for writing. Operations that 823 // only read the tree must hold for reading. 824 opMu sync.RWMutex `state:"nosave"` 825 826 // childrenMu protects the cached children data for this dentry. 827 childrenMu sync.Mutex `state:"nosave"` 828 829 // If this dentry represents a directory, children contains: 830 // 831 // - Mappings of child filenames to dentries representing those children. 832 // 833 // - Mappings of child filenames that are known not to exist to nil 834 // dentries (only if InteropModeShared is not in effect and the directory 835 // is not synthetic). 836 // 837 // +checklocks:childrenMu 838 children map[string]*dentry 839 840 // If this dentry represents a directory, negativeChildrenCache cache 841 // names of negative children. 842 // 843 // +checklocks:childrenMu 844 negativeChildrenCache stringFixedCache 845 // If this dentry represents a directory, negativeChildren is the number 846 // of negative children cached in dentry.children 847 // 848 // +checklocks:childrenMu 849 negativeChildren int 850 851 // If this dentry represents a directory, syntheticChildren is the number 852 // of child dentries for which dentry.isSynthetic() == true. 853 // 854 // +checklocks:childrenMu 855 syntheticChildren int 856 857 // If this dentry represents a directory, 858 // dentry.cachedMetadataAuthoritative() == true, and dirents is not 859 // nil, then dirents is a cache of all entries in the directory, in the 860 // order they were returned by the server. childrenSet just stores the 861 // `Name` field of all dirents in a set for fast query. dirents and 862 // childrenSet share the same lifecycle. 863 // 864 // +checklocks:childrenMu 865 dirents []vfs.Dirent 866 // +checklocks:childrenMu 867 childrenSet map[string]struct{} 868 869 // Cached metadata; protected by metadataMu. 870 // To access: 871 // - In situations where consistency is not required (like stat), these 872 // can be accessed using atomic operations only (without locking). 873 // - Lock metadataMu and can access without atomic operations. 874 // To mutate: 875 // - Lock metadataMu and use atomic operations to update because we might 876 // have atomic readers that don't hold the lock. 877 metadataMu sync.Mutex `state:"nosave"` 878 ino uint64 // immutable 879 mode atomicbitops.Uint32 // type is immutable, perms are mutable 880 uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 881 gid atomicbitops.Uint32 // auth.KGID, but ... 882 blockSize atomicbitops.Uint32 // 0 if unknown 883 // Timestamps, all nsecs from the Unix epoch. 884 atime atomicbitops.Int64 885 mtime atomicbitops.Int64 886 ctime atomicbitops.Int64 887 btime atomicbitops.Int64 888 // File size, which differs from other metadata in two ways: 889 // 890 // - We make a best-effort attempt to keep it up to date even if 891 // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. 892 // 893 // - size is protected by both metadataMu and dataMu (i.e. both must be 894 // locked to mutate it; locking either is sufficient to access it). 895 size atomicbitops.Uint64 896 // If this dentry does not represent a synthetic file, deleted is 0, and 897 // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the 898 // remote file's timestamps, which should be updated when this dentry is 899 // evicted. 900 atimeDirty atomicbitops.Uint32 901 mtimeDirty atomicbitops.Uint32 902 903 // nlink counts the number of hard links to this dentry. It's updated and 904 // accessed using atomic operations. It's not protected by metadataMu like the 905 // other metadata fields. 906 nlink atomicbitops.Uint32 907 908 mapsMu sync.Mutex `state:"nosave"` 909 910 // If this dentry represents a regular file, mappings tracks mappings of 911 // the file into memmap.MappingSpaces. mappings is protected by mapsMu. 912 mappings memmap.MappingSet 913 914 // - If this dentry represents a regular file or directory, readFD (if not 915 // -1) is a host FD used for reads by all regularFileFDs/directoryFDs 916 // representing this dentry. 917 // 918 // - If this dentry represents a regular file, writeFD (if not -1) is a host 919 // FD used for writes by all regularFileFDs representing this dentry. 920 // 921 // - If this dentry represents a regular file, mmapFD is the host FD used 922 // for memory mappings. If mmapFD is -1, no such FD is available, and the 923 // internal page cache implementation is used for memory mappings instead. 924 // 925 // These fields are protected by handleMu. readFD, writeFD, and mmapFD are 926 // additionally written using atomic memory operations, allowing them to be 927 // read (albeit racily) with atomic.LoadInt32() without locking handleMu. 928 // 929 // readFD and writeFD may or may not be the same file descriptor. Once either 930 // transitions from closed (-1) to open, it may be mutated with handleMu 931 // locked, but cannot be closed until the dentry is destroyed. 932 // 933 // readFD and writeFD may or may not be the same file descriptor. mmapFD is 934 // always either -1 or equal to readFD; if the file has been opened for 935 // writing, it is additionally either -1 or equal to writeFD. 936 handleMu sync.RWMutex `state:"nosave"` 937 readFD atomicbitops.Int32 `state:"nosave"` 938 writeFD atomicbitops.Int32 `state:"nosave"` 939 mmapFD atomicbitops.Int32 `state:"nosave"` 940 941 dataMu sync.RWMutex `state:"nosave"` 942 943 // If this dentry represents a regular file that is client-cached, cache 944 // maps offsets into the cached file to offsets into 945 // filesystem.mfp.MemoryFile() that store the file's data. cache is 946 // protected by dataMu. 947 cache fsutil.FileRangeSet 948 949 // If this dentry represents a regular file that is client-cached, dirty 950 // tracks dirty segments in cache. dirty is protected by dataMu. 951 dirty fsutil.DirtySet 952 953 // pf implements platform.File for mappings of hostFD. 954 pf dentryPlatformFile 955 956 // If this dentry represents a symbolic link, InteropModeShared is not in 957 // effect, and haveTarget is true, target is the symlink target. haveTarget 958 // and target are protected by dataMu. 959 haveTarget bool 960 target string 961 962 // If this dentry represents a synthetic socket file, endpoint is the 963 // transport endpoint bound to this file. 964 endpoint transport.BoundEndpoint 965 966 // If this dentry represents a synthetic named pipe, pipe is the pipe 967 // endpoint bound to this file. 968 pipe *pipe.VFSPipe 969 970 locks vfs.FileLocks 971 972 // Inotify watches for this dentry. 973 // 974 // Note that inotify may behave unexpectedly in the presence of hard links, 975 // because dentries corresponding to the same file have separate inotify 976 // watches when they should share the same set. This is the case because it is 977 // impossible for us to know for sure whether two dentries correspond to the 978 // same underlying file (see the gofer filesystem section fo vfs/inotify.md for 979 // a more in-depth discussion on this matter). 980 watches vfs.Watches 981 982 // impl is the specific dentry implementation for non-synthetic dentries. 983 // impl is immutable. 984 // 985 // If impl is nil, this dentry represents a synthetic file, i.e. a 986 // file that does not exist on the host filesystem. As of this writing, the 987 // only files that can be synthetic are sockets, pipes, and directories. 988 impl any 989 } 990 991 // +stateify savable 992 type stringListElem struct { 993 // str is the string that this elem represents. 994 str string 995 stringEntry 996 } 997 998 // +stateify savable 999 type dentryListElem struct { 1000 // d is the dentry that this elem represents. 1001 d *dentry 1002 dentryEntry 1003 } 1004 1005 func (fs *filesystem) inoFromKey(key inoKey) uint64 { 1006 fs.inoMu.Lock() 1007 defer fs.inoMu.Unlock() 1008 1009 if ino, ok := fs.inoByKey[key]; ok { 1010 return ino 1011 } 1012 ino := fs.nextIno() 1013 fs.inoByKey[key] = ino 1014 return ino 1015 } 1016 1017 func (fs *filesystem) nextIno() uint64 { 1018 return fs.lastIno.Add(1) 1019 } 1020 1021 // init must be called before first use of d. 1022 func (d *dentry) init(impl any) { 1023 d.pf.dentry = d 1024 d.cacheEntry.d = d 1025 d.syncableListEntry.d = d 1026 // Nested impl-inheritance pattern. In memory it looks like: 1027 // [[[ vfs.Dentry ] dentry ] dentryImpl ] 1028 // All 3 abstractions are allocated in one allocation. We achieve this by 1029 // making each outer dentry implementation hold the inner dentry by value. 1030 // Then the outer most dentry is allocated and we initialize fields inward. 1031 // Each inner dentry has a pointer to the next level of implementation. 1032 d.impl = impl 1033 d.vfsd.Init(d) 1034 refs.Register(d) 1035 } 1036 1037 func (d *dentry) isSynthetic() bool { 1038 return d.impl == nil 1039 } 1040 1041 func (d *dentry) cachedMetadataAuthoritative() bool { 1042 return d.fs.opts.interop != InteropModeShared || d.isSynthetic() 1043 } 1044 1045 // updateMetadataFromStatxLocked is called to update d's metadata after an update 1046 // from the remote filesystem. 1047 // Precondition: d.metadataMu must be locked. 1048 // +checklocks:d.metadataMu 1049 func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) { 1050 if stat.Mask&linux.STATX_TYPE != 0 { 1051 if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want { 1052 panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) 1053 } 1054 } 1055 if stat.Mask&linux.STATX_MODE != 0 { 1056 d.mode.Store(uint32(stat.Mode)) 1057 } 1058 if stat.Mask&linux.STATX_UID != 0 { 1059 d.uid.Store(dentryUID(lisafs.UID(stat.UID))) 1060 } 1061 if stat.Mask&linux.STATX_GID != 0 { 1062 d.gid.Store(dentryGID(lisafs.GID(stat.GID))) 1063 } 1064 if stat.Blksize != 0 { 1065 d.blockSize.Store(stat.Blksize) 1066 } 1067 // Don't override newer client-defined timestamps with old server-defined 1068 // ones. 1069 if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 { 1070 d.atime.Store(dentryTimestamp(stat.Atime)) 1071 } 1072 if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 { 1073 d.mtime.Store(dentryTimestamp(stat.Mtime)) 1074 } 1075 if stat.Mask&linux.STATX_CTIME != 0 { 1076 d.ctime.Store(dentryTimestamp(stat.Ctime)) 1077 } 1078 if stat.Mask&linux.STATX_BTIME != 0 { 1079 d.btime.Store(dentryTimestamp(stat.Btime)) 1080 } 1081 if stat.Mask&linux.STATX_NLINK != 0 { 1082 d.nlink.Store(stat.Nlink) 1083 } 1084 if stat.Mask&linux.STATX_SIZE != 0 { 1085 d.updateSizeLocked(stat.Size) 1086 } 1087 } 1088 1089 // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked, 1090 // except that it takes a unix.Stat_t argument. 1091 // Precondition: d.metadataMu must be locked. 1092 // +checklocks:d.metadataMu 1093 func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error { 1094 if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want { 1095 panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got)) 1096 } 1097 d.mode.Store(stat.Mode) 1098 d.uid.Store(stat.Uid) 1099 d.gid.Store(stat.Gid) 1100 d.blockSize.Store(uint32(stat.Blksize)) 1101 // Don't override newer client-defined timestamps with old host-defined 1102 // ones. 1103 if d.atimeDirty.Load() == 0 { 1104 d.atime.Store(dentryTimestampFromUnix(stat.Atim)) 1105 } 1106 if d.mtimeDirty.Load() == 0 { 1107 d.mtime.Store(dentryTimestampFromUnix(stat.Mtim)) 1108 } 1109 d.ctime.Store(dentryTimestampFromUnix(stat.Ctim)) 1110 d.nlink.Store(uint32(stat.Nlink)) 1111 d.updateSizeLocked(uint64(stat.Size)) 1112 return nil 1113 } 1114 1115 // Preconditions: !d.isSynthetic(). 1116 // Preconditions: d.metadataMu is locked. 1117 // +checklocks:d.metadataMu 1118 func (d *dentry) refreshSizeLocked(ctx context.Context) error { 1119 d.handleMu.RLock() 1120 1121 // Can use RacyLoad() because handleMu is locked. 1122 if d.writeFD.RacyLoad() < 0 { 1123 d.handleMu.RUnlock() 1124 // Use a suitable FD if we don't have a writable host FD. 1125 return d.updateMetadataLocked(ctx, noHandle) 1126 } 1127 1128 // Using statx(2) with a minimal mask is faster than fstat(2). 1129 var stat unix.Statx_t 1130 // Can use RacyLoad() because handleMu is locked. 1131 err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat) 1132 d.handleMu.RUnlock() // must be released before updateSizeLocked() 1133 if err != nil { 1134 return err 1135 } 1136 d.updateSizeLocked(stat.Size) 1137 return nil 1138 } 1139 1140 // Preconditions: !d.isSynthetic(). 1141 func (d *dentry) updateMetadata(ctx context.Context) error { 1142 // d.metadataMu must be locked *before* we stat so that we do not end up 1143 // updating stale attributes in d.updateMetadataFromStatLocked(). 1144 d.metadataMu.Lock() 1145 defer d.metadataMu.Unlock() 1146 return d.updateMetadataLocked(ctx, noHandle) 1147 } 1148 1149 func (d *dentry) fileType() uint32 { 1150 return d.mode.Load() & linux.S_IFMT 1151 } 1152 1153 func (d *dentry) statTo(stat *linux.Statx) { 1154 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME 1155 stat.Blksize = d.blockSize.Load() 1156 stat.Nlink = d.nlink.Load() 1157 if stat.Nlink == 0 { 1158 // The remote filesystem doesn't support link count; just make 1159 // something up. This is consistent with Linux, where 1160 // fs/inode.c:inode_init_always() initializes link count to 1, and 1161 // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if 1162 // it's not provided by the remote filesystem. 1163 stat.Nlink = 1 1164 } 1165 stat.UID = d.uid.Load() 1166 stat.GID = d.gid.Load() 1167 stat.Mode = uint16(d.mode.Load()) 1168 stat.Ino = uint64(d.ino) 1169 stat.Size = d.size.Load() 1170 // This is consistent with regularFileFD.Seek(), which treats regular files 1171 // as having no holes. 1172 stat.Blocks = (stat.Size + 511) / 512 1173 stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load()) 1174 stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load()) 1175 stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load()) 1176 stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load()) 1177 stat.DevMajor = linux.UNNAMED_MAJOR 1178 stat.DevMinor = d.fs.devMinor 1179 } 1180 1181 // Precondition: fs.renameMu is locked. 1182 func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { 1183 stat := &opts.Stat 1184 if stat.Mask == 0 { 1185 return nil 1186 } 1187 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { 1188 return linuxerr.EPERM 1189 } 1190 mode := linux.FileMode(d.mode.Load()) 1191 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil { 1192 return err 1193 } 1194 if err := mnt.CheckBeginWrite(); err != nil { 1195 return err 1196 } 1197 defer mnt.EndWrite() 1198 1199 if stat.Mask&linux.STATX_SIZE != 0 { 1200 // Reject attempts to truncate files other than regular files, since 1201 // filesystem implementations may return the wrong errno. 1202 switch mode.FileType() { 1203 case linux.S_IFREG: 1204 // ok 1205 case linux.S_IFDIR: 1206 return linuxerr.EISDIR 1207 default: 1208 return linuxerr.EINVAL 1209 } 1210 } 1211 1212 var now int64 1213 if d.cachedMetadataAuthoritative() { 1214 // Truncate updates mtime. 1215 if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { 1216 stat.Mask |= linux.STATX_MTIME 1217 stat.Mtime = linux.StatxTimestamp{ 1218 Nsec: linux.UTIME_NOW, 1219 } 1220 } 1221 1222 // Use client clocks for timestamps. 1223 now = d.fs.clock.Now().Nanoseconds() 1224 if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { 1225 stat.Atime = linux.NsecToStatxTimestamp(now) 1226 } 1227 if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { 1228 stat.Mtime = linux.NsecToStatxTimestamp(now) 1229 } 1230 } 1231 1232 d.metadataMu.Lock() 1233 defer d.metadataMu.Unlock() 1234 1235 // As with Linux, if the UID, GID, or file size is changing, we have to 1236 // clear permission bits. Note that when set, clearSGID may cause 1237 // permissions to be updated. 1238 clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) || 1239 (stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) || 1240 stat.Mask&linux.STATX_SIZE != 0 1241 if clearSGID { 1242 if stat.Mask&linux.STATX_MODE != 0 { 1243 stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode))) 1244 } else { 1245 oldMode := d.mode.Load() 1246 if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode { 1247 stat.Mode = uint16(updatedMode) 1248 stat.Mask |= linux.STATX_MODE 1249 } 1250 } 1251 } 1252 1253 // failureMask indicates which attributes could not be set on the remote 1254 // filesystem. p9 returns an error if any of the attributes could not be set 1255 // but that leads to inconsistency as the server could have set a few 1256 // attributes successfully but a later failure will cause the successful ones 1257 // to not be updated in the dentry cache. 1258 var failureMask uint32 1259 var failureErr error 1260 if !d.isSynthetic() { 1261 if stat.Mask != 0 { 1262 if err := d.prepareSetStat(ctx, stat); err != nil { 1263 return err 1264 } 1265 d.handleMu.RLock() 1266 if stat.Mask&linux.STATX_SIZE != 0 { 1267 // d.dataMu must be held around the update to both the remote 1268 // file's size and d.size to serialize with writeback (which 1269 // might otherwise write data back up to the old d.size after 1270 // the remote file has been truncated). 1271 d.dataMu.Lock() 1272 } 1273 var err error 1274 failureMask, failureErr, err = d.setStatLocked(ctx, stat) 1275 d.handleMu.RUnlock() 1276 if err != nil { 1277 if stat.Mask&linux.STATX_SIZE != 0 { 1278 d.dataMu.Unlock() // +checklocksforce: locked conditionally above 1279 } 1280 return err 1281 } 1282 if stat.Mask&linux.STATX_SIZE != 0 { 1283 if failureMask&linux.STATX_SIZE == 0 { 1284 // d.size should be kept up to date, and privatized 1285 // copy-on-write mappings of truncated pages need to be 1286 // invalidated, even if InteropModeShared is in effect. 1287 d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above 1288 } else { 1289 d.dataMu.Unlock() // +checklocksforce: locked conditionally above 1290 } 1291 } 1292 } 1293 if d.fs.opts.interop == InteropModeShared { 1294 // There's no point to updating d's metadata in this case since 1295 // it'll be overwritten by revalidation before the next time it's 1296 // used anyway. (InteropModeShared inhibits client caching of 1297 // regular file data, so there's no cache to truncate either.) 1298 return nil 1299 } 1300 } 1301 if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 { 1302 d.mode.Store(d.fileType() | uint32(stat.Mode)) 1303 } 1304 if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 { 1305 d.uid.Store(stat.UID) 1306 } 1307 if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 { 1308 d.gid.Store(stat.GID) 1309 } 1310 // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because 1311 // if d.cachedMetadataAuthoritative() then we converted stat.Atime and 1312 // stat.Mtime to client-local timestamps above, and if 1313 // !d.cachedMetadataAuthoritative() then we returned after calling 1314 // d.file.setAttr(). For the same reason, now must have been initialized. 1315 if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 { 1316 d.atime.Store(stat.Atime.ToNsec()) 1317 d.atimeDirty.Store(0) 1318 } 1319 if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 { 1320 d.mtime.Store(stat.Mtime.ToNsec()) 1321 d.mtimeDirty.Store(0) 1322 } 1323 d.ctime.Store(now) 1324 if failureMask != 0 { 1325 // Setting some attribute failed on the remote filesystem. 1326 return failureErr 1327 } 1328 return nil 1329 } 1330 1331 // doAllocate performs an allocate operation on d. Note that d.metadataMu will 1332 // be held when allocate is called. 1333 func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error { 1334 d.metadataMu.Lock() 1335 defer d.metadataMu.Unlock() 1336 1337 // Allocating a smaller size is a noop. 1338 size := offset + length 1339 if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() { 1340 return nil 1341 } 1342 1343 err := allocate() 1344 if err != nil { 1345 return err 1346 } 1347 d.updateSizeLocked(size) 1348 if d.cachedMetadataAuthoritative() { 1349 d.touchCMtimeLocked() 1350 } 1351 return nil 1352 } 1353 1354 // Preconditions: d.metadataMu must be locked. 1355 func (d *dentry) updateSizeLocked(newSize uint64) { 1356 d.dataMu.Lock() 1357 d.updateSizeAndUnlockDataMuLocked(newSize) 1358 } 1359 1360 // Preconditions: d.metadataMu and d.dataMu must be locked. 1361 // 1362 // Postconditions: d.dataMu is unlocked. 1363 // +checklocksrelease:d.dataMu 1364 func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) { 1365 oldSize := d.size.RacyLoad() 1366 d.size.Store(newSize) 1367 // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings 1368 // below. This allows concurrent calls to Read/Translate/etc. These 1369 // functions synchronize with truncation by refusing to use cache 1370 // contents beyond the new d.size. (We are still holding d.metadataMu, 1371 // so we can't race with Write or another truncate.) 1372 d.dataMu.Unlock() 1373 if newSize < oldSize { 1374 oldpgend, _ := hostarch.PageRoundUp(oldSize) 1375 newpgend, _ := hostarch.PageRoundUp(newSize) 1376 if oldpgend != newpgend { 1377 d.mapsMu.Lock() 1378 d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 1379 // Compare Linux's mm/truncate.c:truncate_setsize() => 1380 // truncate_pagecache() => 1381 // mm/memory.c:unmap_mapping_range(evencows=1). 1382 InvalidatePrivate: true, 1383 }) 1384 d.mapsMu.Unlock() 1385 } 1386 // We are now guaranteed that there are no translations of 1387 // truncated pages, and can remove them from the cache. Since 1388 // truncated pages have been removed from the remote file, they 1389 // should be dropped without being written back. 1390 d.dataMu.Lock() 1391 d.cache.Truncate(newSize, d.fs.mfp.MemoryFile()) 1392 d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend}) 1393 d.dataMu.Unlock() 1394 } 1395 } 1396 1397 func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 1398 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())) 1399 } 1400 1401 func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error { 1402 // Deny access to the "security" and "system" namespaces since applications 1403 // may expect these to affect kernel behavior in unimplemented ways 1404 // (b/148380782). Allow all other extended attributes to be passed through 1405 // to the remote filesystem. This is inconsistent with Linux's 9p client, 1406 // but consistent with other filesystems (e.g. FUSE). 1407 // 1408 // NOTE(b/202533394): Also disallow "trusted" namespace for now. This is 1409 // consistent with the VFS1 gofer client. 1410 if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { 1411 return linuxerr.EOPNOTSUPP 1412 } 1413 mode := linux.FileMode(d.mode.Load()) 1414 kuid := auth.KUID(d.uid.Load()) 1415 kgid := auth.KGID(d.gid.Load()) 1416 if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil { 1417 return err 1418 } 1419 return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name) 1420 } 1421 1422 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { 1423 return vfs.CheckDeleteSticky( 1424 creds, 1425 linux.FileMode(d.mode.Load()), 1426 auth.KUID(d.uid.Load()), 1427 auth.KUID(child.uid.Load()), 1428 auth.KGID(child.gid.Load()), 1429 ) 1430 } 1431 1432 func dentryUID(uid lisafs.UID) uint32 { 1433 if !uid.Ok() { 1434 return uint32(auth.OverflowUID) 1435 } 1436 return uint32(uid) 1437 } 1438 1439 func dentryGID(gid lisafs.GID) uint32 { 1440 if !gid.Ok() { 1441 return uint32(auth.OverflowGID) 1442 } 1443 return uint32(gid) 1444 } 1445 1446 // IncRef implements vfs.DentryImpl.IncRef. 1447 func (d *dentry) IncRef() { 1448 // d.refs may be 0 if d.fs.renameMu is locked, which serializes against 1449 // d.checkCachingLocked(). 1450 r := d.refs.Add(1) 1451 if d.LogRefs() { 1452 refs.LogIncRef(d, r) 1453 } 1454 } 1455 1456 // TryIncRef implements vfs.DentryImpl.TryIncRef. 1457 func (d *dentry) TryIncRef() bool { 1458 for { 1459 r := d.refs.Load() 1460 if r <= 0 { 1461 return false 1462 } 1463 if d.refs.CompareAndSwap(r, r+1) { 1464 if d.LogRefs() { 1465 refs.LogTryIncRef(d, r+1) 1466 } 1467 return true 1468 } 1469 } 1470 } 1471 1472 // DecRef implements vfs.DentryImpl.DecRef. 1473 func (d *dentry) DecRef(ctx context.Context) { 1474 if d.decRefNoCaching() == 0 { 1475 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1476 } 1477 } 1478 1479 // decRefNoCaching decrements d's reference count without calling 1480 // d.checkCachingLocked, even if d's reference count reaches 0; callers are 1481 // responsible for ensuring that d.checkCachingLocked will be called later. 1482 func (d *dentry) decRefNoCaching() int64 { 1483 r := d.refs.Add(-1) 1484 if d.LogRefs() { 1485 refs.LogDecRef(d, r) 1486 } 1487 if r < 0 { 1488 panic("gofer.dentry.decRefNoCaching() called without holding a reference") 1489 } 1490 return r 1491 } 1492 1493 // RefType implements refs.CheckedObject.Type. 1494 func (d *dentry) RefType() string { 1495 return "gofer.dentry" 1496 } 1497 1498 // LeakMessage implements refs.CheckedObject.LeakMessage. 1499 func (d *dentry) LeakMessage() string { 1500 return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 1501 } 1502 1503 // LogRefs implements refs.CheckedObject.LogRefs. 1504 // 1505 // This should only be set to true for debugging purposes, as it can generate an 1506 // extremely large amount of output and drastically degrade performance. 1507 func (d *dentry) LogRefs() bool { 1508 return false 1509 } 1510 1511 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 1512 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 1513 if d.isDir() { 1514 events |= linux.IN_ISDIR 1515 } 1516 1517 d.fs.renameMu.RLock() 1518 // The ordering below is important, Linux always notifies the parent first. 1519 if d.parent != nil { 1520 d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) 1521 } 1522 d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) 1523 d.fs.renameMu.RUnlock() 1524 } 1525 1526 // Watches implements vfs.DentryImpl.Watches. 1527 func (d *dentry) Watches() *vfs.Watches { 1528 return &d.watches 1529 } 1530 1531 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. 1532 // 1533 // If no watches are left on this dentry and it has no references, cache it. 1534 func (d *dentry) OnZeroWatches(ctx context.Context) { 1535 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 1536 } 1537 1538 // checkCachingLocked should be called after d's reference count becomes 0 or 1539 // it becomes disowned. 1540 // 1541 // For performance, checkCachingLocked can also be called after d's reference 1542 // count becomes non-zero, so that d can be removed from the LRU cache. This 1543 // may help in reducing the size of the cache and hence reduce evictions. Note 1544 // that this is not necessary for correctness. 1545 // 1546 // It may be called on a destroyed dentry. For example, 1547 // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times 1548 // for the same dentry when the dentry is visited more than once in the same 1549 // operation. One of the calls may destroy the dentry, so subsequent calls will 1550 // do nothing. 1551 // 1552 // Preconditions: d.fs.renameMu must be locked for writing if 1553 // renameMuWriteLocked is true; it may be temporarily unlocked. 1554 func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) { 1555 d.cachingMu.Lock() 1556 refs := d.refs.Load() 1557 if refs == -1 { 1558 // Dentry has already been destroyed. 1559 d.cachingMu.Unlock() 1560 return 1561 } 1562 if refs > 0 { 1563 // fs.dentryCache.dentries is permitted to contain dentries with non-zero 1564 // refs, which are skipped by fs.evictCachedDentryLocked() upon reaching 1565 // the end of the LRU. But it is still beneficial to remove d from the 1566 // cache as we are already holding d.cachingMu. Keeping a cleaner cache 1567 // also reduces the number of evictions (which is expensive as it acquires 1568 // fs.renameMu). 1569 d.removeFromCacheLocked() 1570 d.cachingMu.Unlock() 1571 return 1572 } 1573 // Deleted and invalidated dentries with zero references are no longer 1574 // reachable by path resolution and should be dropped immediately. 1575 if d.vfsd.IsDead() { 1576 d.removeFromCacheLocked() 1577 d.cachingMu.Unlock() 1578 if !renameMuWriteLocked { 1579 // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked(). 1580 d.fs.renameMu.Lock() 1581 defer d.fs.renameMu.Unlock() 1582 // Now that renameMu is locked for writing, no more refs can be taken on 1583 // d because path resolution requires renameMu for reading at least. 1584 if d.refs.Load() != 0 { 1585 // Destroy d only if its ref is still 0. If not, either someone took a 1586 // ref on it or it got destroyed before fs.renameMu could be acquired. 1587 return 1588 } 1589 } 1590 if d.isDeleted() { 1591 d.watches.HandleDeletion(ctx) 1592 } 1593 d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point. 1594 return 1595 } 1596 if d.vfsd.IsEvictable() { 1597 d.cachingMu.Unlock() 1598 // Attempt to evict. 1599 if renameMuWriteLocked { 1600 d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case. 1601 return 1602 } 1603 d.evict(ctx) 1604 return 1605 } 1606 // If d still has inotify watches and it is not deleted or invalidated, it 1607 // can't be evicted. Otherwise, we will lose its watches, even if a new 1608 // dentry is created for the same file in the future. Note that the size of 1609 // d.watches cannot concurrently transition from zero to non-zero, because 1610 // adding a watch requires holding a reference on d. 1611 if d.watches.Size() > 0 { 1612 // As in the refs > 0 case, removing d is beneficial. 1613 d.removeFromCacheLocked() 1614 d.cachingMu.Unlock() 1615 return 1616 } 1617 1618 if d.fs.released.Load() != 0 { 1619 d.cachingMu.Unlock() 1620 if !renameMuWriteLocked { 1621 // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as 1622 // needed by d.destroyLocked() later. 1623 d.fs.renameMu.Lock() 1624 defer d.fs.renameMu.Unlock() 1625 } 1626 if d.parent != nil { 1627 d.parent.childrenMu.Lock() 1628 delete(d.parent.children, d.name) 1629 d.parent.childrenMu.Unlock() 1630 } 1631 d.destroyLocked(ctx) // +checklocksforce: see above. 1632 return 1633 } 1634 1635 d.fs.dentryCache.mu.Lock() 1636 // If d is already cached, just move it to the front of the LRU. 1637 if d.cached { 1638 d.fs.dentryCache.dentries.Remove(&d.cacheEntry) 1639 d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) 1640 d.fs.dentryCache.mu.Unlock() 1641 d.cachingMu.Unlock() 1642 return 1643 } 1644 // Cache the dentry, then evict the least recently used cached dentry if 1645 // the cache becomes over-full. 1646 d.fs.dentryCache.dentries.PushFront(&d.cacheEntry) 1647 d.fs.dentryCache.dentriesLen++ 1648 d.cached = true 1649 shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries 1650 d.fs.dentryCache.mu.Unlock() 1651 d.cachingMu.Unlock() 1652 1653 if shouldEvict { 1654 if !renameMuWriteLocked { 1655 // Need to lock d.fs.renameMu for writing as needed by 1656 // d.evictCachedDentryLocked(). 1657 d.fs.renameMu.Lock() 1658 defer d.fs.renameMu.Unlock() 1659 } 1660 d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above. 1661 } 1662 } 1663 1664 // Preconditions: d.cachingMu must be locked. 1665 func (d *dentry) removeFromCacheLocked() { 1666 if d.cached { 1667 d.fs.dentryCache.mu.Lock() 1668 d.fs.dentryCache.dentries.Remove(&d.cacheEntry) 1669 d.fs.dentryCache.dentriesLen-- 1670 d.fs.dentryCache.mu.Unlock() 1671 d.cached = false 1672 } 1673 } 1674 1675 // Precondition: fs.renameMu must be locked for writing; it may be temporarily 1676 // unlocked. 1677 // +checklocks:fs.renameMu 1678 func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) { 1679 for fs.dentryCache.dentriesLen != 0 { 1680 fs.evictCachedDentryLocked(ctx) 1681 } 1682 } 1683 1684 // Preconditions: 1685 // - fs.renameMu must be locked for writing; it may be temporarily unlocked. 1686 // 1687 // +checklocks:fs.renameMu 1688 func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) { 1689 fs.dentryCache.mu.Lock() 1690 victim := fs.dentryCache.dentries.Back() 1691 fs.dentryCache.mu.Unlock() 1692 if victim == nil { 1693 // fs.dentryCache.dentries may have become empty between when it was 1694 // checked and when we locked fs.dentryCache.mu. 1695 return 1696 } 1697 1698 if victim.d.fs == fs { 1699 victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs 1700 return 1701 } 1702 1703 // The dentry cache is shared between all gofer filesystems and the victim is 1704 // from another filesystem. Have that filesystem do the work. We unlock 1705 // fs.renameMu to prevent deadlock: two filesystems could otherwise wait on 1706 // each others' renameMu. 1707 fs.renameMu.Unlock() 1708 defer fs.renameMu.Lock() 1709 victim.d.evict(ctx) 1710 } 1711 1712 // Preconditions: 1713 // - d.fs.renameMu must not be locked for writing. 1714 func (d *dentry) evict(ctx context.Context) { 1715 d.fs.renameMu.Lock() 1716 defer d.fs.renameMu.Unlock() 1717 d.evictLocked(ctx) 1718 } 1719 1720 // Preconditions: 1721 // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1722 // 1723 // +checklocks:d.fs.renameMu 1724 func (d *dentry) evictLocked(ctx context.Context) { 1725 d.cachingMu.Lock() 1726 d.removeFromCacheLocked() 1727 // d.refs or d.watches.Size() may have become non-zero from an earlier path 1728 // resolution since it was inserted into fs.dentryCache.dentries. 1729 if d.refs.Load() != 0 || d.watches.Size() != 0 { 1730 d.cachingMu.Unlock() 1731 return 1732 } 1733 if d.parent != nil { 1734 d.parent.opMu.Lock() 1735 if !d.vfsd.IsDead() { 1736 // Note that d can't be a mount point (in any mount namespace), since VFS 1737 // holds references on mount points. 1738 d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd) 1739 1740 d.parent.childrenMu.Lock() 1741 delete(d.parent.children, d.name) 1742 d.parent.childrenMu.Unlock() 1743 1744 // We're only deleting the dentry, not the file it 1745 // represents, so we don't need to update 1746 // victim parent.dirents etc. 1747 } 1748 d.parent.opMu.Unlock() 1749 } 1750 // Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any 1751 // concurrent caching attempts on d will attempt to destroy it and so will 1752 // try to acquire fs.renameMu (which we have already acquiredd). Hence, 1753 // fs.renameMu will synchronize the destroy attempts. 1754 d.cachingMu.Unlock() 1755 d.destroyLocked(ctx) // +checklocksforce: owned as precondition. 1756 } 1757 1758 // destroyDisconnected destroys an uncached, unparented dentry. There are no 1759 // locking preconditions. 1760 func (d *dentry) destroyDisconnected(ctx context.Context) { 1761 mf := d.fs.mfp.MemoryFile() 1762 1763 d.handleMu.Lock() 1764 d.dataMu.Lock() 1765 1766 if d.isWriteHandleOk() { 1767 // Write dirty pages back to the remote filesystem. 1768 h := d.writeHandle() 1769 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 1770 log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) 1771 } 1772 } 1773 // Discard cached data. 1774 if !d.cache.IsEmpty() { 1775 mf.MarkAllUnevictable(d) 1776 d.cache.DropAll(mf) 1777 d.dirty.RemoveAll() 1778 } 1779 d.dataMu.Unlock() 1780 1781 // Close any resources held by the implementation. 1782 d.destroyImpl(ctx) 1783 1784 // Can use RacyLoad() because handleMu is locked. 1785 if d.readFD.RacyLoad() >= 0 { 1786 _ = unix.Close(int(d.readFD.RacyLoad())) 1787 } 1788 if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() { 1789 _ = unix.Close(int(d.writeFD.RacyLoad())) 1790 } 1791 d.readFD = atomicbitops.FromInt32(-1) 1792 d.writeFD = atomicbitops.FromInt32(-1) 1793 d.mmapFD = atomicbitops.FromInt32(-1) 1794 d.handleMu.Unlock() 1795 1796 if !d.isSynthetic() { 1797 // Note that it's possible that d.atimeDirty or d.mtimeDirty are true, 1798 // i.e. client and server timestamps may differ (because e.g. a client 1799 // write was serviced by the page cache, and only written back to the 1800 // remote file later). Ideally, we'd write client timestamps back to 1801 // the remote filesystem so that timestamps for a new dentry 1802 // instantiated for the same file would remain coherent. Unfortunately, 1803 // this turns out to be too expensive in many cases, so for now we 1804 // don't do this. 1805 1806 // Remove d from the set of syncable dentries. 1807 d.fs.syncMu.Lock() 1808 d.fs.syncableDentries.Remove(&d.syncableListEntry) 1809 d.fs.syncMu.Unlock() 1810 } 1811 1812 // Drop references and stop tracking this child. 1813 d.refs.Store(-1) 1814 refs.Unregister(d) 1815 } 1816 1817 // destroyLocked destroys the dentry. 1818 // 1819 // Preconditions: 1820 // - d.fs.renameMu must be locked for writing; it may be temporarily unlocked. 1821 // - d.refs == 0. 1822 // - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal 1823 // from its former parent dentry. 1824 // 1825 // +checklocks:d.fs.renameMu 1826 func (d *dentry) destroyLocked(ctx context.Context) { 1827 switch d.refs.Load() { 1828 case 0: 1829 // Mark the dentry destroyed. 1830 d.refs.Store(-1) 1831 case -1: 1832 panic("dentry.destroyLocked() called on already destroyed dentry") 1833 default: 1834 panic("dentry.destroyLocked() called with references on the dentry") 1835 } 1836 1837 // Allow the following to proceed without renameMu locked to improve 1838 // scalability. 1839 d.fs.renameMu.Unlock() 1840 1841 // No locks need to be held during destoryDisconnected. 1842 d.destroyDisconnected(ctx) 1843 1844 d.fs.renameMu.Lock() 1845 1846 // Drop the reference held by d on its parent without recursively locking 1847 // d.fs.renameMu. 1848 if d.parent != nil && d.parent.decRefNoCaching() == 0 { 1849 d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 1850 } 1851 } 1852 1853 func (d *dentry) isDeleted() bool { 1854 return d.deleted.Load() != 0 1855 } 1856 1857 func (d *dentry) setDeleted() { 1858 d.deleted.Store(1) 1859 } 1860 1861 func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) { 1862 if d.isSynthetic() { 1863 return nil, nil 1864 } 1865 1866 return d.listXattrImpl(ctx, size) 1867 } 1868 1869 func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 1870 if d.isSynthetic() { 1871 return "", linuxerr.ENODATA 1872 } 1873 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil { 1874 return "", err 1875 } 1876 return d.getXattrImpl(ctx, opts) 1877 } 1878 1879 func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 1880 if d.isSynthetic() { 1881 return linuxerr.EPERM 1882 } 1883 if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil { 1884 return err 1885 } 1886 return d.setXattrImpl(ctx, opts) 1887 } 1888 1889 func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error { 1890 if d.isSynthetic() { 1891 return linuxerr.EPERM 1892 } 1893 if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil { 1894 return err 1895 } 1896 return d.removeXattrImpl(ctx, name) 1897 } 1898 1899 // Preconditions: 1900 // - !d.isSynthetic(). 1901 // - d.isRegularFile() || d.isDir(). 1902 // - fs.renameMu is locked. 1903 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { 1904 // O_TRUNC unconditionally requires us to obtain a new handle (opened with 1905 // O_TRUNC). 1906 if !trunc { 1907 d.handleMu.RLock() 1908 canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk()) 1909 d.handleMu.RUnlock() 1910 if canReuseCurHandle { 1911 // Current handles are sufficient. 1912 return nil 1913 } 1914 } 1915 1916 d.handleMu.Lock() 1917 needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc 1918 if !needNewHandle { 1919 d.handleMu.Unlock() 1920 return nil 1921 } 1922 1923 var fdsToCloseArr [2]int32 1924 fdsToClose := fdsToCloseArr[:0] 1925 invalidateTranslations := false 1926 // Get a new handle. If this file has been opened for both reading and 1927 // writing, try to get a single handle that is usable for both: 1928 // 1929 // - Writable memory mappings of a host FD require that the host FD is 1930 // opened for both reading and writing. 1931 // 1932 // - NOTE(b/141991141): Some filesystems may not ensure coherence 1933 // between multiple handles for the same file. 1934 openReadable := d.isReadHandleOk() || read 1935 openWritable := d.isWriteHandleOk() || write 1936 h, err := d.openHandle(ctx, openReadable, openWritable, trunc) 1937 if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) { 1938 // It may not be possible to use a single handle for both 1939 // reading and writing, since permissions on the file may have 1940 // changed to e.g. disallow reading after previously being 1941 // opened for reading. In this case, we have no choice but to 1942 // use separate handles for reading and writing. 1943 ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) 1944 openReadable = read 1945 openWritable = write 1946 h, err = d.openHandle(ctx, openReadable, openWritable, trunc) 1947 } 1948 if err != nil { 1949 d.handleMu.Unlock() 1950 return err 1951 } 1952 1953 // Update d.readFD and d.writeFD 1954 if h.fd >= 0 { 1955 if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) { 1956 // Replace existing FDs with this one. 1957 if d.readFD.RacyLoad() >= 0 { 1958 // We already have a readable FD that may be in use by 1959 // concurrent callers of d.pf.FD(). 1960 if d.fs.opts.overlayfsStaleRead { 1961 // If overlayfsStaleRead is in effect, then the new FD 1962 // may not be coherent with the existing one, so we 1963 // have no choice but to switch to mappings of the new 1964 // FD in both the application and sentry. 1965 if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { 1966 d.handleMu.Unlock() 1967 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) 1968 h.close(ctx) 1969 return err 1970 } 1971 fdsToClose = append(fdsToClose, d.readFD.RacyLoad()) 1972 invalidateTranslations = true 1973 d.readFD.Store(h.fd) 1974 } else { 1975 // Otherwise, we want to avoid invalidating existing 1976 // memmap.Translations (which is expensive); instead, use 1977 // dup3 to make the old file descriptor refer to the new 1978 // file description, then close the new file descriptor 1979 // (which is no longer needed). Racing callers of d.pf.FD() 1980 // may use the old or new file description, but this 1981 // doesn't matter since they refer to the same file, and 1982 // any racing mappings must be read-only. 1983 if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil { 1984 oldFD := d.readFD.RacyLoad() 1985 d.handleMu.Unlock() 1986 ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err) 1987 h.close(ctx) 1988 return err 1989 } 1990 fdsToClose = append(fdsToClose, h.fd) 1991 h.fd = d.readFD.RacyLoad() 1992 } 1993 } else { 1994 d.readFD.Store(h.fd) 1995 } 1996 if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 { 1997 fdsToClose = append(fdsToClose, d.writeFD.RacyLoad()) 1998 } 1999 d.writeFD.Store(h.fd) 2000 d.mmapFD.Store(h.fd) 2001 } else if openReadable && d.readFD.RacyLoad() < 0 { 2002 readHandleWasOk := d.isReadHandleOk() 2003 d.readFD.Store(h.fd) 2004 // If the file has not been opened for writing, the new FD may 2005 // be used for read-only memory mappings. If the file was 2006 // previously opened for reading (without an FD), then existing 2007 // translations of the file may use the internal page cache; 2008 // invalidate those mappings. 2009 if !d.isWriteHandleOk() { 2010 invalidateTranslations = readHandleWasOk 2011 d.mmapFD.Store(h.fd) 2012 } 2013 } else if openWritable && d.writeFD.RacyLoad() < 0 { 2014 d.writeFD.Store(h.fd) 2015 if d.readFD.RacyLoad() >= 0 { 2016 // We have an existing read-only FD, but the file has just 2017 // been opened for writing, so we need to start supporting 2018 // writable memory mappings. However, the new FD is not 2019 // readable, so we have no FD that can be used to create 2020 // writable memory mappings. Switch to using the internal 2021 // page cache. 2022 invalidateTranslations = true 2023 d.mmapFD.Store(-1) 2024 } 2025 } else { 2026 // The new FD is not useful. 2027 fdsToClose = append(fdsToClose, h.fd) 2028 } 2029 } else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 { 2030 // We have an existing read-only FD, but the file has just been 2031 // opened for writing, so we need to start supporting writable 2032 // memory mappings. However, we have no writable host FD. Switch to 2033 // using the internal page cache. 2034 invalidateTranslations = true 2035 d.mmapFD.Store(-1) 2036 } 2037 2038 d.updateHandles(ctx, h, openReadable, openWritable) 2039 d.handleMu.Unlock() 2040 2041 if invalidateTranslations { 2042 // Invalidate application mappings that may be using an old FD; they 2043 // will be replaced with mappings using the new FD after future calls 2044 // to d.Translate(). This requires holding d.mapsMu, which precedes 2045 // d.handleMu in the lock order. 2046 d.mapsMu.Lock() 2047 d.mappings.InvalidateAll(memmap.InvalidateOpts{}) 2048 d.mapsMu.Unlock() 2049 } 2050 for _, fd := range fdsToClose { 2051 unix.Close(int(fd)) 2052 } 2053 2054 return nil 2055 } 2056 2057 func (d *dentry) syncRemoteFile(ctx context.Context) error { 2058 d.handleMu.RLock() 2059 defer d.handleMu.RUnlock() 2060 return d.syncRemoteFileLocked(ctx) 2061 } 2062 2063 // Preconditions: d.handleMu must be locked. 2064 func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { 2065 // Prefer syncing write handles over read handles, since some remote 2066 // filesystem implementations may not sync changes made through write 2067 // handles otherwise. 2068 wh := d.writeHandle() 2069 wh.sync(ctx) 2070 rh := d.readHandle() 2071 rh.sync(ctx) 2072 return nil 2073 } 2074 2075 func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error { 2076 d.handleMu.RLock() 2077 defer d.handleMu.RUnlock() 2078 if d.isWriteHandleOk() { 2079 // Write back dirty pages to the remote file. 2080 d.dataMu.Lock() 2081 h := d.writeHandle() 2082 err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) 2083 d.dataMu.Unlock() 2084 if err != nil { 2085 return err 2086 } 2087 } 2088 if err := d.syncRemoteFileLocked(ctx); err != nil { 2089 if !forFilesystemSync { 2090 return err 2091 } 2092 // Only return err if we can reasonably have expected sync to succeed 2093 // (d is a regular file and was opened for writing). 2094 if d.isRegularFile() && d.isWriteHandleOk() { 2095 return err 2096 } 2097 ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err) 2098 } 2099 return nil 2100 } 2101 2102 // incLinks increments link count. 2103 func (d *dentry) incLinks() { 2104 if d.nlink.Load() == 0 { 2105 // The remote filesystem doesn't support link count. 2106 return 2107 } 2108 d.nlink.Add(1) 2109 } 2110 2111 // decLinks decrements link count. 2112 func (d *dentry) decLinks() { 2113 if d.nlink.Load() == 0 { 2114 // The remote filesystem doesn't support link count. 2115 return 2116 } 2117 d.nlink.Add(^uint32(0)) 2118 } 2119 2120 // fileDescription is embedded by gofer implementations of 2121 // vfs.FileDescriptionImpl. 2122 // 2123 // +stateify savable 2124 type fileDescription struct { 2125 vfsfd vfs.FileDescription 2126 vfs.FileDescriptionDefaultImpl 2127 vfs.LockFD 2128 2129 lockLogging sync.Once `state:"nosave"` 2130 } 2131 2132 func (fd *fileDescription) filesystem() *filesystem { 2133 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 2134 } 2135 2136 func (fd *fileDescription) dentry() *dentry { 2137 return fd.vfsfd.Dentry().Impl().(*dentry) 2138 } 2139 2140 // Stat implements vfs.FileDescriptionImpl.Stat. 2141 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 2142 d := fd.dentry() 2143 const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) 2144 if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { 2145 // Use specialFileFD.handle.fileLisa for the Stat if available, for the 2146 // same reason that we try to use open FD in updateMetadataLocked(). 2147 var err error 2148 if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok { 2149 err = sffd.updateMetadata(ctx) 2150 } else { 2151 err = d.updateMetadata(ctx) 2152 } 2153 if err != nil { 2154 return linux.Statx{}, err 2155 } 2156 } 2157 var stat linux.Statx 2158 d.statTo(&stat) 2159 return stat, nil 2160 } 2161 2162 // SetStat implements vfs.FileDescriptionImpl.SetStat. 2163 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 2164 fs := fd.filesystem() 2165 fs.renameMu.RLock() 2166 defer fs.renameMu.RUnlock() 2167 return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()) 2168 } 2169 2170 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 2171 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 2172 return fd.dentry().listXattr(ctx, size) 2173 } 2174 2175 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 2176 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 2177 return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2178 } 2179 2180 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 2181 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 2182 return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts) 2183 } 2184 2185 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 2186 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 2187 return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name) 2188 } 2189 2190 // LockBSD implements vfs.FileDescriptionImpl.LockBSD. 2191 func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error { 2192 fd.lockLogging.Do(func() { 2193 log.Infof("File lock using gofer file handled internally.") 2194 }) 2195 return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block) 2196 } 2197 2198 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. 2199 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error { 2200 fd.lockLogging.Do(func() { 2201 log.Infof("Range lock using gofer file handled internally.") 2202 }) 2203 return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block) 2204 } 2205 2206 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. 2207 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error { 2208 return fd.Locks().UnlockPOSIX(ctx, uid, r) 2209 } 2210 2211 // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally 2212 // holds some information around the intent behind resolving the path. 2213 type resolvingPath struct { 2214 *vfs.ResolvingPath 2215 2216 // excludeLast indicates whether the intent is to resolve until the last path 2217 // component. If true, the last path component should remain unresolved. 2218 excludeLast bool 2219 } 2220 2221 func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath { 2222 return resolvingPath{ResolvingPath: rp, excludeLast: false} 2223 } 2224 2225 func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath { 2226 return resolvingPath{ResolvingPath: rp, excludeLast: true} 2227 } 2228 2229 func (rp *resolvingPath) done() bool { 2230 if rp.excludeLast { 2231 return rp.Final() 2232 } 2233 return rp.Done() 2234 } 2235 2236 func (rp *resolvingPath) copy() resolvingPath { 2237 return resolvingPath{ 2238 ResolvingPath: rp.ResolvingPath.Copy(), 2239 excludeLast: rp.excludeLast, 2240 } 2241 } 2242 2243 // Precondition: !rp.done() && rp.Component() is not "." or "..". 2244 func (rp *resolvingPath) getComponents(emit func(string) bool) { 2245 rp.GetComponents(rp.excludeLast, emit) 2246 }