github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/fsutil/inode_cached.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package fsutil 16 17 import ( 18 "fmt" 19 "io" 20 21 "github.com/SagerNet/gvisor/pkg/context" 22 "github.com/SagerNet/gvisor/pkg/hostarch" 23 "github.com/SagerNet/gvisor/pkg/log" 24 "github.com/SagerNet/gvisor/pkg/safemem" 25 "github.com/SagerNet/gvisor/pkg/sentry/fs" 26 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 27 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 28 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 29 "github.com/SagerNet/gvisor/pkg/sentry/usage" 30 "github.com/SagerNet/gvisor/pkg/sync" 31 "github.com/SagerNet/gvisor/pkg/usermem" 32 ) 33 34 // Lock order (compare the lock order model in mm/mm.go): 35 // 36 // CachingInodeOperations.attrMu ("fs locks") 37 // CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate") 38 // CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate") 39 // CachedFileObject locks 40 41 // CachingInodeOperations caches the metadata and content of a CachedFileObject. 42 // It implements a subset of InodeOperations. As a utility it can be used to 43 // implement the full set of InodeOperations. Generally it should not be 44 // embedded to avoid unexpected inherited behavior. 45 // 46 // CachingInodeOperations implements Mappable for the CachedFileObject: 47 // 48 // - If CachedFileObject.FD returns a value >= 0 then the file descriptor 49 // will be memory mapped on the host. 50 // 51 // - Otherwise, the contents of CachedFileObject are buffered into memory 52 // managed by the CachingInodeOperations. 53 // 54 // Implementations of FileOperations for a CachedFileObject must read and 55 // write through CachingInodeOperations using Read and Write respectively. 56 // 57 // Implementations of InodeOperations.WriteOut must call Sync to write out 58 // in-memory modifications of data and metadata to the CachedFileObject. 59 // 60 // +stateify savable 61 type CachingInodeOperations struct { 62 // backingFile is a handle to a cached file object. 63 backingFile CachedFileObject 64 65 // mfp is used to allocate memory that caches backingFile's contents. 66 mfp pgalloc.MemoryFileProvider 67 68 // opts contains options. opts is immutable. 69 opts CachingInodeOperationsOptions 70 71 attrMu sync.Mutex `state:"nosave"` 72 73 // attr is unstable cached metadata. 74 // 75 // attr is protected by attrMu. attr.Size is protected by both attrMu and 76 // dataMu; reading it requires locking either mutex, while mutating it 77 // requires locking both. 78 attr fs.UnstableAttr 79 80 // dirtyAttr is metadata that was updated in-place but hasn't yet 81 // been successfully written out. 82 // 83 // dirtyAttr is protected by attrMu. 84 dirtyAttr fs.AttrMask 85 86 mapsMu sync.Mutex `state:"nosave"` 87 88 // mappings tracks mappings of the cached file object into 89 // memmap.MappingSpaces. 90 // 91 // mappings is protected by mapsMu. 92 mappings memmap.MappingSet 93 94 dataMu sync.RWMutex `state:"nosave"` 95 96 // cache maps offsets into the cached file to offsets into 97 // mfp.MemoryFile() that store the file's data. 98 // 99 // cache is protected by dataMu. 100 cache FileRangeSet 101 102 // dirty tracks dirty segments in cache. 103 // 104 // dirty is protected by dataMu. 105 dirty DirtySet 106 107 // hostFileMapper caches internal mappings of backingFile.FD(). 108 hostFileMapper *HostFileMapper 109 110 // refs tracks active references to data in the cache. 111 // 112 // refs is protected by dataMu. 113 refs FrameRefSet 114 } 115 116 // CachingInodeOperationsOptions configures a CachingInodeOperations. 117 // 118 // +stateify savable 119 type CachingInodeOperationsOptions struct { 120 // If ForcePageCache is true, use the sentry page cache even if a host file 121 // descriptor is available. 122 ForcePageCache bool 123 124 // If LimitHostFDTranslation is true, apply maxFillRange() constraints to 125 // host file descriptor mappings returned by 126 // CachingInodeOperations.Translate(). 127 LimitHostFDTranslation bool 128 } 129 130 // CachedFileObject is a file that may require caching. 131 type CachedFileObject interface { 132 // ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts, 133 // starting at offset, and returns the number of bytes read. ReadToBlocksAt 134 // may return a partial read without an error. 135 ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) 136 137 // WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the 138 // file, starting at offset, and returns the number of bytes written. 139 // WriteFromBlocksAt may return a partial write without an error. 140 WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) 141 142 // SetMaskedAttributes sets the attributes in attr that are true in 143 // mask on the backing file. If the mask contains only ATime or MTime 144 // and the CachedFileObject has an FD to the file, then this operation 145 // is a noop unless forceSetTimestamps is true. This avoids an extra 146 // RPC to the gofer in the open-read/write-close case, when the 147 // timestamps on the file will be updated by the host kernel for us. 148 // 149 // SetMaskedAttributes may be called at any point, regardless of whether 150 // the file was opened. 151 SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error 152 153 // Allocate allows the caller to reserve disk space for the inode. 154 // It's equivalent to fallocate(2) with 'mode=0'. 155 Allocate(ctx context.Context, offset int64, length int64) error 156 157 // Sync instructs the remote filesystem to sync the file to stable storage. 158 Sync(ctx context.Context) error 159 160 // FD returns a host file descriptor. If it is possible for 161 // CachingInodeOperations.AddMapping to have ever been called with writable 162 // = true, the FD must have been opened O_RDWR; otherwise, it may have been 163 // opened O_RDONLY or O_RDWR. (mmap unconditionally requires that mapped 164 // files are readable.) If no host file descriptor is available, FD returns 165 // a negative number. 166 // 167 // For any given CachedFileObject, if FD() ever succeeds (returns a 168 // non-negative number), it must always succeed. 169 // 170 // FD is called iff the file has been memory mapped. This implies that 171 // the file was opened (see fs.InodeOperations.GetFile). 172 FD() int 173 } 174 175 // NewCachingInodeOperations returns a new CachingInodeOperations backed by 176 // a CachedFileObject and its initial unstable attributes. 177 func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations { 178 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 179 if mfp == nil { 180 panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider)) 181 } 182 return &CachingInodeOperations{ 183 backingFile: backingFile, 184 mfp: mfp, 185 opts: opts, 186 attr: uattr, 187 hostFileMapper: NewHostFileMapper(), 188 } 189 } 190 191 // Release implements fs.InodeOperations.Release. 192 func (c *CachingInodeOperations) Release() { 193 c.mapsMu.Lock() 194 defer c.mapsMu.Unlock() 195 c.dataMu.Lock() 196 defer c.dataMu.Unlock() 197 198 // Something has gone terribly wrong if we're releasing an inode that is 199 // still memory-mapped. 200 if !c.mappings.IsEmpty() { 201 panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings)) 202 } 203 204 // Drop any cached pages that are still awaiting MemoryFile eviction. (This 205 // means that MemoryFile no longer needs to evict them.) 206 mf := c.mfp.MemoryFile() 207 mf.MarkAllUnevictable(c) 208 if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { 209 panic(fmt.Sprintf("Failed to writeback cached data: %v", err)) 210 } 211 c.cache.DropAll(mf) 212 c.dirty.RemoveAll() 213 } 214 215 // UnstableAttr implements fs.InodeOperations.UnstableAttr. 216 func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { 217 c.attrMu.Lock() 218 attr := c.attr 219 c.attrMu.Unlock() 220 return attr, nil 221 } 222 223 // SetPermissions implements fs.InodeOperations.SetPermissions. 224 func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool { 225 c.attrMu.Lock() 226 defer c.attrMu.Unlock() 227 228 now := ktime.NowFromContext(ctx) 229 masked := fs.AttrMask{Perms: true} 230 if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}, false); err != nil { 231 return false 232 } 233 c.attr.Perms = perms 234 c.touchStatusChangeTimeLocked(now) 235 return true 236 } 237 238 // SetOwner implements fs.InodeOperations.SetOwner. 239 func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { 240 if !owner.UID.Ok() && !owner.GID.Ok() { 241 return nil 242 } 243 244 c.attrMu.Lock() 245 defer c.attrMu.Unlock() 246 247 now := ktime.NowFromContext(ctx) 248 masked := fs.AttrMask{ 249 UID: owner.UID.Ok(), 250 GID: owner.GID.Ok(), 251 } 252 if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}, false); err != nil { 253 return err 254 } 255 if owner.UID.Ok() { 256 c.attr.Owner.UID = owner.UID 257 } 258 if owner.GID.Ok() { 259 c.attr.Owner.GID = owner.GID 260 } 261 c.touchStatusChangeTimeLocked(now) 262 return nil 263 } 264 265 // SetTimestamps implements fs.InodeOperations.SetTimestamps. 266 func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { 267 if ts.ATimeOmit && ts.MTimeOmit { 268 return nil 269 } 270 271 c.attrMu.Lock() 272 defer c.attrMu.Unlock() 273 274 // Replace requests to use the "system time" with the current time to 275 // ensure that cached timestamps remain consistent with the remote 276 // filesystem. 277 now := ktime.NowFromContext(ctx) 278 if ts.ATimeSetSystemTime { 279 ts.ATime = now 280 } 281 if ts.MTimeSetSystemTime { 282 ts.MTime = now 283 } 284 masked := fs.AttrMask{ 285 AccessTime: !ts.ATimeOmit, 286 ModificationTime: !ts.MTimeOmit, 287 } 288 // Call SetMaskedAttributes with forceSetTimestamps = true to make sure 289 // the timestamp is updated. 290 if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}, true); err != nil { 291 return err 292 } 293 if !ts.ATimeOmit { 294 c.attr.AccessTime = ts.ATime 295 } 296 if !ts.MTimeOmit { 297 c.attr.ModificationTime = ts.MTime 298 } 299 c.touchStatusChangeTimeLocked(now) 300 return nil 301 } 302 303 // Truncate implements fs.InodeOperations.Truncate. 304 func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { 305 c.attrMu.Lock() 306 defer c.attrMu.Unlock() 307 308 // c.attr.Size is protected by both c.attrMu and c.dataMu. 309 c.dataMu.Lock() 310 now := ktime.NowFromContext(ctx) 311 masked := fs.AttrMask{Size: true} 312 attr := fs.UnstableAttr{Size: size} 313 if c.attr.Perms.HasSetUIDOrGID() { 314 masked.Perms = true 315 attr.Perms = c.attr.Perms 316 attr.Perms.DropSetUIDAndMaybeGID() 317 c.attr.Perms = attr.Perms 318 } 319 if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr, false); err != nil { 320 c.dataMu.Unlock() 321 return err 322 } 323 oldSize := c.attr.Size 324 c.attr.Size = size 325 c.touchModificationAndStatusChangeTimeLocked(now) 326 327 // We drop c.dataMu here so that we can lock c.mapsMu and invalidate 328 // mappings below. This allows concurrent calls to Read/Translate/etc. 329 // These functions synchronize with an in-progress Truncate by refusing to 330 // use cache contents beyond the new c.attr.Size. (We are still holding 331 // c.attrMu, so we can't race with Truncate/Write.) 332 c.dataMu.Unlock() 333 334 // Nothing left to do unless shrinking the file. 335 if size >= oldSize { 336 return nil 337 } 338 339 oldpgend := fs.OffsetPageEnd(oldSize) 340 newpgend := fs.OffsetPageEnd(size) 341 342 // Invalidate past translations of truncated pages. 343 if newpgend != oldpgend { 344 c.mapsMu.Lock() 345 c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 346 // Compare Linux's mm/truncate.c:truncate_setsize() => 347 // truncate_pagecache() => 348 // mm/memory.c:unmap_mapping_range(evencows=1). 349 InvalidatePrivate: true, 350 }) 351 c.mapsMu.Unlock() 352 } 353 354 // We are now guaranteed that there are no translations of truncated pages, 355 // and can remove them from the cache. Since truncated pages have been 356 // removed from the backing file, they should be dropped without being 357 // written back. 358 c.dataMu.Lock() 359 defer c.dataMu.Unlock() 360 c.cache.Truncate(uint64(size), c.mfp.MemoryFile()) 361 c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend}) 362 363 return nil 364 } 365 366 // Allocate implements fs.InodeOperations.Allocate. 367 func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error { 368 newSize := offset + length 369 370 // c.attr.Size is protected by both c.attrMu and c.dataMu. 371 c.attrMu.Lock() 372 defer c.attrMu.Unlock() 373 c.dataMu.Lock() 374 defer c.dataMu.Unlock() 375 376 if newSize <= c.attr.Size { 377 return nil 378 } 379 380 now := ktime.NowFromContext(ctx) 381 if err := c.backingFile.Allocate(ctx, offset, length); err != nil { 382 return err 383 } 384 385 c.attr.Size = newSize 386 c.touchModificationAndStatusChangeTimeLocked(now) 387 return nil 388 } 389 390 // WriteDirtyPagesAndAttrs will write the dirty pages and attributes to the 391 // gofer without calling Fsync on the remote file. 392 func (c *CachingInodeOperations) WriteDirtyPagesAndAttrs(ctx context.Context, inode *fs.Inode) error { 393 c.attrMu.Lock() 394 defer c.attrMu.Unlock() 395 c.dataMu.Lock() 396 defer c.dataMu.Unlock() 397 398 // Write dirty pages back. 399 err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt) 400 if err != nil { 401 return err 402 } 403 404 // SyncDirtyAll above would have grown if needed. On shrinks, the backing 405 // file is called directly, so size is never needs to be updated. 406 c.dirtyAttr.Size = false 407 408 // Write out cached attributes. 409 if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr, false); err != nil { 410 return err 411 } 412 c.dirtyAttr = fs.AttrMask{} 413 414 return nil 415 } 416 417 // WriteOut implements fs.InodeOperations.WriteOut. 418 func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { 419 if err := c.WriteDirtyPagesAndAttrs(ctx, inode); err != nil { 420 return err 421 } 422 423 // Fsync the remote file. 424 return c.backingFile.Sync(ctx) 425 } 426 427 // IncLinks increases the link count and updates cached modification time. 428 func (c *CachingInodeOperations) IncLinks(ctx context.Context) { 429 c.attrMu.Lock() 430 c.attr.Links++ 431 c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) 432 c.attrMu.Unlock() 433 } 434 435 // DecLinks decreases the link count and updates cached modification time. 436 func (c *CachingInodeOperations) DecLinks(ctx context.Context) { 437 c.attrMu.Lock() 438 c.attr.Links-- 439 c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) 440 c.attrMu.Unlock() 441 } 442 443 // TouchAccessTime updates the cached access time in-place to the 444 // current time. It does not update status change time in-place. See 445 // mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed. 446 func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) { 447 if inode.MountSource.Flags.NoAtime { 448 return 449 } 450 451 c.attrMu.Lock() 452 c.touchAccessTimeLocked(ktime.NowFromContext(ctx)) 453 c.attrMu.Unlock() 454 } 455 456 // touchAccesstimeLocked updates the cached access time in-place to the current 457 // time. 458 // 459 // Preconditions: c.attrMu is locked for writing. 460 func (c *CachingInodeOperations) touchAccessTimeLocked(now ktime.Time) { 461 c.attr.AccessTime = now 462 c.dirtyAttr.AccessTime = true 463 } 464 465 // TouchModificationAndStatusChangeTime updates the cached modification and 466 // status change times in-place to the current time. 467 func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx context.Context) { 468 c.attrMu.Lock() 469 c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) 470 c.attrMu.Unlock() 471 } 472 473 // touchModificationAndStatusChangeTimeLocked updates the cached modification 474 // and status change times in-place to the current time. 475 // 476 // Preconditions: c.attrMu is locked for writing. 477 func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now ktime.Time) { 478 c.attr.ModificationTime = now 479 c.dirtyAttr.ModificationTime = true 480 c.attr.StatusChangeTime = now 481 c.dirtyAttr.StatusChangeTime = true 482 } 483 484 // TouchStatusChangeTime updates the cached status change time in-place to the 485 // current time. 486 func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) { 487 c.attrMu.Lock() 488 c.touchStatusChangeTimeLocked(ktime.NowFromContext(ctx)) 489 c.attrMu.Unlock() 490 } 491 492 // touchStatusChangeTimeLocked updates the cached status change time 493 // in-place to the current time. 494 // 495 // Preconditions: c.attrMu is locked for writing. 496 func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now ktime.Time) { 497 c.attr.StatusChangeTime = now 498 c.dirtyAttr.StatusChangeTime = true 499 } 500 501 // UpdateUnstable updates the cached unstable attributes. Only non-dirty 502 // attributes are updated. 503 func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) { 504 // All attributes are protected by attrMu. 505 c.attrMu.Lock() 506 507 if !c.dirtyAttr.Usage { 508 c.attr.Usage = attr.Usage 509 } 510 if !c.dirtyAttr.Perms { 511 c.attr.Perms = attr.Perms 512 } 513 if !c.dirtyAttr.UID { 514 c.attr.Owner.UID = attr.Owner.UID 515 } 516 if !c.dirtyAttr.GID { 517 c.attr.Owner.GID = attr.Owner.GID 518 } 519 if !c.dirtyAttr.AccessTime { 520 c.attr.AccessTime = attr.AccessTime 521 } 522 if !c.dirtyAttr.ModificationTime { 523 c.attr.ModificationTime = attr.ModificationTime 524 } 525 if !c.dirtyAttr.StatusChangeTime { 526 c.attr.StatusChangeTime = attr.StatusChangeTime 527 } 528 if !c.dirtyAttr.Links { 529 c.attr.Links = attr.Links 530 } 531 532 // Size requires holding attrMu and dataMu. 533 c.dataMu.Lock() 534 if !c.dirtyAttr.Size { 535 c.attr.Size = attr.Size 536 } 537 c.dataMu.Unlock() 538 539 c.attrMu.Unlock() 540 } 541 542 // Read reads from frames and otherwise directly from the backing file 543 // into dst starting at offset until dst is full, EOF is reached, or an 544 // error is encountered. 545 // 546 // Read may partially fill dst and return a nil error. 547 func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { 548 if dst.NumBytes() == 0 { 549 return 0, nil 550 } 551 552 // Have we reached EOF? We check for this again in 553 // inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would 554 // serialize reads) or c.dataMu (which would violate lock ordering), but 555 // check here first (before calling into MM) since reading at EOF is 556 // common: getting a return value of 0 from a read syscall is the only way 557 // to detect EOF. 558 // 559 // TODO(jamieliu): Separate out c.attr.Size and use atomics instead of 560 // c.dataMu. 561 c.dataMu.RLock() 562 size := c.attr.Size 563 c.dataMu.RUnlock() 564 if offset >= size { 565 return 0, io.EOF 566 } 567 568 n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset}) 569 // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). 570 c.TouchAccessTime(ctx, file.Dirent.Inode) 571 return n, err 572 } 573 574 // Write writes to frames and otherwise directly to the backing file 575 // from src starting at offset and until src is empty or an error is 576 // encountered. 577 // 578 // If Write partially fills src, a non-nil error is returned. 579 func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { 580 // Hot path. Avoid defers. 581 if src.NumBytes() == 0 { 582 return 0, nil 583 } 584 585 c.attrMu.Lock() 586 // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time(). 587 c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx)) 588 n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset}) 589 c.attrMu.Unlock() 590 return n, err 591 } 592 593 type inodeReadWriter struct { 594 ctx context.Context 595 c *CachingInodeOperations 596 offset int64 597 } 598 599 // ReadToBlocks implements safemem.Reader.ReadToBlocks. 600 func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { 601 mem := rw.c.mfp.MemoryFile() 602 fillCache := !rw.c.useHostPageCache() && mem.ShouldCacheEvictable() 603 604 // Hot path. Avoid defers. 605 var unlock func() 606 if fillCache { 607 rw.c.dataMu.Lock() 608 unlock = rw.c.dataMu.Unlock 609 } else { 610 rw.c.dataMu.RLock() 611 unlock = rw.c.dataMu.RUnlock 612 } 613 614 // Compute the range to read. 615 if rw.offset >= rw.c.attr.Size { 616 unlock() 617 return 0, io.EOF 618 } 619 end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size) 620 if end == rw.offset { // dsts.NumBytes() == 0? 621 unlock() 622 return 0, nil 623 } 624 625 var done uint64 626 seg, gap := rw.c.cache.Find(uint64(rw.offset)) 627 for rw.offset < end { 628 mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} 629 switch { 630 case seg.Ok(): 631 // Get internal mappings from the cache. 632 ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) 633 if err != nil { 634 unlock() 635 return done, err 636 } 637 638 // Copy from internal mappings. 639 n, err := safemem.CopySeq(dsts, ims) 640 done += n 641 rw.offset += int64(n) 642 dsts = dsts.DropFirst64(n) 643 if err != nil { 644 unlock() 645 return done, err 646 } 647 648 // Continue. 649 seg, gap = seg.NextNonEmpty() 650 651 case gap.Ok(): 652 gapMR := gap.Range().Intersect(mr) 653 if fillCache { 654 // Read into the cache, then re-enter the loop to read from the 655 // cache. 656 reqMR := memmap.MappableRange{ 657 Start: uint64(hostarch.Addr(gapMR.Start).RoundDown()), 658 End: fs.OffsetPageEnd(int64(gapMR.End)), 659 } 660 optMR := gap.Range() 661 err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), uint64(rw.c.attr.Size), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt) 662 mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End}) 663 seg, gap = rw.c.cache.Find(uint64(rw.offset)) 664 if !seg.Ok() { 665 unlock() 666 return done, err 667 } 668 // err might have occurred in part of gap.Range() outside 669 // gapMR. Forget about it for now; if the error matters and 670 // persists, we'll run into it again in a later iteration of 671 // this loop. 672 } else { 673 // Read directly from the backing file. 674 dst := dsts.TakeFirst64(gapMR.Length()) 675 n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapMR.Start) 676 done += n 677 rw.offset += int64(n) 678 dsts = dsts.DropFirst64(n) 679 // Partial reads are fine. But we must stop reading. 680 if n != dst.NumBytes() || err != nil { 681 unlock() 682 return done, err 683 } 684 685 // Continue. 686 seg, gap = gap.NextSegment(), FileRangeGapIterator{} 687 } 688 } 689 } 690 unlock() 691 return done, nil 692 } 693 694 // maybeUpdateAttrs updates the file's attributes after a write. It updates 695 // size if data has been written past the old size, and setuid/setgid if any 696 // bytes were written. 697 // 698 // Preconditions: 699 // * rw.c.attrMu must be locked. 700 // * rw.c.dataMu must be locked. 701 func (rw *inodeReadWriter) maybeUpdateAttrs(nwritten uint64) { 702 // If the write ends beyond the file's previous size, it causes the 703 // file to grow. 704 if rw.offset > rw.c.attr.Size { 705 rw.c.attr.Size = rw.offset 706 rw.c.dirtyAttr.Size = true 707 } 708 if rw.offset > rw.c.attr.Usage { 709 // This is incorrect if CachingInodeOperations is caching a sparse 710 // file. (In Linux, keeping inode::i_blocks up to date is the 711 // filesystem's responsibility.) 712 rw.c.attr.Usage = rw.offset 713 rw.c.dirtyAttr.Usage = true 714 } 715 716 // If bytes were written, ensure setuid and setgid are cleared. 717 if nwritten > 0 && rw.c.attr.Perms.HasSetUIDOrGID() { 718 rw.c.dirtyAttr.Perms = true 719 rw.c.attr.Perms.DropSetUIDAndMaybeGID() 720 } 721 } 722 723 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. 724 // 725 // Preconditions: rw.c.attrMu must be locked. 726 func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 727 // Hot path. Avoid defers. 728 rw.c.dataMu.Lock() 729 730 // Compute the range to write. 731 end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) 732 if end == rw.offset { // srcs.NumBytes() == 0? 733 rw.c.dataMu.Unlock() 734 return 0, nil 735 } 736 737 mf := rw.c.mfp.MemoryFile() 738 var done uint64 739 seg, gap := rw.c.cache.Find(uint64(rw.offset)) 740 for rw.offset < end { 741 mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} 742 switch { 743 case seg.Ok() && seg.Start() < mr.End: 744 // Get internal mappings from the cache. 745 segMR := seg.Range().Intersect(mr) 746 ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write) 747 if err != nil { 748 rw.maybeUpdateAttrs(done) 749 rw.c.dataMu.Unlock() 750 return done, err 751 } 752 753 // Copy to internal mappings. 754 n, err := safemem.CopySeq(ims, srcs) 755 done += n 756 rw.offset += int64(n) 757 srcs = srcs.DropFirst64(n) 758 rw.c.dirty.MarkDirty(segMR) 759 if err != nil { 760 rw.maybeUpdateAttrs(done) 761 rw.c.dataMu.Unlock() 762 return done, err 763 } 764 765 // Continue. 766 seg, gap = seg.NextNonEmpty() 767 768 case gap.Ok() && gap.Start() < mr.End: 769 // Write directly to the backing file. At present, we never fill 770 // the cache when writing, since doing so can convert small writes 771 // into inefficient read-modify-write cycles, and we have no 772 // mechanism for detecting or avoiding this. 773 gapmr := gap.Range().Intersect(mr) 774 src := srcs.TakeFirst64(gapmr.Length()) 775 n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start) 776 done += n 777 rw.offset += int64(n) 778 srcs = srcs.DropFirst64(n) 779 // Partial writes are fine. But we must stop writing. 780 if n != src.NumBytes() || err != nil { 781 rw.maybeUpdateAttrs(done) 782 rw.c.dataMu.Unlock() 783 return done, err 784 } 785 786 // Continue. 787 seg, gap = gap.NextSegment(), FileRangeGapIterator{} 788 } 789 } 790 rw.maybeUpdateAttrs(done) 791 rw.c.dataMu.Unlock() 792 return done, nil 793 } 794 795 // useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O 796 // and memory mappings, and false if c.cache may contain data cached from 797 // c.backingFile. 798 func (c *CachingInodeOperations) useHostPageCache() bool { 799 return !c.opts.ForcePageCache && c.backingFile.FD() >= 0 800 } 801 802 // AddMapping implements memmap.Mappable.AddMapping. 803 func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { 804 // Hot path. Avoid defers. 805 c.mapsMu.Lock() 806 mapped := c.mappings.AddMapping(ms, ar, offset, writable) 807 // Do this unconditionally since whether we have c.backingFile.FD() >= 0 808 // can change across save/restore. 809 for _, r := range mapped { 810 c.hostFileMapper.IncRefOn(r) 811 } 812 if !c.useHostPageCache() { 813 // c.Evict() will refuse to evict memory-mapped pages, so tell the 814 // MemoryFile to not bother trying. 815 mf := c.mfp.MemoryFile() 816 for _, r := range mapped { 817 mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End}) 818 } 819 } 820 c.mapsMu.Unlock() 821 return nil 822 } 823 824 // RemoveMapping implements memmap.Mappable.RemoveMapping. 825 func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { 826 // Hot path. Avoid defers. 827 c.mapsMu.Lock() 828 unmapped := c.mappings.RemoveMapping(ms, ar, offset, writable) 829 for _, r := range unmapped { 830 c.hostFileMapper.DecRefOn(r) 831 } 832 if c.useHostPageCache() { 833 c.mapsMu.Unlock() 834 return 835 } 836 837 // Pages that are no longer referenced by any application memory mappings 838 // are now considered unused; allow MemoryFile to evict them when 839 // necessary. 840 mf := c.mfp.MemoryFile() 841 c.dataMu.Lock() 842 for _, r := range unmapped { 843 // Since these pages are no longer mapped, they are no longer 844 // concurrently dirtyable by a writable memory mapping. 845 c.dirty.AllowClean(r) 846 mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End}) 847 } 848 c.dataMu.Unlock() 849 c.mapsMu.Unlock() 850 } 851 852 // CopyMapping implements memmap.Mappable.CopyMapping. 853 func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { 854 return c.AddMapping(ctx, ms, dstAR, offset, writable) 855 } 856 857 // Translate implements memmap.Mappable.Translate. 858 func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 859 // Hot path. Avoid defer. 860 if c.useHostPageCache() { 861 mr := optional 862 if c.opts.LimitHostFDTranslation { 863 mr = maxFillRange(required, optional) 864 } 865 return []memmap.Translation{ 866 { 867 Source: mr, 868 File: c, 869 Offset: mr.Start, 870 Perms: hostarch.AnyAccess, 871 }, 872 }, nil 873 } 874 875 c.dataMu.Lock() 876 877 // Constrain translations to c.attr.Size (rounded up) to prevent 878 // translation to pages that may be concurrently truncated. 879 pgend := fs.OffsetPageEnd(c.attr.Size) 880 var beyondEOF bool 881 if required.End > pgend { 882 if required.Start >= pgend { 883 c.dataMu.Unlock() 884 return nil, &memmap.BusError{io.EOF} 885 } 886 beyondEOF = true 887 required.End = pgend 888 } 889 if optional.End > pgend { 890 optional.End = pgend 891 } 892 893 mf := c.mfp.MemoryFile() 894 cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), uint64(c.attr.Size), mf, usage.PageCache, c.backingFile.ReadToBlocksAt) 895 896 var ts []memmap.Translation 897 var translatedEnd uint64 898 for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { 899 segMR := seg.Range().Intersect(optional) 900 // TODO(jamieliu): Make Translations writable even if writability is 901 // not required if already kept-dirty by another writable translation. 902 perms := hostarch.AccessType{ 903 Read: true, 904 Execute: true, 905 } 906 if at.Write { 907 // From this point forward, this memory can be dirtied through the 908 // mapping at any time. 909 c.dirty.KeepDirty(segMR) 910 perms.Write = true 911 } 912 ts = append(ts, memmap.Translation{ 913 Source: segMR, 914 File: mf, 915 Offset: seg.FileRangeOf(segMR).Start, 916 Perms: perms, 917 }) 918 translatedEnd = segMR.End 919 } 920 921 c.dataMu.Unlock() 922 923 // Don't return the error returned by c.cache.Fill if it occurred outside 924 // of required. 925 if translatedEnd < required.End && cerr != nil { 926 return ts, &memmap.BusError{cerr} 927 } 928 if beyondEOF { 929 return ts, &memmap.BusError{io.EOF} 930 } 931 return ts, nil 932 } 933 934 func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { 935 const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily 936 if required.Length() >= maxReadahead { 937 return required 938 } 939 if optional.Length() <= maxReadahead { 940 return optional 941 } 942 optional.Start = required.Start 943 if optional.Length() <= maxReadahead { 944 return optional 945 } 946 optional.End = optional.Start + maxReadahead 947 return optional 948 } 949 950 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 951 func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { 952 // Whether we have a host fd (and consequently what memmap.File is 953 // mapped) can change across save/restore, so invalidate all translations 954 // unconditionally. 955 c.mapsMu.Lock() 956 defer c.mapsMu.Unlock() 957 c.mappings.InvalidateAll(memmap.InvalidateOpts{}) 958 959 // Sync the cache's contents so that if we have a host fd after restore, 960 // the remote file's contents are coherent. 961 mf := c.mfp.MemoryFile() 962 c.dataMu.Lock() 963 defer c.dataMu.Unlock() 964 if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { 965 return err 966 } 967 968 // Discard the cache so that it's not stored in saved state. This is safe 969 // because per InvalidateUnsavable invariants, no new translations can have 970 // been returned after we invalidated all existing translations above. 971 c.cache.DropAll(mf) 972 c.dirty.RemoveAll() 973 974 return nil 975 } 976 977 // NotifyChangeFD must be called after the file description represented by 978 // CachedFileObject.FD() changes. 979 func (c *CachingInodeOperations) NotifyChangeFD() error { 980 // Update existing sentry mappings to refer to the new file description. 981 if err := c.hostFileMapper.RegenerateMappings(c.backingFile.FD()); err != nil { 982 return err 983 } 984 985 // Shoot down existing application mappings of the old file description; 986 // they will be remapped with the new file description on demand. 987 c.mapsMu.Lock() 988 defer c.mapsMu.Unlock() 989 990 c.mappings.InvalidateAll(memmap.InvalidateOpts{}) 991 return nil 992 } 993 994 // Evict implements pgalloc.EvictableMemoryUser.Evict. 995 func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) { 996 c.mapsMu.Lock() 997 defer c.mapsMu.Unlock() 998 c.dataMu.Lock() 999 defer c.dataMu.Unlock() 1000 1001 mr := memmap.MappableRange{er.Start, er.End} 1002 mf := c.mfp.MemoryFile() 1003 // Only allow pages that are no longer memory-mapped to be evicted. 1004 for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { 1005 mgapMR := mgap.Range().Intersect(mr) 1006 if mgapMR.Length() == 0 { 1007 continue 1008 } 1009 if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil { 1010 log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) 1011 } 1012 c.cache.Drop(mgapMR, mf) 1013 c.dirty.KeepClean(mgapMR) 1014 } 1015 } 1016 1017 // IncRef implements memmap.File.IncRef. This is used when we directly map an 1018 // underlying host fd and CachingInodeOperations is used as the memmap.File 1019 // during translation. 1020 func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) { 1021 // Hot path. Avoid defers. 1022 c.dataMu.Lock() 1023 seg, gap := c.refs.Find(fr.Start) 1024 for { 1025 switch { 1026 case seg.Ok() && seg.Start() < fr.End: 1027 seg = c.refs.Isolate(seg, fr) 1028 seg.SetValue(seg.Value() + 1) 1029 seg, gap = seg.NextNonEmpty() 1030 case gap.Ok() && gap.Start() < fr.End: 1031 newRange := gap.Range().Intersect(fr) 1032 usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) 1033 seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() 1034 default: 1035 c.refs.MergeAdjacent(fr) 1036 c.dataMu.Unlock() 1037 return 1038 } 1039 } 1040 } 1041 1042 // DecRef implements memmap.File.DecRef. This is used when we directly map an 1043 // underlying host fd and CachingInodeOperations is used as the memmap.File 1044 // during translation. 1045 func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) { 1046 // Hot path. Avoid defers. 1047 c.dataMu.Lock() 1048 seg := c.refs.FindSegment(fr.Start) 1049 1050 for seg.Ok() && seg.Start() < fr.End { 1051 seg = c.refs.Isolate(seg, fr) 1052 if old := seg.Value(); old == 1 { 1053 usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) 1054 seg = c.refs.Remove(seg).NextSegment() 1055 } else { 1056 seg.SetValue(old - 1) 1057 seg = seg.NextSegment() 1058 } 1059 } 1060 c.refs.MergeAdjacent(fr) 1061 c.dataMu.Unlock() 1062 } 1063 1064 // MapInternal implements memmap.File.MapInternal. This is used when we 1065 // directly map an underlying host fd and CachingInodeOperations is used as the 1066 // memmap.File during translation. 1067 func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 1068 return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) 1069 } 1070 1071 // FD implements memmap.File.FD. This is used when we directly map an 1072 // underlying host fd and CachingInodeOperations is used as the memmap.File 1073 // during translation. 1074 func (c *CachingInodeOperations) FD() int { 1075 return c.backingFile.FD() 1076 }