github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/gofer/regular_file.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gofer 16 17 import ( 18 "fmt" 19 "io" 20 "math" 21 22 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 23 "github.com/MerlinKodo/gvisor/pkg/context" 24 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 25 "github.com/MerlinKodo/gvisor/pkg/hostarch" 26 "github.com/MerlinKodo/gvisor/pkg/log" 27 "github.com/MerlinKodo/gvisor/pkg/metric" 28 "github.com/MerlinKodo/gvisor/pkg/safemem" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/fsmetric" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/fsutil" 31 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 34 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 35 "github.com/MerlinKodo/gvisor/pkg/sync" 36 "github.com/MerlinKodo/gvisor/pkg/usermem" 37 ) 38 39 func (d *dentry) isRegularFile() bool { 40 return d.fileType() == linux.S_IFREG 41 } 42 43 // +stateify savable 44 type regularFileFD struct { 45 fileDescription 46 47 // off is the file offset. off is protected by mu. 48 mu sync.Mutex `state:"nosave"` 49 off int64 50 } 51 52 func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { 53 fd := ®ularFileFD{} 54 fd.LockFD.Init(&d.locks) 55 if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ 56 AllowDirectIO: true, 57 }); err != nil { 58 return nil, err 59 } 60 if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { 61 metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) 62 } 63 if d.mmapFD.Load() >= 0 { 64 fsmetric.GoferOpensHost.Increment() 65 } else { 66 fsmetric.GoferOpens9P.Increment() 67 } 68 return fd, nil 69 } 70 71 // Release implements vfs.FileDescriptionImpl.Release. 72 func (fd *regularFileFD) Release(context.Context) { 73 } 74 75 // OnClose implements vfs.FileDescriptionImpl.OnClose. 76 func (fd *regularFileFD) OnClose(ctx context.Context) error { 77 if !fd.vfsfd.IsWritable() { 78 return nil 79 } 80 d := fd.dentry() 81 if d.fs.opts.interop == InteropModeExclusive { 82 // d may have dirty pages that we won't write back now (and wouldn't 83 // have in VFS1), making a flushf RPC ineffective. If this is the case, 84 // skip the flushf. 85 // 86 // Note that it's also possible to have dirty pages under other interop 87 // modes if forcePageCache is in effect; we conservatively assume that 88 // applications have some way of tolerating this and still want the 89 // flushf. 90 d.dataMu.RLock() 91 haveDirtyPages := !d.dirty.IsEmpty() 92 d.dataMu.RUnlock() 93 if haveDirtyPages { 94 return nil 95 } 96 } 97 return d.flush(ctx) 98 } 99 100 // Allocate implements vfs.FileDescriptionImpl.Allocate. 101 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { 102 d := fd.dentry() 103 return d.doAllocate(ctx, offset, length, func() error { 104 return d.allocate(ctx, mode, offset, length) 105 }) 106 } 107 108 // PRead implements vfs.FileDescriptionImpl.PRead. 109 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 110 start := fsmetric.StartReadWait() 111 d := fd.dentry() 112 defer func() { 113 if d.readFD.Load() >= 0 { 114 fsmetric.GoferReadsHost.Increment() 115 fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) 116 } else { 117 fsmetric.GoferReads9P.Increment() 118 fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) 119 } 120 }() 121 122 if offset < 0 { 123 return 0, linuxerr.EINVAL 124 } 125 126 // Check that flags are supported. 127 // 128 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 129 if opts.Flags&^linux.RWF_HIPRI != 0 { 130 return 0, linuxerr.EOPNOTSUPP 131 } 132 133 // Check for reading at EOF before calling into MM (but not under 134 // InteropModeShared, which makes d.size unreliable). 135 if d.cachedMetadataAuthoritative() && uint64(offset) >= d.size.Load() { 136 return 0, io.EOF 137 } 138 139 var ( 140 n int64 141 readErr error 142 ) 143 if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { 144 // Write dirty cached pages that will be touched by the read back to 145 // the remote file. 146 if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { 147 return 0, err 148 } 149 rw := getDentryReadWriter(ctx, d, offset) 150 // Require the read to go to the remote file. 151 rw.direct = true 152 n, readErr = dst.CopyOutFrom(ctx, rw) 153 putDentryReadWriter(rw) 154 if d.fs.opts.interop != InteropModeShared { 155 // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). 156 d.touchAtimeLocked(fd.vfsfd.Mount()) 157 } 158 } else { 159 rw := getDentryReadWriter(ctx, d, offset) 160 n, readErr = dst.CopyOutFrom(ctx, rw) 161 putDentryReadWriter(rw) 162 if d.fs.opts.interop != InteropModeShared { 163 // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). 164 d.touchAtime(fd.vfsfd.Mount()) 165 } 166 } 167 return n, readErr 168 } 169 170 // Read implements vfs.FileDescriptionImpl.Read. 171 func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 172 fd.mu.Lock() 173 n, err := fd.PRead(ctx, dst, fd.off, opts) 174 fd.off += n 175 fd.mu.Unlock() 176 return n, err 177 } 178 179 // PWrite implements vfs.FileDescriptionImpl.PWrite. 180 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 181 n, _, err := fd.pwrite(ctx, src, offset, opts) 182 return n, err 183 } 184 185 // pwrite returns the number of bytes written, final offset, error. The final 186 // offset should be ignored by PWrite. 187 func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { 188 if offset < 0 { 189 return 0, offset, linuxerr.EINVAL 190 } 191 192 // Check that flags are supported. 193 // 194 // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. 195 if opts.Flags&^linux.RWF_HIPRI != 0 { 196 return 0, offset, linuxerr.EOPNOTSUPP 197 } 198 199 d := fd.dentry() 200 201 d.metadataMu.Lock() 202 defer d.metadataMu.Unlock() 203 204 // If the fd was opened with O_APPEND, make sure the file size is updated. 205 // There is a possible race here if size is modified externally after 206 // metadata cache is updated. 207 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { 208 if err := d.refreshSizeLocked(ctx); err != nil { 209 return 0, offset, err 210 } 211 } 212 213 // Set offset to file size if the fd was opened with O_APPEND. 214 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 215 // Holding d.metadataMu is sufficient for reading d.size. 216 offset = int64(d.size.RacyLoad()) 217 } 218 limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) 219 if err != nil { 220 return 0, offset, err 221 } 222 src = src.TakeFirst64(limit) 223 224 if d.fs.opts.interop != InteropModeShared { 225 // Compare Linux's mm/filemap.c:__generic_file_write_iter() => 226 // file_update_time(). This is d.touchCMtime(), but without locking 227 // d.metadataMu (recursively). 228 d.touchCMtimeLocked() 229 } 230 231 rw := getDentryReadWriter(ctx, d, offset) 232 defer putDentryReadWriter(rw) 233 234 if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { 235 if err := fd.writeCache(ctx, d, offset, src); err != nil { 236 return 0, offset, err 237 } 238 239 // Require the write to go to the remote file. 240 rw.direct = true 241 } 242 243 n, err := src.CopyInTo(ctx, rw) 244 if err != nil { 245 return n, offset + n, err 246 } 247 if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { 248 // Note that if any of the following fail, then we can't guarantee that 249 // any data was actually written with the semantics of O_DSYNC or 250 // O_SYNC, so we return zero bytes written. Compare Linux's 251 // mm/filemap.c:generic_file_write_iter() => 252 // include/linux/fs.h:generic_write_sync(). 253 // 254 // Write dirty cached pages touched by the write back to the remote 255 // file. 256 if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { 257 return 0, offset, err 258 } 259 // Request the remote filesystem to sync the remote file. 260 if err := d.syncRemoteFile(ctx); err != nil { 261 return 0, offset, err 262 } 263 } 264 265 // As with Linux, writing clears the setuid and setgid bits. 266 if n > 0 { 267 oldMode := d.mode.Load() 268 // If setuid or setgid were set, update d.mode and propagate 269 // changes to the host. 270 if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { 271 if err := d.chmod(ctx, uint16(newMode)); err != nil { 272 return 0, offset, err 273 } 274 d.mode.Store(newMode) 275 } 276 } 277 278 return n, offset + n, nil 279 } 280 281 func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { 282 // Write dirty cached pages that will be touched by the write back to 283 // the remote file. 284 if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { 285 return err 286 } 287 288 // Remove touched pages from the cache. 289 pgstart := hostarch.PageRoundDown(uint64(offset)) 290 pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) 291 if !ok { 292 return linuxerr.EINVAL 293 } 294 mr := memmap.MappableRange{pgstart, pgend} 295 var freed []memmap.FileRange 296 297 d.dataMu.Lock() 298 cseg := d.cache.LowerBoundSegment(mr.Start) 299 for cseg.Ok() && cseg.Start() < mr.End { 300 cseg = d.cache.Isolate(cseg, mr) 301 freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) 302 cseg = d.cache.Remove(cseg).NextSegment() 303 } 304 d.dataMu.Unlock() 305 306 // Invalidate mappings of removed pages. 307 d.mapsMu.Lock() 308 d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) 309 d.mapsMu.Unlock() 310 311 // Finally free pages removed from the cache. 312 mf := d.fs.mfp.MemoryFile() 313 for _, freedFR := range freed { 314 mf.DecRef(freedFR) 315 } 316 return nil 317 } 318 319 // Write implements vfs.FileDescriptionImpl.Write. 320 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 321 fd.mu.Lock() 322 n, off, err := fd.pwrite(ctx, src, fd.off, opts) 323 fd.off = off 324 fd.mu.Unlock() 325 return n, err 326 } 327 328 type dentryReadWriter struct { 329 ctx context.Context 330 d *dentry 331 off uint64 332 direct bool 333 } 334 335 var dentryReadWriterPool = sync.Pool{ 336 New: func() any { 337 return &dentryReadWriter{} 338 }, 339 } 340 341 func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { 342 rw := dentryReadWriterPool.Get().(*dentryReadWriter) 343 rw.ctx = ctx 344 rw.d = d 345 rw.off = uint64(offset) 346 rw.direct = false 347 return rw 348 } 349 350 func putDentryReadWriter(rw *dentryReadWriter) { 351 rw.ctx = nil 352 rw.d = nil 353 dentryReadWriterPool.Put(rw) 354 } 355 356 // ReadToBlocks implements safemem.Reader.ReadToBlocks. 357 func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { 358 if dsts.IsEmpty() { 359 return 0, nil 360 } 361 362 // If we have a mmappable host FD (which must be used here to ensure 363 // coherence with memory-mapped I/O), or if InteropModeShared is in effect 364 // (which prevents us from caching file contents and makes dentry.size 365 // unreliable), or if the file was opened O_DIRECT, read directly from 366 // readHandle() without locking dentry.dataMu. 367 rw.d.handleMu.RLock() 368 h := rw.d.readHandle() 369 if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { 370 n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) 371 rw.d.handleMu.RUnlock() 372 rw.off += n 373 return n, err 374 } 375 376 // Otherwise read from/through the cache. 377 mf := rw.d.fs.mfp.MemoryFile() 378 fillCache := mf.ShouldCacheEvictable() 379 var dataMuUnlock func() 380 if fillCache { 381 rw.d.dataMu.Lock() 382 dataMuUnlock = rw.d.dataMu.Unlock 383 } else { 384 rw.d.dataMu.RLock() 385 dataMuUnlock = rw.d.dataMu.RUnlock 386 } 387 388 // Compute the range to read (limited by file size and overflow-checked). 389 end := rw.d.size.Load() 390 if rw.off >= end { 391 dataMuUnlock() 392 rw.d.handleMu.RUnlock() 393 return 0, io.EOF 394 } 395 if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { 396 end = rend 397 } 398 399 var done uint64 400 seg, gap := rw.d.cache.Find(rw.off) 401 for rw.off < end { 402 mr := memmap.MappableRange{rw.off, end} 403 switch { 404 case seg.Ok(): 405 // Get internal mappings from the cache. 406 ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) 407 if err != nil { 408 dataMuUnlock() 409 rw.d.handleMu.RUnlock() 410 return done, err 411 } 412 413 // Copy from internal mappings. 414 n, err := safemem.CopySeq(dsts, ims) 415 done += n 416 rw.off += n 417 dsts = dsts.DropFirst64(n) 418 if err != nil { 419 dataMuUnlock() 420 rw.d.handleMu.RUnlock() 421 return done, err 422 } 423 424 // Continue. 425 seg, gap = seg.NextNonEmpty() 426 427 case gap.Ok(): 428 gapMR := gap.Range().Intersect(mr) 429 if fillCache { 430 // Read into the cache, then re-enter the loop to read from the 431 // cache. 432 gapEnd, _ := hostarch.PageRoundUp(gapMR.End) 433 reqMR := memmap.MappableRange{ 434 Start: hostarch.PageRoundDown(gapMR.Start), 435 End: gapEnd, 436 } 437 optMR := gap.Range() 438 _, err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size.Load(), mf, usage.PageCache, pgalloc.AllocateAndWritePopulate, h.readToBlocksAt) 439 mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) 440 seg, gap = rw.d.cache.Find(rw.off) 441 if !seg.Ok() { 442 dataMuUnlock() 443 rw.d.handleMu.RUnlock() 444 return done, err 445 } 446 // err might have occurred in part of gap.Range() outside gapMR 447 // (in particular, gap.End() might be beyond EOF). Forget about 448 // it for now; if the error matters and persists, we'll run 449 // into it again in a later iteration of this loop. 450 } else { 451 // Read directly from the file. 452 gapDsts := dsts.TakeFirst64(gapMR.Length()) 453 n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) 454 done += n 455 rw.off += n 456 dsts = dsts.DropFirst64(n) 457 // Partial reads are fine. But we must stop reading. 458 if n != gapDsts.NumBytes() || err != nil { 459 dataMuUnlock() 460 rw.d.handleMu.RUnlock() 461 return done, err 462 } 463 464 // Continue. 465 seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} 466 } 467 } 468 } 469 dataMuUnlock() 470 rw.d.handleMu.RUnlock() 471 return done, nil 472 } 473 474 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. 475 // 476 // Preconditions: rw.d.metadataMu must be locked. 477 func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 478 if srcs.IsEmpty() { 479 return 0, nil 480 } 481 482 // If we have a mmappable host FD (which must be used here to ensure 483 // coherence with memory-mapped I/O), or if InteropModeShared is in effect 484 // (which prevents us from caching file contents), or if the file was 485 // opened with O_DIRECT, write directly to dentry.writeHandle() 486 // without locking dentry.dataMu. 487 rw.d.handleMu.RLock() 488 h := rw.d.writeHandle() 489 if (rw.d.mmapFD.RacyLoad() >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { 490 n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) 491 rw.off += n 492 rw.d.dataMu.Lock() 493 if rw.off > rw.d.size.Load() { 494 rw.d.size.Store(rw.off) 495 // The remote file's size will implicitly be extended to the correct 496 // value when we write back to it. 497 } 498 rw.d.dataMu.Unlock() 499 rw.d.handleMu.RUnlock() 500 return n, err 501 } 502 503 // Otherwise write to/through the cache. 504 mf := rw.d.fs.mfp.MemoryFile() 505 rw.d.dataMu.Lock() 506 507 // Compute the range to write (overflow-checked). 508 start := rw.off 509 end := rw.off + srcs.NumBytes() 510 if end <= rw.off { 511 end = math.MaxInt64 512 } 513 514 var ( 515 done uint64 516 retErr error 517 ) 518 seg, gap := rw.d.cache.Find(rw.off) 519 for rw.off < end { 520 mr := memmap.MappableRange{rw.off, end} 521 switch { 522 case seg.Ok(): 523 // Get internal mappings from the cache. 524 segMR := seg.Range().Intersect(mr) 525 ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write) 526 if err != nil { 527 retErr = err 528 goto exitLoop 529 } 530 531 // Copy to internal mappings. 532 n, err := safemem.CopySeq(ims, srcs) 533 done += n 534 rw.off += n 535 srcs = srcs.DropFirst64(n) 536 rw.d.dirty.MarkDirty(segMR) 537 if err != nil { 538 retErr = err 539 goto exitLoop 540 } 541 542 // Continue. 543 seg, gap = seg.NextNonEmpty() 544 545 case gap.Ok(): 546 // Write directly to the file. At present, we never fill the cache 547 // when writing, since doing so can convert small writes into 548 // inefficient read-modify-write cycles, and we have no mechanism 549 // for detecting or avoiding this. 550 gapMR := gap.Range().Intersect(mr) 551 gapSrcs := srcs.TakeFirst64(gapMR.Length()) 552 n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) 553 done += n 554 rw.off += n 555 srcs = srcs.DropFirst64(n) 556 // Partial writes are fine. But we must stop writing. 557 if n != gapSrcs.NumBytes() || err != nil { 558 retErr = err 559 goto exitLoop 560 } 561 562 // Continue. 563 seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} 564 } 565 } 566 exitLoop: 567 if rw.off > rw.d.size.Load() { 568 rw.d.size.Store(rw.off) 569 // The remote file's size will implicitly be extended to the correct 570 // value when we write back to it. 571 } 572 // If InteropModeWritethrough is in effect, flush written data back to the 573 // remote filesystem. 574 if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { 575 if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ 576 Start: start, 577 End: rw.off, 578 }, &rw.d.cache, &rw.d.dirty, rw.d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 579 // We have no idea how many bytes were actually flushed. 580 rw.off = start 581 done = 0 582 retErr = err 583 } 584 } 585 rw.d.dataMu.Unlock() 586 rw.d.handleMu.RUnlock() 587 return done, retErr 588 } 589 590 func (d *dentry) writeback(ctx context.Context, offset, size int64) error { 591 if size == 0 { 592 return nil 593 } 594 d.handleMu.RLock() 595 defer d.handleMu.RUnlock() 596 h := d.writeHandle() 597 d.dataMu.Lock() 598 defer d.dataMu.Unlock() 599 // Compute the range of valid bytes (overflow-checked). 600 dentrySize := d.size.Load() 601 if uint64(offset) >= dentrySize { 602 return nil 603 } 604 end := int64(dentrySize) 605 if rend := offset + size; rend > offset && rend < end { 606 end = rend 607 } 608 return fsutil.SyncDirty(ctx, memmap.MappableRange{ 609 Start: uint64(offset), 610 End: uint64(end), 611 }, &d.cache, &d.dirty, dentrySize, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) 612 } 613 614 // Seek implements vfs.FileDescriptionImpl.Seek. 615 func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 616 fd.mu.Lock() 617 defer fd.mu.Unlock() 618 newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) 619 if err != nil { 620 return 0, err 621 } 622 fd.off = newOffset 623 return newOffset, nil 624 } 625 626 // Calculate the new offset for a seek operation on a regular file. 627 func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { 628 switch whence { 629 case linux.SEEK_SET: 630 // Use offset as specified. 631 case linux.SEEK_CUR: 632 offset += fdOffset 633 case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: 634 // Ensure file size is up to date. 635 if !d.cachedMetadataAuthoritative() { 636 if err := d.updateMetadata(ctx); err != nil { 637 return 0, err 638 } 639 } 640 size := int64(d.size.Load()) 641 // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous 642 // block of data. 643 switch whence { 644 case linux.SEEK_END: 645 offset += size 646 case linux.SEEK_DATA: 647 if offset >= size { 648 return 0, linuxerr.ENXIO 649 } 650 // Use offset as specified. 651 case linux.SEEK_HOLE: 652 if offset >= size { 653 return 0, linuxerr.ENXIO 654 } 655 offset = size 656 } 657 default: 658 return 0, linuxerr.EINVAL 659 } 660 if offset < 0 { 661 return 0, linuxerr.EINVAL 662 } 663 return offset, nil 664 } 665 666 // Sync implements vfs.FileDescriptionImpl.Sync. 667 func (fd *regularFileFD) Sync(ctx context.Context) error { 668 return fd.dentry().syncCachedFile(ctx, false /* forFilesystemSync */) 669 } 670 671 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 672 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { 673 d := fd.dentry() 674 // Force sentry page caching at your own risk. 675 if !d.fs.opts.forcePageCache { 676 switch d.fs.opts.interop { 677 case InteropModeExclusive: 678 // Any mapping is fine. 679 case InteropModeWritethrough: 680 // Shared writable mappings require a host FD, since otherwise we 681 // can't synchronously flush memory-mapped writes to the remote 682 // file. 683 if opts.Private || !opts.MaxPerms.Write { 684 break 685 } 686 fallthrough 687 case InteropModeShared: 688 // All mappings require a host FD to be coherent with other 689 // filesystem users. 690 if d.mmapFD.Load() < 0 { 691 return linuxerr.ENODEV 692 } 693 default: 694 panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) 695 } 696 } 697 // After this point, d may be used as a memmap.Mappable. 698 d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) 699 opts.SentryOwnedContent = d.fs.opts.forcePageCache 700 return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) 701 } 702 703 func (fs *filesystem) mayCachePagesInMemoryFile() bool { 704 return fs.opts.forcePageCache || fs.opts.interop != InteropModeShared 705 } 706 707 // AddMapping implements memmap.Mappable.AddMapping. 708 func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { 709 d.mapsMu.Lock() 710 mapped := d.mappings.AddMapping(ms, ar, offset, writable) 711 // Do this unconditionally since whether we have a host FD can change 712 // across save/restore. 713 for _, r := range mapped { 714 d.pf.hostFileMapper.IncRefOn(r) 715 } 716 if d.fs.mayCachePagesInMemoryFile() { 717 // d.Evict() will refuse to evict memory-mapped pages, so tell the 718 // MemoryFile to not bother trying. 719 mf := d.fs.mfp.MemoryFile() 720 for _, r := range mapped { 721 mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) 722 } 723 } 724 d.mapsMu.Unlock() 725 return nil 726 } 727 728 // RemoveMapping implements memmap.Mappable.RemoveMapping. 729 func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { 730 d.mapsMu.Lock() 731 unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) 732 for _, r := range unmapped { 733 d.pf.hostFileMapper.DecRefOn(r) 734 } 735 if d.fs.mayCachePagesInMemoryFile() { 736 // Pages that are no longer referenced by any application memory 737 // mappings are now considered unused; allow MemoryFile to evict them 738 // when necessary. 739 mf := d.fs.mfp.MemoryFile() 740 d.dataMu.Lock() 741 for _, r := range unmapped { 742 // Since these pages are no longer mapped, they are no longer 743 // concurrently dirtyable by a writable memory mapping. 744 d.dirty.AllowClean(r) 745 mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) 746 } 747 d.dataMu.Unlock() 748 } 749 d.mapsMu.Unlock() 750 } 751 752 // CopyMapping implements memmap.Mappable.CopyMapping. 753 func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { 754 return d.AddMapping(ctx, ms, dstAR, offset, writable) 755 } 756 757 // Translate implements memmap.Mappable.Translate. 758 func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 759 d.handleMu.RLock() 760 if d.mmapFD.RacyLoad() >= 0 && !d.fs.opts.forcePageCache { 761 d.handleMu.RUnlock() 762 mr := optional 763 if d.fs.opts.limitHostFDTranslation { 764 mr = maxFillRange(required, optional) 765 } 766 return []memmap.Translation{ 767 { 768 Source: mr, 769 File: &d.pf, 770 Offset: mr.Start, 771 Perms: hostarch.AnyAccess, 772 }, 773 }, nil 774 } 775 776 d.dataMu.Lock() 777 778 // Constrain translations to d.size (rounded up) to prevent translation to 779 // pages that may be concurrently truncated. 780 pgend, _ := hostarch.PageRoundUp(d.size.Load()) 781 var beyondEOF bool 782 if required.End > pgend { 783 if required.Start >= pgend { 784 d.dataMu.Unlock() 785 d.handleMu.RUnlock() 786 return nil, &memmap.BusError{io.EOF} 787 } 788 beyondEOF = true 789 required.End = pgend 790 } 791 if optional.End > pgend { 792 optional.End = pgend 793 } 794 795 mf := d.fs.mfp.MemoryFile() 796 h := d.readHandle() 797 _, cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size.Load(), mf, usage.PageCache, pgalloc.AllocateAndWritePopulate, h.readToBlocksAt) 798 799 var ts []memmap.Translation 800 var translatedEnd uint64 801 for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { 802 segMR := seg.Range().Intersect(optional) 803 // TODO(jamieliu): Make Translations writable even if writability is 804 // not required if already kept-dirty by another writable translation. 805 perms := hostarch.AccessType{ 806 Read: true, 807 Execute: true, 808 } 809 if at.Write { 810 // From this point forward, this memory can be dirtied through the 811 // mapping at any time. 812 d.dirty.KeepDirty(segMR) 813 perms.Write = true 814 } 815 ts = append(ts, memmap.Translation{ 816 Source: segMR, 817 File: mf, 818 Offset: seg.FileRangeOf(segMR).Start, 819 Perms: perms, 820 }) 821 translatedEnd = segMR.End 822 } 823 824 d.dataMu.Unlock() 825 d.handleMu.RUnlock() 826 827 // Don't return the error returned by c.cache.Fill if it occurred outside 828 // of required. 829 if translatedEnd < required.End && cerr != nil { 830 return ts, &memmap.BusError{cerr} 831 } 832 if beyondEOF { 833 return ts, &memmap.BusError{io.EOF} 834 } 835 return ts, nil 836 } 837 838 func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { 839 const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily 840 if required.Length() >= maxReadahead { 841 return required 842 } 843 if optional.Length() <= maxReadahead { 844 return optional 845 } 846 optional.Start = required.Start 847 if optional.Length() <= maxReadahead { 848 return optional 849 } 850 optional.End = optional.Start + maxReadahead 851 return optional 852 } 853 854 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 855 func (d *dentry) InvalidateUnsavable(ctx context.Context) error { 856 // Whether we have a host fd (and consequently what memmap.File is 857 // mapped) can change across save/restore, so invalidate all translations 858 // unconditionally. 859 d.mapsMu.Lock() 860 defer d.mapsMu.Unlock() 861 d.mappings.InvalidateAll(memmap.InvalidateOpts{}) 862 863 // Write the cache's contents back to the remote file so that if we have a 864 // host fd after restore, the remote file's contents are coherent. 865 mf := d.fs.mfp.MemoryFile() 866 d.handleMu.RLock() 867 defer d.handleMu.RUnlock() 868 h := d.writeHandle() 869 d.dataMu.Lock() 870 defer d.dataMu.Unlock() 871 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 872 return err 873 } 874 875 // Discard the cache so that it's not stored in saved state. This is safe 876 // because per InvalidateUnsavable invariants, no new translations can have 877 // been returned after we invalidated all existing translations above. 878 d.cache.DropAll(mf) 879 d.dirty.RemoveAll() 880 881 return nil 882 } 883 884 // Evict implements pgalloc.EvictableMemoryUser.Evict. 885 func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { 886 mr := memmap.MappableRange{er.Start, er.End} 887 mf := d.fs.mfp.MemoryFile() 888 d.mapsMu.Lock() 889 defer d.mapsMu.Unlock() 890 d.handleMu.RLock() 891 defer d.handleMu.RUnlock() 892 h := d.writeHandle() 893 d.dataMu.Lock() 894 defer d.dataMu.Unlock() 895 896 // Only allow pages that are no longer memory-mapped to be evicted. 897 for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { 898 mgapMR := mgap.Range().Intersect(mr) 899 if mgapMR.Length() == 0 { 900 continue 901 } 902 if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil { 903 log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) 904 } 905 d.cache.Drop(mgapMR, mf) 906 d.dirty.KeepClean(mgapMR) 907 } 908 } 909 910 // dentryPlatformFile implements memmap.File. It exists solely because dentry 911 // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. 912 // 913 // dentryPlatformFile is only used when a host FD representing the remote file 914 // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application 915 // memory mappings (i.e. !filesystem.opts.forcePageCache). 916 // 917 // +stateify savable 918 type dentryPlatformFile struct { 919 *dentry 920 921 // fdRefs counts references on memmap.File offsets. fdRefs is protected 922 // by dentry.dataMu. 923 fdRefs fsutil.FrameRefSet 924 925 // If this dentry represents a regular file, and dentry.mmapFD >= 0, 926 // hostFileMapper caches mappings of dentry.mmapFD. 927 hostFileMapper fsutil.HostFileMapper 928 929 // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. 930 hostFileMapperInitOnce sync.Once `state:"nosave"` 931 } 932 933 // IncRef implements memmap.File.IncRef. 934 func (d *dentryPlatformFile) IncRef(fr memmap.FileRange, memCgID uint32) { 935 d.dataMu.Lock() 936 d.fdRefs.IncRefAndAccount(fr, memCgID) 937 d.dataMu.Unlock() 938 } 939 940 // DecRef implements memmap.File.DecRef. 941 func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { 942 d.dataMu.Lock() 943 d.fdRefs.DecRefAndAccount(fr) 944 d.dataMu.Unlock() 945 } 946 947 // MapInternal implements memmap.File.MapInternal. 948 func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 949 d.handleMu.RLock() 950 defer d.handleMu.RUnlock() 951 return d.hostFileMapper.MapInternal(fr, int(d.mmapFD.RacyLoad()), at.Write) 952 } 953 954 // FD implements memmap.File.FD. 955 func (d *dentryPlatformFile) FD() int { 956 d.handleMu.RLock() 957 defer d.handleMu.RUnlock() 958 return int(d.mmapFD.RacyLoad()) 959 }