github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/gofer/regular_file.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gofer 16 17 import ( 18 "fmt" 19 "io" 20 "math" 21 "sync/atomic" 22 23 "github.com/SagerNet/gvisor/pkg/abi/linux" 24 "github.com/SagerNet/gvisor/pkg/context" 25 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 26 "github.com/SagerNet/gvisor/pkg/hostarch" 27 "github.com/SagerNet/gvisor/pkg/log" 28 "github.com/SagerNet/gvisor/pkg/metric" 29 "github.com/SagerNet/gvisor/pkg/p9" 30 "github.com/SagerNet/gvisor/pkg/safemem" 31 "github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil" 32 "github.com/SagerNet/gvisor/pkg/sentry/fsmetric" 33 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 34 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 35 "github.com/SagerNet/gvisor/pkg/sentry/usage" 36 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 37 "github.com/SagerNet/gvisor/pkg/sync" 38 "github.com/SagerNet/gvisor/pkg/syserror" 39 "github.com/SagerNet/gvisor/pkg/usermem" 40 ) 41 42 func (d *dentry) isRegularFile() bool { 43 return d.fileType() == linux.S_IFREG 44 } 45 46 // +stateify savable 47 type regularFileFD struct { 48 fileDescription 49 50 // off is the file offset. off is protected by mu. 51 mu sync.Mutex `state:"nosave"` 52 off int64 53 } 54 55 func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) { 56 fd := ®ularFileFD{} 57 fd.LockFD.Init(&d.locks) 58 if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ 59 AllowDirectIO: true, 60 }); err != nil { 61 return nil, err 62 } 63 if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) { 64 metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file") 65 } 66 if atomic.LoadInt32(&d.mmapFD) >= 0 { 67 fsmetric.GoferOpensHost.Increment() 68 } else { 69 fsmetric.GoferOpens9P.Increment() 70 } 71 return fd, nil 72 } 73 74 // Release implements vfs.FileDescriptionImpl.Release. 75 func (fd *regularFileFD) Release(context.Context) { 76 } 77 78 // OnClose implements vfs.FileDescriptionImpl.OnClose. 79 func (fd *regularFileFD) OnClose(ctx context.Context) error { 80 if !fd.vfsfd.IsWritable() { 81 return nil 82 } 83 // Skip flushing if there are client-buffered writes, since (as with the 84 // VFS1 client) we don't flush buffered writes on close anyway. 85 d := fd.dentry() 86 if d.fs.opts.interop != InteropModeExclusive { 87 return nil 88 } 89 d.dataMu.RLock() 90 haveDirtyPages := !d.dirty.IsEmpty() 91 d.dataMu.RUnlock() 92 if haveDirtyPages { 93 return nil 94 } 95 d.handleMu.RLock() 96 defer d.handleMu.RUnlock() 97 if d.writeFile.isNil() { 98 return nil 99 } 100 return d.writeFile.flush(ctx) 101 } 102 103 // Allocate implements vfs.FileDescriptionImpl.Allocate. 104 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { 105 d := fd.dentry() 106 return d.doAllocate(ctx, offset, length, func() error { 107 d.handleMu.RLock() 108 defer d.handleMu.RUnlock() 109 return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) 110 }) 111 } 112 113 // PRead implements vfs.FileDescriptionImpl.PRead. 114 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 115 start := fsmetric.StartReadWait() 116 d := fd.dentry() 117 defer func() { 118 if atomic.LoadInt32(&d.readFD) >= 0 { 119 fsmetric.GoferReadsHost.Increment() 120 fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) 121 } else { 122 fsmetric.GoferReads9P.Increment() 123 fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) 124 } 125 }() 126 127 if offset < 0 { 128 return 0, linuxerr.EINVAL 129 } 130 131 // Check that flags are supported. 132 // 133 // TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags. 134 if opts.Flags&^linux.RWF_HIPRI != 0 { 135 return 0, syserror.EOPNOTSUPP 136 } 137 138 // Check for reading at EOF before calling into MM (but not under 139 // InteropModeShared, which makes d.size unreliable). 140 if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) { 141 return 0, io.EOF 142 } 143 144 var ( 145 n int64 146 readErr error 147 ) 148 if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { 149 // Lock d.metadataMu for the rest of the read to prevent d.size from 150 // changing. 151 d.metadataMu.Lock() 152 defer d.metadataMu.Unlock() 153 // Write dirty cached pages that will be touched by the read back to 154 // the remote file. 155 if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { 156 return 0, err 157 } 158 rw := getDentryReadWriter(ctx, d, offset) 159 // Require the read to go to the remote file. 160 rw.direct = true 161 n, readErr = dst.CopyOutFrom(ctx, rw) 162 putDentryReadWriter(rw) 163 if d.fs.opts.interop != InteropModeShared { 164 // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). 165 d.touchAtimeLocked(fd.vfsfd.Mount()) 166 } 167 } else { 168 rw := getDentryReadWriter(ctx, d, offset) 169 n, readErr = dst.CopyOutFrom(ctx, rw) 170 putDentryReadWriter(rw) 171 if d.fs.opts.interop != InteropModeShared { 172 // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). 173 d.touchAtime(fd.vfsfd.Mount()) 174 } 175 } 176 return n, readErr 177 } 178 179 // Read implements vfs.FileDescriptionImpl.Read. 180 func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 181 fd.mu.Lock() 182 n, err := fd.PRead(ctx, dst, fd.off, opts) 183 fd.off += n 184 fd.mu.Unlock() 185 return n, err 186 } 187 188 // PWrite implements vfs.FileDescriptionImpl.PWrite. 189 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 190 n, _, err := fd.pwrite(ctx, src, offset, opts) 191 return n, err 192 } 193 194 // pwrite returns the number of bytes written, final offset, error. The final 195 // offset should be ignored by PWrite. 196 func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { 197 if offset < 0 { 198 return 0, offset, linuxerr.EINVAL 199 } 200 201 // Check that flags are supported. 202 // 203 // TODO(github.com/SagerNet/issue/2601): Support select pwritev2 flags. 204 if opts.Flags&^linux.RWF_HIPRI != 0 { 205 return 0, offset, syserror.EOPNOTSUPP 206 } 207 208 d := fd.dentry() 209 210 d.metadataMu.Lock() 211 defer d.metadataMu.Unlock() 212 213 // If the fd was opened with O_APPEND, make sure the file size is updated. 214 // There is a possible race here if size is modified externally after 215 // metadata cache is updated. 216 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { 217 if err := d.refreshSizeLocked(ctx); err != nil { 218 return 0, offset, err 219 } 220 } 221 222 // Set offset to file size if the fd was opened with O_APPEND. 223 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 224 // Holding d.metadataMu is sufficient for reading d.size. 225 offset = int64(d.size) 226 } 227 limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) 228 if err != nil { 229 return 0, offset, err 230 } 231 src = src.TakeFirst64(limit) 232 233 if d.fs.opts.interop != InteropModeShared { 234 // Compare Linux's mm/filemap.c:__generic_file_write_iter() => 235 // file_update_time(). This is d.touchCMtime(), but without locking 236 // d.metadataMu (recursively). 237 d.touchCMtimeLocked() 238 } 239 240 rw := getDentryReadWriter(ctx, d, offset) 241 defer putDentryReadWriter(rw) 242 243 if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { 244 if err := fd.writeCache(ctx, d, offset, src); err != nil { 245 return 0, offset, err 246 } 247 248 // Require the write to go to the remote file. 249 rw.direct = true 250 } 251 252 n, err := src.CopyInTo(ctx, rw) 253 if err != nil { 254 return n, offset + n, err 255 } 256 if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { 257 // Note that if any of the following fail, then we can't guarantee that 258 // any data was actually written with the semantics of O_DSYNC or 259 // O_SYNC, so we return zero bytes written. Compare Linux's 260 // mm/filemap.c:generic_file_write_iter() => 261 // include/linux/fs.h:generic_write_sync(). 262 // 263 // Write dirty cached pages touched by the write back to the remote 264 // file. 265 if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { 266 return 0, offset, err 267 } 268 // Request the remote filesystem to sync the remote file. 269 if err := d.syncRemoteFile(ctx); err != nil { 270 return 0, offset, err 271 } 272 } 273 274 // As with Linux, writing clears the setuid and setgid bits. 275 if n > 0 { 276 oldMode := atomic.LoadUint32(&d.mode) 277 // If setuid or setgid were set, update d.mode and propagate 278 // changes to the host. 279 if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode { 280 atomic.StoreUint32(&d.mode, newMode) 281 if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil { 282 return 0, offset, err 283 } 284 } 285 } 286 287 return n, offset + n, nil 288 } 289 290 func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { 291 // Write dirty cached pages that will be touched by the write back to 292 // the remote file. 293 if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { 294 return err 295 } 296 297 // Remove touched pages from the cache. 298 pgstart := hostarch.PageRoundDown(uint64(offset)) 299 pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) 300 if !ok { 301 return linuxerr.EINVAL 302 } 303 mr := memmap.MappableRange{pgstart, pgend} 304 var freed []memmap.FileRange 305 306 d.dataMu.Lock() 307 cseg := d.cache.LowerBoundSegment(mr.Start) 308 for cseg.Ok() && cseg.Start() < mr.End { 309 cseg = d.cache.Isolate(cseg, mr) 310 freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) 311 cseg = d.cache.Remove(cseg).NextSegment() 312 } 313 d.dataMu.Unlock() 314 315 // Invalidate mappings of removed pages. 316 d.mapsMu.Lock() 317 d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) 318 d.mapsMu.Unlock() 319 320 // Finally free pages removed from the cache. 321 mf := d.fs.mfp.MemoryFile() 322 for _, freedFR := range freed { 323 mf.DecRef(freedFR) 324 } 325 return nil 326 } 327 328 // Write implements vfs.FileDescriptionImpl.Write. 329 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 330 fd.mu.Lock() 331 n, off, err := fd.pwrite(ctx, src, fd.off, opts) 332 fd.off = off 333 fd.mu.Unlock() 334 return n, err 335 } 336 337 type dentryReadWriter struct { 338 ctx context.Context 339 d *dentry 340 off uint64 341 direct bool 342 } 343 344 var dentryReadWriterPool = sync.Pool{ 345 New: func() interface{} { 346 return &dentryReadWriter{} 347 }, 348 } 349 350 func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { 351 rw := dentryReadWriterPool.Get().(*dentryReadWriter) 352 rw.ctx = ctx 353 rw.d = d 354 rw.off = uint64(offset) 355 rw.direct = false 356 return rw 357 } 358 359 func putDentryReadWriter(rw *dentryReadWriter) { 360 rw.ctx = nil 361 rw.d = nil 362 dentryReadWriterPool.Put(rw) 363 } 364 365 // ReadToBlocks implements safemem.Reader.ReadToBlocks. 366 func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { 367 if dsts.IsEmpty() { 368 return 0, nil 369 } 370 371 // If we have a mmappable host FD (which must be used here to ensure 372 // coherence with memory-mapped I/O), or if InteropModeShared is in effect 373 // (which prevents us from caching file contents and makes dentry.size 374 // unreliable), or if the file was opened O_DIRECT, read directly from 375 // dentry.readHandleLocked() without locking dentry.dataMu. 376 rw.d.handleMu.RLock() 377 h := rw.d.readHandleLocked() 378 if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { 379 n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) 380 rw.d.handleMu.RUnlock() 381 rw.off += n 382 return n, err 383 } 384 385 // Otherwise read from/through the cache. 386 mf := rw.d.fs.mfp.MemoryFile() 387 fillCache := mf.ShouldCacheEvictable() 388 var dataMuUnlock func() 389 if fillCache { 390 rw.d.dataMu.Lock() 391 dataMuUnlock = rw.d.dataMu.Unlock 392 } else { 393 rw.d.dataMu.RLock() 394 dataMuUnlock = rw.d.dataMu.RUnlock 395 } 396 397 // Compute the range to read (limited by file size and overflow-checked). 398 if rw.off >= rw.d.size { 399 dataMuUnlock() 400 rw.d.handleMu.RUnlock() 401 return 0, io.EOF 402 } 403 end := rw.d.size 404 if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { 405 end = rend 406 } 407 408 var done uint64 409 seg, gap := rw.d.cache.Find(rw.off) 410 for rw.off < end { 411 mr := memmap.MappableRange{rw.off, end} 412 switch { 413 case seg.Ok(): 414 // Get internal mappings from the cache. 415 ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) 416 if err != nil { 417 dataMuUnlock() 418 rw.d.handleMu.RUnlock() 419 return done, err 420 } 421 422 // Copy from internal mappings. 423 n, err := safemem.CopySeq(dsts, ims) 424 done += n 425 rw.off += n 426 dsts = dsts.DropFirst64(n) 427 if err != nil { 428 dataMuUnlock() 429 rw.d.handleMu.RUnlock() 430 return done, err 431 } 432 433 // Continue. 434 seg, gap = seg.NextNonEmpty() 435 436 case gap.Ok(): 437 gapMR := gap.Range().Intersect(mr) 438 if fillCache { 439 // Read into the cache, then re-enter the loop to read from the 440 // cache. 441 gapEnd, _ := hostarch.PageRoundUp(gapMR.End) 442 reqMR := memmap.MappableRange{ 443 Start: hostarch.PageRoundDown(gapMR.Start), 444 End: gapEnd, 445 } 446 optMR := gap.Range() 447 err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt) 448 mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) 449 seg, gap = rw.d.cache.Find(rw.off) 450 if !seg.Ok() { 451 dataMuUnlock() 452 rw.d.handleMu.RUnlock() 453 return done, err 454 } 455 // err might have occurred in part of gap.Range() outside gapMR 456 // (in particular, gap.End() might be beyond EOF). Forget about 457 // it for now; if the error matters and persists, we'll run 458 // into it again in a later iteration of this loop. 459 } else { 460 // Read directly from the file. 461 gapDsts := dsts.TakeFirst64(gapMR.Length()) 462 n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) 463 done += n 464 rw.off += n 465 dsts = dsts.DropFirst64(n) 466 // Partial reads are fine. But we must stop reading. 467 if n != gapDsts.NumBytes() || err != nil { 468 dataMuUnlock() 469 rw.d.handleMu.RUnlock() 470 return done, err 471 } 472 473 // Continue. 474 seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} 475 } 476 } 477 } 478 dataMuUnlock() 479 rw.d.handleMu.RUnlock() 480 return done, nil 481 } 482 483 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. 484 // 485 // Preconditions: rw.d.metadataMu must be locked. 486 func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 487 if srcs.IsEmpty() { 488 return 0, nil 489 } 490 491 // If we have a mmappable host FD (which must be used here to ensure 492 // coherence with memory-mapped I/O), or if InteropModeShared is in effect 493 // (which prevents us from caching file contents), or if the file was 494 // opened with O_DIRECT, write directly to dentry.writeHandleLocked() 495 // without locking dentry.dataMu. 496 rw.d.handleMu.RLock() 497 h := rw.d.writeHandleLocked() 498 if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { 499 n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) 500 rw.off += n 501 rw.d.dataMu.Lock() 502 if rw.off > rw.d.size { 503 atomic.StoreUint64(&rw.d.size, rw.off) 504 // The remote file's size will implicitly be extended to the correct 505 // value when we write back to it. 506 } 507 rw.d.dataMu.Unlock() 508 rw.d.handleMu.RUnlock() 509 return n, err 510 } 511 512 // Otherwise write to/through the cache. 513 mf := rw.d.fs.mfp.MemoryFile() 514 rw.d.dataMu.Lock() 515 516 // Compute the range to write (overflow-checked). 517 start := rw.off 518 end := rw.off + srcs.NumBytes() 519 if end <= rw.off { 520 end = math.MaxInt64 521 } 522 523 var ( 524 done uint64 525 retErr error 526 ) 527 seg, gap := rw.d.cache.Find(rw.off) 528 for rw.off < end { 529 mr := memmap.MappableRange{rw.off, end} 530 switch { 531 case seg.Ok(): 532 // Get internal mappings from the cache. 533 segMR := seg.Range().Intersect(mr) 534 ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write) 535 if err != nil { 536 retErr = err 537 goto exitLoop 538 } 539 540 // Copy to internal mappings. 541 n, err := safemem.CopySeq(ims, srcs) 542 done += n 543 rw.off += n 544 srcs = srcs.DropFirst64(n) 545 rw.d.dirty.MarkDirty(segMR) 546 if err != nil { 547 retErr = err 548 goto exitLoop 549 } 550 551 // Continue. 552 seg, gap = seg.NextNonEmpty() 553 554 case gap.Ok(): 555 // Write directly to the file. At present, we never fill the cache 556 // when writing, since doing so can convert small writes into 557 // inefficient read-modify-write cycles, and we have no mechanism 558 // for detecting or avoiding this. 559 gapMR := gap.Range().Intersect(mr) 560 gapSrcs := srcs.TakeFirst64(gapMR.Length()) 561 n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) 562 done += n 563 rw.off += n 564 srcs = srcs.DropFirst64(n) 565 // Partial writes are fine. But we must stop writing. 566 if n != gapSrcs.NumBytes() || err != nil { 567 retErr = err 568 goto exitLoop 569 } 570 571 // Continue. 572 seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} 573 } 574 } 575 exitLoop: 576 if rw.off > rw.d.size { 577 atomic.StoreUint64(&rw.d.size, rw.off) 578 // The remote file's size will implicitly be extended to the correct 579 // value when we write back to it. 580 } 581 // If InteropModeWritethrough is in effect, flush written data back to the 582 // remote filesystem. 583 if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { 584 if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ 585 Start: start, 586 End: rw.off, 587 }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil { 588 // We have no idea how many bytes were actually flushed. 589 rw.off = start 590 done = 0 591 retErr = err 592 } 593 } 594 rw.d.dataMu.Unlock() 595 rw.d.handleMu.RUnlock() 596 return done, retErr 597 } 598 599 func (d *dentry) writeback(ctx context.Context, offset, size int64) error { 600 if size == 0 { 601 return nil 602 } 603 d.handleMu.RLock() 604 defer d.handleMu.RUnlock() 605 h := d.writeHandleLocked() 606 d.dataMu.Lock() 607 defer d.dataMu.Unlock() 608 // Compute the range of valid bytes (overflow-checked). 609 if uint64(offset) >= d.size { 610 return nil 611 } 612 end := int64(d.size) 613 if rend := offset + size; rend > offset && rend < end { 614 end = rend 615 } 616 return fsutil.SyncDirty(ctx, memmap.MappableRange{ 617 Start: uint64(offset), 618 End: uint64(end), 619 }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) 620 } 621 622 // Seek implements vfs.FileDescriptionImpl.Seek. 623 func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 624 fd.mu.Lock() 625 defer fd.mu.Unlock() 626 newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) 627 if err != nil { 628 return 0, err 629 } 630 fd.off = newOffset 631 return newOffset, nil 632 } 633 634 // Calculate the new offset for a seek operation on a regular file. 635 func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { 636 switch whence { 637 case linux.SEEK_SET: 638 // Use offset as specified. 639 case linux.SEEK_CUR: 640 offset += fdOffset 641 case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: 642 // Ensure file size is up to date. 643 if !d.cachedMetadataAuthoritative() { 644 if err := d.updateFromGetattr(ctx); err != nil { 645 return 0, err 646 } 647 } 648 size := int64(atomic.LoadUint64(&d.size)) 649 // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous 650 // block of data. 651 switch whence { 652 case linux.SEEK_END: 653 offset += size 654 case linux.SEEK_DATA: 655 if offset > size { 656 return 0, linuxerr.ENXIO 657 } 658 // Use offset as specified. 659 case linux.SEEK_HOLE: 660 if offset > size { 661 return 0, linuxerr.ENXIO 662 } 663 offset = size 664 } 665 default: 666 return 0, linuxerr.EINVAL 667 } 668 if offset < 0 { 669 return 0, linuxerr.EINVAL 670 } 671 return offset, nil 672 } 673 674 // Sync implements vfs.FileDescriptionImpl.Sync. 675 func (fd *regularFileFD) Sync(ctx context.Context) error { 676 return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */) 677 } 678 679 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 680 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { 681 d := fd.dentry() 682 // Force sentry page caching at your own risk. 683 if !d.fs.opts.forcePageCache { 684 switch d.fs.opts.interop { 685 case InteropModeExclusive: 686 // Any mapping is fine. 687 case InteropModeWritethrough: 688 // Shared writable mappings require a host FD, since otherwise we 689 // can't synchronously flush memory-mapped writes to the remote 690 // file. 691 if opts.Private || !opts.MaxPerms.Write { 692 break 693 } 694 fallthrough 695 case InteropModeShared: 696 // All mappings require a host FD to be coherent with other 697 // filesystem users. 698 if atomic.LoadInt32(&d.mmapFD) < 0 { 699 return linuxerr.ENODEV 700 } 701 default: 702 panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) 703 } 704 } 705 // After this point, d may be used as a memmap.Mappable. 706 d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) 707 opts.SentryOwnedContent = d.fs.opts.forcePageCache 708 return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) 709 } 710 711 func (d *dentry) mayCachePages() bool { 712 if d.fs.opts.forcePageCache { 713 return true 714 } 715 if d.fs.opts.interop == InteropModeShared { 716 return false 717 } 718 return atomic.LoadInt32(&d.mmapFD) >= 0 719 } 720 721 // AddMapping implements memmap.Mappable.AddMapping. 722 func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { 723 d.mapsMu.Lock() 724 mapped := d.mappings.AddMapping(ms, ar, offset, writable) 725 // Do this unconditionally since whether we have a host FD can change 726 // across save/restore. 727 for _, r := range mapped { 728 d.pf.hostFileMapper.IncRefOn(r) 729 } 730 if d.mayCachePages() { 731 // d.Evict() will refuse to evict memory-mapped pages, so tell the 732 // MemoryFile to not bother trying. 733 mf := d.fs.mfp.MemoryFile() 734 for _, r := range mapped { 735 mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) 736 } 737 } 738 d.mapsMu.Unlock() 739 return nil 740 } 741 742 // RemoveMapping implements memmap.Mappable.RemoveMapping. 743 func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { 744 d.mapsMu.Lock() 745 unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) 746 for _, r := range unmapped { 747 d.pf.hostFileMapper.DecRefOn(r) 748 } 749 if d.mayCachePages() { 750 // Pages that are no longer referenced by any application memory 751 // mappings are now considered unused; allow MemoryFile to evict them 752 // when necessary. 753 mf := d.fs.mfp.MemoryFile() 754 d.dataMu.Lock() 755 for _, r := range unmapped { 756 // Since these pages are no longer mapped, they are no longer 757 // concurrently dirtyable by a writable memory mapping. 758 d.dirty.AllowClean(r) 759 mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) 760 } 761 d.dataMu.Unlock() 762 } 763 d.mapsMu.Unlock() 764 } 765 766 // CopyMapping implements memmap.Mappable.CopyMapping. 767 func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { 768 return d.AddMapping(ctx, ms, dstAR, offset, writable) 769 } 770 771 // Translate implements memmap.Mappable.Translate. 772 func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 773 d.handleMu.RLock() 774 if d.mmapFD >= 0 && !d.fs.opts.forcePageCache { 775 d.handleMu.RUnlock() 776 mr := optional 777 if d.fs.opts.limitHostFDTranslation { 778 mr = maxFillRange(required, optional) 779 } 780 return []memmap.Translation{ 781 { 782 Source: mr, 783 File: &d.pf, 784 Offset: mr.Start, 785 Perms: hostarch.AnyAccess, 786 }, 787 }, nil 788 } 789 790 d.dataMu.Lock() 791 792 // Constrain translations to d.size (rounded up) to prevent translation to 793 // pages that may be concurrently truncated. 794 pgend, _ := hostarch.PageRoundUp(d.size) 795 var beyondEOF bool 796 if required.End > pgend { 797 if required.Start >= pgend { 798 d.dataMu.Unlock() 799 d.handleMu.RUnlock() 800 return nil, &memmap.BusError{io.EOF} 801 } 802 beyondEOF = true 803 required.End = pgend 804 } 805 if optional.End > pgend { 806 optional.End = pgend 807 } 808 809 mf := d.fs.mfp.MemoryFile() 810 h := d.readHandleLocked() 811 cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt) 812 813 var ts []memmap.Translation 814 var translatedEnd uint64 815 for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { 816 segMR := seg.Range().Intersect(optional) 817 // TODO(jamieliu): Make Translations writable even if writability is 818 // not required if already kept-dirty by another writable translation. 819 perms := hostarch.AccessType{ 820 Read: true, 821 Execute: true, 822 } 823 if at.Write { 824 // From this point forward, this memory can be dirtied through the 825 // mapping at any time. 826 d.dirty.KeepDirty(segMR) 827 perms.Write = true 828 } 829 ts = append(ts, memmap.Translation{ 830 Source: segMR, 831 File: mf, 832 Offset: seg.FileRangeOf(segMR).Start, 833 Perms: perms, 834 }) 835 translatedEnd = segMR.End 836 } 837 838 d.dataMu.Unlock() 839 d.handleMu.RUnlock() 840 841 // Don't return the error returned by c.cache.Fill if it occurred outside 842 // of required. 843 if translatedEnd < required.End && cerr != nil { 844 return ts, &memmap.BusError{cerr} 845 } 846 if beyondEOF { 847 return ts, &memmap.BusError{io.EOF} 848 } 849 return ts, nil 850 } 851 852 func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { 853 const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily 854 if required.Length() >= maxReadahead { 855 return required 856 } 857 if optional.Length() <= maxReadahead { 858 return optional 859 } 860 optional.Start = required.Start 861 if optional.Length() <= maxReadahead { 862 return optional 863 } 864 optional.End = optional.Start + maxReadahead 865 return optional 866 } 867 868 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 869 func (d *dentry) InvalidateUnsavable(ctx context.Context) error { 870 // Whether we have a host fd (and consequently what memmap.File is 871 // mapped) can change across save/restore, so invalidate all translations 872 // unconditionally. 873 d.mapsMu.Lock() 874 defer d.mapsMu.Unlock() 875 d.mappings.InvalidateAll(memmap.InvalidateOpts{}) 876 877 // Write the cache's contents back to the remote file so that if we have a 878 // host fd after restore, the remote file's contents are coherent. 879 mf := d.fs.mfp.MemoryFile() 880 d.handleMu.RLock() 881 defer d.handleMu.RUnlock() 882 h := d.writeHandleLocked() 883 d.dataMu.Lock() 884 defer d.dataMu.Unlock() 885 if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { 886 return err 887 } 888 889 // Discard the cache so that it's not stored in saved state. This is safe 890 // because per InvalidateUnsavable invariants, no new translations can have 891 // been returned after we invalidated all existing translations above. 892 d.cache.DropAll(mf) 893 d.dirty.RemoveAll() 894 895 return nil 896 } 897 898 // Evict implements pgalloc.EvictableMemoryUser.Evict. 899 func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { 900 mr := memmap.MappableRange{er.Start, er.End} 901 mf := d.fs.mfp.MemoryFile() 902 d.mapsMu.Lock() 903 defer d.mapsMu.Unlock() 904 d.handleMu.RLock() 905 defer d.handleMu.RUnlock() 906 h := d.writeHandleLocked() 907 d.dataMu.Lock() 908 defer d.dataMu.Unlock() 909 910 // Only allow pages that are no longer memory-mapped to be evicted. 911 for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { 912 mgapMR := mgap.Range().Intersect(mr) 913 if mgapMR.Length() == 0 { 914 continue 915 } 916 if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { 917 log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) 918 } 919 d.cache.Drop(mgapMR, mf) 920 d.dirty.KeepClean(mgapMR) 921 } 922 } 923 924 // dentryPlatformFile implements memmap.File. It exists solely because dentry 925 // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. 926 // 927 // dentryPlatformFile is only used when a host FD representing the remote file 928 // is available (i.e. dentry.mmapFD >= 0), and that FD is used for application 929 // memory mappings (i.e. !filesystem.opts.forcePageCache). 930 // 931 // +stateify savable 932 type dentryPlatformFile struct { 933 *dentry 934 935 // fdRefs counts references on memmap.File offsets. fdRefs is protected 936 // by dentry.dataMu. 937 fdRefs fsutil.FrameRefSet 938 939 // If this dentry represents a regular file, and dentry.mmapFD >= 0, 940 // hostFileMapper caches mappings of dentry.mmapFD. 941 hostFileMapper fsutil.HostFileMapper 942 943 // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. 944 hostFileMapperInitOnce sync.Once `state:"nosave"` 945 } 946 947 // IncRef implements memmap.File.IncRef. 948 func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) { 949 d.dataMu.Lock() 950 d.fdRefs.IncRefAndAccount(fr) 951 d.dataMu.Unlock() 952 } 953 954 // DecRef implements memmap.File.DecRef. 955 func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { 956 d.dataMu.Lock() 957 d.fdRefs.DecRefAndAccount(fr) 958 d.dataMu.Unlock() 959 } 960 961 // MapInternal implements memmap.File.MapInternal. 962 func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 963 d.handleMu.RLock() 964 defer d.handleMu.RUnlock() 965 return d.hostFileMapper.MapInternal(fr, int(d.mmapFD), at.Write) 966 } 967 968 // FD implements memmap.File.FD. 969 func (d *dentryPlatformFile) FD() int { 970 d.handleMu.RLock() 971 defer d.handleMu.RUnlock() 972 return int(d.mmapFD) 973 }