github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/tmpfs/regular_file.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tmpfs 16 17 import ( 18 "fmt" 19 "io" 20 "math" 21 22 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 23 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 24 "github.com/nicocha30/gvisor-ligolo/pkg/context" 25 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 26 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 27 "github.com/nicocha30/gvisor-ligolo/pkg/safemem" 28 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsmetric" 29 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsutil" 30 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/hostfd" 31 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 32 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap" 33 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 34 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 35 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 36 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 37 "github.com/nicocha30/gvisor-ligolo/pkg/usermem" 38 ) 39 40 // regularFile is a regular (=S_IFREG) tmpfs file. 41 // 42 // +stateify savable 43 type regularFile struct { 44 inode inode 45 46 // memoryUsageKind is the memory accounting category under which pages backing 47 // this regularFile's contents are accounted. 48 memoryUsageKind usage.MemoryKind 49 50 // mapsMu protects mappings. 51 mapsMu sync.Mutex `state:"nosave"` 52 53 // mappings tracks mappings of the file into memmap.MappingSpaces. 54 // 55 // Protected by mapsMu. 56 mappings memmap.MappingSet 57 58 // writableMappingPages tracks how many pages of virtual memory are mapped 59 // as potentially writable from this file. If a page has multiple mappings, 60 // each mapping is counted separately. 61 // 62 // This counter is susceptible to overflow as we can potentially count 63 // mappings from many VMAs. We count pages rather than bytes to slightly 64 // mitigate this. 65 // 66 // Protected by mapsMu. 67 writableMappingPages uint64 68 69 // dataMu protects the fields below. 70 dataMu sync.RWMutex `state:"nosave"` 71 72 // data maps offsets into the file to offsets into memFile that store 73 // the file's data. 74 // 75 // Protected by dataMu. 76 data fsutil.FileRangeSet 77 78 // seals represents file seals on this inode. 79 // 80 // Protected by dataMu. 81 seals uint32 82 83 // size is the size of data. 84 // 85 // Protected by both dataMu and inode.mu; reading it requires holding 86 // either mutex, while writing requires holding both AND using atomics. 87 // Readers that do not require consistency (like Stat) may read the 88 // value atomically without holding either lock. 89 size atomicbitops.Uint64 90 } 91 92 func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode { 93 file := ®ularFile{ 94 memoryUsageKind: fs.usage, 95 seals: linux.F_SEAL_SEAL, 96 } 97 file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir) 98 file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory 99 return &file.inode 100 } 101 102 // newUnlinkedRegularFileDescription creates a regular file on the tmpfs 103 // filesystem represented by mount and returns an FD representing that file. 104 // The new file is not reachable by path traversal from any other file. 105 // 106 // newUnlinkedRegularFileDescription is analogous to Linux's 107 // mm/shmem.c:__shmem_file_setup(). 108 // 109 // Preconditions: mount must be a tmpfs mount. 110 func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { 111 fs, ok := mount.Filesystem().Impl().(*filesystem) 112 if !ok { 113 panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") 114 } 115 116 inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */) 117 d := fs.newDentry(inode) 118 defer d.DecRef(ctx) 119 d.name = name 120 121 fd := ®ularFileFD{} 122 fd.Init(&inode.locks) 123 flags := uint32(linux.O_RDWR) 124 if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { 125 return nil, err 126 } 127 return fd, nil 128 } 129 130 // NewZeroFile creates a new regular file and file description as for 131 // mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is 132 // initially (implicitly) filled with zeroes. 133 // 134 // Preconditions: mount must be a tmpfs mount. 135 func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { 136 // Compare mm/shmem.c:shmem_zero_setup(). 137 fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") 138 if err != nil { 139 return nil, err 140 } 141 rf := fd.inode().impl.(*regularFile) 142 rf.memoryUsageKind = usage.Anonymous 143 rf.size.Store(size) 144 return &fd.vfsfd, err 145 } 146 147 // NewMemfd creates a new regular file and file description as for 148 // memfd_create. 149 // 150 // Preconditions: mount must be a tmpfs mount. 151 func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { 152 fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) 153 if err != nil { 154 return nil, err 155 } 156 if allowSeals { 157 fd.inode().impl.(*regularFile).seals = 0 158 } 159 return &fd.vfsfd, nil 160 } 161 162 // truncate grows or shrinks the file to the given size. It returns true if the 163 // file size was updated. 164 func (rf *regularFile) truncate(newSize uint64) (bool, error) { 165 rf.inode.mu.Lock() 166 defer rf.inode.mu.Unlock() 167 return rf.truncateLocked(newSize) 168 } 169 170 // Preconditions: 171 // - rf.inode.mu must be held. 172 // - rf.dataMu must be locked for writing. 173 // - newSize > rf.size. 174 func (rf *regularFile) growLocked(newSize uint64) error { 175 // Can we grow the file? 176 if rf.seals&linux.F_SEAL_GROW != 0 { 177 return linuxerr.EPERM 178 } 179 rf.size.Store(newSize) 180 return nil 181 } 182 183 // Preconditions: rf.inode.mu must be held. 184 func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { 185 oldSize := rf.size.RacyLoad() 186 if newSize == oldSize { 187 // Nothing to do. 188 return false, nil 189 } 190 191 // Need to hold inode.mu and dataMu while modifying size. 192 rf.dataMu.Lock() 193 if newSize > oldSize { 194 err := rf.growLocked(newSize) 195 rf.dataMu.Unlock() 196 return err == nil, err 197 } 198 199 // We are shrinking the file. First check if this is allowed. 200 if rf.seals&linux.F_SEAL_SHRINK != 0 { 201 rf.dataMu.Unlock() 202 return false, linuxerr.EPERM 203 } 204 205 rf.size.Store(newSize) 206 rf.dataMu.Unlock() 207 208 // Invalidate past translations of truncated pages. 209 oldpgend := offsetPageEnd(int64(oldSize)) 210 newpgend := offsetPageEnd(int64(newSize)) 211 if newpgend < oldpgend { 212 rf.mapsMu.Lock() 213 rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 214 // Compare Linux's mm/shmem.c:shmem_setattr() => 215 // mm/memory.c:unmap_mapping_range(evencows=1). 216 InvalidatePrivate: true, 217 }) 218 rf.mapsMu.Unlock() 219 } 220 221 // We are now guaranteed that there are no translations of truncated pages, 222 // and can remove them. 223 rf.dataMu.Lock() 224 decPages := rf.data.Truncate(newSize, rf.inode.fs.mf) 225 rf.dataMu.Unlock() 226 rf.inode.fs.unaccountPages(decPages) 227 return true, nil 228 } 229 230 // AddMapping implements memmap.Mappable.AddMapping. 231 func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { 232 rf.mapsMu.Lock() 233 defer rf.mapsMu.Unlock() 234 rf.dataMu.RLock() 235 defer rf.dataMu.RUnlock() 236 237 // Reject writable mapping if F_SEAL_WRITE is set. 238 if rf.seals&linux.F_SEAL_WRITE != 0 && writable { 239 return linuxerr.EPERM 240 } 241 242 rf.mappings.AddMapping(ms, ar, offset, writable) 243 if writable { 244 pagesBefore := rf.writableMappingPages 245 246 // ar is guaranteed to be page aligned per memmap.Mappable. 247 rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize) 248 249 if rf.writableMappingPages < pagesBefore { 250 panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) 251 } 252 } 253 254 return nil 255 } 256 257 // RemoveMapping implements memmap.Mappable.RemoveMapping. 258 func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { 259 rf.mapsMu.Lock() 260 defer rf.mapsMu.Unlock() 261 262 rf.mappings.RemoveMapping(ms, ar, offset, writable) 263 264 if writable { 265 pagesBefore := rf.writableMappingPages 266 267 // ar is guaranteed to be page aligned per memmap.Mappable. 268 rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize) 269 270 if rf.writableMappingPages > pagesBefore { 271 panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) 272 } 273 } 274 } 275 276 // CopyMapping implements memmap.Mappable.CopyMapping. 277 func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { 278 return rf.AddMapping(ctx, ms, dstAR, offset, writable) 279 } 280 281 // Translate implements memmap.Mappable.Translate. 282 func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 283 rf.dataMu.Lock() 284 defer rf.dataMu.Unlock() 285 286 // Constrain translations to f.attr.Size (rounded up) to prevent 287 // translation to pages that may be concurrently truncated. 288 pgend := offsetPageEnd(int64(rf.size.RacyLoad())) 289 var beyondEOF bool 290 if required.End > pgend { 291 if required.Start >= pgend { 292 return nil, &memmap.BusError{io.EOF} 293 } 294 beyondEOF = true 295 required.End = pgend 296 } 297 if optional.End > pgend { 298 optional.End = pgend 299 } 300 pagesToFill := rf.data.PagesToFill(required, optional) 301 if !rf.inode.fs.accountPages(pagesToFill) { 302 // If we can not accommodate pagesToFill pages, then retry with just 303 // the required range. Because optional may be larger than required. 304 // Only error out if even the required range can not be allocated for. 305 pagesToFill = rf.data.PagesToFill(required, required) 306 if !rf.inode.fs.accountPages(pagesToFill) { 307 return nil, &memmap.BusError{linuxerr.ENOSPC} 308 } 309 optional = required 310 } 311 pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, rf.memoryUsageKind, pgalloc.AllocateOnly, nil /* r */) 312 // rf.data.Fill() may fail mid-way. We still want to account any pages that 313 // were allocated, irrespective of an error. 314 rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced) 315 316 var ts []memmap.Translation 317 var translatedEnd uint64 318 for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { 319 segMR := seg.Range().Intersect(optional) 320 ts = append(ts, memmap.Translation{ 321 Source: segMR, 322 File: rf.inode.fs.mf, 323 Offset: seg.FileRangeOf(segMR).Start, 324 Perms: hostarch.AnyAccess, 325 }) 326 translatedEnd = segMR.End 327 } 328 329 // Don't return the error returned by f.data.Fill if it occurred outside of 330 // required. 331 if translatedEnd < required.End && cerr != nil { 332 return ts, &memmap.BusError{cerr} 333 } 334 if beyondEOF { 335 return ts, &memmap.BusError{io.EOF} 336 } 337 return ts, nil 338 } 339 340 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 341 func (*regularFile) InvalidateUnsavable(context.Context) error { 342 return nil 343 } 344 345 // +stateify savable 346 type regularFileFD struct { 347 fileDescription 348 349 // off is the file offset. off is accessed using atomic memory operations. 350 // offMu serializes operations that may mutate off. 351 off int64 352 offMu sync.Mutex `state:"nosave"` 353 } 354 355 // Release implements vfs.FileDescriptionImpl.Release. 356 func (fd *regularFileFD) Release(context.Context) { 357 // noop 358 } 359 360 // Allocate implements vfs.FileDescriptionImpl.Allocate. 361 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { 362 f := fd.inode().impl.(*regularFile) 363 364 f.inode.mu.Lock() 365 defer f.inode.mu.Unlock() 366 f.dataMu.Lock() 367 defer f.dataMu.Unlock() 368 369 // We must allocate pages in the range specified by offset and length. 370 // Even if newSize <= oldSize, there might not be actual memory backing this 371 // range, so any gaps must be filled by calling f.data.Fill(). 372 // "After a successful call, subsequent writes into the range 373 // specified by offset and len are guaranteed not to fail because of 374 // lack of disk space." - fallocate(2) 375 newSize := offset + length 376 pgstartaddr := hostarch.Addr(offset).RoundDown() 377 pgendaddr, ok := hostarch.Addr(newSize).RoundUp() 378 if !ok { 379 return linuxerr.EFBIG 380 } 381 required := memmap.MappableRange{Start: uint64(pgstartaddr), End: uint64(pgendaddr)} 382 pagesToFill := f.data.PagesToFill(required, required) 383 if !f.inode.fs.accountPages(pagesToFill) { 384 return linuxerr.ENOSPC 385 } 386 // Given our definitions in pgalloc, fallocate(2) semantics imply that pages 387 // in the MemoryFile must be committed, in addition to being allocated. 388 allocMode := pgalloc.AllocateAndCommit 389 if !f.inode.fs.mf.IsDiskBacked() { 390 // Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We 391 // take a more aggressive approach in populating pages for memory-backed 392 // MemoryFiles. shmem pages are subject to swap rather than disk writeback. 393 // They are not likely to be swapped before they are written to. Hence it 394 // is beneficial to populate (in addition to commit) shmem pages to avoid 395 // faulting page-by-page when these pages are written to in the future. 396 allocMode = pgalloc.AllocateAndWritePopulate 397 } 398 pagesAlloced, err := f.data.Fill(ctx, required, required, newSize, f.inode.fs.mf, f.memoryUsageKind, allocMode, nil /* r */) 399 // f.data.Fill() may fail mid-way. We still want to account any pages that 400 // were allocated, irrespective of an error. 401 f.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced) 402 if err != nil && err != io.EOF { 403 return err 404 } 405 406 oldSize := f.size.Load() 407 if oldSize >= newSize { 408 return nil 409 } 410 return f.growLocked(newSize) 411 } 412 413 // PRead implements vfs.FileDescriptionImpl.PRead. 414 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 415 start := fsmetric.StartReadWait() 416 defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start) 417 fsmetric.TmpfsReads.Increment() 418 419 if offset < 0 { 420 return 0, linuxerr.EINVAL 421 } 422 423 // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since 424 // all state is in-memory. 425 // 426 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 427 if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { 428 return 0, linuxerr.EOPNOTSUPP 429 } 430 431 if dst.NumBytes() == 0 { 432 return 0, nil 433 } 434 f := fd.inode().impl.(*regularFile) 435 rw := getRegularFileReadWriter(f, offset, 0) 436 n, err := dst.CopyOutFrom(ctx, rw) 437 putRegularFileReadWriter(rw) 438 fd.inode().touchAtime(fd.vfsfd.Mount()) 439 return n, err 440 } 441 442 // Read implements vfs.FileDescriptionImpl.Read. 443 func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 444 fd.offMu.Lock() 445 n, err := fd.PRead(ctx, dst, fd.off, opts) 446 fd.off += n 447 fd.offMu.Unlock() 448 return n, err 449 } 450 451 // PWrite implements vfs.FileDescriptionImpl.PWrite. 452 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 453 n, _, err := fd.pwrite(ctx, src, offset, opts) 454 return n, err 455 } 456 457 // pwrite returns the number of bytes written, final offset and error. The 458 // final offset should be ignored by PWrite. 459 func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { 460 if offset < 0 { 461 return 0, offset, linuxerr.EINVAL 462 } 463 464 // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since 465 // all state is in-memory. 466 // 467 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 468 if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { 469 return 0, offset, linuxerr.EOPNOTSUPP 470 } 471 472 srclen := src.NumBytes() 473 if srclen == 0 { 474 return 0, offset, nil 475 } 476 f := fd.inode().impl.(*regularFile) 477 f.inode.mu.Lock() 478 defer f.inode.mu.Unlock() 479 // If the file is opened with O_APPEND, update offset to file size. 480 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 481 // Locking f.inode.mu is sufficient for reading f.size. 482 offset = int64(f.size.RacyLoad()) 483 } 484 end := offset + srclen 485 if end < offset { 486 // Overflow. 487 return 0, offset, linuxerr.EINVAL 488 } 489 490 srclen, err = vfs.CheckLimit(ctx, offset, srclen) 491 if err != nil { 492 return 0, offset, err 493 } 494 src = src.TakeFirst64(srclen) 495 496 // Perform the write. 497 rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx)) 498 n, err := src.CopyInTo(ctx, rw) 499 500 f.inode.touchCMtimeLocked() 501 for { 502 old := f.inode.mode.Load() 503 new := vfs.ClearSUIDAndSGID(old) 504 if swapped := f.inode.mode.CompareAndSwap(old, new); swapped { 505 break 506 } 507 } 508 putRegularFileReadWriter(rw) 509 return n, n + offset, err 510 } 511 512 // Write implements vfs.FileDescriptionImpl.Write. 513 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 514 fd.offMu.Lock() 515 n, off, err := fd.pwrite(ctx, src, fd.off, opts) 516 fd.off = off 517 fd.offMu.Unlock() 518 return n, err 519 } 520 521 // Seek implements vfs.FileDescriptionImpl.Seek. 522 func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 523 fd.offMu.Lock() 524 defer fd.offMu.Unlock() 525 switch whence { 526 case linux.SEEK_SET: 527 // use offset as specified 528 case linux.SEEK_CUR: 529 offset += fd.off 530 case linux.SEEK_END: 531 offset += int64(fd.inode().impl.(*regularFile).size.Load()) 532 default: 533 return 0, linuxerr.EINVAL 534 } 535 if offset < 0 { 536 return 0, linuxerr.EINVAL 537 } 538 fd.off = offset 539 return offset, nil 540 } 541 542 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 543 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { 544 file := fd.inode().impl.(*regularFile) 545 opts.SentryOwnedContent = true 546 return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts) 547 } 548 549 // offsetPageEnd returns the file offset rounded up to the nearest 550 // page boundary. offsetPageEnd panics if rounding up causes overflow, 551 // which shouldn't be possible given that offset is an int64. 552 func offsetPageEnd(offset int64) uint64 { 553 end, ok := hostarch.Addr(offset).RoundUp() 554 if !ok { 555 panic("impossible overflow") 556 } 557 return uint64(end) 558 } 559 560 // regularFileReadWriter implements safemem.Reader and Safemem.Writer. 561 type regularFileReadWriter struct { 562 file *regularFile 563 564 // Offset into the file to read/write at. Note that this may be 565 // different from the FD offset if PRead/PWrite is used. 566 off uint64 567 568 // memCgID is the memory cgroup ID used for accounting the allocated 569 // pages. 570 memCgID uint32 571 } 572 573 var regularFileReadWriterPool = sync.Pool{ 574 New: func() any { 575 return ®ularFileReadWriter{} 576 }, 577 } 578 579 func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter { 580 rw := regularFileReadWriterPool.Get().(*regularFileReadWriter) 581 rw.file = file 582 rw.off = uint64(offset) 583 rw.memCgID = memCgID 584 return rw 585 } 586 587 func putRegularFileReadWriter(rw *regularFileReadWriter) { 588 rw.file = nil 589 regularFileReadWriterPool.Put(rw) 590 } 591 592 // ReadToBlocks implements safemem.Reader.ReadToBlocks. 593 func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { 594 rw.file.dataMu.RLock() 595 defer rw.file.dataMu.RUnlock() 596 size := rw.file.size.RacyLoad() 597 598 // Compute the range to read (limited by file size and overflow-checked). 599 if rw.off >= size { 600 return 0, io.EOF 601 } 602 end := size 603 if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { 604 end = rend 605 } 606 607 var done uint64 608 seg, gap := rw.file.data.Find(uint64(rw.off)) 609 for rw.off < end { 610 mr := memmap.MappableRange{uint64(rw.off), uint64(end)} 611 switch { 612 case seg.Ok(): 613 // Get internal mappings. 614 ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) 615 if err != nil { 616 return done, err 617 } 618 619 // Copy from internal mappings. 620 n, err := safemem.CopySeq(dsts, ims) 621 done += n 622 rw.off += uint64(n) 623 dsts = dsts.DropFirst64(n) 624 if err != nil { 625 return done, err 626 } 627 628 // Continue. 629 seg, gap = seg.NextNonEmpty() 630 631 case gap.Ok(): 632 // Tmpfs holes are zero-filled. 633 gapmr := gap.Range().Intersect(mr) 634 dst := dsts.TakeFirst64(gapmr.Length()) 635 n, err := safemem.ZeroSeq(dst) 636 done += n 637 rw.off += uint64(n) 638 dsts = dsts.DropFirst64(n) 639 if err != nil { 640 return done, err 641 } 642 643 // Continue. 644 seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} 645 } 646 } 647 return done, nil 648 } 649 650 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. 651 // 652 // Preconditions: rw.file.inode.mu must be held. 653 func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 654 // Hold dataMu so we can modify size. 655 rw.file.dataMu.Lock() 656 defer rw.file.dataMu.Unlock() 657 658 // Compute the range to write (overflow-checked). 659 end := rw.off + srcs.NumBytes() 660 if end <= rw.off { 661 end = math.MaxInt64 662 } 663 664 // Check if seals prevent either file growth or all writes. 665 switch { 666 case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed 667 return 0, linuxerr.EPERM 668 case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed 669 // When growth is sealed, Linux effectively allows writes which would 670 // normally grow the file to partially succeed up to the current EOF, 671 // rounded down to the page boundary before the EOF. 672 // 673 // This happens because writes (and thus the growth check) for tmpfs 674 // files proceed page-by-page on Linux, and the final write to the page 675 // containing EOF fails, resulting in a partial write up to the start of 676 // that page. 677 // 678 // To emulate this behaviour, artifically truncate the write to the 679 // start of the page containing the current EOF. 680 // 681 // See Linux, mm/filemap.c:generic_perform_write() and 682 // mm/shmem.c:shmem_write_begin(). 683 if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart { 684 end = pgstart 685 } 686 if end <= rw.off { 687 // Truncation would result in no data being written. 688 return 0, linuxerr.EPERM 689 } 690 } 691 692 // Page-aligned mr for when we need to allocate memory. RoundUp can't 693 // overflow since end is an int64. 694 pgstartaddr := hostarch.Addr(rw.off).RoundDown() 695 pgendaddr, _ := hostarch.Addr(end).RoundUp() 696 pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} 697 698 var ( 699 done uint64 700 retErr error 701 ) 702 seg, gap := rw.file.data.Find(uint64(rw.off)) 703 for rw.off < end { 704 mr := memmap.MappableRange{uint64(rw.off), uint64(end)} 705 switch { 706 case seg.Ok(): 707 n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs) 708 done += n 709 rw.off += uint64(n) 710 srcs = srcs.DropFirst64(n) 711 if err != nil { 712 retErr = err 713 goto exitLoop 714 } 715 716 // Continue. 717 seg, gap = seg.NextNonEmpty() 718 719 case gap.Ok(): 720 // Allocate memory for the write. 721 gapMR := gap.Range().Intersect(pgMR) 722 pagesToFill := gapMR.Length() / hostarch.PageSize 723 pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill) 724 if pagesReserved == 0 { 725 if done == 0 { 726 retErr = linuxerr.ENOSPC 727 goto exitLoop 728 } 729 retErr = nil 730 goto exitLoop 731 } 732 gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved) 733 allocMode := pgalloc.AllocateAndWritePopulate 734 if rw.file.inode.fs.mf.IsDiskBacked() { 735 // Don't populate pages for disk-backed files. Benchmarking showed that 736 // disk-backed pages are likely to be written back to disk before we 737 // can write to them. The pages fault again on write anyways. In total, 738 // prepopulating disk-backed pages deteriorates performance as it fails 739 // to eliminate future page faults and we also additionally incur 740 // useless disk writebacks. 741 allocMode = pgalloc.AllocateOnly 742 } 743 fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{ 744 Kind: rw.file.memoryUsageKind, 745 Mode: allocMode, 746 MemCgID: rw.memCgID, 747 }) 748 if err != nil { 749 retErr = err 750 rw.file.inode.fs.unaccountPages(pagesReserved) 751 goto exitLoop 752 } 753 754 // Write to that memory as usual. 755 seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} 756 default: 757 panic("unreachable") 758 } 759 } 760 exitLoop: 761 // If the write ends beyond the file's previous size, it causes the 762 // file to grow. 763 if rw.off > rw.file.size.RacyLoad() { 764 rw.file.size.Store(rw.off) 765 } 766 767 return done, retErr 768 } 769 770 func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) { 771 if rw.file.inode.fs.mf.IsDiskBacked() { 772 // Disk-backed files are not prepopulated. The safemem.CopySeq() approach 773 // used below incurs a lot of page faults without page prepopulation, which 774 // causes a lot of context switching. Use write(2) host syscall instead, 775 // which makes one context switch and faults all the pages that are touched 776 // during the write. 777 return hostfd.Pwritev2( 778 int32(rw.file.inode.fs.mf.FD()), // fd 779 srcs.TakeFirst64(fr.Length()), // srcs 780 int64(fr.Start), // offset 781 0, // flags 782 ) 783 } 784 // Get internal mappings. 785 ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write) 786 if err != nil { 787 return 0, err 788 } 789 // Copy to internal mappings. 790 return safemem.CopySeq(ims, srcs) 791 } 792 793 // GetSeals returns the current set of seals on a memfd inode. 794 func GetSeals(fd *vfs.FileDescription) (uint32, error) { 795 f, ok := fd.Impl().(*regularFileFD) 796 if !ok { 797 return 0, linuxerr.EINVAL 798 } 799 rf := f.inode().impl.(*regularFile) 800 rf.dataMu.RLock() 801 defer rf.dataMu.RUnlock() 802 return rf.seals, nil 803 } 804 805 // AddSeals adds new file seals to a memfd inode. 806 func AddSeals(fd *vfs.FileDescription, val uint32) error { 807 f, ok := fd.Impl().(*regularFileFD) 808 if !ok { 809 return linuxerr.EINVAL 810 } 811 rf := f.inode().impl.(*regularFile) 812 rf.mapsMu.Lock() 813 defer rf.mapsMu.Unlock() 814 rf.dataMu.Lock() 815 defer rf.dataMu.Unlock() 816 817 if rf.seals&linux.F_SEAL_SEAL != 0 { 818 // Seal applied which prevents addition of any new seals. 819 return linuxerr.EPERM 820 } 821 822 // F_SEAL_WRITE can only be added if there are no active writable maps. 823 if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { 824 if rf.writableMappingPages > 0 { 825 return linuxerr.EBUSY 826 } 827 } 828 829 // Seals can only be added, never removed. 830 rf.seals |= val 831 return nil 832 }