github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/gofer/special_file.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gofer 16 17 import ( 18 "fmt" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/atomicbitops" 22 "github.com/metacubex/gvisor/pkg/context" 23 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 24 "github.com/metacubex/gvisor/pkg/fdnotifier" 25 "github.com/metacubex/gvisor/pkg/hostarch" 26 "github.com/metacubex/gvisor/pkg/metric" 27 "github.com/metacubex/gvisor/pkg/safemem" 28 "github.com/metacubex/gvisor/pkg/sentry/fsmetric" 29 "github.com/metacubex/gvisor/pkg/sentry/fsutil" 30 "github.com/metacubex/gvisor/pkg/sentry/memmap" 31 "github.com/metacubex/gvisor/pkg/sentry/vfs" 32 "github.com/metacubex/gvisor/pkg/sync" 33 "github.com/metacubex/gvisor/pkg/usermem" 34 "github.com/metacubex/gvisor/pkg/waiter" 35 ) 36 37 // specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device 38 // special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is 39 // in effect) regular files. specialFileFD differs from regularFileFD by using 40 // per-FD handles instead of shared per-dentry handles, and never buffering I/O. 41 // 42 // +stateify savable 43 type specialFileFD struct { 44 fileDescription 45 specialFDEntry 46 47 // releaseMu synchronizes the closing of fd.handle with fd.sync(). It's safe 48 // to access fd.handle without locking for operations that require a ref to 49 // be held by the caller, e.g. vfs.FileDescriptionImpl implementations. 50 releaseMu sync.RWMutex `state:"nosave"` 51 52 // handle is used for file I/O. handle is immutable. 53 handle handle `state:"nosave"` 54 55 // isRegularFile is true if this FD represents a regular file which is only 56 // possible when filesystemOptions.regularFilesUseSpecialFileFD is in 57 // effect. isRegularFile is immutable. 58 isRegularFile bool 59 60 // seekable is true if this file description represents a file for which 61 // file offset is significant, i.e. a regular file, character device or 62 // block device. seekable is immutable. 63 seekable bool 64 65 // haveQueue is true if this file description represents a file for which 66 // queue may send I/O readiness events. haveQueue is immutable. 67 haveQueue bool `state:"nosave"` 68 queue waiter.Queue 69 70 // If seekable is true, off is the file offset. off is protected by mu. 71 mu sync.Mutex `state:"nosave"` 72 off int64 73 74 // If haveBuf is non-zero, this FD represents a pipe, and buf contains data 75 // read from the pipe from previous calls to specialFileFD.savePipeData(). 76 // haveBuf and buf are protected by bufMu. 77 bufMu sync.Mutex `state:"nosave"` 78 haveBuf atomicbitops.Uint32 79 buf []byte 80 81 // If handle.fd >= 0, hostFileMapper caches mappings of handle.fd, and 82 // hostFileMapperInitOnce is used to initialize it on first use. 83 hostFileMapperInitOnce sync.Once `state:"nosave"` 84 hostFileMapper fsutil.HostFileMapper 85 86 // If handle.fd >= 0, fileRefs counts references on memmap.File offsets. 87 // fileRefs is protected by fileRefsMu. 88 fileRefsMu sync.Mutex `state:"nosave"` 89 fileRefs fsutil.FrameRefSet 90 } 91 92 func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { 93 ftype := d.fileType() 94 seekable := ftype == linux.S_IFREG || ftype == linux.S_IFCHR || ftype == linux.S_IFBLK 95 haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK || ftype == linux.S_IFCHR) && h.fd >= 0 96 fd := &specialFileFD{ 97 handle: h, 98 isRegularFile: ftype == linux.S_IFREG, 99 seekable: seekable, 100 haveQueue: haveQueue, 101 } 102 fd.LockFD.Init(&d.locks) 103 if haveQueue { 104 if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { 105 return nil, err 106 } 107 } 108 if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ 109 AllowDirectIO: true, 110 DenyPRead: !seekable, 111 DenyPWrite: !seekable, 112 }); err != nil { 113 if haveQueue { 114 fdnotifier.RemoveFD(h.fd) 115 } 116 return nil, err 117 } 118 d.fs.syncMu.Lock() 119 d.fs.specialFileFDs.PushBack(fd) 120 d.fs.syncMu.Unlock() 121 if fd.vfsfd.IsWritable() && (d.mode.Load()&0111 != 0) { 122 metric.SuspiciousOperationsMetric.Increment(&metric.SuspiciousOperationsTypeOpenedWriteExecuteFile) 123 } 124 if h.fd >= 0 { 125 fsmetric.GoferOpensHost.Increment() 126 } else { 127 fsmetric.GoferOpens9P.Increment() 128 } 129 return fd, nil 130 } 131 132 // Release implements vfs.FileDescriptionImpl.Release. 133 func (fd *specialFileFD) Release(ctx context.Context) { 134 if fd.haveQueue { 135 fdnotifier.RemoveFD(fd.handle.fd) 136 } 137 fd.releaseMu.Lock() 138 fd.handle.close(ctx) 139 fd.releaseMu.Unlock() 140 141 fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 142 fs.syncMu.Lock() 143 fs.specialFileFDs.Remove(fd) 144 fs.syncMu.Unlock() 145 } 146 147 // OnClose implements vfs.FileDescriptionImpl.OnClose. 148 func (fd *specialFileFD) OnClose(ctx context.Context) error { 149 if !fd.vfsfd.IsWritable() { 150 return nil 151 } 152 return flush(ctx, fd.handle.fdLisa) 153 } 154 155 // Readiness implements waiter.Waitable.Readiness. 156 func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { 157 if fd.haveQueue { 158 return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) 159 } 160 return fd.fileDescription.Readiness(mask) 161 } 162 163 // EventRegister implements waiter.Waitable.EventRegister. 164 func (fd *specialFileFD) EventRegister(e *waiter.Entry) error { 165 if fd.haveQueue { 166 fd.queue.EventRegister(e) 167 if err := fdnotifier.UpdateFD(fd.handle.fd); err != nil { 168 fd.queue.EventUnregister(e) 169 return err 170 } 171 return nil 172 } 173 return fd.fileDescription.EventRegister(e) 174 } 175 176 // EventUnregister implements waiter.Waitable.EventUnregister. 177 func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { 178 if fd.haveQueue { 179 fd.queue.EventUnregister(e) 180 if err := fdnotifier.UpdateFD(fd.handle.fd); err != nil { 181 panic(fmt.Sprint("UpdateFD:", err)) 182 } 183 return 184 } 185 fd.fileDescription.EventUnregister(e) 186 } 187 188 // Epollable implements FileDescriptionImpl.Epollable. 189 func (fd *specialFileFD) Epollable() bool { 190 if fd.haveQueue { 191 return true 192 } 193 return fd.fileDescription.Epollable() 194 } 195 196 func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { 197 if fd.isRegularFile { 198 d := fd.dentry() 199 return d.doAllocate(ctx, offset, length, func() error { 200 return fd.handle.allocate(ctx, mode, offset, length) 201 }) 202 } 203 return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) 204 } 205 206 // PRead implements vfs.FileDescriptionImpl.PRead. 207 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 208 start := fsmetric.StartReadWait() 209 defer func() { 210 if fd.handle.fd >= 0 { 211 fsmetric.GoferReadsHost.Increment() 212 fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start) 213 } else { 214 fsmetric.GoferReads9P.Increment() 215 fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start) 216 } 217 }() 218 219 if fd.seekable && offset < 0 { 220 return 0, linuxerr.EINVAL 221 } 222 223 // Check that flags are supported. 224 // 225 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 226 if opts.Flags&^linux.RWF_HIPRI != 0 { 227 return 0, linuxerr.EOPNOTSUPP 228 } 229 230 if d := fd.dentry(); d.cachedMetadataAuthoritative() { 231 d.touchAtime(fd.vfsfd.Mount()) 232 } 233 234 bufN := int64(0) 235 if fd.haveBuf.Load() != 0 { 236 var err error 237 fd.bufMu.Lock() 238 if len(fd.buf) != 0 { 239 var n int 240 n, err = dst.CopyOut(ctx, fd.buf) 241 dst = dst.DropFirst(n) 242 fd.buf = fd.buf[n:] 243 if len(fd.buf) == 0 { 244 fd.haveBuf.Store(0) 245 fd.buf = nil 246 } 247 bufN = int64(n) 248 if offset >= 0 { 249 offset += bufN 250 } 251 } 252 fd.bufMu.Unlock() 253 if err != nil { 254 return bufN, err 255 } 256 } 257 258 rw := getHandleReadWriter(ctx, &fd.handle, offset) 259 n, err := dst.CopyOutFrom(ctx, rw) 260 putHandleReadWriter(rw) 261 if linuxerr.Equals(linuxerr.EAGAIN, err) { 262 err = linuxerr.ErrWouldBlock 263 } 264 return bufN + n, err 265 } 266 267 // Read implements vfs.FileDescriptionImpl.Read. 268 func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 269 if !fd.seekable { 270 return fd.PRead(ctx, dst, -1, opts) 271 } 272 273 fd.mu.Lock() 274 n, err := fd.PRead(ctx, dst, fd.off, opts) 275 fd.off += n 276 fd.mu.Unlock() 277 return n, err 278 } 279 280 // PWrite implements vfs.FileDescriptionImpl.PWrite. 281 func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 282 n, _, err := fd.pwrite(ctx, src, offset, opts) 283 return n, err 284 } 285 286 // pwrite returns the number of bytes written, final offset, error. The final 287 // offset should be ignored by PWrite. 288 func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { 289 if fd.seekable && offset < 0 { 290 return 0, offset, linuxerr.EINVAL 291 } 292 293 // Check that flags are supported. 294 // 295 // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. 296 if opts.Flags&^linux.RWF_HIPRI != 0 { 297 return 0, offset, linuxerr.EOPNOTSUPP 298 } 299 300 d := fd.dentry() 301 if fd.isRegularFile { 302 // If the regular file fd was opened with O_APPEND, make sure the file 303 // size is updated. There is a possible race here if size is modified 304 // externally after metadata cache is updated. 305 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { 306 if err := d.updateMetadata(ctx); err != nil { 307 return 0, offset, err 308 } 309 } 310 311 // We need to hold the metadataMu *while* writing to a regular file. 312 d.metadataMu.Lock() 313 defer d.metadataMu.Unlock() 314 315 // Set offset to file size if the regular file was opened with O_APPEND. 316 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 317 // Holding d.metadataMu is sufficient for reading d.size. 318 offset = int64(d.size.RacyLoad()) 319 } 320 limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) 321 if err != nil { 322 return 0, offset, err 323 } 324 src = src.TakeFirst64(limit) 325 } 326 327 if d.cachedMetadataAuthoritative() { 328 if fd.isRegularFile { 329 d.touchCMtimeLocked() 330 } else { 331 d.touchCMtime() 332 } 333 } 334 335 // handleReadWriter always writes to the remote file. So O_DIRECT is 336 // effectively always set. Invalidate pages in d.mappings that have been 337 // written to. 338 pgstart := hostarch.PageRoundDown(uint64(offset)) 339 pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes())) 340 if !ok { 341 return 0, offset, linuxerr.EINVAL 342 } 343 mr := memmap.MappableRange{pgstart, pgend} 344 d.mapsMu.Lock() 345 d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) 346 d.mapsMu.Unlock() 347 348 rw := getHandleReadWriter(ctx, &fd.handle, offset) 349 n, err := src.CopyInTo(ctx, rw) 350 putHandleReadWriter(rw) 351 if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { 352 // Note that if syncing the remote file fails, then we can't guarantee that 353 // any data was actually written with the semantics of O_DSYNC or 354 // O_SYNC, so we return zero bytes written. Compare Linux's 355 // mm/filemap.c:generic_file_write_iter() => 356 // include/linux/fs.h:generic_write_sync(). 357 if err := fd.sync(ctx, false /* forFilesystemSync */); err != nil { 358 return 0, offset, err 359 } 360 } 361 if linuxerr.Equals(linuxerr.EAGAIN, err) { 362 err = linuxerr.ErrWouldBlock 363 } 364 // Update offset if the offset is valid. 365 if offset >= 0 { 366 offset += n 367 } 368 // Update file size for regular files. 369 if fd.isRegularFile { 370 // d.metadataMu is already locked at this point. 371 if uint64(offset) > d.size.RacyLoad() { 372 d.dataMu.Lock() 373 defer d.dataMu.Unlock() 374 d.size.Store(uint64(offset)) 375 } 376 } 377 return int64(n), offset, err 378 } 379 380 // Write implements vfs.FileDescriptionImpl.Write. 381 func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 382 if !fd.seekable { 383 return fd.PWrite(ctx, src, -1, opts) 384 } 385 386 fd.mu.Lock() 387 n, off, err := fd.pwrite(ctx, src, fd.off, opts) 388 fd.off = off 389 fd.mu.Unlock() 390 return n, err 391 } 392 393 // Seek implements vfs.FileDescriptionImpl.Seek. 394 func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 395 if !fd.seekable { 396 return 0, linuxerr.ESPIPE 397 } 398 fd.mu.Lock() 399 defer fd.mu.Unlock() 400 newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) 401 if err != nil { 402 return 0, err 403 } 404 fd.off = newOffset 405 return newOffset, nil 406 } 407 408 // Sync implements vfs.FileDescriptionImpl.Sync. 409 func (fd *specialFileFD) Sync(ctx context.Context) error { 410 return fd.sync(ctx, false /* forFilesystemSync */) 411 } 412 413 func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error { 414 // Locks to ensure it didn't race with fd.Release(). 415 fd.releaseMu.RLock() 416 defer fd.releaseMu.RUnlock() 417 418 if err := fd.handle.sync(ctx); err != nil { 419 if !forFilesystemSync { 420 return err 421 } 422 // Only return err if we can reasonably have expected sync to succeed 423 // (fd represents a regular file that was opened for writing). 424 if fd.isRegularFile && fd.vfsfd.IsWritable() { 425 return err 426 } 427 ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err) 428 } 429 return nil 430 } 431 432 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 433 func (fd *specialFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { 434 if fd.handle.fd < 0 || fd.filesystem().opts.forcePageCache { 435 return linuxerr.ENODEV 436 } 437 // After this point, fd may be used as a memmap.Mappable and memmap.File. 438 fd.hostFileMapperInitOnce.Do(fd.hostFileMapper.Init) 439 return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts) 440 } 441 442 // AddMapping implements memmap.Mappable.AddMapping. 443 func (fd *specialFileFD) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { 444 d := fd.dentry() 445 d.mapsMu.Lock() 446 defer d.mapsMu.Unlock() 447 d.mappings.AddMapping(ms, ar, offset, writable) 448 fd.hostFileMapper.IncRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) 449 return nil 450 } 451 452 // RemoveMapping implements memmap.Mappable.RemoveMapping. 453 func (fd *specialFileFD) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { 454 d := fd.dentry() 455 d.mapsMu.Lock() 456 defer d.mapsMu.Unlock() 457 d.mappings.RemoveMapping(ms, ar, offset, writable) 458 fd.hostFileMapper.DecRefOn(memmap.MappableRange{offset, offset + uint64(ar.Length())}) 459 } 460 461 // CopyMapping implements memmap.Mappable.CopyMapping. 462 func (fd *specialFileFD) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { 463 return fd.AddMapping(ctx, ms, dstAR, offset, writable) 464 } 465 466 // Translate implements memmap.Mappable.Translate. 467 func (fd *specialFileFD) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 468 mr := optional 469 if fd.filesystem().opts.limitHostFDTranslation { 470 mr = maxFillRange(required, optional) 471 } 472 return []memmap.Translation{ 473 { 474 Source: mr, 475 File: fd, 476 Offset: mr.Start, 477 Perms: hostarch.AnyAccess, 478 }, 479 }, nil 480 } 481 482 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 483 func (fd *specialFileFD) InvalidateUnsavable(ctx context.Context) error { 484 return nil 485 } 486 487 // IncRef implements memmap.File.IncRef. 488 func (fd *specialFileFD) IncRef(fr memmap.FileRange, memCgID uint32) { 489 fd.fileRefsMu.Lock() 490 defer fd.fileRefsMu.Unlock() 491 fd.fileRefs.IncRefAndAccount(fr, memCgID) 492 } 493 494 // DecRef implements memmap.File.DecRef. 495 func (fd *specialFileFD) DecRef(fr memmap.FileRange) { 496 fd.fileRefsMu.Lock() 497 defer fd.fileRefsMu.Unlock() 498 fd.fileRefs.DecRefAndAccount(fr) 499 } 500 501 // MapInternal implements memmap.File.MapInternal. 502 func (fd *specialFileFD) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) { 503 fd.requireHostFD() 504 return fd.hostFileMapper.MapInternal(fr, int(fd.handle.fd), at.Write) 505 } 506 507 // FD implements memmap.File.FD. 508 func (fd *specialFileFD) FD() int { 509 fd.requireHostFD() 510 return int(fd.handle.fd) 511 } 512 513 func (fd *specialFileFD) requireHostFD() { 514 if fd.handle.fd < 0 { 515 // This is possible if fd was successfully mmapped before saving, then 516 // was restored without a host FD. This is unrecoverable: without a 517 // host FD, we can't mmap this file post-restore. 518 panic("gofer.specialFileFD can no longer be memory-mapped without a host FD") 519 } 520 } 521 522 func (fd *specialFileFD) updateMetadata(ctx context.Context) error { 523 d := fd.dentry() 524 d.metadataMu.Lock() 525 defer d.metadataMu.Unlock() 526 return d.updateMetadataLocked(ctx, fd.handle) 527 }