github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/host/host.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package host provides a filesystem implementation for host files imported as 16 // file descriptors. 17 package host 18 19 import ( 20 "fmt" 21 "math" 22 "sync/atomic" 23 24 "golang.org/x/sys/unix" 25 "github.com/SagerNet/gvisor/pkg/abi/linux" 26 "github.com/SagerNet/gvisor/pkg/context" 27 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 28 "github.com/SagerNet/gvisor/pkg/fdnotifier" 29 "github.com/SagerNet/gvisor/pkg/fspath" 30 "github.com/SagerNet/gvisor/pkg/hostarch" 31 "github.com/SagerNet/gvisor/pkg/log" 32 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/kernfs" 33 "github.com/SagerNet/gvisor/pkg/sentry/hostfd" 34 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 35 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 36 "github.com/SagerNet/gvisor/pkg/sentry/memmap" 37 unixsocket "github.com/SagerNet/gvisor/pkg/sentry/socket/unix" 38 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 39 "github.com/SagerNet/gvisor/pkg/sync" 40 "github.com/SagerNet/gvisor/pkg/syserror" 41 "github.com/SagerNet/gvisor/pkg/usermem" 42 "github.com/SagerNet/gvisor/pkg/waiter" 43 ) 44 45 // inode implements kernfs.Inode. 46 // 47 // +stateify savable 48 type inode struct { 49 kernfs.InodeNoStatFS 50 kernfs.InodeNotDirectory 51 kernfs.InodeNotSymlink 52 kernfs.CachedMappable 53 kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. 54 55 locks vfs.FileLocks 56 57 // When the reference count reaches zero, the host fd is closed. 58 inodeRefs 59 60 // hostFD contains the host fd that this file was originally created from, 61 // which must be available at time of restore. 62 // 63 // This field is initialized at creation time and is immutable. 64 hostFD int 65 66 // ino is an inode number unique within this filesystem. 67 // 68 // This field is initialized at creation time and is immutable. 69 ino uint64 70 71 // ftype is the file's type (a linux.S_IFMT mask). 72 // 73 // This field is initialized at creation time and is immutable. 74 ftype uint16 75 76 // mayBlock is true if hostFD is non-blocking, and operations on it may 77 // return EAGAIN or EWOULDBLOCK instead of blocking. 78 // 79 // This field is initialized at creation time and is immutable. 80 mayBlock bool 81 82 // seekable is false if lseek(hostFD) returns ESPIPE. We assume that file 83 // offsets are meaningful iff seekable is true. 84 // 85 // This field is initialized at creation time and is immutable. 86 seekable bool 87 88 // isTTY is true if this file represents a TTY. 89 // 90 // This field is initialized at creation time and is immutable. 91 isTTY bool 92 93 // savable is true if hostFD may be saved/restored by its numeric value. 94 // 95 // This field is initialized at creation time and is immutable. 96 savable bool 97 98 // Event queue for blocking operations. 99 queue waiter.Queue 100 101 // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data 102 // read from the pipe from previous calls to inode.beforeSave(). haveBuf 103 // and buf are protected by bufMu. haveBuf is accessed using atomic memory 104 // operations. 105 bufMu sync.Mutex `state:"nosave"` 106 haveBuf uint32 107 buf []byte 108 } 109 110 func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) { 111 // Determine if hostFD is seekable. 112 _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) 113 seekable := !linuxerr.Equals(linuxerr.ESPIPE, err) 114 // We expect regular files to be seekable, as this is required for them to 115 // be memory-mappable. 116 if !seekable && fileType == unix.S_IFREG { 117 ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) 118 return nil, linuxerr.ESPIPE 119 } 120 121 i := &inode{ 122 hostFD: hostFD, 123 ino: fs.NextIno(), 124 ftype: uint16(fileType), 125 mayBlock: fileType != unix.S_IFREG && fileType != unix.S_IFDIR, 126 seekable: seekable, 127 isTTY: isTTY, 128 savable: savable, 129 } 130 i.InitRefs() 131 i.CachedMappable.Init(hostFD) 132 133 // If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and 134 // handle blocking behavior in the sentry. 135 if i.mayBlock { 136 if err := unix.SetNonblock(i.hostFD, true); err != nil { 137 return nil, err 138 } 139 if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { 140 return nil, err 141 } 142 } 143 return i, nil 144 } 145 146 // NewFDOptions contains options to NewFD. 147 type NewFDOptions struct { 148 // If Savable is true, the host file descriptor may be saved/restored by 149 // numeric value; the sandbox API requires a corresponding host FD with the 150 // same numeric value to be provieded at time of restore. 151 Savable bool 152 153 // If IsTTY is true, the file descriptor is a TTY. 154 IsTTY bool 155 156 // If HaveFlags is true, use Flags for the new file description. Otherwise, 157 // the new file description will inherit flags from hostFD. 158 HaveFlags bool 159 Flags uint32 160 } 161 162 // NewFD returns a vfs.FileDescription representing the given host file 163 // descriptor. mnt must be Kernel.HostMount(). 164 func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { 165 fs, ok := mnt.Filesystem().Impl().(*filesystem) 166 if !ok { 167 return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) 168 } 169 170 // Retrieve metadata. 171 var s unix.Stat_t 172 if err := unix.Fstat(hostFD, &s); err != nil { 173 return nil, err 174 } 175 176 flags := opts.Flags 177 if !opts.HaveFlags { 178 // Get flags for the imported FD. 179 flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) 180 if err != nil { 181 return nil, err 182 } 183 flags = uint32(flagsInt) 184 } 185 186 d := &kernfs.Dentry{} 187 i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY) 188 if err != nil { 189 return nil, err 190 } 191 d.Init(&fs.Filesystem, i) 192 193 // i.open will take a reference on d. 194 defer d.DecRef(ctx) 195 196 // For simplicity, fileDescription.offset is set to 0. Technically, we 197 // should only set to 0 on files that are not seekable (sockets, pipes, 198 // etc.), and use the offset from the host fd otherwise when importing. 199 return i.open(ctx, d, mnt, flags) 200 } 201 202 // ImportFD sets up and returns a vfs.FileDescription from a donated fd. 203 func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { 204 return NewFD(ctx, mnt, hostFD, &NewFDOptions{ 205 Savable: true, 206 IsTTY: isTTY, 207 }) 208 } 209 210 // filesystemType implements vfs.FilesystemType. 211 // 212 // +stateify savable 213 type filesystemType struct{} 214 215 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 216 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 217 panic("host.filesystemType.GetFilesystem should never be called") 218 } 219 220 // Name implements vfs.FilesystemType.Name. 221 func (filesystemType) Name() string { 222 return "none" 223 } 224 225 // Release implements vfs.FilesystemType.Release. 226 func (filesystemType) Release(ctx context.Context) {} 227 228 // NewFilesystem sets up and returns a new hostfs filesystem. 229 // 230 // Note that there should only ever be one instance of host.filesystem, 231 // a global mount for host fds. 232 func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { 233 devMinor, err := vfsObj.GetAnonBlockDevMinor() 234 if err != nil { 235 return nil, err 236 } 237 fs := &filesystem{ 238 devMinor: devMinor, 239 } 240 fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) 241 return fs.VFSFilesystem(), nil 242 } 243 244 // filesystem implements vfs.FilesystemImpl. 245 // 246 // +stateify savable 247 type filesystem struct { 248 kernfs.Filesystem 249 250 devMinor uint32 251 } 252 253 func (fs *filesystem) Release(ctx context.Context) { 254 fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 255 fs.Filesystem.Release(ctx) 256 } 257 258 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 259 d := vd.Dentry().Impl().(*kernfs.Dentry) 260 inode := d.Inode().(*inode) 261 b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) 262 return vfs.PrependPathSyntheticError{} 263 } 264 265 // MountOptions implements vfs.FilesystemImpl.MountOptions. 266 func (fs *filesystem) MountOptions() string { 267 return "" 268 } 269 270 // CheckPermissions implements kernfs.Inode.CheckPermissions. 271 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 272 var s unix.Stat_t 273 if err := unix.Fstat(i.hostFD, &s); err != nil { 274 return err 275 } 276 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) 277 } 278 279 // Mode implements kernfs.Inode.Mode. 280 func (i *inode) Mode() linux.FileMode { 281 var s unix.Stat_t 282 if err := unix.Fstat(i.hostFD, &s); err != nil { 283 // Retrieving the mode from the host fd using fstat(2) should not fail. 284 // If the syscall does not succeed, something is fundamentally wrong. 285 panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) 286 } 287 return linux.FileMode(s.Mode) 288 } 289 290 // Stat implements kernfs.Inode.Stat. 291 func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { 292 if opts.Mask&linux.STATX__RESERVED != 0 { 293 return linux.Statx{}, linuxerr.EINVAL 294 } 295 if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { 296 return linux.Statx{}, linuxerr.EINVAL 297 } 298 299 fs := vfsfs.Impl().(*filesystem) 300 301 // Limit our host call only to known flags. 302 mask := opts.Mask & linux.STATX_ALL 303 var s unix.Statx_t 304 err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) 305 if linuxerr.Equals(linuxerr.ENOSYS, err) { 306 // Fallback to fstat(2), if statx(2) is not supported on the host. 307 // 308 // TODO(b/151263641): Remove fallback. 309 return i.fstat(fs) 310 } 311 if err != nil { 312 return linux.Statx{}, err 313 } 314 315 // Unconditionally fill blksize, attributes, and device numbers, as 316 // indicated by /include/uapi/linux/stat.h. Inode number is always 317 // available, since we use our own rather than the host's. 318 ls := linux.Statx{ 319 Mask: linux.STATX_INO, 320 Blksize: s.Blksize, 321 Attributes: s.Attributes, 322 Ino: i.ino, 323 AttributesMask: s.Attributes_mask, 324 DevMajor: linux.UNNAMED_MAJOR, 325 DevMinor: fs.devMinor, 326 } 327 328 // Copy other fields that were returned by the host. RdevMajor/RdevMinor 329 // are never copied (and therefore left as zero), so as not to expose host 330 // device numbers. 331 ls.Mask |= s.Mask & linux.STATX_ALL 332 if s.Mask&linux.STATX_TYPE != 0 { 333 ls.Mode |= s.Mode & linux.S_IFMT 334 } 335 if s.Mask&linux.STATX_MODE != 0 { 336 ls.Mode |= s.Mode &^ linux.S_IFMT 337 } 338 if s.Mask&linux.STATX_NLINK != 0 { 339 ls.Nlink = s.Nlink 340 } 341 if s.Mask&linux.STATX_UID != 0 { 342 ls.UID = s.Uid 343 } 344 if s.Mask&linux.STATX_GID != 0 { 345 ls.GID = s.Gid 346 } 347 if s.Mask&linux.STATX_ATIME != 0 { 348 ls.Atime = unixToLinuxStatxTimestamp(s.Atime) 349 } 350 if s.Mask&linux.STATX_BTIME != 0 { 351 ls.Btime = unixToLinuxStatxTimestamp(s.Btime) 352 } 353 if s.Mask&linux.STATX_CTIME != 0 { 354 ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) 355 } 356 if s.Mask&linux.STATX_MTIME != 0 { 357 ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) 358 } 359 if s.Mask&linux.STATX_SIZE != 0 { 360 ls.Size = s.Size 361 } 362 if s.Mask&linux.STATX_BLOCKS != 0 { 363 ls.Blocks = s.Blocks 364 } 365 366 return ls, nil 367 } 368 369 // fstat is a best-effort fallback for inode.Stat() if the host does not 370 // support statx(2). 371 // 372 // We ignore the mask and sync flags in opts and simply supply 373 // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification 374 // of a mask or sync flags. fstat(2) does not provide any metadata 375 // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so 376 // those fields remain empty. 377 func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { 378 var s unix.Stat_t 379 if err := unix.Fstat(i.hostFD, &s); err != nil { 380 return linux.Statx{}, err 381 } 382 383 // As with inode.Stat(), we always use internal device and inode numbers, 384 // and never expose the host's represented device numbers. 385 return linux.Statx{ 386 Mask: linux.STATX_BASIC_STATS, 387 Blksize: uint32(s.Blksize), 388 Nlink: uint32(s.Nlink), 389 UID: s.Uid, 390 GID: s.Gid, 391 Mode: uint16(s.Mode), 392 Ino: i.ino, 393 Size: uint64(s.Size), 394 Blocks: uint64(s.Blocks), 395 Atime: timespecToStatxTimestamp(s.Atim), 396 Ctime: timespecToStatxTimestamp(s.Ctim), 397 Mtime: timespecToStatxTimestamp(s.Mtim), 398 DevMajor: linux.UNNAMED_MAJOR, 399 DevMinor: fs.devMinor, 400 }, nil 401 } 402 403 // SetStat implements kernfs.Inode.SetStat. 404 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 405 s := &opts.Stat 406 407 m := s.Mask 408 if m == 0 { 409 return nil 410 } 411 if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { 412 return linuxerr.EPERM 413 } 414 var hostStat unix.Stat_t 415 if err := unix.Fstat(i.hostFD, &hostStat); err != nil { 416 return err 417 } 418 if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { 419 return err 420 } 421 422 if m&linux.STATX_MODE != 0 { 423 if err := unix.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { 424 return err 425 } 426 } 427 if m&linux.STATX_SIZE != 0 { 428 if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { 429 return linuxerr.EINVAL 430 } 431 if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil { 432 return err 433 } 434 oldSize := uint64(hostStat.Size) 435 if s.Size < oldSize { 436 oldpgend, _ := hostarch.PageRoundUp(oldSize) 437 newpgend, _ := hostarch.PageRoundUp(s.Size) 438 if oldpgend != newpgend { 439 i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend}) 440 } 441 } 442 } 443 if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { 444 ts := [2]unix.Timespec{ 445 toTimespec(s.Atime, m&linux.STATX_ATIME == 0), 446 toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), 447 } 448 if err := setTimestamps(i.hostFD, &ts); err != nil { 449 return err 450 } 451 } 452 return nil 453 } 454 455 // DecRef implements kernfs.Inode.DecRef. 456 func (i *inode) DecRef(ctx context.Context) { 457 i.inodeRefs.DecRef(func() { 458 if i.mayBlock { 459 fdnotifier.RemoveFD(int32(i.hostFD)) 460 } 461 if err := unix.Close(i.hostFD); err != nil { 462 log.Warningf("failed to close host fd %d: %v", i.hostFD, err) 463 } 464 // We can't rely on fdnotifier when closing the fd, because the event may race 465 // with fdnotifier.RemoveFD. Instead, notify the queue explicitly. 466 i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents) 467 }) 468 } 469 470 // Open implements kernfs.Inode.Open. 471 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 472 // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. 473 if i.Mode().FileType() == linux.S_IFSOCK { 474 return nil, linuxerr.ENXIO 475 } 476 return i.open(ctx, d, rp.Mount(), opts.Flags) 477 } 478 479 func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { 480 var s unix.Stat_t 481 if err := unix.Fstat(i.hostFD, &s); err != nil { 482 return nil, err 483 } 484 fileType := s.Mode & linux.FileTypeMask 485 486 // Constrain flags to a subset we can handle. 487 // 488 // TODO(github.com/SagerNet/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. 489 flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND 490 491 switch fileType { 492 case unix.S_IFSOCK: 493 if i.isTTY { 494 log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) 495 return nil, syserror.ENOTTY 496 } 497 498 ep, err := newEndpoint(ctx, i.hostFD, &i.queue) 499 if err != nil { 500 return nil, err 501 } 502 // Currently, we only allow Unix sockets to be imported. 503 return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks) 504 505 case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR: 506 if i.isTTY { 507 fd := &TTYFileDescription{ 508 fileDescription: fileDescription{inode: i}, 509 termios: linux.DefaultReplicaTermios, 510 } 511 if task := kernel.TaskFromContext(ctx); task != nil { 512 fd.fgProcessGroup = task.ThreadGroup().ProcessGroup() 513 fd.session = fd.fgProcessGroup.Session() 514 } 515 fd.LockFD.Init(&i.locks) 516 vfsfd := &fd.vfsfd 517 if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 518 return nil, err 519 } 520 return vfsfd, nil 521 } 522 523 fd := &fileDescription{inode: i} 524 fd.LockFD.Init(&i.locks) 525 vfsfd := &fd.vfsfd 526 if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 527 return nil, err 528 } 529 return vfsfd, nil 530 531 default: 532 log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) 533 return nil, linuxerr.EPERM 534 } 535 } 536 537 // fileDescription is embedded by host fd implementations of FileDescriptionImpl. 538 // 539 // +stateify savable 540 type fileDescription struct { 541 vfsfd vfs.FileDescription 542 vfs.FileDescriptionDefaultImpl 543 vfs.LockFD 544 545 // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but 546 // cached to reduce indirections and casting. fileDescription does not hold 547 // a reference on the inode through the inode field (since one is already 548 // held via the Dentry). 549 // 550 // inode is immutable after fileDescription creation. 551 inode *inode 552 553 // offsetMu protects offset. 554 offsetMu sync.Mutex `state:"nosave"` 555 556 // offset specifies the current file offset. It is only meaningful when 557 // inode.seekable is true. 558 offset int64 559 } 560 561 // SetStat implements vfs.FileDescriptionImpl.SetStat. 562 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 563 creds := auth.CredentialsFromContext(ctx) 564 return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) 565 } 566 567 // Stat implements vfs.FileDescriptionImpl.Stat. 568 func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 569 return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) 570 } 571 572 // Release implements vfs.FileDescriptionImpl.Release. 573 func (f *fileDescription) Release(context.Context) { 574 // noop 575 } 576 577 // Allocate implements vfs.FileDescriptionImpl.Allocate. 578 func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { 579 return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) 580 } 581 582 // PRead implements vfs.FileDescriptionImpl.PRead. 583 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 584 // Check that flags are supported. 585 // 586 // TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags. 587 if opts.Flags&^linux.RWF_HIPRI != 0 { 588 return 0, syserror.EOPNOTSUPP 589 } 590 591 i := f.inode 592 if !i.seekable { 593 return 0, linuxerr.ESPIPE 594 } 595 596 return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) 597 } 598 599 // Read implements vfs.FileDescriptionImpl.Read. 600 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 601 // Check that flags are supported. 602 // 603 // TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags. 604 if opts.Flags&^linux.RWF_HIPRI != 0 { 605 return 0, syserror.EOPNOTSUPP 606 } 607 608 i := f.inode 609 if !i.seekable { 610 bufN, err := i.readFromBuf(ctx, &dst) 611 if err != nil { 612 return bufN, err 613 } 614 n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) 615 total := bufN + n 616 if isBlockError(err) { 617 // If we got any data at all, return it as a "completed" partial read 618 // rather than retrying until complete. 619 if total != 0 { 620 err = nil 621 } else { 622 err = syserror.ErrWouldBlock 623 } 624 } 625 return total, err 626 } 627 628 f.offsetMu.Lock() 629 n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) 630 f.offset += n 631 f.offsetMu.Unlock() 632 return n, err 633 } 634 635 func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) { 636 if atomic.LoadUint32(&i.haveBuf) == 0 { 637 return 0, nil 638 } 639 i.bufMu.Lock() 640 defer i.bufMu.Unlock() 641 if len(i.buf) == 0 { 642 return 0, nil 643 } 644 n, err := dst.CopyOut(ctx, i.buf) 645 *dst = dst.DropFirst(n) 646 i.buf = i.buf[n:] 647 if len(i.buf) == 0 { 648 atomic.StoreUint32(&i.haveBuf, 0) 649 i.buf = nil 650 } 651 return int64(n), err 652 } 653 654 func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { 655 reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) 656 n, err := dst.CopyOutFrom(ctx, reader) 657 hostfd.PutReadWriterAt(reader) 658 return int64(n), err 659 } 660 661 // PWrite implements vfs.FileDescriptionImpl.PWrite. 662 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 663 if !f.inode.seekable { 664 return 0, linuxerr.ESPIPE 665 } 666 667 return f.writeToHostFD(ctx, src, offset, opts.Flags) 668 } 669 670 // Write implements vfs.FileDescriptionImpl.Write. 671 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 672 i := f.inode 673 if !i.seekable { 674 n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) 675 if isBlockError(err) { 676 err = syserror.ErrWouldBlock 677 } 678 return n, err 679 } 680 681 f.offsetMu.Lock() 682 // NOTE(github.com/SagerNet/issue/2983): O_APPEND may cause memory corruption if 683 // another process modifies the host file between retrieving the file size 684 // and writing to the host fd. This is an unavoidable race condition because 685 // we cannot enforce synchronization on the host. 686 if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 687 var s unix.Stat_t 688 if err := unix.Fstat(i.hostFD, &s); err != nil { 689 f.offsetMu.Unlock() 690 return 0, err 691 } 692 f.offset = s.Size 693 } 694 n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) 695 f.offset += n 696 f.offsetMu.Unlock() 697 return n, err 698 } 699 700 func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { 701 hostFD := f.inode.hostFD 702 // TODO(github.com/SagerNet/issue/2601): Support select pwritev2 flags. 703 if flags != 0 { 704 return 0, syserror.EOPNOTSUPP 705 } 706 writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) 707 n, err := src.CopyInTo(ctx, writer) 708 hostfd.PutReadWriterAt(writer) 709 // NOTE(github.com/SagerNet/issue/2979): We always sync everything, even for O_DSYNC. 710 if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { 711 if syncErr := unix.Fsync(hostFD); syncErr != nil { 712 return int64(n), syncErr 713 } 714 } 715 return int64(n), err 716 } 717 718 // Seek implements vfs.FileDescriptionImpl.Seek. 719 // 720 // Note that we do not support seeking on directories, since we do not even 721 // allow directory fds to be imported at all. 722 func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { 723 i := f.inode 724 if !i.seekable { 725 return 0, linuxerr.ESPIPE 726 } 727 728 f.offsetMu.Lock() 729 defer f.offsetMu.Unlock() 730 731 switch whence { 732 case linux.SEEK_SET: 733 if offset < 0 { 734 return f.offset, linuxerr.EINVAL 735 } 736 f.offset = offset 737 738 case linux.SEEK_CUR: 739 // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. 740 if offset > math.MaxInt64-f.offset { 741 return f.offset, linuxerr.EOVERFLOW 742 } 743 if f.offset+offset < 0 { 744 return f.offset, linuxerr.EINVAL 745 } 746 f.offset += offset 747 748 case linux.SEEK_END: 749 var s unix.Stat_t 750 if err := unix.Fstat(i.hostFD, &s); err != nil { 751 return f.offset, err 752 } 753 size := s.Size 754 755 // Check for overflow. Note that underflow cannot occur, since size >= 0. 756 if offset > math.MaxInt64-size { 757 return f.offset, linuxerr.EOVERFLOW 758 } 759 if size+offset < 0 { 760 return f.offset, linuxerr.EINVAL 761 } 762 f.offset = size + offset 763 764 case linux.SEEK_DATA, linux.SEEK_HOLE: 765 // Modifying the offset in the host file table should not matter, since 766 // this is the only place where we use it. 767 // 768 // For reading and writing, we always rely on our internal offset. 769 n, err := unix.Seek(i.hostFD, offset, int(whence)) 770 if err != nil { 771 return f.offset, err 772 } 773 f.offset = n 774 775 default: 776 // Invalid whence. 777 return f.offset, linuxerr.EINVAL 778 } 779 780 return f.offset, nil 781 } 782 783 // Sync implements vfs.FileDescriptionImpl.Sync. 784 func (f *fileDescription) Sync(ctx context.Context) error { 785 // TODO(github.com/SagerNet/issue/1897): Currently, we always sync everything. 786 return unix.Fsync(f.inode.hostFD) 787 } 788 789 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 790 func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { 791 // NOTE(b/38213152): Technically, some obscure char devices can be memory 792 // mapped, but we only allow regular files. 793 if f.inode.ftype != unix.S_IFREG { 794 return linuxerr.ENODEV 795 } 796 i := f.inode 797 i.CachedMappable.InitFileMapperOnce() 798 return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) 799 } 800 801 // EventRegister implements waiter.Waitable.EventRegister. 802 func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { 803 f.inode.queue.EventRegister(e, mask) 804 if f.inode.mayBlock { 805 fdnotifier.UpdateFD(int32(f.inode.hostFD)) 806 } 807 } 808 809 // EventUnregister implements waiter.Waitable.EventUnregister. 810 func (f *fileDescription) EventUnregister(e *waiter.Entry) { 811 f.inode.queue.EventUnregister(e) 812 if f.inode.mayBlock { 813 fdnotifier.UpdateFD(int32(f.inode.hostFD)) 814 } 815 } 816 817 // Readiness uses the poll() syscall to check the status of the underlying FD. 818 func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { 819 return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) 820 }