gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/host/host.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package host provides a filesystem implementation for host files imported as 16 // file descriptors. 17 package host 18 19 import ( 20 "fmt" 21 "math" 22 23 "golang.org/x/sys/unix" 24 "gvisor.dev/gvisor/pkg/abi/linux" 25 "gvisor.dev/gvisor/pkg/atomicbitops" 26 "gvisor.dev/gvisor/pkg/context" 27 "gvisor.dev/gvisor/pkg/errors/linuxerr" 28 "gvisor.dev/gvisor/pkg/fdnotifier" 29 "gvisor.dev/gvisor/pkg/fspath" 30 "gvisor.dev/gvisor/pkg/hostarch" 31 "gvisor.dev/gvisor/pkg/log" 32 "gvisor.dev/gvisor/pkg/sentry/arch" 33 "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" 34 "gvisor.dev/gvisor/pkg/sentry/hostfd" 35 "gvisor.dev/gvisor/pkg/sentry/kernel" 36 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 37 "gvisor.dev/gvisor/pkg/sentry/memmap" 38 unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" 39 "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" 40 "gvisor.dev/gvisor/pkg/sentry/uniqueid" 41 "gvisor.dev/gvisor/pkg/sentry/vfs" 42 "gvisor.dev/gvisor/pkg/sync" 43 "gvisor.dev/gvisor/pkg/usermem" 44 "gvisor.dev/gvisor/pkg/waiter" 45 ) 46 47 // These are the modes that are stored with virtualOwner. 48 const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID 49 50 // +stateify savable 51 type virtualOwner struct { 52 // This field is initialized at creation time and is immutable. 53 enabled bool 54 55 // mu protects the fields below and they can be accessed using atomic memory 56 // operations. 57 mu sync.Mutex `state:"nosave"` 58 uid atomicbitops.Uint32 59 gid atomicbitops.Uint32 60 // mode is also stored, otherwise setting the host file to `0000` could remove 61 // access to the file. 62 mode atomicbitops.Uint32 63 } 64 65 func (v *virtualOwner) atomicUID() uint32 { 66 return v.uid.Load() 67 } 68 69 func (v *virtualOwner) atomicGID() uint32 { 70 return v.gid.Load() 71 } 72 73 func (v *virtualOwner) atomicMode() uint32 { 74 return v.mode.Load() 75 } 76 77 func isEpollable(fd int) bool { 78 epollfd, err := unix.EpollCreate1(0) 79 if err != nil { 80 // This shouldn't happen. If it does, just say file doesn't support epoll. 81 return false 82 } 83 defer unix.Close(epollfd) 84 85 event := unix.EpollEvent{ 86 Fd: int32(fd), 87 Events: unix.EPOLLIN, 88 } 89 err = unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, fd, &event) 90 return err == nil 91 } 92 93 // inode implements kernfs.Inode. 94 // 95 // +stateify savable 96 type inode struct { 97 kernfs.CachedMappable 98 kernfs.InodeNoStatFS 99 kernfs.InodeAnonymous // inode is effectively anonymous because it represents a donated FD. 100 kernfs.InodeNotDirectory 101 kernfs.InodeNotSymlink 102 kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. 103 kernfs.InodeWatches 104 105 locks vfs.FileLocks 106 107 // When the reference count reaches zero, the host fd is closed. 108 inodeRefs 109 110 // hostFD contains the host fd that this file was originally created from. 111 // Upon restore, it must be remapped using restoreKey and vfs.CtxRestoreFilesystemFDMap 112 // from the restore context. 113 // 114 // This field is initialized at creation time and is immutable. 115 hostFD int `state:"nosave"` 116 117 // restoreKey is used to identify the `hostFD` after a restore is performed. 118 restoreKey vfs.RestoreID 119 120 // ino is an inode number unique within this filesystem. 121 // 122 // This field is initialized at creation time and is immutable. 123 ino uint64 124 125 // ftype is the file's type (a linux.S_IFMT mask). 126 // 127 // This field is initialized at creation time and is immutable. 128 ftype uint16 129 130 // epollable indicates whether the hostFD can be used with epoll_ctl(2). This 131 // also indicates that hostFD has been set to non-blocking. 132 // 133 // This field is initialized at creation time and is immutable. 134 epollable bool 135 136 // seekable is false if lseek(hostFD) returns ESPIPE. We assume that file 137 // offsets are meaningful iff seekable is true. 138 // 139 // This field is initialized at creation time and is immutable. 140 seekable bool 141 142 // isTTY is true if this file represents a TTY. 143 // 144 // This field is initialized at creation time and is immutable. 145 isTTY bool 146 147 // savable is true if hostFD may be saved/restored by its numeric value. 148 // 149 // This field is initialized at creation time and is immutable. 150 savable bool 151 152 // readonly is true if operations that can potentially change the host file 153 // are blocked. 154 // 155 // This field is initialized at creation time and is immutable. 156 readonly bool 157 158 // Event queue for blocking operations. 159 queue waiter.Queue 160 161 // virtualOwner caches ownership and permission information to override the 162 // underlying file owner and permission. This is used to allow the unstrusted 163 // application to change these fields without affecting the host. 164 virtualOwner virtualOwner 165 166 // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data 167 // read from the pipe from previous calls to inode.beforeSave(). haveBuf 168 // and buf are protected by bufMu. 169 bufMu sync.Mutex `state:"nosave"` 170 haveBuf atomicbitops.Uint32 171 buf []byte 172 } 173 174 func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, restoreKey vfs.RestoreID, fileType linux.FileMode, isTTY bool, readonly bool) (*inode, error) { 175 // Determine if hostFD is seekable. 176 _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) 177 seekable := !linuxerr.Equals(linuxerr.ESPIPE, err) 178 // We expect regular files to be seekable, as this is required for them to 179 // be memory-mappable. 180 if !seekable && fileType == unix.S_IFREG { 181 ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) 182 return nil, linuxerr.ESPIPE 183 } 184 185 i := &inode{ 186 hostFD: hostFD, 187 ino: fs.NextIno(), 188 ftype: uint16(fileType), 189 epollable: isEpollable(hostFD), 190 seekable: seekable, 191 isTTY: isTTY, 192 savable: savable, 193 restoreKey: restoreKey, 194 readonly: readonly, 195 } 196 i.InitRefs() 197 i.CachedMappable.Init(hostFD) 198 199 // If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and 200 // handle blocking behavior in the sentry. 201 if i.epollable { 202 if err := unix.SetNonblock(i.hostFD, true); err != nil { 203 return nil, err 204 } 205 if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { 206 return nil, err 207 } 208 } 209 return i, nil 210 } 211 212 // NewFDOptions contains options to NewFD. 213 type NewFDOptions struct { 214 // If Savable is true, the host file descriptor may be saved/restored by 215 // numeric value. RestoreKey is used to map the FD after restore. 216 Savable bool 217 218 // RestoreKey is only used when Savable==true. It uniquely identifies the 219 // host FD so that a mapping to the corresponding FD can be provided during 220 // restore. 221 RestoreKey vfs.RestoreID 222 223 // If IsTTY is true, the file descriptor is a TTY. 224 IsTTY bool 225 226 // If HaveFlags is true, use Flags for the new file description. Otherwise, 227 // the new file description will inherit flags from hostFD. 228 HaveFlags bool 229 Flags uint32 230 231 // VirtualOwner allow the host file to have owner and permissions different 232 // than the underlying host file. 233 VirtualOwner bool 234 UID auth.KUID 235 GID auth.KGID 236 237 // If Readonly is true, we disallow operations that can potentially change 238 // the host file associated with the file descriptor. 239 Readonly bool 240 } 241 242 // NewFD returns a vfs.FileDescription representing the given host file 243 // descriptor. mnt must be Kernel.HostMount(). 244 func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { 245 fs, ok := mnt.Filesystem().Impl().(*filesystem) 246 if !ok { 247 return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) 248 } 249 250 if opts.Readonly { 251 if opts.IsTTY { 252 // This is not a technical limitation, but access checks for TTYs 253 // have not been implemented yet. 254 return nil, fmt.Errorf("readonly file descriptor may currently not be a TTY") 255 } 256 257 flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) 258 if err != nil { 259 return nil, err 260 } 261 accessMode := uint32(flagsInt) & unix.O_ACCMODE 262 if accessMode != unix.O_RDONLY { 263 return nil, fmt.Errorf("readonly file descriptor may only be opened as O_RDONLY on the host") 264 } 265 } 266 267 // Retrieve metadata. 268 var stat unix.Stat_t 269 if err := unix.Fstat(hostFD, &stat); err != nil { 270 return nil, err 271 } 272 273 flags := opts.Flags 274 if !opts.HaveFlags { 275 // Get flags for the imported FD. 276 flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) 277 if err != nil { 278 return nil, err 279 } 280 flags = uint32(flagsInt) 281 } 282 283 fileType := linux.FileMode(stat.Mode).FileType() 284 i, err := newInode(ctx, fs, hostFD, opts.Savable, opts.RestoreKey, fileType, opts.IsTTY, opts.Readonly) 285 if err != nil { 286 return nil, err 287 } 288 if opts.VirtualOwner { 289 i.virtualOwner.enabled = true 290 i.virtualOwner.uid = atomicbitops.FromUint32(uint32(opts.UID)) 291 i.virtualOwner.gid = atomicbitops.FromUint32(uint32(opts.GID)) 292 i.virtualOwner.mode = atomicbitops.FromUint32(stat.Mode) 293 } 294 295 d := &kernfs.Dentry{} 296 d.Init(&fs.Filesystem, i) 297 298 // i.open will take a reference on d. 299 defer d.DecRef(ctx) 300 301 // For simplicity, fileDescription.offset is set to 0. Technically, we 302 // should only set to 0 on files that are not seekable (sockets, pipes, 303 // etc.), and use the offset from the host fd otherwise when importing. 304 return i.open(ctx, d, mnt, fileType, flags) 305 } 306 307 // filesystemType implements vfs.FilesystemType. 308 // 309 // +stateify savable 310 type filesystemType struct{} 311 312 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 313 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 314 panic("host.filesystemType.GetFilesystem should never be called") 315 } 316 317 // Name implements vfs.FilesystemType.Name. 318 func (filesystemType) Name() string { 319 return "none" 320 } 321 322 // Release implements vfs.FilesystemType.Release. 323 func (filesystemType) Release(ctx context.Context) {} 324 325 // NewFilesystem sets up and returns a new hostfs filesystem. 326 // 327 // Note that there should only ever be one instance of host.filesystem, 328 // a global mount for host fds. 329 func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { 330 devMinor, err := vfsObj.GetAnonBlockDevMinor() 331 if err != nil { 332 return nil, err 333 } 334 fs := &filesystem{ 335 devMinor: devMinor, 336 } 337 fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) 338 return fs.VFSFilesystem(), nil 339 } 340 341 // filesystem implements vfs.FilesystemImpl. 342 // 343 // +stateify savable 344 type filesystem struct { 345 kernfs.Filesystem 346 347 devMinor uint32 348 } 349 350 func (fs *filesystem) Release(ctx context.Context) { 351 fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 352 fs.Filesystem.Release(ctx) 353 } 354 355 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 356 d := vd.Dentry().Impl().(*kernfs.Dentry) 357 inode := d.Inode().(*inode) 358 b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) 359 return vfs.PrependPathSyntheticError{} 360 } 361 362 // MountOptions implements vfs.FilesystemImpl.MountOptions. 363 func (fs *filesystem) MountOptions() string { 364 return "" 365 } 366 367 // CheckPermissions implements kernfs.Inode.CheckPermissions. 368 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 369 var s unix.Stat_t 370 if err := i.stat(&s); err != nil { 371 return err 372 } 373 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) 374 } 375 376 // Mode implements kernfs.Inode.Mode. 377 func (i *inode) Mode() linux.FileMode { 378 var s unix.Stat_t 379 if err := i.stat(&s); err != nil { 380 // Retrieving the mode from the host fd using fstat(2) should not fail. 381 // If the syscall does not succeed, something is fundamentally wrong. 382 panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) 383 } 384 return linux.FileMode(s.Mode) 385 } 386 387 // Mode implements kernfs.Inode.UID 388 func (i *inode) UID() auth.KUID { 389 return auth.KUID(i.virtualOwner.uid.Load()) 390 } 391 392 // Mode implements kernfs.Inode.GID 393 func (i *inode) GID() auth.KGID { 394 return auth.KGID(i.virtualOwner.gid.Load()) 395 } 396 397 // Stat implements kernfs.Inode.Stat. 398 func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { 399 if opts.Mask&linux.STATX__RESERVED != 0 { 400 return linux.Statx{}, linuxerr.EINVAL 401 } 402 if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { 403 return linux.Statx{}, linuxerr.EINVAL 404 } 405 406 fs := vfsfs.Impl().(*filesystem) 407 408 // Limit our host call only to known flags. 409 mask := opts.Mask & linux.STATX_ALL 410 var s unix.Statx_t 411 err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) 412 if linuxerr.Equals(linuxerr.ENOSYS, err) { 413 // Fallback to fstat(2), if statx(2) is not supported on the host. 414 // 415 // TODO(b/151263641): Remove fallback. 416 return i.statxFromStat(fs) 417 } 418 if err != nil { 419 return linux.Statx{}, err 420 } 421 422 // Unconditionally fill blksize, attributes, and device numbers, as 423 // indicated by /include/uapi/linux/stat.h. Inode number is always 424 // available, since we use our own rather than the host's. 425 ls := linux.Statx{ 426 Mask: linux.STATX_INO, 427 Blksize: s.Blksize, 428 Attributes: s.Attributes, 429 Ino: i.ino, 430 AttributesMask: s.Attributes_mask, 431 DevMajor: linux.UNNAMED_MAJOR, 432 DevMinor: fs.devMinor, 433 } 434 435 // Copy other fields that were returned by the host. RdevMajor/RdevMinor 436 // are never copied (and therefore left as zero), so as not to expose host 437 // device numbers. 438 ls.Mask |= s.Mask & linux.STATX_ALL 439 if s.Mask&linux.STATX_TYPE != 0 { 440 if i.virtualOwner.enabled { 441 ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT 442 } else { 443 ls.Mode |= s.Mode & linux.S_IFMT 444 } 445 } 446 if s.Mask&linux.STATX_MODE != 0 { 447 if i.virtualOwner.enabled { 448 ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT 449 } else { 450 ls.Mode |= s.Mode &^ linux.S_IFMT 451 } 452 } 453 if s.Mask&linux.STATX_NLINK != 0 { 454 ls.Nlink = s.Nlink 455 } 456 if s.Mask&linux.STATX_UID != 0 { 457 if i.virtualOwner.enabled { 458 ls.UID = i.virtualOwner.atomicUID() 459 } else { 460 ls.UID = s.Uid 461 } 462 } 463 if s.Mask&linux.STATX_GID != 0 { 464 if i.virtualOwner.enabled { 465 ls.GID = i.virtualOwner.atomicGID() 466 } else { 467 ls.GID = s.Gid 468 } 469 } 470 if s.Mask&linux.STATX_ATIME != 0 { 471 ls.Atime = unixToLinuxStatxTimestamp(s.Atime) 472 } 473 if s.Mask&linux.STATX_BTIME != 0 { 474 ls.Btime = unixToLinuxStatxTimestamp(s.Btime) 475 } 476 if s.Mask&linux.STATX_CTIME != 0 { 477 ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) 478 } 479 if s.Mask&linux.STATX_MTIME != 0 { 480 ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) 481 } 482 if s.Mask&linux.STATX_SIZE != 0 { 483 ls.Size = s.Size 484 } 485 if s.Mask&linux.STATX_BLOCKS != 0 { 486 ls.Blocks = s.Blocks 487 } 488 489 return ls, nil 490 } 491 492 // statxFromStat is a best-effort fallback for inode.Stat() if the host does not 493 // support statx(2). 494 // 495 // We ignore the mask and sync flags in opts and simply supply 496 // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification 497 // of a mask or sync flags. fstat(2) does not provide any metadata 498 // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so 499 // those fields remain empty. 500 func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) { 501 var s unix.Stat_t 502 if err := i.stat(&s); err != nil { 503 return linux.Statx{}, err 504 } 505 506 // As with inode.Stat(), we always use internal device and inode numbers, 507 // and never expose the host's represented device numbers. 508 return linux.Statx{ 509 Mask: linux.STATX_BASIC_STATS, 510 Blksize: uint32(s.Blksize), 511 Nlink: uint32(s.Nlink), 512 UID: s.Uid, 513 GID: s.Gid, 514 Mode: uint16(s.Mode), 515 Ino: i.ino, 516 Size: uint64(s.Size), 517 Blocks: uint64(s.Blocks), 518 Atime: timespecToStatxTimestamp(s.Atim), 519 Ctime: timespecToStatxTimestamp(s.Ctim), 520 Mtime: timespecToStatxTimestamp(s.Mtim), 521 DevMajor: linux.UNNAMED_MAJOR, 522 DevMinor: fs.devMinor, 523 }, nil 524 } 525 526 func (i *inode) stat(stat *unix.Stat_t) error { 527 if err := unix.Fstat(i.hostFD, stat); err != nil { 528 return err 529 } 530 if i.virtualOwner.enabled { 531 stat.Uid = i.virtualOwner.atomicUID() 532 stat.Gid = i.virtualOwner.atomicGID() 533 stat.Mode = i.virtualOwner.atomicMode() 534 } 535 return nil 536 } 537 538 // SetStat implements kernfs.Inode.SetStat. 539 // 540 // +checklocksignore 541 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 542 if i.readonly { 543 return linuxerr.EPERM 544 } 545 546 s := &opts.Stat 547 548 m := s.Mask 549 if m == 0 { 550 return nil 551 } 552 supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME) 553 if i.virtualOwner.enabled { 554 if m&virtualOwnerModes != 0 { 555 // Take lock if any of the virtual owner fields will be updated. 556 i.virtualOwner.mu.Lock() 557 defer i.virtualOwner.mu.Unlock() 558 } 559 560 supportedModes |= virtualOwnerModes 561 } 562 if m&^supportedModes != 0 { 563 return linuxerr.EPERM 564 } 565 566 var hostStat unix.Stat_t 567 if err := i.stat(&hostStat); err != nil { 568 return err 569 } 570 if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { 571 return err 572 } 573 574 if m&linux.STATX_MODE != 0 { 575 if i.virtualOwner.enabled { 576 // We hold i.virtualOwner.mu. 577 i.virtualOwner.mode = atomicbitops.FromUint32(uint32(opts.Stat.Mode)) 578 } else { 579 log.Warningf("sentry seccomp filters don't allow making fchmod(2) syscall") 580 return unix.EPERM 581 } 582 } 583 if m&linux.STATX_SIZE != 0 { 584 if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { 585 return linuxerr.EINVAL 586 } 587 if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil { 588 return err 589 } 590 oldSize := uint64(hostStat.Size) 591 if s.Size < oldSize { 592 oldpgend, _ := hostarch.PageRoundUp(oldSize) 593 newpgend, _ := hostarch.PageRoundUp(s.Size) 594 if oldpgend != newpgend { 595 i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend}) 596 } 597 } 598 } 599 if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { 600 ts := [2]unix.Timespec{ 601 toTimespec(s.Atime, m&linux.STATX_ATIME == 0), 602 toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), 603 } 604 if err := setTimestamps(i.hostFD, &ts); err != nil { 605 return err 606 } 607 } 608 if i.virtualOwner.enabled { 609 if m&linux.STATX_UID != 0 { 610 // We hold i.virtualOwner.mu. 611 i.virtualOwner.uid = atomicbitops.FromUint32(opts.Stat.UID) 612 } 613 if m&linux.STATX_GID != 0 { 614 // We hold i.virtualOwner.mu. 615 i.virtualOwner.gid = atomicbitops.FromUint32(opts.Stat.GID) 616 } 617 } 618 return nil 619 } 620 621 // DecRef implements kernfs.Inode.DecRef. 622 func (i *inode) DecRef(ctx context.Context) { 623 i.inodeRefs.DecRef(func() { 624 if i.epollable { 625 fdnotifier.RemoveFD(int32(i.hostFD)) 626 } 627 if err := unix.Close(i.hostFD); err != nil { 628 log.Warningf("failed to close host fd %d: %v", i.hostFD, err) 629 } 630 // We can't rely on fdnotifier when closing the fd, because the event may race 631 // with fdnotifier.RemoveFD. Instead, notify the queue explicitly. 632 i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents) 633 }) 634 } 635 636 // Open implements kernfs.Inode.Open. 637 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 638 // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. 639 if i.Mode().FileType() == linux.S_IFSOCK { 640 return nil, linuxerr.ENXIO 641 } 642 var stat unix.Stat_t 643 if err := i.stat(&stat); err != nil { 644 return nil, err 645 } 646 fileType := linux.FileMode(stat.Mode).FileType() 647 return i.open(ctx, d, rp.Mount(), fileType, opts.Flags) 648 } 649 650 func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) { 651 // Constrain flags to a subset we can handle. 652 // 653 // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. 654 flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND 655 656 switch fileType { 657 case unix.S_IFSOCK: 658 if i.isTTY { 659 log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) 660 return nil, linuxerr.ENOTTY 661 } 662 663 ep, err := newEndpoint(ctx, i.hostFD, &i.queue) 664 if err != nil { 665 return nil, err 666 } 667 // Currently, we only allow Unix sockets to be imported. 668 return unixsocket.NewFileDescription(ep, ep.Type(), flags, nil, mnt, d.VFSDentry(), &i.locks) 669 670 case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR: 671 if i.isTTY { 672 fd := &TTYFileDescription{ 673 fileDescription: fileDescription{inode: i}, 674 termios: linux.DefaultReplicaTermios, 675 } 676 if task := kernel.TaskFromContext(ctx); task != nil { 677 fd.fgProcessGroup = task.ThreadGroup().ProcessGroup() 678 fd.session = fd.fgProcessGroup.Session() 679 } 680 fd.LockFD.Init(&i.locks) 681 vfsfd := &fd.vfsfd 682 if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 683 return nil, err 684 } 685 return vfsfd, nil 686 } 687 688 fd := &fileDescription{inode: i} 689 fd.LockFD.Init(&i.locks) 690 vfsfd := &fd.vfsfd 691 if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 692 return nil, err 693 } 694 return vfsfd, nil 695 696 default: 697 log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) 698 return nil, linuxerr.EPERM 699 } 700 } 701 702 // Create a new host-backed endpoint from the given fd and its corresponding 703 // notification queue. 704 func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) { 705 // Set up an external transport.Endpoint using the host fd. 706 addr := fmt.Sprintf("hostfd:[%d]", hostFD) 707 e, err := transport.NewHostConnectedEndpoint(hostFD, addr) 708 if err != nil { 709 return nil, err.ToError() 710 } 711 ep := transport.NewExternal(e.SockType(), uniqueid.GlobalProviderFromContext(ctx), queue, e, e) 712 return ep, nil 713 } 714 715 // fileDescription is embedded by host fd implementations of FileDescriptionImpl. 716 // 717 // +stateify savable 718 type fileDescription struct { 719 vfsfd vfs.FileDescription 720 vfs.FileDescriptionDefaultImpl 721 vfs.LockFD 722 723 // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but 724 // cached to reduce indirections and casting. fileDescription does not hold 725 // a reference on the inode through the inode field (since one is already 726 // held via the Dentry). 727 // 728 // inode is immutable after fileDescription creation. 729 inode *inode 730 731 // offsetMu protects offset. 732 offsetMu sync.Mutex `state:"nosave"` 733 734 // offset specifies the current file offset. It is only meaningful when 735 // inode.seekable is true. 736 offset int64 737 } 738 739 // SetStat implements vfs.FileDescriptionImpl.SetStat. 740 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 741 creds := auth.CredentialsFromContext(ctx) 742 return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) 743 } 744 745 // Stat implements vfs.FileDescriptionImpl.Stat. 746 func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 747 return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) 748 } 749 750 // Release implements vfs.FileDescriptionImpl.Release. 751 func (f *fileDescription) Release(context.Context) { 752 // noop 753 } 754 755 // Allocate implements vfs.FileDescriptionImpl.Allocate. 756 func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { 757 if f.inode.readonly { 758 return linuxerr.EPERM 759 } 760 return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) 761 } 762 763 // PRead implements vfs.FileDescriptionImpl.PRead. 764 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 765 // Check that flags are supported. 766 // 767 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 768 if opts.Flags&^linux.RWF_HIPRI != 0 { 769 return 0, linuxerr.EOPNOTSUPP 770 } 771 772 i := f.inode 773 if !i.seekable { 774 return 0, linuxerr.ESPIPE 775 } 776 777 return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) 778 } 779 780 // Read implements vfs.FileDescriptionImpl.Read. 781 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 782 // Check that flags are supported. 783 // 784 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 785 if opts.Flags&^linux.RWF_HIPRI != 0 { 786 return 0, linuxerr.EOPNOTSUPP 787 } 788 789 i := f.inode 790 if !i.seekable { 791 bufN, err := i.readFromBuf(ctx, &dst) 792 if err != nil { 793 return bufN, err 794 } 795 n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) 796 total := bufN + n 797 if isBlockError(err) { 798 // If we got any data at all, return it as a "completed" partial read 799 // rather than retrying until complete. 800 if total != 0 { 801 err = nil 802 } else { 803 err = linuxerr.ErrWouldBlock 804 } 805 } 806 return total, err 807 } 808 809 f.offsetMu.Lock() 810 n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) 811 f.offset += n 812 f.offsetMu.Unlock() 813 return n, err 814 } 815 816 func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) { 817 if i.haveBuf.Load() == 0 { 818 return 0, nil 819 } 820 i.bufMu.Lock() 821 defer i.bufMu.Unlock() 822 if len(i.buf) == 0 { 823 return 0, nil 824 } 825 n, err := dst.CopyOut(ctx, i.buf) 826 *dst = dst.DropFirst(n) 827 i.buf = i.buf[n:] 828 if len(i.buf) == 0 { 829 i.haveBuf.Store(0) 830 i.buf = nil 831 } 832 return int64(n), err 833 } 834 835 func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { 836 reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) 837 n, err := dst.CopyOutFrom(ctx, reader) 838 hostfd.PutReadWriterAt(reader) 839 return int64(n), err 840 } 841 842 // PWrite implements vfs.FileDescriptionImpl.PWrite. 843 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 844 if !f.inode.seekable { 845 return 0, linuxerr.ESPIPE 846 } 847 848 return f.writeToHostFD(ctx, src, offset, opts.Flags) 849 } 850 851 // Write implements vfs.FileDescriptionImpl.Write. 852 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 853 i := f.inode 854 if !i.seekable { 855 n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) 856 if isBlockError(err) { 857 err = linuxerr.ErrWouldBlock 858 } 859 return n, err 860 } 861 862 f.offsetMu.Lock() 863 // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if 864 // another process modifies the host file between retrieving the file size 865 // and writing to the host fd. This is an unavoidable race condition because 866 // we cannot enforce synchronization on the host. 867 if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 868 var s unix.Stat_t 869 if err := unix.Fstat(i.hostFD, &s); err != nil { 870 f.offsetMu.Unlock() 871 return 0, err 872 } 873 f.offset = s.Size 874 } 875 n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) 876 f.offset += n 877 f.offsetMu.Unlock() 878 return n, err 879 } 880 881 func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { 882 if f.inode.readonly { 883 return 0, linuxerr.EPERM 884 } 885 hostFD := f.inode.hostFD 886 // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. 887 if flags != 0 { 888 return 0, linuxerr.EOPNOTSUPP 889 } 890 writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) 891 n, err := src.CopyInTo(ctx, writer) 892 hostfd.PutReadWriterAt(writer) 893 // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC. 894 if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { 895 if syncErr := unix.Fsync(hostFD); syncErr != nil { 896 return int64(n), syncErr 897 } 898 } 899 return int64(n), err 900 } 901 902 // Seek implements vfs.FileDescriptionImpl.Seek. 903 // 904 // Note that we do not support seeking on directories, since we do not even 905 // allow directory fds to be imported at all. 906 func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { 907 i := f.inode 908 if !i.seekable { 909 return 0, linuxerr.ESPIPE 910 } 911 912 f.offsetMu.Lock() 913 defer f.offsetMu.Unlock() 914 915 switch whence { 916 case linux.SEEK_SET: 917 if offset < 0 { 918 return f.offset, linuxerr.EINVAL 919 } 920 f.offset = offset 921 922 case linux.SEEK_CUR: 923 // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. 924 if offset > math.MaxInt64-f.offset { 925 return f.offset, linuxerr.EOVERFLOW 926 } 927 if f.offset+offset < 0 { 928 return f.offset, linuxerr.EINVAL 929 } 930 f.offset += offset 931 932 case linux.SEEK_END: 933 var s unix.Stat_t 934 if err := unix.Fstat(i.hostFD, &s); err != nil { 935 return f.offset, err 936 } 937 size := s.Size 938 939 // Check for overflow. Note that underflow cannot occur, since size >= 0. 940 if offset > math.MaxInt64-size { 941 return f.offset, linuxerr.EOVERFLOW 942 } 943 if size+offset < 0 { 944 return f.offset, linuxerr.EINVAL 945 } 946 f.offset = size + offset 947 948 case linux.SEEK_DATA, linux.SEEK_HOLE: 949 // Modifying the offset in the host file table should not matter, since 950 // this is the only place where we use it. 951 // 952 // For reading and writing, we always rely on our internal offset. 953 n, err := unix.Seek(i.hostFD, offset, int(whence)) 954 if err != nil { 955 return f.offset, err 956 } 957 f.offset = n 958 959 default: 960 // Invalid whence. 961 return f.offset, linuxerr.EINVAL 962 } 963 964 return f.offset, nil 965 } 966 967 // Sync implements vfs.FileDescriptionImpl.Sync. 968 func (f *fileDescription) Sync(ctx context.Context) error { 969 if f.inode.readonly { 970 return linuxerr.EPERM 971 } 972 // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. 973 return unix.Fsync(f.inode.hostFD) 974 } 975 976 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 977 func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { 978 // NOTE(b/38213152): Technically, some obscure char devices can be memory 979 // mapped, but we only allow regular files. 980 if f.inode.ftype != unix.S_IFREG { 981 return linuxerr.ENODEV 982 } 983 i := f.inode 984 i.CachedMappable.InitFileMapperOnce() 985 return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) 986 } 987 988 // EventRegister implements waiter.Waitable.EventRegister. 989 func (f *fileDescription) EventRegister(e *waiter.Entry) error { 990 f.inode.queue.EventRegister(e) 991 if f.inode.epollable { 992 if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil { 993 f.inode.queue.EventUnregister(e) 994 return err 995 } 996 } 997 return nil 998 } 999 1000 // EventUnregister implements waiter.Waitable.EventUnregister. 1001 func (f *fileDescription) EventUnregister(e *waiter.Entry) { 1002 f.inode.queue.EventUnregister(e) 1003 if f.inode.epollable { 1004 if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil { 1005 panic(fmt.Sprint("UpdateFD:", err)) 1006 } 1007 } 1008 } 1009 1010 // Readiness uses the poll() syscall to check the status of the underlying FD. 1011 func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { 1012 return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) 1013 } 1014 1015 // Epollable implements FileDescriptionImpl.Epollable. 1016 func (f *fileDescription) Epollable() bool { 1017 return f.inode.epollable 1018 } 1019 1020 // Ioctl queries the underlying FD for allowed ioctl commands. 1021 func (f *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 1022 switch cmd := args[1].Int(); cmd { 1023 case linux.FIONREAD: 1024 v, err := ioctlFionread(f.inode.hostFD) 1025 if err != nil { 1026 return 0, err 1027 } 1028 1029 var buf [4]byte 1030 hostarch.ByteOrder.PutUint32(buf[:], v) 1031 _, err = uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) 1032 return 0, err 1033 } 1034 1035 return f.FileDescriptionDefaultImpl.Ioctl(ctx, uio, sysno, args) 1036 }