github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/host/host.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package host provides a filesystem implementation for host files imported as 16 // file descriptors. 17 package host 18 19 import ( 20 "fmt" 21 "math" 22 23 "golang.org/x/sys/unix" 24 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 25 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 26 "github.com/MerlinKodo/gvisor/pkg/context" 27 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 28 "github.com/MerlinKodo/gvisor/pkg/fdnotifier" 29 "github.com/MerlinKodo/gvisor/pkg/fspath" 30 "github.com/MerlinKodo/gvisor/pkg/hostarch" 31 "github.com/MerlinKodo/gvisor/pkg/log" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/arch" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs" 34 "github.com/MerlinKodo/gvisor/pkg/sentry/hostfd" 35 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 36 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 37 "github.com/MerlinKodo/gvisor/pkg/sentry/memmap" 38 unixsocket "github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix" 39 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix/transport" 40 "github.com/MerlinKodo/gvisor/pkg/sentry/uniqueid" 41 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 42 "github.com/MerlinKodo/gvisor/pkg/sync" 43 "github.com/MerlinKodo/gvisor/pkg/usermem" 44 "github.com/MerlinKodo/gvisor/pkg/waiter" 45 ) 46 47 // These are the modes that are stored with virtualOwner. 48 const virtualOwnerModes = linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID 49 50 // +stateify savable 51 type virtualOwner struct { 52 // This field is initialized at creation time and is immutable. 53 enabled bool 54 55 // mu protects the fields below and they can be accessed using atomic memory 56 // operations. 57 mu sync.Mutex `state:"nosave"` 58 uid atomicbitops.Uint32 59 gid atomicbitops.Uint32 60 // mode is also stored, otherwise setting the host file to `0000` could remove 61 // access to the file. 62 mode atomicbitops.Uint32 63 } 64 65 func (v *virtualOwner) atomicUID() uint32 { 66 return v.uid.Load() 67 } 68 69 func (v *virtualOwner) atomicGID() uint32 { 70 return v.gid.Load() 71 } 72 73 func (v *virtualOwner) atomicMode() uint32 { 74 return v.mode.Load() 75 } 76 77 func isEpollable(fd int) bool { 78 epollfd, err := unix.EpollCreate1(0) 79 if err != nil { 80 // This shouldn't happen. If it does, just say file doesn't support epoll. 81 return false 82 } 83 defer unix.Close(epollfd) 84 85 event := unix.EpollEvent{ 86 Fd: int32(fd), 87 Events: unix.EPOLLIN, 88 } 89 err = unix.EpollCtl(epollfd, unix.EPOLL_CTL_ADD, fd, &event) 90 return err == nil 91 } 92 93 // inode implements kernfs.Inode. 94 // 95 // +stateify savable 96 type inode struct { 97 kernfs.CachedMappable 98 kernfs.InodeNoStatFS 99 kernfs.InodeAnonymous // inode is effectively anonymous because it represents a donated FD. 100 kernfs.InodeNotDirectory 101 kernfs.InodeNotSymlink 102 kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid. 103 kernfs.InodeWatches 104 105 locks vfs.FileLocks 106 107 // When the reference count reaches zero, the host fd is closed. 108 inodeRefs 109 110 // hostFD contains the host fd that this file was originally created from, 111 // which must be available at time of restore. 112 // 113 // This field is initialized at creation time and is immutable. 114 hostFD int 115 116 // ino is an inode number unique within this filesystem. 117 // 118 // This field is initialized at creation time and is immutable. 119 ino uint64 120 121 // ftype is the file's type (a linux.S_IFMT mask). 122 // 123 // This field is initialized at creation time and is immutable. 124 ftype uint16 125 126 // epollable indicates whether the hostFD can be used with epoll_ctl(2). This 127 // also indicates that hostFD has been set to non-blocking. 128 // 129 // This field is initialized at creation time and is immutable. 130 epollable bool 131 132 // seekable is false if lseek(hostFD) returns ESPIPE. We assume that file 133 // offsets are meaningful iff seekable is true. 134 // 135 // This field is initialized at creation time and is immutable. 136 seekable bool 137 138 // isTTY is true if this file represents a TTY. 139 // 140 // This field is initialized at creation time and is immutable. 141 isTTY bool 142 143 // savable is true if hostFD may be saved/restored by its numeric value. 144 // 145 // This field is initialized at creation time and is immutable. 146 savable bool 147 148 // readonly is true if operations that can potentially change the host file 149 // are blocked. 150 // 151 // This field is initialized at creation time and is immutable. 152 readonly bool 153 154 // Event queue for blocking operations. 155 queue waiter.Queue 156 157 // virtualOwner caches ownership and permission information to override the 158 // underlying file owner and permission. This is used to allow the unstrusted 159 // application to change these fields without affecting the host. 160 virtualOwner virtualOwner 161 162 // If haveBuf is non-zero, hostFD represents a pipe, and buf contains data 163 // read from the pipe from previous calls to inode.beforeSave(). haveBuf 164 // and buf are protected by bufMu. 165 bufMu sync.Mutex `state:"nosave"` 166 haveBuf atomicbitops.Uint32 167 buf []byte 168 } 169 170 func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool, readonly bool) (*inode, error) { 171 // Determine if hostFD is seekable. 172 _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) 173 seekable := !linuxerr.Equals(linuxerr.ESPIPE, err) 174 // We expect regular files to be seekable, as this is required for them to 175 // be memory-mappable. 176 if !seekable && fileType == unix.S_IFREG { 177 ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD) 178 return nil, linuxerr.ESPIPE 179 } 180 181 i := &inode{ 182 hostFD: hostFD, 183 ino: fs.NextIno(), 184 ftype: uint16(fileType), 185 epollable: isEpollable(hostFD), 186 seekable: seekable, 187 isTTY: isTTY, 188 savable: savable, 189 readonly: readonly, 190 } 191 i.InitRefs() 192 i.CachedMappable.Init(hostFD) 193 194 // If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and 195 // handle blocking behavior in the sentry. 196 if i.epollable { 197 if err := unix.SetNonblock(i.hostFD, true); err != nil { 198 return nil, err 199 } 200 if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { 201 return nil, err 202 } 203 } 204 return i, nil 205 } 206 207 // NewFDOptions contains options to NewFD. 208 type NewFDOptions struct { 209 // If Savable is true, the host file descriptor may be saved/restored by 210 // numeric value; the sandbox API requires a corresponding host FD with the 211 // same numeric value to be provided at time of restore. 212 Savable bool 213 214 // If IsTTY is true, the file descriptor is a TTY. 215 IsTTY bool 216 217 // If HaveFlags is true, use Flags for the new file description. Otherwise, 218 // the new file description will inherit flags from hostFD. 219 HaveFlags bool 220 Flags uint32 221 222 // VirtualOwner allow the host file to have owner and permissions different 223 // than the underlying host file. 224 VirtualOwner bool 225 UID auth.KUID 226 GID auth.KGID 227 228 // If Readonly is true, we disallow operations that can potentially change 229 // the host file associated with the file descriptor. 230 Readonly bool 231 } 232 233 // NewFD returns a vfs.FileDescription representing the given host file 234 // descriptor. mnt must be Kernel.HostMount(). 235 func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { 236 fs, ok := mnt.Filesystem().Impl().(*filesystem) 237 if !ok { 238 return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) 239 } 240 241 if opts.Readonly { 242 if opts.IsTTY { 243 // This is not a technical limitation, but access checks for TTYs 244 // have not been implemented yet. 245 return nil, fmt.Errorf("readonly file descriptor may currently not be a TTY") 246 } 247 248 flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) 249 if err != nil { 250 return nil, err 251 } 252 accessMode := uint32(flagsInt) & unix.O_ACCMODE 253 if accessMode != unix.O_RDONLY { 254 return nil, fmt.Errorf("readonly file descriptor may only be opened as O_RDONLY on the host") 255 } 256 } 257 258 // Retrieve metadata. 259 var stat unix.Stat_t 260 if err := unix.Fstat(hostFD, &stat); err != nil { 261 return nil, err 262 } 263 264 flags := opts.Flags 265 if !opts.HaveFlags { 266 // Get flags for the imported FD. 267 flagsInt, err := unix.FcntlInt(uintptr(hostFD), unix.F_GETFL, 0) 268 if err != nil { 269 return nil, err 270 } 271 flags = uint32(flagsInt) 272 } 273 274 fileType := linux.FileMode(stat.Mode).FileType() 275 i, err := newInode(ctx, fs, hostFD, opts.Savable, fileType, opts.IsTTY, opts.Readonly) 276 if err != nil { 277 return nil, err 278 } 279 if opts.VirtualOwner { 280 i.virtualOwner.enabled = true 281 i.virtualOwner.uid = atomicbitops.FromUint32(uint32(opts.UID)) 282 i.virtualOwner.gid = atomicbitops.FromUint32(uint32(opts.GID)) 283 i.virtualOwner.mode = atomicbitops.FromUint32(stat.Mode) 284 } 285 286 d := &kernfs.Dentry{} 287 d.Init(&fs.Filesystem, i) 288 289 // i.open will take a reference on d. 290 defer d.DecRef(ctx) 291 292 // For simplicity, fileDescription.offset is set to 0. Technically, we 293 // should only set to 0 on files that are not seekable (sockets, pipes, 294 // etc.), and use the offset from the host fd otherwise when importing. 295 return i.open(ctx, d, mnt, fileType, flags) 296 } 297 298 // filesystemType implements vfs.FilesystemType. 299 // 300 // +stateify savable 301 type filesystemType struct{} 302 303 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 304 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 305 panic("host.filesystemType.GetFilesystem should never be called") 306 } 307 308 // Name implements vfs.FilesystemType.Name. 309 func (filesystemType) Name() string { 310 return "none" 311 } 312 313 // Release implements vfs.FilesystemType.Release. 314 func (filesystemType) Release(ctx context.Context) {} 315 316 // NewFilesystem sets up and returns a new hostfs filesystem. 317 // 318 // Note that there should only ever be one instance of host.filesystem, 319 // a global mount for host fds. 320 func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { 321 devMinor, err := vfsObj.GetAnonBlockDevMinor() 322 if err != nil { 323 return nil, err 324 } 325 fs := &filesystem{ 326 devMinor: devMinor, 327 } 328 fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) 329 return fs.VFSFilesystem(), nil 330 } 331 332 // filesystem implements vfs.FilesystemImpl. 333 // 334 // +stateify savable 335 type filesystem struct { 336 kernfs.Filesystem 337 338 devMinor uint32 339 } 340 341 func (fs *filesystem) Release(ctx context.Context) { 342 fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 343 fs.Filesystem.Release(ctx) 344 } 345 346 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 347 d := vd.Dentry().Impl().(*kernfs.Dentry) 348 inode := d.Inode().(*inode) 349 b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) 350 return vfs.PrependPathSyntheticError{} 351 } 352 353 // MountOptions implements vfs.FilesystemImpl.MountOptions. 354 func (fs *filesystem) MountOptions() string { 355 return "" 356 } 357 358 // CheckPermissions implements kernfs.Inode.CheckPermissions. 359 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { 360 var s unix.Stat_t 361 if err := i.stat(&s); err != nil { 362 return err 363 } 364 return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) 365 } 366 367 // Mode implements kernfs.Inode.Mode. 368 func (i *inode) Mode() linux.FileMode { 369 var s unix.Stat_t 370 if err := i.stat(&s); err != nil { 371 // Retrieving the mode from the host fd using fstat(2) should not fail. 372 // If the syscall does not succeed, something is fundamentally wrong. 373 panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) 374 } 375 return linux.FileMode(s.Mode) 376 } 377 378 // Mode implements kernfs.Inode.UID 379 func (i *inode) UID() auth.KUID { 380 return auth.KUID(i.virtualOwner.uid.Load()) 381 } 382 383 // Mode implements kernfs.Inode.GID 384 func (i *inode) GID() auth.KGID { 385 return auth.KGID(i.virtualOwner.gid.Load()) 386 } 387 388 // Stat implements kernfs.Inode.Stat. 389 func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { 390 if opts.Mask&linux.STATX__RESERVED != 0 { 391 return linux.Statx{}, linuxerr.EINVAL 392 } 393 if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { 394 return linux.Statx{}, linuxerr.EINVAL 395 } 396 397 fs := vfsfs.Impl().(*filesystem) 398 399 // Limit our host call only to known flags. 400 mask := opts.Mask & linux.STATX_ALL 401 var s unix.Statx_t 402 err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) 403 if linuxerr.Equals(linuxerr.ENOSYS, err) { 404 // Fallback to fstat(2), if statx(2) is not supported on the host. 405 // 406 // TODO(b/151263641): Remove fallback. 407 return i.statxFromStat(fs) 408 } 409 if err != nil { 410 return linux.Statx{}, err 411 } 412 413 // Unconditionally fill blksize, attributes, and device numbers, as 414 // indicated by /include/uapi/linux/stat.h. Inode number is always 415 // available, since we use our own rather than the host's. 416 ls := linux.Statx{ 417 Mask: linux.STATX_INO, 418 Blksize: s.Blksize, 419 Attributes: s.Attributes, 420 Ino: i.ino, 421 AttributesMask: s.Attributes_mask, 422 DevMajor: linux.UNNAMED_MAJOR, 423 DevMinor: fs.devMinor, 424 } 425 426 // Copy other fields that were returned by the host. RdevMajor/RdevMinor 427 // are never copied (and therefore left as zero), so as not to expose host 428 // device numbers. 429 ls.Mask |= s.Mask & linux.STATX_ALL 430 if s.Mask&linux.STATX_TYPE != 0 { 431 if i.virtualOwner.enabled { 432 ls.Mode |= uint16(i.virtualOwner.atomicMode()) & linux.S_IFMT 433 } else { 434 ls.Mode |= s.Mode & linux.S_IFMT 435 } 436 } 437 if s.Mask&linux.STATX_MODE != 0 { 438 if i.virtualOwner.enabled { 439 ls.Mode |= uint16(i.virtualOwner.atomicMode()) &^ linux.S_IFMT 440 } else { 441 ls.Mode |= s.Mode &^ linux.S_IFMT 442 } 443 } 444 if s.Mask&linux.STATX_NLINK != 0 { 445 ls.Nlink = s.Nlink 446 } 447 if s.Mask&linux.STATX_UID != 0 { 448 if i.virtualOwner.enabled { 449 ls.UID = i.virtualOwner.atomicUID() 450 } else { 451 ls.UID = s.Uid 452 } 453 } 454 if s.Mask&linux.STATX_GID != 0 { 455 if i.virtualOwner.enabled { 456 ls.GID = i.virtualOwner.atomicGID() 457 } else { 458 ls.GID = s.Gid 459 } 460 } 461 if s.Mask&linux.STATX_ATIME != 0 { 462 ls.Atime = unixToLinuxStatxTimestamp(s.Atime) 463 } 464 if s.Mask&linux.STATX_BTIME != 0 { 465 ls.Btime = unixToLinuxStatxTimestamp(s.Btime) 466 } 467 if s.Mask&linux.STATX_CTIME != 0 { 468 ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) 469 } 470 if s.Mask&linux.STATX_MTIME != 0 { 471 ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) 472 } 473 if s.Mask&linux.STATX_SIZE != 0 { 474 ls.Size = s.Size 475 } 476 if s.Mask&linux.STATX_BLOCKS != 0 { 477 ls.Blocks = s.Blocks 478 } 479 480 return ls, nil 481 } 482 483 // statxFromStat is a best-effort fallback for inode.Stat() if the host does not 484 // support statx(2). 485 // 486 // We ignore the mask and sync flags in opts and simply supply 487 // STATX_BASIC_STATS, as fstat(2) itself does not allow the specification 488 // of a mask or sync flags. fstat(2) does not provide any metadata 489 // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so 490 // those fields remain empty. 491 func (i *inode) statxFromStat(fs *filesystem) (linux.Statx, error) { 492 var s unix.Stat_t 493 if err := i.stat(&s); err != nil { 494 return linux.Statx{}, err 495 } 496 497 // As with inode.Stat(), we always use internal device and inode numbers, 498 // and never expose the host's represented device numbers. 499 return linux.Statx{ 500 Mask: linux.STATX_BASIC_STATS, 501 Blksize: uint32(s.Blksize), 502 Nlink: uint32(s.Nlink), 503 UID: s.Uid, 504 GID: s.Gid, 505 Mode: uint16(s.Mode), 506 Ino: i.ino, 507 Size: uint64(s.Size), 508 Blocks: uint64(s.Blocks), 509 Atime: timespecToStatxTimestamp(s.Atim), 510 Ctime: timespecToStatxTimestamp(s.Ctim), 511 Mtime: timespecToStatxTimestamp(s.Mtim), 512 DevMajor: linux.UNNAMED_MAJOR, 513 DevMinor: fs.devMinor, 514 }, nil 515 } 516 517 func (i *inode) stat(stat *unix.Stat_t) error { 518 if err := unix.Fstat(i.hostFD, stat); err != nil { 519 return err 520 } 521 if i.virtualOwner.enabled { 522 stat.Uid = i.virtualOwner.atomicUID() 523 stat.Gid = i.virtualOwner.atomicGID() 524 stat.Mode = i.virtualOwner.atomicMode() 525 } 526 return nil 527 } 528 529 // SetStat implements kernfs.Inode.SetStat. 530 // 531 // +checklocksignore 532 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { 533 if i.readonly { 534 return linuxerr.EPERM 535 } 536 537 s := &opts.Stat 538 539 m := s.Mask 540 if m == 0 { 541 return nil 542 } 543 supportedModes := uint32(linux.STATX_MODE | linux.STATX_SIZE | linux.STATX_ATIME | linux.STATX_MTIME) 544 if i.virtualOwner.enabled { 545 if m&virtualOwnerModes != 0 { 546 // Take lock if any of the virtual owner fields will be updated. 547 i.virtualOwner.mu.Lock() 548 defer i.virtualOwner.mu.Unlock() 549 } 550 551 supportedModes |= virtualOwnerModes 552 } 553 if m&^supportedModes != 0 { 554 return linuxerr.EPERM 555 } 556 557 var hostStat unix.Stat_t 558 if err := i.stat(&hostStat); err != nil { 559 return err 560 } 561 if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { 562 return err 563 } 564 565 if m&linux.STATX_MODE != 0 { 566 if i.virtualOwner.enabled { 567 // We hold i.virtualOwner.mu. 568 i.virtualOwner.mode = atomicbitops.FromUint32(uint32(opts.Stat.Mode)) 569 } else { 570 log.Warningf("sentry seccomp filters don't allow making fchmod(2) syscall") 571 return unix.EPERM 572 } 573 } 574 if m&linux.STATX_SIZE != 0 { 575 if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { 576 return linuxerr.EINVAL 577 } 578 if err := unix.Ftruncate(i.hostFD, int64(s.Size)); err != nil { 579 return err 580 } 581 oldSize := uint64(hostStat.Size) 582 if s.Size < oldSize { 583 oldpgend, _ := hostarch.PageRoundUp(oldSize) 584 newpgend, _ := hostarch.PageRoundUp(s.Size) 585 if oldpgend != newpgend { 586 i.CachedMappable.InvalidateRange(memmap.MappableRange{newpgend, oldpgend}) 587 } 588 } 589 } 590 if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { 591 ts := [2]unix.Timespec{ 592 toTimespec(s.Atime, m&linux.STATX_ATIME == 0), 593 toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), 594 } 595 if err := setTimestamps(i.hostFD, &ts); err != nil { 596 return err 597 } 598 } 599 if i.virtualOwner.enabled { 600 if m&linux.STATX_UID != 0 { 601 // We hold i.virtualOwner.mu. 602 i.virtualOwner.uid = atomicbitops.FromUint32(opts.Stat.UID) 603 } 604 if m&linux.STATX_GID != 0 { 605 // We hold i.virtualOwner.mu. 606 i.virtualOwner.gid = atomicbitops.FromUint32(opts.Stat.GID) 607 } 608 } 609 return nil 610 } 611 612 // DecRef implements kernfs.Inode.DecRef. 613 func (i *inode) DecRef(ctx context.Context) { 614 i.inodeRefs.DecRef(func() { 615 if i.epollable { 616 fdnotifier.RemoveFD(int32(i.hostFD)) 617 } 618 if err := unix.Close(i.hostFD); err != nil { 619 log.Warningf("failed to close host fd %d: %v", i.hostFD, err) 620 } 621 // We can't rely on fdnotifier when closing the fd, because the event may race 622 // with fdnotifier.RemoveFD. Instead, notify the queue explicitly. 623 i.queue.Notify(waiter.EventHUp | waiter.ReadableEvents | waiter.WritableEvents) 624 }) 625 } 626 627 // Open implements kernfs.Inode.Open. 628 func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 629 // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. 630 if i.Mode().FileType() == linux.S_IFSOCK { 631 return nil, linuxerr.ENXIO 632 } 633 var stat unix.Stat_t 634 if err := i.stat(&stat); err != nil { 635 return nil, err 636 } 637 fileType := linux.FileMode(stat.Mode).FileType() 638 return i.open(ctx, d, rp.Mount(), fileType, opts.Flags) 639 } 640 641 func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, fileType linux.FileMode, flags uint32) (*vfs.FileDescription, error) { 642 // Constrain flags to a subset we can handle. 643 // 644 // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. 645 flags &= unix.O_ACCMODE | unix.O_NONBLOCK | unix.O_DSYNC | unix.O_SYNC | unix.O_APPEND 646 647 switch fileType { 648 case unix.S_IFSOCK: 649 if i.isTTY { 650 log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) 651 return nil, linuxerr.ENOTTY 652 } 653 654 ep, err := newEndpoint(ctx, i.hostFD, &i.queue) 655 if err != nil { 656 return nil, err 657 } 658 // Currently, we only allow Unix sockets to be imported. 659 return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks) 660 661 case unix.S_IFREG, unix.S_IFIFO, unix.S_IFCHR: 662 if i.isTTY { 663 fd := &TTYFileDescription{ 664 fileDescription: fileDescription{inode: i}, 665 termios: linux.DefaultReplicaTermios, 666 } 667 if task := kernel.TaskFromContext(ctx); task != nil { 668 fd.fgProcessGroup = task.ThreadGroup().ProcessGroup() 669 fd.session = fd.fgProcessGroup.Session() 670 } 671 fd.LockFD.Init(&i.locks) 672 vfsfd := &fd.vfsfd 673 if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 674 return nil, err 675 } 676 return vfsfd, nil 677 } 678 679 fd := &fileDescription{inode: i} 680 fd.LockFD.Init(&i.locks) 681 vfsfd := &fd.vfsfd 682 if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil { 683 return nil, err 684 } 685 return vfsfd, nil 686 687 default: 688 log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) 689 return nil, linuxerr.EPERM 690 } 691 } 692 693 // Create a new host-backed endpoint from the given fd and its corresponding 694 // notification queue. 695 func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) { 696 // Set up an external transport.Endpoint using the host fd. 697 addr := fmt.Sprintf("hostfd:[%d]", hostFD) 698 e, err := transport.NewHostConnectedEndpoint(hostFD, addr) 699 if err != nil { 700 return nil, err.ToError() 701 } 702 ep := transport.NewExternal(e.SockType(), uniqueid.GlobalProviderFromContext(ctx), queue, e, e) 703 return ep, nil 704 } 705 706 // fileDescription is embedded by host fd implementations of FileDescriptionImpl. 707 // 708 // +stateify savable 709 type fileDescription struct { 710 vfsfd vfs.FileDescription 711 vfs.FileDescriptionDefaultImpl 712 vfs.LockFD 713 714 // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but 715 // cached to reduce indirections and casting. fileDescription does not hold 716 // a reference on the inode through the inode field (since one is already 717 // held via the Dentry). 718 // 719 // inode is immutable after fileDescription creation. 720 inode *inode 721 722 // offsetMu protects offset. 723 offsetMu sync.Mutex `state:"nosave"` 724 725 // offset specifies the current file offset. It is only meaningful when 726 // inode.seekable is true. 727 offset int64 728 } 729 730 // SetStat implements vfs.FileDescriptionImpl.SetStat. 731 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 732 creds := auth.CredentialsFromContext(ctx) 733 return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) 734 } 735 736 // Stat implements vfs.FileDescriptionImpl.Stat. 737 func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 738 return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) 739 } 740 741 // Release implements vfs.FileDescriptionImpl.Release. 742 func (f *fileDescription) Release(context.Context) { 743 // noop 744 } 745 746 // Allocate implements vfs.FileDescriptionImpl.Allocate. 747 func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { 748 if f.inode.readonly { 749 return linuxerr.EPERM 750 } 751 return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length)) 752 } 753 754 // PRead implements vfs.FileDescriptionImpl.PRead. 755 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 756 // Check that flags are supported. 757 // 758 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 759 if opts.Flags&^linux.RWF_HIPRI != 0 { 760 return 0, linuxerr.EOPNOTSUPP 761 } 762 763 i := f.inode 764 if !i.seekable { 765 return 0, linuxerr.ESPIPE 766 } 767 768 return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) 769 } 770 771 // Read implements vfs.FileDescriptionImpl.Read. 772 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 773 // Check that flags are supported. 774 // 775 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 776 if opts.Flags&^linux.RWF_HIPRI != 0 { 777 return 0, linuxerr.EOPNOTSUPP 778 } 779 780 i := f.inode 781 if !i.seekable { 782 bufN, err := i.readFromBuf(ctx, &dst) 783 if err != nil { 784 return bufN, err 785 } 786 n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) 787 total := bufN + n 788 if isBlockError(err) { 789 // If we got any data at all, return it as a "completed" partial read 790 // rather than retrying until complete. 791 if total != 0 { 792 err = nil 793 } else { 794 err = linuxerr.ErrWouldBlock 795 } 796 } 797 return total, err 798 } 799 800 f.offsetMu.Lock() 801 n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) 802 f.offset += n 803 f.offsetMu.Unlock() 804 return n, err 805 } 806 807 func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) { 808 if i.haveBuf.Load() == 0 { 809 return 0, nil 810 } 811 i.bufMu.Lock() 812 defer i.bufMu.Unlock() 813 if len(i.buf) == 0 { 814 return 0, nil 815 } 816 n, err := dst.CopyOut(ctx, i.buf) 817 *dst = dst.DropFirst(n) 818 i.buf = i.buf[n:] 819 if len(i.buf) == 0 { 820 i.haveBuf.Store(0) 821 i.buf = nil 822 } 823 return int64(n), err 824 } 825 826 func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { 827 reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) 828 n, err := dst.CopyOutFrom(ctx, reader) 829 hostfd.PutReadWriterAt(reader) 830 return int64(n), err 831 } 832 833 // PWrite implements vfs.FileDescriptionImpl.PWrite. 834 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 835 if !f.inode.seekable { 836 return 0, linuxerr.ESPIPE 837 } 838 839 return f.writeToHostFD(ctx, src, offset, opts.Flags) 840 } 841 842 // Write implements vfs.FileDescriptionImpl.Write. 843 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 844 i := f.inode 845 if !i.seekable { 846 n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) 847 if isBlockError(err) { 848 err = linuxerr.ErrWouldBlock 849 } 850 return n, err 851 } 852 853 f.offsetMu.Lock() 854 // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if 855 // another process modifies the host file between retrieving the file size 856 // and writing to the host fd. This is an unavoidable race condition because 857 // we cannot enforce synchronization on the host. 858 if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 859 var s unix.Stat_t 860 if err := unix.Fstat(i.hostFD, &s); err != nil { 861 f.offsetMu.Unlock() 862 return 0, err 863 } 864 f.offset = s.Size 865 } 866 n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) 867 f.offset += n 868 f.offsetMu.Unlock() 869 return n, err 870 } 871 872 func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { 873 if f.inode.readonly { 874 return 0, linuxerr.EPERM 875 } 876 hostFD := f.inode.hostFD 877 // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. 878 if flags != 0 { 879 return 0, linuxerr.EOPNOTSUPP 880 } 881 writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) 882 n, err := src.CopyInTo(ctx, writer) 883 hostfd.PutReadWriterAt(writer) 884 // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC. 885 if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { 886 if syncErr := unix.Fsync(hostFD); syncErr != nil { 887 return int64(n), syncErr 888 } 889 } 890 return int64(n), err 891 } 892 893 // Seek implements vfs.FileDescriptionImpl.Seek. 894 // 895 // Note that we do not support seeking on directories, since we do not even 896 // allow directory fds to be imported at all. 897 func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { 898 i := f.inode 899 if !i.seekable { 900 return 0, linuxerr.ESPIPE 901 } 902 903 f.offsetMu.Lock() 904 defer f.offsetMu.Unlock() 905 906 switch whence { 907 case linux.SEEK_SET: 908 if offset < 0 { 909 return f.offset, linuxerr.EINVAL 910 } 911 f.offset = offset 912 913 case linux.SEEK_CUR: 914 // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. 915 if offset > math.MaxInt64-f.offset { 916 return f.offset, linuxerr.EOVERFLOW 917 } 918 if f.offset+offset < 0 { 919 return f.offset, linuxerr.EINVAL 920 } 921 f.offset += offset 922 923 case linux.SEEK_END: 924 var s unix.Stat_t 925 if err := unix.Fstat(i.hostFD, &s); err != nil { 926 return f.offset, err 927 } 928 size := s.Size 929 930 // Check for overflow. Note that underflow cannot occur, since size >= 0. 931 if offset > math.MaxInt64-size { 932 return f.offset, linuxerr.EOVERFLOW 933 } 934 if size+offset < 0 { 935 return f.offset, linuxerr.EINVAL 936 } 937 f.offset = size + offset 938 939 case linux.SEEK_DATA, linux.SEEK_HOLE: 940 // Modifying the offset in the host file table should not matter, since 941 // this is the only place where we use it. 942 // 943 // For reading and writing, we always rely on our internal offset. 944 n, err := unix.Seek(i.hostFD, offset, int(whence)) 945 if err != nil { 946 return f.offset, err 947 } 948 f.offset = n 949 950 default: 951 // Invalid whence. 952 return f.offset, linuxerr.EINVAL 953 } 954 955 return f.offset, nil 956 } 957 958 // Sync implements vfs.FileDescriptionImpl.Sync. 959 func (f *fileDescription) Sync(ctx context.Context) error { 960 if f.inode.readonly { 961 return linuxerr.EPERM 962 } 963 // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. 964 return unix.Fsync(f.inode.hostFD) 965 } 966 967 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 968 func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { 969 // NOTE(b/38213152): Technically, some obscure char devices can be memory 970 // mapped, but we only allow regular files. 971 if f.inode.ftype != unix.S_IFREG { 972 return linuxerr.ENODEV 973 } 974 i := f.inode 975 i.CachedMappable.InitFileMapperOnce() 976 return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) 977 } 978 979 // EventRegister implements waiter.Waitable.EventRegister. 980 func (f *fileDescription) EventRegister(e *waiter.Entry) error { 981 f.inode.queue.EventRegister(e) 982 if f.inode.epollable { 983 if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil { 984 f.inode.queue.EventUnregister(e) 985 return err 986 } 987 } 988 return nil 989 } 990 991 // EventUnregister implements waiter.Waitable.EventUnregister. 992 func (f *fileDescription) EventUnregister(e *waiter.Entry) { 993 f.inode.queue.EventUnregister(e) 994 if f.inode.epollable { 995 if err := fdnotifier.UpdateFD(int32(f.inode.hostFD)); err != nil { 996 panic(fmt.Sprint("UpdateFD:", err)) 997 } 998 } 999 } 1000 1001 // Readiness uses the poll() syscall to check the status of the underlying FD. 1002 func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { 1003 return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) 1004 } 1005 1006 // Epollable implements FileDescriptionImpl.Epollable. 1007 func (f *fileDescription) Epollable() bool { 1008 return f.inode.epollable 1009 } 1010 1011 // Ioctl queries the underlying FD for allowed ioctl commands. 1012 func (f *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) { 1013 switch cmd := args[1].Int(); cmd { 1014 case linux.FIONREAD: 1015 v, err := ioctlFionread(f.inode.hostFD) 1016 if err != nil { 1017 return 0, err 1018 } 1019 1020 var buf [4]byte 1021 hostarch.ByteOrder.PutUint32(buf[:], v) 1022 _, err = uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) 1023 return 0, err 1024 } 1025 1026 return f.FileDescriptionDefaultImpl.Ioctl(ctx, uio, sysno, args) 1027 }