github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/kernel/fd_table.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 "fmt" 19 "math" 20 "strings" 21 "sync/atomic" 22 23 "golang.org/x/sys/unix" 24 "github.com/SagerNet/gvisor/pkg/abi/linux" 25 "github.com/SagerNet/gvisor/pkg/context" 26 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 27 "github.com/SagerNet/gvisor/pkg/sentry/fs" 28 "github.com/SagerNet/gvisor/pkg/sentry/fs/lock" 29 "github.com/SagerNet/gvisor/pkg/sentry/limits" 30 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 31 "github.com/SagerNet/gvisor/pkg/sync" 32 ) 33 34 // FDFlags define flags for an individual descriptor. 35 // 36 // +stateify savable 37 type FDFlags struct { 38 // CloseOnExec indicates the descriptor should be closed on exec. 39 CloseOnExec bool 40 } 41 42 // ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags 43 // representation. 44 func (f FDFlags) ToLinuxFileFlags() (mask uint) { 45 if f.CloseOnExec { 46 mask |= linux.O_CLOEXEC 47 } 48 return 49 } 50 51 // ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags 52 // representation. 53 func (f FDFlags) ToLinuxFDFlags() (mask uint) { 54 if f.CloseOnExec { 55 mask |= linux.FD_CLOEXEC 56 } 57 return 58 } 59 60 // descriptor holds the details about a file descriptor, namely a pointer to 61 // the file itself and the descriptor flags. 62 // 63 // Note that this is immutable and can only be changed via operations on the 64 // descriptorTable. 65 // 66 // It contains both VFS1 and VFS2 file types, but only one of them can be set. 67 // 68 // +stateify savable 69 type descriptor struct { 70 // TODO(github.com/SagerNet/issue/1624): Remove fs.File. 71 file *fs.File 72 fileVFS2 *vfs.FileDescription 73 flags FDFlags 74 } 75 76 // FDTable is used to manage File references and flags. 77 // 78 // +stateify savable 79 type FDTable struct { 80 FDTableRefs 81 82 k *Kernel 83 84 // mu protects below. 85 mu sync.Mutex `state:"nosave"` 86 87 // next is start position to find fd. 88 next int32 89 90 // used contains the number of non-nil entries. It must be accessed 91 // atomically. It may be read atomically without holding mu (but not 92 // written). 93 used int32 94 95 // descriptorTable holds descriptors. 96 descriptorTable `state:".(map[int32]descriptor)"` 97 } 98 99 func (f *FDTable) saveDescriptorTable() map[int32]descriptor { 100 m := make(map[int32]descriptor) 101 f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { 102 m[fd] = descriptor{ 103 file: file, 104 fileVFS2: fileVFS2, 105 flags: flags, 106 } 107 }) 108 return m 109 } 110 111 func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { 112 ctx := context.Background() 113 f.initNoLeakCheck() // Initialize table. 114 f.used = 0 115 for fd, d := range m { 116 if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil { 117 panic("VFS1 or VFS2 files set") 118 } 119 120 // Note that we do _not_ need to acquire a extra table reference here. The 121 // table reference will already be accounted for in the file, so we drop the 122 // reference taken by set above. 123 switch { 124 case d.file != nil: 125 d.file.DecRef(ctx) 126 case d.fileVFS2 != nil: 127 d.fileVFS2.DecRef(ctx) 128 } 129 } 130 } 131 132 // drop drops the table reference. 133 func (f *FDTable) drop(ctx context.Context, file *fs.File) { 134 // Release locks. 135 file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF}) 136 137 // Send inotify events. 138 d := file.Dirent 139 var ev uint32 140 if fs.IsDir(d.Inode.StableAttr) { 141 ev |= linux.IN_ISDIR 142 } 143 if file.Flags().Write { 144 ev |= linux.IN_CLOSE_WRITE 145 } else { 146 ev |= linux.IN_CLOSE_NOWRITE 147 } 148 d.InotifyEvent(ev, 0) 149 150 // Drop the table reference. 151 file.DecRef(ctx) 152 } 153 154 // dropVFS2 drops the table reference. 155 func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) { 156 // Release any POSIX lock possibly held by the FDTable. 157 if file.SupportsLocks() { 158 err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) 159 if err != nil && !linuxerr.Equals(linuxerr.ENOLCK, err) { 160 panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) 161 } 162 } 163 164 // Drop the table's reference. 165 file.DecRef(ctx) 166 } 167 168 // NewFDTable allocates a new FDTable that may be used by tasks in k. 169 func (k *Kernel) NewFDTable() *FDTable { 170 f := &FDTable{k: k} 171 f.init() 172 return f 173 } 174 175 // DecRef implements RefCounter.DecRef. 176 // 177 // If f reaches zero references, all of its file descriptors are removed. 178 func (f *FDTable) DecRef(ctx context.Context) { 179 f.FDTableRefs.DecRef(func() { 180 f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { 181 return true 182 }) 183 }) 184 } 185 186 // forEach iterates over all non-nil files in sorted order. 187 // 188 // It is the caller's responsibility to acquire an appropriate lock. 189 func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { 190 // retries tracks the number of failed TryIncRef attempts for the same FD. 191 retries := 0 192 fd := int32(0) 193 for { 194 file, fileVFS2, flags, ok := f.getAll(fd) 195 if !ok { 196 break 197 } 198 switch { 199 case file != nil: 200 if !file.TryIncRef() { 201 retries++ 202 if retries > 1000 { 203 panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations)) 204 } 205 continue // Race caught. 206 } 207 fn(fd, file, nil, flags) 208 file.DecRef(ctx) 209 case fileVFS2 != nil: 210 if !fileVFS2.TryIncRef() { 211 retries++ 212 if retries > 1000 { 213 panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl())) 214 } 215 continue // Race caught. 216 } 217 fn(fd, nil, fileVFS2, flags) 218 fileVFS2.DecRef(ctx) 219 } 220 retries = 0 221 fd++ 222 } 223 } 224 225 // String is a stringer for FDTable. 226 func (f *FDTable) String() string { 227 var buf strings.Builder 228 ctx := context.Background() 229 f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { 230 switch { 231 case file != nil: 232 n, _ := file.Dirent.FullName(nil /* root */) 233 fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n) 234 235 case fileVFS2 != nil: 236 vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() 237 vd := fileVFS2.VirtualDentry() 238 if vd.Dentry() == nil { 239 panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2)) 240 } 241 name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) 242 if err != nil { 243 fmt.Fprintf(&buf, "<err: %v>\n", err) 244 return 245 } 246 fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name) 247 } 248 }) 249 return buf.String() 250 } 251 252 // NewFDs allocates new FDs guaranteed to be the lowest number available 253 // greater than or equal to the fd parameter. All files will share the set 254 // flags. Success is guaranteed to be all or none. 255 func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) { 256 if fd < 0 { 257 // Don't accept negative FDs. 258 return nil, unix.EINVAL 259 } 260 261 // Default limit. 262 end := int32(math.MaxInt32) 263 264 // Ensure we don't get past the provided limit. 265 if limitSet := limits.FromContext(ctx); limitSet != nil { 266 lim := limitSet.Get(limits.NumberOfFiles) 267 if lim.Cur != limits.Infinity { 268 end = int32(lim.Cur) 269 } 270 if fd >= end { 271 return nil, unix.EMFILE 272 } 273 } 274 275 f.mu.Lock() 276 277 // From f.next to find available fd. 278 if fd < f.next { 279 fd = f.next 280 } 281 282 // Install all entries. 283 for i := fd; i < end && len(fds) < len(files); i++ { 284 if d, _, _ := f.get(i); d == nil { 285 // Set the descriptor. 286 f.set(ctx, i, files[len(fds)], flags) 287 fds = append(fds, i) // Record the file descriptor. 288 } 289 } 290 291 // Failure? Unwind existing FDs. 292 if len(fds) < len(files) { 293 for _, i := range fds { 294 f.set(ctx, i, nil, FDFlags{}) 295 } 296 f.mu.Unlock() 297 298 // Drop the reference taken by the call to f.set() that 299 // originally installed the file. Don't call f.drop() 300 // (generating inotify events, etc.) since the file should 301 // appear to have never been inserted into f. 302 for _, file := range files[:len(fds)] { 303 file.DecRef(ctx) 304 } 305 return nil, unix.EMFILE 306 } 307 308 if fd == f.next { 309 // Update next search start position. 310 f.next = fds[len(fds)-1] + 1 311 } 312 313 f.mu.Unlock() 314 return fds, nil 315 } 316 317 // NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available 318 // greater than or equal to the fd parameter. All files will share the set 319 // flags. Success is guaranteed to be all or none. 320 func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { 321 if fd < 0 { 322 // Don't accept negative FDs. 323 return nil, unix.EINVAL 324 } 325 326 // Default limit. 327 end := int32(math.MaxInt32) 328 329 // Ensure we don't get past the provided limit. 330 if limitSet := limits.FromContext(ctx); limitSet != nil { 331 lim := limitSet.Get(limits.NumberOfFiles) 332 if lim.Cur != limits.Infinity { 333 end = int32(lim.Cur) 334 } 335 if fd >= end { 336 return nil, unix.EMFILE 337 } 338 } 339 340 f.mu.Lock() 341 342 // From f.next to find available fd. 343 if fd < f.next { 344 fd = f.next 345 } 346 347 // Install all entries. 348 for i := fd; i < end && len(fds) < len(files); i++ { 349 if d, _, _ := f.getVFS2(i); d == nil { 350 // Set the descriptor. 351 f.setVFS2(ctx, i, files[len(fds)], flags) 352 fds = append(fds, i) // Record the file descriptor. 353 } 354 } 355 356 // Failure? Unwind existing FDs. 357 if len(fds) < len(files) { 358 for _, i := range fds { 359 f.setVFS2(ctx, i, nil, FDFlags{}) 360 } 361 f.mu.Unlock() 362 363 // Drop the reference taken by the call to f.setVFS2() that 364 // originally installed the file. Don't call f.dropVFS2() 365 // (generating inotify events, etc.) since the file should 366 // appear to have never been inserted into f. 367 for _, file := range files[:len(fds)] { 368 file.DecRef(ctx) 369 } 370 return nil, unix.EMFILE 371 } 372 373 if fd == f.next { 374 // Update next search start position. 375 f.next = fds[len(fds)-1] + 1 376 } 377 378 f.mu.Unlock() 379 return fds, nil 380 } 381 382 // NewFDVFS2 allocates a file descriptor greater than or equal to minfd for 383 // the given file description. If it succeeds, it takes a reference on file. 384 func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { 385 if minfd < 0 { 386 // Don't accept negative FDs. 387 return -1, unix.EINVAL 388 } 389 390 // Default limit. 391 end := int32(math.MaxInt32) 392 393 // Ensure we don't get past the provided limit. 394 if limitSet := limits.FromContext(ctx); limitSet != nil { 395 lim := limitSet.Get(limits.NumberOfFiles) 396 if lim.Cur != limits.Infinity { 397 end = int32(lim.Cur) 398 } 399 if minfd >= end { 400 return -1, unix.EMFILE 401 } 402 } 403 404 f.mu.Lock() 405 defer f.mu.Unlock() 406 407 // From f.next to find available fd. 408 fd := minfd 409 if fd < f.next { 410 fd = f.next 411 } 412 for fd < end { 413 if d, _, _ := f.getVFS2(fd); d == nil { 414 f.setVFS2(ctx, fd, file, flags) 415 if fd == f.next { 416 // Update next search start position. 417 f.next = fd + 1 418 } 419 return fd, nil 420 } 421 fd++ 422 } 423 return -1, unix.EMFILE 424 } 425 426 // NewFDAt sets the file reference for the given FD. If there is an active 427 // reference for that FD, the ref count for that existing reference is 428 // decremented. 429 func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error { 430 df, _, err := f.newFDAt(ctx, fd, file, nil, flags) 431 if err != nil { 432 return err 433 } 434 if df != nil { 435 f.drop(ctx, df) 436 } 437 return nil 438 } 439 440 // NewFDAtVFS2 sets the file reference for the given FD. If there is an active 441 // reference for that FD, the ref count for that existing reference is 442 // decremented. 443 func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error { 444 _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags) 445 if err != nil { 446 return err 447 } 448 if dfVFS2 != nil { 449 f.dropVFS2(ctx, dfVFS2) 450 } 451 return nil 452 } 453 454 func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) { 455 if fd < 0 { 456 // Don't accept negative FDs. 457 return nil, nil, unix.EBADF 458 } 459 460 // Check the limit for the provided file. 461 if limitSet := limits.FromContext(ctx); limitSet != nil { 462 if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { 463 return nil, nil, unix.EMFILE 464 } 465 } 466 467 // Install the entry. 468 f.mu.Lock() 469 defer f.mu.Unlock() 470 471 df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags) 472 return df, dfVFS2, nil 473 } 474 475 // SetFlags sets the flags for the given file descriptor. 476 // 477 // True is returned iff flags were changed. 478 func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error { 479 if fd < 0 { 480 // Don't accept negative FDs. 481 return unix.EBADF 482 } 483 484 f.mu.Lock() 485 defer f.mu.Unlock() 486 487 file, _, _ := f.get(fd) 488 if file == nil { 489 // No file found. 490 return unix.EBADF 491 } 492 493 // Update the flags. 494 f.set(ctx, fd, file, flags) 495 return nil 496 } 497 498 // SetFlagsVFS2 sets the flags for the given file descriptor. 499 // 500 // True is returned iff flags were changed. 501 func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error { 502 if fd < 0 { 503 // Don't accept negative FDs. 504 return unix.EBADF 505 } 506 507 f.mu.Lock() 508 defer f.mu.Unlock() 509 510 file, _, _ := f.getVFS2(fd) 511 if file == nil { 512 // No file found. 513 return unix.EBADF 514 } 515 516 // Update the flags. 517 f.setVFS2(ctx, fd, file, flags) 518 return nil 519 } 520 521 // Get returns a reference to the file and the flags for the FD or nil if no 522 // file is defined for the given fd. 523 // 524 // N.B. Callers are required to use DecRef when they are done. 525 // 526 //go:nosplit 527 func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) { 528 if fd < 0 { 529 return nil, FDFlags{} 530 } 531 532 for { 533 file, flags, _ := f.get(fd) 534 if file != nil { 535 if !file.TryIncRef() { 536 continue // Race caught. 537 } 538 // Reference acquired. 539 return file, flags 540 } 541 // No file available. 542 return nil, FDFlags{} 543 } 544 } 545 546 // GetVFS2 returns a reference to the file and the flags for the FD or nil if no 547 // file is defined for the given fd. 548 // 549 // N.B. Callers are required to use DecRef when they are done. 550 // 551 //go:nosplit 552 func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { 553 if fd < 0 { 554 return nil, FDFlags{} 555 } 556 557 for { 558 file, flags, _ := f.getVFS2(fd) 559 if file != nil { 560 if !file.TryIncRef() { 561 continue // Race caught. 562 } 563 // Reference acquired. 564 return file, flags 565 } 566 // No file available. 567 return nil, FDFlags{} 568 } 569 } 570 571 // GetFDs returns a sorted list of valid fds. 572 // 573 // Precondition: The caller must be running on the task goroutine, or Task.mu 574 // must be locked. 575 func (f *FDTable) GetFDs(ctx context.Context) []int32 { 576 fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) 577 f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { 578 fds = append(fds, fd) 579 }) 580 return fds 581 } 582 583 // Fork returns an independent FDTable. 584 func (f *FDTable) Fork(ctx context.Context) *FDTable { 585 clone := f.k.NewFDTable() 586 587 f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { 588 // The set function here will acquire an appropriate table 589 // reference for the clone. We don't need anything else. 590 if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil { 591 panic("VFS1 or VFS2 files set") 592 } 593 }) 594 return clone 595 } 596 597 // Remove removes an FD from and returns a non-file iff successful. 598 // 599 // N.B. Callers are required to use DecRef when they are done. 600 func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) { 601 if fd < 0 { 602 return nil, nil 603 } 604 605 f.mu.Lock() 606 607 // Update current available position. 608 if fd < f.next { 609 f.next = fd 610 } 611 612 orig, orig2, _, _ := f.getAll(fd) 613 614 // Add reference for caller. 615 switch { 616 case orig != nil: 617 orig.IncRef() 618 case orig2 != nil: 619 orig2.IncRef() 620 } 621 622 if orig != nil || orig2 != nil { 623 orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry. 624 } 625 f.mu.Unlock() 626 627 if orig != nil { 628 f.drop(ctx, orig) 629 } 630 if orig2 != nil { 631 f.dropVFS2(ctx, orig2) 632 } 633 634 return orig, orig2 635 } 636 637 // RemoveIf removes all FDs where cond is true. 638 func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { 639 // TODO(github.com/SagerNet/issue/1624): Remove fs.File slice. 640 var files []*fs.File 641 var filesVFS2 []*vfs.FileDescription 642 643 f.mu.Lock() 644 f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { 645 if cond(file, fileVFS2, flags) { 646 df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table. 647 if df != nil { 648 files = append(files, df) 649 } 650 if dfVFS2 != nil { 651 filesVFS2 = append(filesVFS2, dfVFS2) 652 } 653 // Update current available position. 654 if fd < f.next { 655 f.next = fd 656 } 657 } 658 }) 659 f.mu.Unlock() 660 661 for _, file := range files { 662 f.drop(ctx, file) 663 } 664 665 for _, file := range filesVFS2 { 666 f.dropVFS2(ctx, file) 667 } 668 }