gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/kernel/fd_table.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernel 16 17 import ( 18 goContext "context" 19 "fmt" 20 "math" 21 "strings" 22 23 "golang.org/x/sys/unix" 24 "gvisor.dev/gvisor/pkg/abi/linux" 25 "gvisor.dev/gvisor/pkg/bitmap" 26 "gvisor.dev/gvisor/pkg/context" 27 "gvisor.dev/gvisor/pkg/errors/linuxerr" 28 "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock" 29 "gvisor.dev/gvisor/pkg/sentry/limits" 30 "gvisor.dev/gvisor/pkg/sentry/vfs" 31 ) 32 33 // FDFlags define flags for an individual descriptor. 34 // 35 // +stateify savable 36 type FDFlags struct { 37 // CloseOnExec indicates the descriptor should be closed on exec. 38 CloseOnExec bool 39 } 40 41 // ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags 42 // representation. 43 func (f FDFlags) ToLinuxFileFlags() (mask uint) { 44 if f.CloseOnExec { 45 mask |= linux.O_CLOEXEC 46 } 47 return 48 } 49 50 // ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags 51 // representation. 52 func (f FDFlags) ToLinuxFDFlags() (mask uint) { 53 if f.CloseOnExec { 54 mask |= linux.FD_CLOEXEC 55 } 56 return 57 } 58 59 // descriptor holds the details about a file descriptor, namely a pointer to 60 // the file itself and the descriptor flags. 61 // 62 // Note that this is immutable and can only be changed via operations on the 63 // descriptorTable. 64 // 65 // +stateify savable 66 type descriptor struct { 67 file *vfs.FileDescription 68 flags FDFlags 69 } 70 71 // MaxFdLimit defines the upper limit on the integer value of file descriptors. 72 const MaxFdLimit int32 = int32(bitmap.MaxBitEntryLimit) 73 74 // FDTable is used to manage File references and flags. 75 // 76 // +stateify savable 77 type FDTable struct { 78 FDTableRefs 79 80 k *Kernel 81 82 // mu protects below. 83 mu fdTableMutex `state:"nosave"` 84 85 // fdBitmap shows which fds are already in use. 86 fdBitmap bitmap.Bitmap `state:"nosave"` 87 88 // descriptorTable holds descriptors. 89 descriptorTable `state:".(map[int32]descriptor)"` 90 } 91 92 func (f *FDTable) saveDescriptorTable() map[int32]descriptor { 93 m := make(map[int32]descriptor) 94 f.mu.Lock() 95 defer f.mu.Unlock() 96 f.forEach(context.Background(), func(fd int32, file *vfs.FileDescription, flags FDFlags) { 97 m[fd] = descriptor{ 98 file: file, 99 flags: flags, 100 } 101 }) 102 return m 103 } 104 105 func (f *FDTable) loadDescriptorTable(_ goContext.Context, m map[int32]descriptor) { 106 ctx := context.Background() 107 f.initNoLeakCheck() // Initialize table. 108 f.fdBitmap = bitmap.New(uint32(math.MaxUint16)) 109 for fd, d := range m { 110 if fd < 0 { 111 panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd)) 112 } 113 114 if df := f.set(fd, d.file, d.flags); df != nil { 115 panic("file set") 116 } 117 f.fdBitmap.Add(uint32(fd)) 118 // Note that we do _not_ need to acquire a extra table reference here. The 119 // table reference will already be accounted for in the file, so we drop the 120 // reference taken by set above. 121 if d.file != nil { 122 d.file.DecRef(ctx) 123 } 124 } 125 } 126 127 // Release any POSIX lock possibly held by the FDTable. 128 func (f *FDTable) fileUnlock(ctx context.Context, file *vfs.FileDescription) { 129 if file.SupportsLocks() { 130 err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF}) 131 if err != nil && !linuxerr.Equals(linuxerr.ENOLCK, err) { 132 panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) 133 } 134 } 135 } 136 137 // NewFDTable allocates a new FDTable that may be used by tasks in k. 138 func (k *Kernel) NewFDTable() *FDTable { 139 f := &FDTable{k: k} 140 f.init() 141 return f 142 } 143 144 // DecRef implements RefCounter.DecRef. 145 // 146 // If f reaches zero references, all of its file descriptors are removed. 147 func (f *FDTable) DecRef(ctx context.Context) { 148 f.FDTableRefs.DecRef(func() { 149 f.RemoveIf(ctx, func(*vfs.FileDescription, FDFlags) bool { 150 return true 151 }) 152 }) 153 } 154 155 // forEachUpTo iterates over all non-nil files upto maxFds (non-inclusive) in sorted order. 156 // 157 // It is the caller's responsibility to acquire an appropriate lock. 158 func (f *FDTable) forEachUpTo(ctx context.Context, maxFd int32, fn func(fd int32, file *vfs.FileDescription, flags FDFlags)) { 159 // Iterate through the fdBitmap. 160 f.fdBitmap.ForEach(0, uint32(maxFd), func(ufd uint32) bool { 161 fd := int32(ufd) 162 file, flags, ok := f.get(fd) 163 if !ok { 164 return true 165 } 166 if file != nil { 167 if !file.TryIncRef() { 168 return true 169 } 170 fn(fd, file, flags) 171 file.DecRef(ctx) 172 } 173 return true 174 }) 175 } 176 177 // forEach iterates over all non-nil files upto maxFd in sorted order. 178 // 179 // It is the caller's responsibility to acquire an appropriate lock. 180 func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *vfs.FileDescription, flags FDFlags)) { 181 f.forEachUpTo(ctx, MaxFdLimit, fn) 182 } 183 184 // String is a stringer for FDTable. 185 func (f *FDTable) String() string { 186 var buf strings.Builder 187 ctx := context.Background() 188 files := make(map[int32]*vfs.FileDescription) 189 f.mu.Lock() 190 // Can't release f.mu from defer, because vfsObj.PathnameWithDeleted 191 // should not be called under the fdtable mutex. 192 f.forEach(ctx, func(fd int32, file *vfs.FileDescription, flags FDFlags) { 193 if file != nil { 194 file.IncRef() 195 files[fd] = file 196 } 197 }) 198 f.mu.Unlock() 199 defer func() { 200 for _, f := range files { 201 f.DecRef(ctx) 202 } 203 }() 204 205 for fd, file := range files { 206 vfsObj := file.Mount().Filesystem().VirtualFilesystem() 207 vd := file.VirtualDentry() 208 if vd.Dentry() == nil { 209 panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, file.Impl(), file)) 210 } 211 name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, file.VirtualDentry()) 212 if err != nil { 213 fmt.Fprintf(&buf, "<err: %v>\n", err) 214 continue 215 } 216 fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name) 217 } 218 return buf.String() 219 } 220 221 // NewFDs allocates new FDs guaranteed to be the lowest number available 222 // greater than or equal to the minFD parameter. All files will share the set 223 // flags. Success is guaranteed to be all or none. 224 func (f *FDTable) NewFDs(ctx context.Context, minFD int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) { 225 if minFD < 0 { 226 // Don't accept negative FDs. 227 return nil, unix.EINVAL 228 } 229 230 // Default limit. 231 end := f.k.MaxFDLimit.Load() 232 233 // Ensure we don't get past the provided limit. 234 if limitSet := limits.FromContext(ctx); limitSet != nil { 235 lim := limitSet.Get(limits.NumberOfFiles) 236 // Only set if the limit is smaller than the max to avoid overflow. 237 if lim.Cur != limits.Infinity && lim.Cur < uint64(end) { 238 end = int32(lim.Cur) 239 } 240 } 241 if minFD+int32(len(files)) > end { 242 return nil, unix.EMFILE 243 } 244 245 f.mu.Lock() 246 247 // max is used as the largest number in fdBitmap + 1. 248 max := int32(0) 249 if !f.fdBitmap.IsEmpty() { 250 max = int32(f.fdBitmap.Maximum()) 251 max++ 252 } 253 254 // Adjust max in case it is less than minFD. 255 if max < minFD { 256 max = minFD 257 } 258 // Install all entries. 259 for len(fds) < len(files) { 260 // Try to use free bit in fdBitmap. 261 // If all bits in fdBitmap are used, expand fd to the max. 262 fd, err := f.fdBitmap.FirstZero(uint32(minFD)) 263 if err != nil { 264 fd = uint32(max) 265 max++ 266 } 267 if fd >= uint32(end) { 268 break 269 } 270 f.fdBitmap.Add(fd) 271 if df := f.set(int32(fd), files[len(fds)], flags); df != nil { 272 panic("file set") 273 } 274 fds = append(fds, int32(fd)) 275 minFD = int32(fd) 276 } 277 278 // Failure? Unwind existing FDs. 279 if len(fds) < len(files) { 280 for _, i := range fds { 281 _ = f.set(i, nil, FDFlags{}) 282 f.fdBitmap.Remove(uint32(i)) 283 } 284 f.mu.Unlock() 285 286 // Drop the reference taken by the call to f.set() that 287 // originally installed the file. Don't call f.drop() 288 // (generating inotify events, etc.) since the file should 289 // appear to have never been inserted into f. 290 for _, file := range files[:len(fds)] { 291 file.DecRef(ctx) 292 } 293 return nil, unix.EMFILE 294 } 295 296 f.mu.Unlock() 297 return fds, nil 298 } 299 300 // NewFD allocates a file descriptor greater than or equal to minFD for 301 // the given file description. If it succeeds, it takes a reference on file. 302 func (f *FDTable) NewFD(ctx context.Context, minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { 303 files := []*vfs.FileDescription{file} 304 fileSlice, error := f.NewFDs(ctx, minFD, files, flags) 305 if error != nil { 306 return -1, error 307 } 308 return fileSlice[0], nil 309 } 310 311 // NewFDAt sets the file reference for the given FD. If there is an existing 312 // file description for that FD, it is returned. 313 // 314 // N.B. Callers are required to use DecRef on the returned file when they are done. 315 // 316 // Precondition: file != nil. 317 func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) (*vfs.FileDescription, error) { 318 if fd < 0 { 319 // Don't accept negative FDs. 320 return nil, unix.EBADF 321 } 322 323 if fd >= f.k.MaxFDLimit.Load() { 324 return nil, unix.EMFILE 325 } 326 // Check the limit for the provided file. 327 if limitSet := limits.FromContext(ctx); limitSet != nil { 328 if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { 329 return nil, unix.EMFILE 330 } 331 } 332 333 // Install the entry. 334 f.mu.Lock() 335 df := f.set(fd, file, flags) 336 // Add fd to fdBitmap. 337 if df == nil { 338 f.fdBitmap.Add(uint32(fd)) 339 } 340 f.mu.Unlock() 341 342 if df != nil { 343 f.fileUnlock(ctx, df) 344 // Table's reference on df is transferred to caller, so don't DecRef. 345 } 346 return df, nil 347 } 348 349 // SetFlags sets the flags for the given file descriptor. 350 // 351 // True is returned iff flags were changed. 352 func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error { 353 if fd < 0 { 354 // Don't accept negative FDs. 355 return unix.EBADF 356 } 357 358 f.mu.Lock() 359 defer f.mu.Unlock() 360 361 file, _, _ := f.get(fd) 362 if file == nil { 363 // No file found. 364 return unix.EBADF 365 } 366 367 // Update the flags. 368 if df := f.set(fd, file, flags); df != nil { 369 panic("file changed") 370 } 371 return nil 372 } 373 374 // SetFlagsForRange sets the flags for the given range of file descriptors 375 // (inclusive: [startFd, endFd]). 376 func (f *FDTable) SetFlagsForRange(ctx context.Context, startFd int32, endFd int32, flags FDFlags) error { 377 if startFd < 0 || startFd > endFd { 378 return unix.EBADF 379 } 380 381 f.mu.Lock() 382 defer f.mu.Unlock() 383 384 for fd, err := f.fdBitmap.FirstOne(uint32(startFd)); err == nil && fd <= uint32(endFd); fd, err = f.fdBitmap.FirstOne(fd + 1) { 385 fdI32 := int32(fd) 386 file, _, _ := f.get(fdI32) 387 if df := f.set(fdI32, file, flags); df != nil { 388 panic("file changed") 389 } 390 } 391 392 return nil 393 } 394 395 // Get returns a reference to the file and the flags for the FD or nil if no 396 // file is defined for the given fd. 397 // 398 // N.B. Callers are required to use DecRef when they are done. 399 // 400 //go:nosplit 401 func (f *FDTable) Get(fd int32) (*vfs.FileDescription, FDFlags) { 402 if fd < 0 { 403 return nil, FDFlags{} 404 } 405 406 for { 407 file, flags, _ := f.get(fd) 408 if file != nil { 409 if !file.TryIncRef() { 410 continue // Race caught. 411 } 412 // Reference acquired. 413 return file, flags 414 } 415 // No file available. 416 return nil, FDFlags{} 417 } 418 } 419 420 // GetFDs returns a sorted list of valid fds. 421 // 422 // Precondition: The caller must be running on the task goroutine, or Task.mu 423 // must be locked. 424 func (f *FDTable) GetFDs(ctx context.Context) []int32 { 425 f.mu.Lock() 426 defer f.mu.Unlock() 427 fds := make([]int32, 0, int(f.fdBitmap.GetNumOnes())) 428 f.forEach(ctx, func(fd int32, _ *vfs.FileDescription, _ FDFlags) { 429 fds = append(fds, fd) 430 }) 431 return fds 432 } 433 434 // Exists returns whether fd is defined in the table. It is inherently racy. 435 // 436 //go:nosplit 437 func (f *FDTable) Exists(fd int32) bool { 438 if fd < 0 { 439 return false 440 } 441 file, _, _ := f.get(fd) 442 return file != nil 443 } 444 445 // Fork returns an independent FDTable, cloning all FDs up to maxFds (non-inclusive). 446 func (f *FDTable) Fork(ctx context.Context, maxFd int32) *FDTable { 447 clone := f.k.NewFDTable() 448 f.mu.Lock() 449 defer f.mu.Unlock() 450 f.forEachUpTo(ctx, maxFd, func(fd int32, file *vfs.FileDescription, flags FDFlags) { 451 // The set function here will acquire an appropriate table 452 // reference for the clone. We don't need anything else. 453 if df := clone.set(fd, file, flags); df != nil { 454 panic("file set") 455 } 456 clone.fdBitmap.Add(uint32(fd)) 457 }) 458 return clone 459 } 460 461 // Remove removes an FD from f. It returns the removed file description. 462 // 463 // N.B. Callers are required to use DecRef on the returned file when they are done. 464 func (f *FDTable) Remove(ctx context.Context, fd int32) *vfs.FileDescription { 465 if fd < 0 { 466 return nil 467 } 468 469 f.mu.Lock() 470 df := f.set(fd, nil, FDFlags{}) // Zap entry. 471 if df != nil { 472 f.fdBitmap.Remove(uint32(fd)) 473 } 474 f.mu.Unlock() 475 476 if df != nil { 477 f.fileUnlock(ctx, df) 478 // Table's reference on df is transferred to caller, so don't DecRef. 479 } 480 return df 481 } 482 483 // RemoveIf removes all FDs where cond is true. 484 func (f *FDTable) RemoveIf(ctx context.Context, cond func(*vfs.FileDescription, FDFlags) bool) { 485 var files []*vfs.FileDescription 486 487 f.mu.Lock() 488 f.forEach(ctx, func(fd int32, file *vfs.FileDescription, flags FDFlags) { 489 if cond(file, flags) { 490 // Clear from table. 491 if df := f.set(fd, nil, FDFlags{}); df != nil { 492 f.fdBitmap.Remove(uint32(fd)) 493 files = append(files, df) 494 } 495 } 496 }) 497 f.mu.Unlock() 498 499 for _, file := range files { 500 f.fileUnlock(ctx, file) 501 file.DecRef(ctx) // Drop the table's reference. 502 } 503 } 504 505 // RemoveNextInRange removes the next FD that falls within the given range, 506 // and returns the FD number and FileDescription of the removed FD. 507 // 508 // N.B. Callers are required to use DecRef on the returned file when they are done. 509 func (f *FDTable) RemoveNextInRange(ctx context.Context, startFd int32, endFd int32) (int32, *vfs.FileDescription) { 510 if startFd < 0 || startFd > endFd { 511 return MaxFdLimit, nil 512 } 513 514 f.mu.Lock() 515 fdUint, err := f.fdBitmap.FirstOne(uint32(startFd)) 516 fd := int32(fdUint) 517 if err != nil || fd > endFd { 518 f.mu.Unlock() 519 return MaxFdLimit, nil 520 } 521 df := f.set(fd, nil, FDFlags{}) // Zap entry. 522 if df != nil { 523 f.fdBitmap.Remove(uint32(fd)) 524 } 525 f.mu.Unlock() 526 527 if df != nil { 528 f.fileUnlock(ctx, df) 529 // Table's reference on df is transferred to caller, so don't DecRef. 530 } 531 return fd, df 532 } 533 534 // GetLastFd returns the last set FD in the FDTable bitmap. 535 func (f *FDTable) GetLastFd() int32 { 536 f.mu.Lock() 537 defer f.mu.Unlock() 538 539 last := f.fdBitmap.Maximum() 540 if last > bitmap.MaxBitEntryLimit { 541 return MaxFdLimit 542 } 543 return int32(last) 544 }