github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/kernel/fd_table.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernel
    16  
    17  import (
    18  	goContext "context"
    19  	"fmt"
    20  	"math"
    21  	"strings"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/metacubex/gvisor/pkg/abi/linux"
    25  	"github.com/metacubex/gvisor/pkg/bitmap"
    26  	"github.com/metacubex/gvisor/pkg/context"
    27  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    28  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/lock"
    29  	"github.com/metacubex/gvisor/pkg/sentry/limits"
    30  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    31  )
    32  
    33  // FDFlags define flags for an individual descriptor.
    34  //
    35  // +stateify savable
    36  type FDFlags struct {
    37  	// CloseOnExec indicates the descriptor should be closed on exec.
    38  	CloseOnExec bool
    39  }
    40  
    41  // ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
    42  // representation.
    43  func (f FDFlags) ToLinuxFileFlags() (mask uint) {
    44  	if f.CloseOnExec {
    45  		mask |= linux.O_CLOEXEC
    46  	}
    47  	return
    48  }
    49  
    50  // ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
    51  // representation.
    52  func (f FDFlags) ToLinuxFDFlags() (mask uint) {
    53  	if f.CloseOnExec {
    54  		mask |= linux.FD_CLOEXEC
    55  	}
    56  	return
    57  }
    58  
    59  // descriptor holds the details about a file descriptor, namely a pointer to
    60  // the file itself and the descriptor flags.
    61  //
    62  // Note that this is immutable and can only be changed via operations on the
    63  // descriptorTable.
    64  //
    65  // +stateify savable
    66  type descriptor struct {
    67  	file  *vfs.FileDescription
    68  	flags FDFlags
    69  }
    70  
    71  // MaxFdLimit defines the upper limit on the integer value of file descriptors.
    72  const MaxFdLimit int32 = int32(bitmap.MaxBitEntryLimit)
    73  
    74  // FDTable is used to manage File references and flags.
    75  //
    76  // +stateify savable
    77  type FDTable struct {
    78  	FDTableRefs
    79  
    80  	k *Kernel
    81  
    82  	// mu protects below.
    83  	mu fdTableMutex `state:"nosave"`
    84  
    85  	// fdBitmap shows which fds are already in use.
    86  	fdBitmap bitmap.Bitmap `state:"nosave"`
    87  
    88  	// descriptorTable holds descriptors.
    89  	descriptorTable `state:".(map[int32]descriptor)"`
    90  }
    91  
    92  func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
    93  	m := make(map[int32]descriptor)
    94  	f.mu.Lock()
    95  	defer f.mu.Unlock()
    96  	f.forEach(context.Background(), func(fd int32, file *vfs.FileDescription, flags FDFlags) {
    97  		m[fd] = descriptor{
    98  			file:  file,
    99  			flags: flags,
   100  		}
   101  	})
   102  	return m
   103  }
   104  
   105  func (f *FDTable) loadDescriptorTable(_ goContext.Context, m map[int32]descriptor) {
   106  	ctx := context.Background()
   107  	f.initNoLeakCheck() // Initialize table.
   108  	f.fdBitmap = bitmap.New(uint32(math.MaxUint16))
   109  	for fd, d := range m {
   110  		if fd < 0 {
   111  			panic(fmt.Sprintf("FD is not supposed to be negative. FD: %d", fd))
   112  		}
   113  
   114  		if df := f.set(fd, d.file, d.flags); df != nil {
   115  			panic("file set")
   116  		}
   117  		f.fdBitmap.Add(uint32(fd))
   118  		// Note that we do _not_ need to acquire a extra table reference here. The
   119  		// table reference will already be accounted for in the file, so we drop the
   120  		// reference taken by set above.
   121  		if d.file != nil {
   122  			d.file.DecRef(ctx)
   123  		}
   124  	}
   125  }
   126  
   127  // Release any POSIX lock possibly held by the FDTable.
   128  func (f *FDTable) fileUnlock(ctx context.Context, file *vfs.FileDescription) {
   129  	if file.SupportsLocks() {
   130  		err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF})
   131  		if err != nil && !linuxerr.Equals(linuxerr.ENOLCK, err) {
   132  			panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
   133  		}
   134  	}
   135  }
   136  
   137  // NewFDTable allocates a new FDTable that may be used by tasks in k.
   138  func (k *Kernel) NewFDTable() *FDTable {
   139  	f := &FDTable{k: k}
   140  	f.init()
   141  	return f
   142  }
   143  
   144  // DecRef implements RefCounter.DecRef.
   145  //
   146  // If f reaches zero references, all of its file descriptors are removed.
   147  func (f *FDTable) DecRef(ctx context.Context) {
   148  	f.FDTableRefs.DecRef(func() {
   149  		f.RemoveIf(ctx, func(*vfs.FileDescription, FDFlags) bool {
   150  			return true
   151  		})
   152  	})
   153  }
   154  
   155  // forEachUpTo iterates over all non-nil files upto maxFds (non-inclusive) in sorted order.
   156  //
   157  // It is the caller's responsibility to acquire an appropriate lock.
   158  func (f *FDTable) forEachUpTo(ctx context.Context, maxFd int32, fn func(fd int32, file *vfs.FileDescription, flags FDFlags)) {
   159  	// Iterate through the fdBitmap.
   160  	f.fdBitmap.ForEach(0, uint32(maxFd), func(ufd uint32) bool {
   161  		fd := int32(ufd)
   162  		file, flags, ok := f.get(fd)
   163  		if !ok {
   164  			return true
   165  		}
   166  		if file != nil {
   167  			if !file.TryIncRef() {
   168  				return true
   169  			}
   170  			fn(fd, file, flags)
   171  			file.DecRef(ctx)
   172  		}
   173  		return true
   174  	})
   175  }
   176  
   177  // forEach iterates over all non-nil files upto maxFd in sorted order.
   178  //
   179  // It is the caller's responsibility to acquire an appropriate lock.
   180  func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *vfs.FileDescription, flags FDFlags)) {
   181  	f.forEachUpTo(ctx, MaxFdLimit, fn)
   182  }
   183  
   184  // String is a stringer for FDTable.
   185  func (f *FDTable) String() string {
   186  	var buf strings.Builder
   187  	ctx := context.Background()
   188  	files := make(map[int32]*vfs.FileDescription)
   189  	f.mu.Lock()
   190  	// Can't release f.mu from defer, because vfsObj.PathnameWithDeleted
   191  	// should not be called under the fdtable mutex.
   192  	f.forEach(ctx, func(fd int32, file *vfs.FileDescription, flags FDFlags) {
   193  		if file != nil {
   194  			file.IncRef()
   195  			files[fd] = file
   196  		}
   197  	})
   198  	f.mu.Unlock()
   199  	defer func() {
   200  		for _, f := range files {
   201  			f.DecRef(ctx)
   202  		}
   203  	}()
   204  
   205  	for fd, file := range files {
   206  		vfsObj := file.Mount().Filesystem().VirtualFilesystem()
   207  		vd := file.VirtualDentry()
   208  		if vd.Dentry() == nil {
   209  			panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, file.Impl(), file))
   210  		}
   211  		name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, file.VirtualDentry())
   212  		if err != nil {
   213  			fmt.Fprintf(&buf, "<err: %v>\n", err)
   214  			continue
   215  		}
   216  		fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
   217  	}
   218  	return buf.String()
   219  }
   220  
   221  // NewFDs allocates new FDs guaranteed to be the lowest number available
   222  // greater than or equal to the minFD parameter. All files will share the set
   223  // flags. Success is guaranteed to be all or none.
   224  func (f *FDTable) NewFDs(ctx context.Context, minFD int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
   225  	if minFD < 0 {
   226  		// Don't accept negative FDs.
   227  		return nil, unix.EINVAL
   228  	}
   229  
   230  	// Default limit.
   231  	end := f.k.MaxFDLimit.Load()
   232  
   233  	// Ensure we don't get past the provided limit.
   234  	if limitSet := limits.FromContext(ctx); limitSet != nil {
   235  		lim := limitSet.Get(limits.NumberOfFiles)
   236  		// Only set if the limit is smaller than the max to avoid overflow.
   237  		if lim.Cur != limits.Infinity && lim.Cur < uint64(end) {
   238  			end = int32(lim.Cur)
   239  		}
   240  	}
   241  	if minFD+int32(len(files)) > end {
   242  		return nil, unix.EMFILE
   243  	}
   244  
   245  	f.mu.Lock()
   246  
   247  	// max is used as the largest number in fdBitmap + 1.
   248  	max := int32(0)
   249  	if !f.fdBitmap.IsEmpty() {
   250  		max = int32(f.fdBitmap.Maximum())
   251  		max++
   252  	}
   253  
   254  	// Adjust max in case it is less than minFD.
   255  	if max < minFD {
   256  		max = minFD
   257  	}
   258  	// Install all entries.
   259  	for len(fds) < len(files) {
   260  		// Try to use free bit in fdBitmap.
   261  		// If all bits in fdBitmap are used, expand fd to the max.
   262  		fd, err := f.fdBitmap.FirstZero(uint32(minFD))
   263  		if err != nil {
   264  			fd = uint32(max)
   265  			max++
   266  		}
   267  		if fd >= uint32(end) {
   268  			break
   269  		}
   270  		f.fdBitmap.Add(fd)
   271  		if df := f.set(int32(fd), files[len(fds)], flags); df != nil {
   272  			panic("file set")
   273  		}
   274  		fds = append(fds, int32(fd))
   275  		minFD = int32(fd)
   276  	}
   277  
   278  	// Failure? Unwind existing FDs.
   279  	if len(fds) < len(files) {
   280  		for _, i := range fds {
   281  			_ = f.set(i, nil, FDFlags{})
   282  			f.fdBitmap.Remove(uint32(i))
   283  		}
   284  		f.mu.Unlock()
   285  
   286  		// Drop the reference taken by the call to f.set() that
   287  		// originally installed the file. Don't call f.drop()
   288  		// (generating inotify events, etc.) since the file should
   289  		// appear to have never been inserted into f.
   290  		for _, file := range files[:len(fds)] {
   291  			file.DecRef(ctx)
   292  		}
   293  		return nil, unix.EMFILE
   294  	}
   295  
   296  	f.mu.Unlock()
   297  	return fds, nil
   298  }
   299  
   300  // NewFD allocates a file descriptor greater than or equal to minFD for
   301  // the given file description. If it succeeds, it takes a reference on file.
   302  func (f *FDTable) NewFD(ctx context.Context, minFD int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
   303  	files := []*vfs.FileDescription{file}
   304  	fileSlice, error := f.NewFDs(ctx, minFD, files, flags)
   305  	if error != nil {
   306  		return -1, error
   307  	}
   308  	return fileSlice[0], nil
   309  }
   310  
   311  // NewFDAt sets the file reference for the given FD. If there is an existing
   312  // file description for that FD, it is returned.
   313  //
   314  // N.B. Callers are required to use DecRef on the returned file when they are done.
   315  //
   316  // Precondition: file != nil.
   317  func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) (*vfs.FileDescription, error) {
   318  	if fd < 0 {
   319  		// Don't accept negative FDs.
   320  		return nil, unix.EBADF
   321  	}
   322  
   323  	if fd >= f.k.MaxFDLimit.Load() {
   324  		return nil, unix.EMFILE
   325  	}
   326  	// Check the limit for the provided file.
   327  	if limitSet := limits.FromContext(ctx); limitSet != nil {
   328  		if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
   329  			return nil, unix.EMFILE
   330  		}
   331  	}
   332  
   333  	// Install the entry.
   334  	f.mu.Lock()
   335  	df := f.set(fd, file, flags)
   336  	// Add fd to fdBitmap.
   337  	if df == nil {
   338  		f.fdBitmap.Add(uint32(fd))
   339  	}
   340  	f.mu.Unlock()
   341  
   342  	if df != nil {
   343  		f.fileUnlock(ctx, df)
   344  		// Table's reference on df is transferred to caller, so don't DecRef.
   345  	}
   346  	return df, nil
   347  }
   348  
   349  // SetFlags sets the flags for the given file descriptor.
   350  //
   351  // True is returned iff flags were changed.
   352  func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
   353  	if fd < 0 {
   354  		// Don't accept negative FDs.
   355  		return unix.EBADF
   356  	}
   357  
   358  	f.mu.Lock()
   359  	defer f.mu.Unlock()
   360  
   361  	file, _, _ := f.get(fd)
   362  	if file == nil {
   363  		// No file found.
   364  		return unix.EBADF
   365  	}
   366  
   367  	// Update the flags.
   368  	if df := f.set(fd, file, flags); df != nil {
   369  		panic("file changed")
   370  	}
   371  	return nil
   372  }
   373  
   374  // SetFlagsForRange sets the flags for the given range of file descriptors
   375  // (inclusive: [startFd, endFd]).
   376  func (f *FDTable) SetFlagsForRange(ctx context.Context, startFd int32, endFd int32, flags FDFlags) error {
   377  	if startFd < 0 || startFd > endFd {
   378  		return unix.EBADF
   379  	}
   380  
   381  	f.mu.Lock()
   382  	defer f.mu.Unlock()
   383  
   384  	for fd, err := f.fdBitmap.FirstOne(uint32(startFd)); err == nil && fd <= uint32(endFd); fd, err = f.fdBitmap.FirstOne(fd + 1) {
   385  		fdI32 := int32(fd)
   386  		file, _, _ := f.get(fdI32)
   387  		if df := f.set(fdI32, file, flags); df != nil {
   388  			panic("file changed")
   389  		}
   390  	}
   391  
   392  	return nil
   393  }
   394  
   395  // Get returns a reference to the file and the flags for the FD or nil if no
   396  // file is defined for the given fd.
   397  //
   398  // N.B. Callers are required to use DecRef when they are done.
   399  //
   400  //go:nosplit
   401  func (f *FDTable) Get(fd int32) (*vfs.FileDescription, FDFlags) {
   402  	if fd < 0 {
   403  		return nil, FDFlags{}
   404  	}
   405  
   406  	for {
   407  		file, flags, _ := f.get(fd)
   408  		if file != nil {
   409  			if !file.TryIncRef() {
   410  				continue // Race caught.
   411  			}
   412  			// Reference acquired.
   413  			return file, flags
   414  		}
   415  		// No file available.
   416  		return nil, FDFlags{}
   417  	}
   418  }
   419  
   420  // GetFDs returns a sorted list of valid fds.
   421  //
   422  // Precondition: The caller must be running on the task goroutine, or Task.mu
   423  // must be locked.
   424  func (f *FDTable) GetFDs(ctx context.Context) []int32 {
   425  	f.mu.Lock()
   426  	defer f.mu.Unlock()
   427  	fds := make([]int32, 0, int(f.fdBitmap.GetNumOnes()))
   428  	f.forEach(ctx, func(fd int32, _ *vfs.FileDescription, _ FDFlags) {
   429  		fds = append(fds, fd)
   430  	})
   431  	return fds
   432  }
   433  
   434  // Exists returns whether fd is defined in the table. It is inherently racy.
   435  //
   436  //go:nosplit
   437  func (f *FDTable) Exists(fd int32) bool {
   438  	if fd < 0 {
   439  		return false
   440  	}
   441  	file, _, _ := f.get(fd)
   442  	return file != nil
   443  }
   444  
   445  // Fork returns an independent FDTable, cloning all FDs up to maxFds (non-inclusive).
   446  func (f *FDTable) Fork(ctx context.Context, maxFd int32) *FDTable {
   447  	clone := f.k.NewFDTable()
   448  	f.mu.Lock()
   449  	defer f.mu.Unlock()
   450  	f.forEachUpTo(ctx, maxFd, func(fd int32, file *vfs.FileDescription, flags FDFlags) {
   451  		// The set function here will acquire an appropriate table
   452  		// reference for the clone. We don't need anything else.
   453  		if df := clone.set(fd, file, flags); df != nil {
   454  			panic("file set")
   455  		}
   456  		clone.fdBitmap.Add(uint32(fd))
   457  	})
   458  	return clone
   459  }
   460  
   461  // Remove removes an FD from f. It returns the removed file description.
   462  //
   463  // N.B. Callers are required to use DecRef on the returned file when they are done.
   464  func (f *FDTable) Remove(ctx context.Context, fd int32) *vfs.FileDescription {
   465  	if fd < 0 {
   466  		return nil
   467  	}
   468  
   469  	f.mu.Lock()
   470  	df := f.set(fd, nil, FDFlags{}) // Zap entry.
   471  	if df != nil {
   472  		f.fdBitmap.Remove(uint32(fd))
   473  	}
   474  	f.mu.Unlock()
   475  
   476  	if df != nil {
   477  		f.fileUnlock(ctx, df)
   478  		// Table's reference on df is transferred to caller, so don't DecRef.
   479  	}
   480  	return df
   481  }
   482  
   483  // RemoveIf removes all FDs where cond is true.
   484  func (f *FDTable) RemoveIf(ctx context.Context, cond func(*vfs.FileDescription, FDFlags) bool) {
   485  	var files []*vfs.FileDescription
   486  
   487  	f.mu.Lock()
   488  	f.forEach(ctx, func(fd int32, file *vfs.FileDescription, flags FDFlags) {
   489  		if cond(file, flags) {
   490  			// Clear from table.
   491  			if df := f.set(fd, nil, FDFlags{}); df != nil {
   492  				f.fdBitmap.Remove(uint32(fd))
   493  				files = append(files, df)
   494  			}
   495  		}
   496  	})
   497  	f.mu.Unlock()
   498  
   499  	for _, file := range files {
   500  		f.fileUnlock(ctx, file)
   501  		file.DecRef(ctx) // Drop the table's reference.
   502  	}
   503  }
   504  
   505  // RemoveNextInRange removes the next FD that falls within the given range,
   506  // and returns the FD number and FileDescription of the removed FD.
   507  //
   508  // N.B. Callers are required to use DecRef on the returned file when they are done.
   509  func (f *FDTable) RemoveNextInRange(ctx context.Context, startFd int32, endFd int32) (int32, *vfs.FileDescription) {
   510  	if startFd < 0 || startFd > endFd {
   511  		return MaxFdLimit, nil
   512  	}
   513  
   514  	f.mu.Lock()
   515  	fdUint, err := f.fdBitmap.FirstOne(uint32(startFd))
   516  	fd := int32(fdUint)
   517  	if err != nil || fd > endFd {
   518  		f.mu.Unlock()
   519  		return MaxFdLimit, nil
   520  	}
   521  	df := f.set(fd, nil, FDFlags{}) // Zap entry.
   522  	if df != nil {
   523  		f.fdBitmap.Remove(uint32(fd))
   524  	}
   525  	f.mu.Unlock()
   526  
   527  	if df != nil {
   528  		f.fileUnlock(ctx, df)
   529  		// Table's reference on df is transferred to caller, so don't DecRef.
   530  	}
   531  	return fd, df
   532  }
   533  
   534  // GetLastFd returns the last set FD in the FDTable bitmap.
   535  func (f *FDTable) GetLastFd() int32 {
   536  	f.mu.Lock()
   537  	defer f.mu.Unlock()
   538  
   539  	last := f.fdBitmap.Maximum()
   540  	if last > bitmap.MaxBitEntryLimit {
   541  		return MaxFdLimit
   542  	}
   543  	return int32(last)
   544  }