github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/gofer/directory.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gofer
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    21  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    22  	"github.com/MerlinKodo/gvisor/pkg/context"
    23  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    24  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    25  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    26  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/pipe"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix/transport"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    29  	"github.com/MerlinKodo/gvisor/pkg/sync"
    30  )
    31  
    32  func (d *dentry) isDir() bool {
    33  	return d.fileType() == linux.S_IFDIR
    34  }
    35  
    36  // cacheNewChildLocked will cache the new child dentry, and will panic if a
    37  // non-negative child is already cached. It is the caller's responsibility to
    38  // check that the child does not exist before calling this method.
    39  //
    40  // Preconditions:
    41  //   - filesystem.renameMu must be locked.
    42  //   - If the addition to the dentry tree is due to a read-only operation (like
    43  //     Walk), then d.opMu must be held for reading. Otherwise d.opMu must be
    44  //     held for writing.
    45  //   - d.childrenMu must be locked.
    46  //   - d.isDir().
    47  //   - child must be a newly-created dentry that has never had a parent.
    48  //   - d.children[name] must be unset or nil (a "negative child")
    49  //
    50  // +checklocksread:d.opMu
    51  // +checklocks:d.childrenMu
    52  func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
    53  	d.IncRef() // reference held by child on its parent
    54  	child.parent = d
    55  	child.name = name
    56  	if d.children == nil {
    57  		d.children = make(map[string]*dentry)
    58  	} else if c, ok := d.children[name]; ok {
    59  		if c != nil {
    60  			panic(fmt.Sprintf("cacheNewChildLocked collision; child with name=%q already cached", name))
    61  		}
    62  
    63  		// Cached child is negative. OK to cache over, but we must
    64  		// update the count of negative children.
    65  		d.negativeChildren--
    66  	}
    67  	d.children[name] = child
    68  }
    69  
    70  // Preconditions:
    71  //   - d.childrenMu must be locked.
    72  //   - d.isDir().
    73  //   - name is not already a negative entry.
    74  //
    75  // +checklocks:d.childrenMu
    76  func (d *dentry) cacheNegativeLookupLocked(name string) {
    77  	// Don't cache negative lookups if InteropModeShared is in effect (since
    78  	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
    79  	// case the only files in the directory are those for which a dentry exists
    80  	// in d.children). Instead, just delete any previously-cached dentry.
    81  	if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
    82  		delete(d.children, name)
    83  		return
    84  	}
    85  	if d.children == nil {
    86  		d.children = make(map[string]*dentry)
    87  	}
    88  	d.children[name] = nil
    89  	d.negativeChildren++
    90  
    91  	if !d.negativeChildrenCache.isInited() {
    92  		// Initializing cache with all negative children name at the first time
    93  		// that negativeChildren increase upto max.
    94  		if d.negativeChildren >= maxCachedNegativeChildren {
    95  			d.negativeChildrenCache.init(maxCachedNegativeChildren)
    96  			for childName, child := range d.children {
    97  				if child == nil {
    98  					d.negativeChildrenCache.add(childName)
    99  				}
   100  			}
   101  		}
   102  	} else if victim := d.negativeChildrenCache.add(name); victim != "" {
   103  		// If victim is a negative entry in d.children, delete it.
   104  		if child, ok := d.children[victim]; ok && child == nil {
   105  			delete(d.children, victim)
   106  			d.negativeChildren--
   107  		}
   108  	}
   109  }
   110  
   111  type createSyntheticOpts struct {
   112  	name string
   113  	mode linux.FileMode
   114  	kuid auth.KUID
   115  	kgid auth.KGID
   116  
   117  	// The endpoint for a synthetic socket. endpoint should be nil if the file
   118  	// being created is not a socket.
   119  	endpoint transport.BoundEndpoint
   120  
   121  	// pipe should be nil if the file being created is not a pipe.
   122  	pipe *pipe.VFSPipe
   123  }
   124  
   125  // newSyntheticDentry creates a synthetic file with the given name.
   126  func (fs *filesystem) newSyntheticDentry(opts *createSyntheticOpts) *dentry {
   127  	now := fs.clock.Now().Nanoseconds()
   128  	child := &dentry{
   129  		refs:      atomicbitops.FromInt64(1), // held by parent.
   130  		fs:        fs,
   131  		ino:       fs.nextIno(),
   132  		mode:      atomicbitops.FromUint32(uint32(opts.mode)),
   133  		uid:       atomicbitops.FromUint32(uint32(opts.kuid)),
   134  		gid:       atomicbitops.FromUint32(uint32(opts.kgid)),
   135  		blockSize: atomicbitops.FromUint32(hostarch.PageSize), // arbitrary
   136  		atime:     atomicbitops.FromInt64(now),
   137  		mtime:     atomicbitops.FromInt64(now),
   138  		ctime:     atomicbitops.FromInt64(now),
   139  		btime:     atomicbitops.FromInt64(now),
   140  		readFD:    atomicbitops.FromInt32(-1),
   141  		writeFD:   atomicbitops.FromInt32(-1),
   142  		mmapFD:    atomicbitops.FromInt32(-1),
   143  		nlink:     atomicbitops.FromUint32(2),
   144  	}
   145  	switch opts.mode.FileType() {
   146  	case linux.S_IFDIR:
   147  		// Nothing else needs to be done.
   148  	case linux.S_IFSOCK:
   149  		child.endpoint = opts.endpoint
   150  	case linux.S_IFIFO:
   151  		child.pipe = opts.pipe
   152  	default:
   153  		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
   154  	}
   155  	child.init(nil /* impl */)
   156  	return child
   157  }
   158  
   159  // Preconditions:
   160  //   - d.childrenMu must be locked.
   161  //
   162  // +checklocks:d.childrenMu
   163  func (d *dentry) clearDirentsLocked() {
   164  	d.dirents = nil
   165  	d.childrenSet = nil
   166  }
   167  
   168  // +stateify savable
   169  type directoryFD struct {
   170  	fileDescription
   171  	vfs.DirectoryFileDescriptionDefaultImpl
   172  
   173  	mu      sync.Mutex `state:"nosave"`
   174  	off     int64
   175  	dirents []vfs.Dirent
   176  }
   177  
   178  // Release implements vfs.FileDescriptionImpl.Release.
   179  func (fd *directoryFD) Release(context.Context) {
   180  }
   181  
   182  // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
   183  func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
   184  	fd.mu.Lock()
   185  	defer fd.mu.Unlock()
   186  
   187  	d := fd.dentry()
   188  	if fd.dirents == nil {
   189  		ds, err := d.getDirents(ctx)
   190  		if err != nil {
   191  			return err
   192  		}
   193  		fd.dirents = ds
   194  	}
   195  
   196  	if d.cachedMetadataAuthoritative() {
   197  		d.touchAtime(fd.vfsfd.Mount())
   198  	}
   199  
   200  	for fd.off < int64(len(fd.dirents)) {
   201  		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
   202  			return err
   203  		}
   204  		fd.off++
   205  	}
   206  	return nil
   207  }
   208  
   209  // Preconditions:
   210  //   - d.isDir().
   211  //   - There exists at least one directoryFD representing d.
   212  func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
   213  	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
   214  	// presence of concurrent mutation of an iterated directory, so
   215  	// implementations may duplicate or omit entries in this case, which
   216  	// violates POSIX semantics. Thus we read all directory entries while
   217  	// holding d.opMu to exclude directory mutations. (Note that it is
   218  	// impossible for the client to exclude concurrent mutation from other
   219  	// remote filesystem users. Since there is no way to detect if the server
   220  	// has incorrectly omitted directory entries, we simply assume that the
   221  	// server is well-behaved under InteropModeShared.) This is inconsistent
   222  	// with Linux (which appears to assume that directory fids have the correct
   223  	// semantics, and translates struct file_operations::readdir calls directly
   224  	// to readdir RPCs), but is consistent with VFS1.
   225  
   226  	// filesystem.renameMu is needed for d.parent, and must be locked before
   227  	// d.opMu.
   228  	d.fs.renameMu.RLock()
   229  	defer d.fs.renameMu.RUnlock()
   230  	d.opMu.RLock()
   231  	defer d.opMu.RUnlock()
   232  
   233  	// d.childrenMu must be locked after d.opMu and held for the entire
   234  	// function. This synchronizes concurrent getDirents() attempts.
   235  	// getdents(2) advances the file offset. To get complete results from
   236  	// multiple getdents(2) calls, the directory FD's offset needs to be
   237  	// protected.
   238  	d.childrenMu.Lock()
   239  	defer d.childrenMu.Unlock()
   240  
   241  	if d.dirents != nil {
   242  		return d.dirents, nil
   243  	}
   244  
   245  	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
   246  	// so we generate them here.
   247  	parent := genericParentOrSelf(d)
   248  	dirents := []vfs.Dirent{
   249  		{
   250  			Name:    ".",
   251  			Type:    linux.DT_DIR,
   252  			Ino:     uint64(d.ino),
   253  			NextOff: 1,
   254  		},
   255  		{
   256  			Name:    "..",
   257  			Type:    uint8(parent.mode.Load() >> 12),
   258  			Ino:     uint64(parent.ino),
   259  			NextOff: 2,
   260  		},
   261  	}
   262  	var realChildren map[string]struct{}
   263  	if !d.isSynthetic() {
   264  		if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
   265  			// Record the set of children d actually has so that we don't emit
   266  			// duplicate entries for synthetic children.
   267  			realChildren = make(map[string]struct{})
   268  		}
   269  		d.handleMu.RLock()
   270  		if !d.isReadHandleOk() {
   271  			// This should not be possible because a readable handle should
   272  			// have been opened when the calling directoryFD was opened.
   273  			panic("gofer.dentry.getDirents called without a readable handle")
   274  		}
   275  		err := d.getDirentsLocked(ctx, func(name string, key inoKey, dType uint8) {
   276  			dirent := vfs.Dirent{
   277  				Name:    name,
   278  				Ino:     d.fs.inoFromKey(key),
   279  				NextOff: int64(len(dirents) + 1),
   280  				Type:    dType,
   281  			}
   282  			dirents = append(dirents, dirent)
   283  			if realChildren != nil {
   284  				realChildren[name] = struct{}{}
   285  			}
   286  		})
   287  		d.handleMu.RUnlock()
   288  		if err != nil {
   289  			return nil, err
   290  		}
   291  	}
   292  
   293  	// Emit entries for synthetic children.
   294  	if d.syntheticChildren != 0 {
   295  		for _, child := range d.children {
   296  			if child == nil || !child.isSynthetic() {
   297  				continue
   298  			}
   299  			if _, ok := realChildren[child.name]; ok {
   300  				continue
   301  			}
   302  			dirents = append(dirents, vfs.Dirent{
   303  				Name:    child.name,
   304  				Type:    uint8(child.mode.Load() >> 12),
   305  				Ino:     uint64(child.ino),
   306  				NextOff: int64(len(dirents) + 1),
   307  			})
   308  		}
   309  	}
   310  	// Cache dirents for future directoryFDs if permitted.
   311  	if d.cachedMetadataAuthoritative() {
   312  		d.dirents = dirents
   313  		d.childrenSet = make(map[string]struct{}, len(dirents))
   314  		for _, dirent := range d.dirents {
   315  			d.childrenSet[dirent.Name] = struct{}{}
   316  		}
   317  	}
   318  	return dirents, nil
   319  }
   320  
   321  // Seek implements vfs.FileDescriptionImpl.Seek.
   322  func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   323  	fd.mu.Lock()
   324  	defer fd.mu.Unlock()
   325  
   326  	switch whence {
   327  	case linux.SEEK_SET:
   328  		if offset < 0 {
   329  			return 0, linuxerr.EINVAL
   330  		}
   331  		if offset == 0 {
   332  			// Ensure that the next call to fd.IterDirents() calls
   333  			// fd.dentry().getDirents().
   334  			fd.dirents = nil
   335  		}
   336  		fd.off = offset
   337  		return fd.off, nil
   338  	case linux.SEEK_CUR:
   339  		offset += fd.off
   340  		if offset < 0 {
   341  			return 0, linuxerr.EINVAL
   342  		}
   343  		// Don't clear fd.dirents in this case, even if offset == 0.
   344  		fd.off = offset
   345  		return fd.off, nil
   346  	default:
   347  		return 0, linuxerr.EINVAL
   348  	}
   349  }
   350  
   351  // Sync implements vfs.FileDescriptionImpl.Sync.
   352  func (fd *directoryFD) Sync(ctx context.Context) error {
   353  	return fd.dentry().syncRemoteFile(ctx)
   354  }