github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/proc/tasks.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"sort"
    20  	"strconv"
    21  
    22  	"github.com/metacubex/gvisor/pkg/abi/linux"
    23  	"github.com/metacubex/gvisor/pkg/context"
    24  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    25  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    26  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    27  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    28  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    29  )
    30  
    31  const (
    32  	selfName       = "self"
    33  	threadSelfName = "thread-self"
    34  )
    35  
    36  // tasksInode represents the inode for /proc/ directory.
    37  //
    38  // +stateify savable
    39  type tasksInode struct {
    40  	implStatFS
    41  	kernfs.InodeAlwaysValid
    42  	kernfs.InodeAttrs
    43  	kernfs.InodeDirectoryNoNewChildren
    44  	kernfs.InodeNotAnonymous
    45  	kernfs.InodeNotSymlink
    46  	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
    47  	kernfs.InodeWatches
    48  	kernfs.OrderedChildren
    49  	tasksInodeRefs
    50  
    51  	locks vfs.FileLocks
    52  
    53  	fs    *filesystem
    54  	pidns *kernel.PIDNamespace
    55  
    56  	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
    57  	// Linux. So handle them outside of OrderedChildren.
    58  
    59  	// fakeCgroupControllers is a map of controller name to directory in the
    60  	// cgroup hierarchy. These controllers are immutable and will be listed
    61  	// in /proc/pid/cgroup if not nil.
    62  	fakeCgroupControllers map[string]string
    63  }
    64  
    65  var _ kernfs.Inode = (*tasksInode)(nil)
    66  
    67  func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, fakeCgroupControllers map[string]string) *tasksInode {
    68  	root := auth.NewRootCredentials(pidns.UserNamespace())
    69  	contents := map[string]kernfs.Inode{
    70  		"cmdline":        fs.newInode(ctx, root, 0444, &cmdLineData{}),
    71  		"cpuinfo":        fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))),
    72  		"filesystems":    fs.newInode(ctx, root, 0444, &filesystemsData{}),
    73  		"loadavg":        fs.newInode(ctx, root, 0444, &loadavgData{}),
    74  		"sys":            fs.newSysDir(ctx, root, k),
    75  		"bus":            fs.newStaticDir(ctx, root, map[string]kernfs.Inode{}),
    76  		"fs":             fs.newStaticDir(ctx, root, map[string]kernfs.Inode{}),
    77  		"irq":            fs.newStaticDir(ctx, root, map[string]kernfs.Inode{}),
    78  		"meminfo":        fs.newInode(ctx, root, 0444, &meminfoData{}),
    79  		"mounts":         kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
    80  		"net":            kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
    81  		"sentry-meminfo": fs.newInode(ctx, root, 0444, &sentryMeminfoData{}),
    82  		"stat":           fs.newInode(ctx, root, 0444, &statData{}),
    83  		"sysrq-trigger":  fs.newInode(ctx, root, 0200, newStaticFile("")),
    84  		"uptime":         fs.newInode(ctx, root, 0444, &uptimeData{}),
    85  		"version":        fs.newInode(ctx, root, 0444, &versionData{}),
    86  	}
    87  	// If fakeCgroupControllers are provided, don't create a cgroupfs backed
    88  	// /proc/cgroup as it will not match the fake controllers.
    89  	if len(fakeCgroupControllers) == 0 {
    90  		contents["cgroups"] = fs.newInode(ctx, root, 0444, &cgroupsData{})
    91  	}
    92  
    93  	inode := &tasksInode{
    94  		pidns:                 pidns,
    95  		fs:                    fs,
    96  		fakeCgroupControllers: fakeCgroupControllers,
    97  	}
    98  	inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
    99  	inode.InitRefs()
   100  
   101  	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   102  	links := inode.OrderedChildren.Populate(contents)
   103  	inode.IncLinks(links)
   104  
   105  	return inode
   106  }
   107  
   108  // Lookup implements kernfs.inodeDirectory.Lookup.
   109  func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
   110  	// Check if a static entry was looked up.
   111  	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
   112  		return d, nil
   113  	}
   114  
   115  	// Not a static entry. Try to lookup a corresponding task.
   116  	tid, err := strconv.ParseUint(name, 10, 64)
   117  	if err != nil {
   118  		root := auth.NewRootCredentials(i.pidns.UserNamespace())
   119  		// If it failed to parse, check if it's one of the special handled files.
   120  		switch name {
   121  		case selfName:
   122  			return i.newSelfSymlink(ctx, root), nil
   123  		case threadSelfName:
   124  			return i.newThreadSelfSymlink(ctx, root), nil
   125  		}
   126  		return nil, linuxerr.ENOENT
   127  	}
   128  
   129  	task := i.pidns.TaskWithID(kernel.ThreadID(tid))
   130  	if task == nil {
   131  		return nil, linuxerr.ENOENT
   132  	}
   133  
   134  	return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers)
   135  }
   136  
   137  // IterDirents implements kernfs.inodeDirectory.IterDirents.
   138  func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
   139  	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
   140  	const FIRST_PROCESS_ENTRY = 256
   141  
   142  	// Use maxTaskID to shortcut searches that will result in 0 entries.
   143  	const maxTaskID = kernel.TasksLimit + 1
   144  	if offset >= maxTaskID {
   145  		return offset, nil
   146  	}
   147  
   148  	// According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories
   149  	// start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by
   150  	// '/proc/thread-self' and then '/proc/[pid]'.
   151  	if offset < FIRST_PROCESS_ENTRY {
   152  		offset = FIRST_PROCESS_ENTRY
   153  	}
   154  
   155  	if offset == FIRST_PROCESS_ENTRY {
   156  		dirent := vfs.Dirent{
   157  			Name:    selfName,
   158  			Type:    linux.DT_LNK,
   159  			Ino:     i.fs.NextIno(),
   160  			NextOff: offset + 1,
   161  		}
   162  		if err := cb.Handle(dirent); err != nil {
   163  			return offset, err
   164  		}
   165  		offset++
   166  	}
   167  	if offset == FIRST_PROCESS_ENTRY+1 {
   168  		dirent := vfs.Dirent{
   169  			Name:    threadSelfName,
   170  			Type:    linux.DT_LNK,
   171  			Ino:     i.fs.NextIno(),
   172  			NextOff: offset + 1,
   173  		}
   174  		if err := cb.Handle(dirent); err != nil {
   175  			return offset, err
   176  		}
   177  		offset++
   178  	}
   179  
   180  	// Collect all tasks that TGIDs are greater than the offset specified. Per
   181  	// Linux we only include in directory listings if it's the leader. But for
   182  	// whatever crazy reason, you can still walk to the given node.
   183  	var tids []int
   184  	startTid := offset - FIRST_PROCESS_ENTRY - 2
   185  	for _, tg := range i.pidns.ThreadGroups() {
   186  		tid := i.pidns.IDOfThreadGroup(tg)
   187  		if int64(tid) < startTid {
   188  			continue
   189  		}
   190  		if leader := tg.Leader(); leader != nil {
   191  			tids = append(tids, int(tid))
   192  		}
   193  	}
   194  
   195  	if len(tids) == 0 {
   196  		return offset, nil
   197  	}
   198  
   199  	sort.Ints(tids)
   200  	for _, tid := range tids {
   201  		dirent := vfs.Dirent{
   202  			Name:    strconv.FormatUint(uint64(tid), 10),
   203  			Type:    linux.DT_DIR,
   204  			Ino:     i.fs.NextIno(),
   205  			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
   206  		}
   207  		if err := cb.Handle(dirent); err != nil {
   208  			return offset, err
   209  		}
   210  		offset++
   211  	}
   212  	return maxTaskID, nil
   213  }
   214  
   215  // Open implements kernfs.Inode.Open.
   216  func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   217  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
   218  		SeekEnd: kernfs.SeekEndZero,
   219  	})
   220  	if err != nil {
   221  		return nil, err
   222  	}
   223  	return fd.VFSFileDescription(), nil
   224  }
   225  
   226  func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
   227  	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
   228  	if err != nil {
   229  		return linux.Statx{}, err
   230  	}
   231  
   232  	if opts.Mask&linux.STATX_NLINK != 0 {
   233  		// Add dynamic children to link count.
   234  		for _, tg := range i.pidns.ThreadGroups() {
   235  			if leader := tg.Leader(); leader != nil {
   236  				stat.Nlink++
   237  			}
   238  		}
   239  	}
   240  
   241  	return stat, nil
   242  }
   243  
   244  // DecRef implements kernfs.Inode.DecRef.
   245  func (i *tasksInode) DecRef(ctx context.Context) {
   246  	i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
   247  }
   248  
   249  // staticFileSetStat implements a special static file that allows inode
   250  // attributes to be set. This is to support /proc files that are readonly, but
   251  // allow attributes to be set.
   252  //
   253  // +stateify savable
   254  type staticFileSetStat struct {
   255  	dynamicBytesFileSetAttr
   256  	vfs.StaticData
   257  }
   258  
   259  var _ dynamicInode = (*staticFileSetStat)(nil)
   260  
   261  func newStaticFileSetStat(data string) *staticFileSetStat {
   262  	return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}}
   263  }
   264  
   265  func cpuInfoData(k *kernel.Kernel) string {
   266  	features := k.FeatureSet()
   267  	var buf bytes.Buffer
   268  	for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
   269  		features.WriteCPUInfoTo(i, &buf)
   270  	}
   271  	return buf.String()
   272  }
   273  
   274  func ipcData(v uint64) dynamicInode {
   275  	return newStaticFile(strconv.FormatUint(v, 10))
   276  }