github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/proc/task.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  
    21  	"github.com/metacubex/gvisor/pkg/abi/linux"
    22  	"github.com/metacubex/gvisor/pkg/context"
    23  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    24  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    25  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    26  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/metacubex/gvisor/pkg/sentry/mm"
    28  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    29  )
    30  
    31  // taskInode represents the inode for /proc/PID/ directory.
    32  //
    33  // +stateify savable
    34  type taskInode struct {
    35  	implStatFS
    36  	kernfs.InodeAttrs
    37  	kernfs.InodeDirectoryNoNewChildren
    38  	kernfs.InodeNotAnonymous
    39  	kernfs.InodeNotSymlink
    40  	kernfs.InodeTemporary
    41  	kernfs.InodeWatches
    42  	kernfs.OrderedChildren
    43  	taskInodeRefs
    44  
    45  	locks vfs.FileLocks
    46  
    47  	task *kernel.Task
    48  }
    49  
    50  var _ kernfs.Inode = (*taskInode)(nil)
    51  
    52  func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, fakeCgroupControllers map[string]string) (kernfs.Inode, error) {
    53  	if task.ExitState() == kernel.TaskExitDead {
    54  		return nil, linuxerr.ESRCH
    55  	}
    56  
    57  	contents := map[string]kernfs.Inode{
    58  		"auxv":      fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &auxvData{task: task}),
    59  		"cmdline":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &metadataData{task: task, metaType: Cmdline}),
    60  		"comm":      fs.newComm(ctx, task, fs.NextIno(), 0644),
    61  		"cwd":       fs.newCwdSymlink(ctx, task, fs.NextIno()),
    62  		"environ":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &metadataData{task: task, metaType: Environ}),
    63  		"exe":       fs.newExeSymlink(ctx, task, fs.NextIno()),
    64  		"fd":        fs.newFDDirInode(ctx, task),
    65  		"fdinfo":    fs.newFDInfoDirInode(ctx, task),
    66  		"gid_map":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
    67  		"io":        fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
    68  		"limits":    fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &limitsData{task: task}),
    69  		"maps":      fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}),
    70  		"mem":       fs.newMemInode(ctx, task, fs.NextIno(), 0400),
    71  		"mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}),
    72  		"mounts":    fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}),
    73  		"net":       fs.newTaskNetDir(ctx, task),
    74  		"ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{
    75  			"net":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET),
    76  			"mnt":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNS),
    77  			"pid":  fs.newPIDNamespaceSymlink(ctx, task, fs.NextIno()),
    78  			"user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
    79  			"ipc":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWIPC),
    80  			"uts":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWUTS),
    81  		}),
    82  		"oom_score":     fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newStaticFile("0\n")),
    83  		"oom_score_adj": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
    84  		"root":          fs.newRootSymlink(ctx, task, fs.NextIno()),
    85  		"smaps":         fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}),
    86  		"stat":          fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
    87  		"statm":         fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}),
    88  		"status":        fs.newStatusInode(ctx, task, pidns, fs.NextIno(), 0444),
    89  		"uid_map":       fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
    90  	}
    91  	if isThreadGroup {
    92  		contents["task"] = fs.newSubtasks(ctx, task, pidns, fakeCgroupControllers)
    93  	} else {
    94  		contents["children"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &childrenData{task: task, pidns: pidns})
    95  	}
    96  	if len(fakeCgroupControllers) > 0 {
    97  		contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newFakeCgroupData(fakeCgroupControllers))
    98  	} else {
    99  		contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskCgroupData{task: task})
   100  	}
   101  
   102  	taskInode := &taskInode{task: task}
   103  	// Note: credentials are overridden by taskOwnedInode.
   104  	taskInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
   105  	taskInode.InitRefs()
   106  
   107  	inode := &taskOwnedInode{Inode: taskInode, owner: task}
   108  
   109  	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   110  	links := taskInode.OrderedChildren.Populate(contents)
   111  	taskInode.IncLinks(links)
   112  
   113  	return inode, nil
   114  }
   115  
   116  // Valid implements kernfs.Inode.Valid. This inode remains valid as long
   117  // as the task is still running. When it's dead, another tasks with the same
   118  // PID could replace it.
   119  func (i *taskInode) Valid(ctx context.Context) bool {
   120  	return i.task.ExitState() != kernel.TaskExitDead
   121  }
   122  
   123  // Open implements kernfs.Inode.Open.
   124  func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   125  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
   126  		SeekEnd: kernfs.SeekEndZero,
   127  	})
   128  	if err != nil {
   129  		return nil, err
   130  	}
   131  	return fd.VFSFileDescription(), nil
   132  }
   133  
   134  // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
   135  func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   136  	return linuxerr.EPERM
   137  }
   138  
   139  // DecRef implements kernfs.Inode.DecRef.
   140  func (i *taskInode) DecRef(ctx context.Context) {
   141  	i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
   142  }
   143  
   144  // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
   145  // effective user and group.
   146  //
   147  // +stateify savable
   148  type taskOwnedInode struct {
   149  	kernfs.Inode
   150  
   151  	// owner is the task that owns this inode.
   152  	owner *kernel.Task
   153  }
   154  
   155  var _ kernfs.Inode = (*taskOwnedInode)(nil)
   156  
   157  func (fs *filesystem) newTaskOwnedInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
   158  	// Note: credentials are overridden by taskOwnedInode.
   159  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
   160  
   161  	return &taskOwnedInode{Inode: inode, owner: task}
   162  }
   163  
   164  func (fs *filesystem) newTaskOwnedDir(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
   165  	// Note: credentials are overridden by taskOwnedInode.
   166  	fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
   167  	dir := kernfs.NewStaticDir(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
   168  
   169  	return &taskOwnedInode{Inode: dir, owner: task}
   170  }
   171  
   172  func (i *taskOwnedInode) Valid(ctx context.Context) bool {
   173  	return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
   174  }
   175  
   176  // Stat implements kernfs.Inode.Stat.
   177  func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
   178  	stat, err := i.Inode.Stat(ctx, fs, opts)
   179  	if err != nil {
   180  		return linux.Statx{}, err
   181  	}
   182  	if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
   183  		uid, gid := i.getOwner(linux.FileMode(stat.Mode))
   184  		if opts.Mask&linux.STATX_UID != 0 {
   185  			stat.UID = uint32(uid)
   186  		}
   187  		if opts.Mask&linux.STATX_GID != 0 {
   188  			stat.GID = uint32(gid)
   189  		}
   190  	}
   191  	return stat, nil
   192  }
   193  
   194  // CheckPermissions implements kernfs.Inode.CheckPermissions.
   195  func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   196  	mode := i.Mode()
   197  	uid, gid := i.getOwner(mode)
   198  	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
   199  }
   200  
   201  func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
   202  	// By default, set the task owner as the file owner.
   203  	creds := i.owner.Credentials()
   204  	uid := creds.EffectiveKUID
   205  	gid := creds.EffectiveKGID
   206  
   207  	// Linux doesn't apply dumpability adjustments to world readable/executable
   208  	// directories so that applications can stat /proc/PID to determine the
   209  	// effective UID of a process. See fs/proc/base.c:task_dump_owner.
   210  	if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
   211  		return uid, gid
   212  	}
   213  
   214  	// If the task is not dumpable, then root (in the namespace preferred)
   215  	// owns the file.
   216  	m := getMM(i.owner)
   217  	if m == nil {
   218  		return auth.RootKUID, auth.RootKGID
   219  	}
   220  	if m.Dumpability() != mm.UserDumpable {
   221  		uid = auth.RootKUID
   222  		if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
   223  			uid = kuid
   224  		}
   225  		gid = auth.RootKGID
   226  		if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
   227  			gid = kgid
   228  		}
   229  	}
   230  	return uid, gid
   231  }
   232  
   233  func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
   234  	if isThreadGroup {
   235  		return &ioData{ioUsage: t.ThreadGroup()}
   236  	}
   237  	return &ioData{ioUsage: t}
   238  }
   239  
   240  // newFakeCgroupData creates an inode that shows fake cgroup
   241  // information passed in as mount options.  From man 7 cgroups: "For
   242  // each cgroup hierarchy of which the process is a member, there is
   243  // one entry containing three colon-separated fields:
   244  // hierarchy-ID:controller-list:cgroup-path"
   245  //
   246  // TODO(b/182488796): Remove once all users adopt cgroupfs.
   247  func newFakeCgroupData(controllers map[string]string) dynamicInode {
   248  	var buf bytes.Buffer
   249  
   250  	// The hierarchy ids must be positive integers (for cgroup v1), but the
   251  	// exact number does not matter, so long as they are unique. We can
   252  	// just use a counter, but since linux sorts this file in descending
   253  	// order, we must count down to preserve this behavior.
   254  	i := len(controllers)
   255  	for name, dir := range controllers {
   256  		fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir)
   257  		i--
   258  	}
   259  	return newStaticFile(buf.String())
   260  }