github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/proc/task.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/mm"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    29  )
    30  
    31  // taskInode represents the inode for /proc/PID/ directory.
    32  //
    33  // +stateify savable
    34  type taskInode struct {
    35  	implStatFS
    36  	kernfs.InodeAttrs
    37  	kernfs.InodeDirectoryNoNewChildren
    38  	kernfs.InodeNotAnonymous
    39  	kernfs.InodeNotSymlink
    40  	kernfs.InodeTemporary
    41  	kernfs.InodeWatches
    42  	kernfs.OrderedChildren
    43  	taskInodeRefs
    44  
    45  	locks vfs.FileLocks
    46  
    47  	task *kernel.Task
    48  }
    49  
    50  var _ kernfs.Inode = (*taskInode)(nil)
    51  
    52  func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, fakeCgroupControllers map[string]string) (kernfs.Inode, error) {
    53  	if task.ExitState() == kernel.TaskExitDead {
    54  		return nil, linuxerr.ESRCH
    55  	}
    56  
    57  	contents := map[string]kernfs.Inode{
    58  		"auxv":      fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &auxvData{task: task}),
    59  		"cmdline":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &metadataData{task: task, metaType: Cmdline}),
    60  		"comm":      fs.newComm(ctx, task, fs.NextIno(), 0644),
    61  		"cwd":       fs.newCwdSymlink(ctx, task, fs.NextIno()),
    62  		"environ":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &metadataData{task: task, metaType: Environ}),
    63  		"exe":       fs.newExeSymlink(ctx, task, fs.NextIno()),
    64  		"fd":        fs.newFDDirInode(ctx, task),
    65  		"fdinfo":    fs.newFDInfoDirInode(ctx, task),
    66  		"gid_map":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
    67  		"io":        fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
    68  		"limits":    fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &limitsData{task: task}),
    69  		"maps":      fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}),
    70  		"mem":       fs.newMemInode(ctx, task, fs.NextIno(), 0400),
    71  		"mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}),
    72  		"mounts":    fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}),
    73  		"net":       fs.newTaskNetDir(ctx, task),
    74  		"ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{
    75  			"net":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), linux.CLONE_NEWNET),
    76  			"pid":  fs.newPIDNamespaceSymlink(ctx, task, fs.NextIno()),
    77  			"user": fs.newFakeNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
    78  		}),
    79  		"oom_score":     fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newStaticFile("0\n")),
    80  		"oom_score_adj": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
    81  		"root":          fs.newRootSymlink(ctx, task, fs.NextIno()),
    82  		"smaps":         fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}),
    83  		"stat":          fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
    84  		"statm":         fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}),
    85  		"status":        fs.newStatusInode(ctx, task, pidns, fs.NextIno(), 0444),
    86  		"uid_map":       fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
    87  	}
    88  	if isThreadGroup {
    89  		contents["task"] = fs.newSubtasks(ctx, task, pidns, fakeCgroupControllers)
    90  	}
    91  	if len(fakeCgroupControllers) > 0 {
    92  		contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newFakeCgroupData(fakeCgroupControllers))
    93  	} else {
    94  		contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskCgroupData{task: task})
    95  	}
    96  
    97  	taskInode := &taskInode{task: task}
    98  	// Note: credentials are overridden by taskOwnedInode.
    99  	taskInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
   100  	taskInode.InitRefs()
   101  
   102  	inode := &taskOwnedInode{Inode: taskInode, owner: task}
   103  
   104  	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   105  	links := taskInode.OrderedChildren.Populate(contents)
   106  	taskInode.IncLinks(links)
   107  
   108  	return inode, nil
   109  }
   110  
   111  // Valid implements kernfs.Inode.Valid. This inode remains valid as long
   112  // as the task is still running. When it's dead, another tasks with the same
   113  // PID could replace it.
   114  func (i *taskInode) Valid(ctx context.Context) bool {
   115  	return i.task.ExitState() != kernel.TaskExitDead
   116  }
   117  
   118  // Open implements kernfs.Inode.Open.
   119  func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   120  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
   121  		SeekEnd: kernfs.SeekEndZero,
   122  	})
   123  	if err != nil {
   124  		return nil, err
   125  	}
   126  	return fd.VFSFileDescription(), nil
   127  }
   128  
   129  // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
   130  func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   131  	return linuxerr.EPERM
   132  }
   133  
   134  // DecRef implements kernfs.Inode.DecRef.
   135  func (i *taskInode) DecRef(ctx context.Context) {
   136  	i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
   137  }
   138  
   139  // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
   140  // effective user and group.
   141  //
   142  // +stateify savable
   143  type taskOwnedInode struct {
   144  	kernfs.Inode
   145  
   146  	// owner is the task that owns this inode.
   147  	owner *kernel.Task
   148  }
   149  
   150  var _ kernfs.Inode = (*taskOwnedInode)(nil)
   151  
   152  func (fs *filesystem) newTaskOwnedInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
   153  	// Note: credentials are overridden by taskOwnedInode.
   154  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
   155  
   156  	return &taskOwnedInode{Inode: inode, owner: task}
   157  }
   158  
   159  func (fs *filesystem) newTaskOwnedDir(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
   160  	// Note: credentials are overridden by taskOwnedInode.
   161  	fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
   162  	dir := kernfs.NewStaticDir(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
   163  
   164  	return &taskOwnedInode{Inode: dir, owner: task}
   165  }
   166  
   167  func (i *taskOwnedInode) Valid(ctx context.Context) bool {
   168  	return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
   169  }
   170  
   171  // Stat implements kernfs.Inode.Stat.
   172  func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
   173  	stat, err := i.Inode.Stat(ctx, fs, opts)
   174  	if err != nil {
   175  		return linux.Statx{}, err
   176  	}
   177  	if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
   178  		uid, gid := i.getOwner(linux.FileMode(stat.Mode))
   179  		if opts.Mask&linux.STATX_UID != 0 {
   180  			stat.UID = uint32(uid)
   181  		}
   182  		if opts.Mask&linux.STATX_GID != 0 {
   183  			stat.GID = uint32(gid)
   184  		}
   185  	}
   186  	return stat, nil
   187  }
   188  
   189  // CheckPermissions implements kernfs.Inode.CheckPermissions.
   190  func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   191  	mode := i.Mode()
   192  	uid, gid := i.getOwner(mode)
   193  	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
   194  }
   195  
   196  func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
   197  	// By default, set the task owner as the file owner.
   198  	creds := i.owner.Credentials()
   199  	uid := creds.EffectiveKUID
   200  	gid := creds.EffectiveKGID
   201  
   202  	// Linux doesn't apply dumpability adjustments to world readable/executable
   203  	// directories so that applications can stat /proc/PID to determine the
   204  	// effective UID of a process. See fs/proc/base.c:task_dump_owner.
   205  	if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
   206  		return uid, gid
   207  	}
   208  
   209  	// If the task is not dumpable, then root (in the namespace preferred)
   210  	// owns the file.
   211  	m := getMM(i.owner)
   212  	if m == nil {
   213  		return auth.RootKUID, auth.RootKGID
   214  	}
   215  	if m.Dumpability() != mm.UserDumpable {
   216  		uid = auth.RootKUID
   217  		if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
   218  			uid = kuid
   219  		}
   220  		gid = auth.RootKGID
   221  		if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
   222  			gid = kgid
   223  		}
   224  	}
   225  	return uid, gid
   226  }
   227  
   228  func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
   229  	if isThreadGroup {
   230  		return &ioData{ioUsage: t.ThreadGroup()}
   231  	}
   232  	return &ioData{ioUsage: t}
   233  }
   234  
   235  // newFakeCgroupData creates an inode that shows fake cgroup
   236  // information passed in as mount options.  From man 7 cgroups: "For
   237  // each cgroup hierarchy of which the process is a member, there is
   238  // one entry containing three colon-separated fields:
   239  // hierarchy-ID:controller-list:cgroup-path"
   240  //
   241  // TODO(b/182488796): Remove once all users adopt cgroupfs.
   242  func newFakeCgroupData(controllers map[string]string) dynamicInode {
   243  	var buf bytes.Buffer
   244  
   245  	// The hierarchy ids must be positive integers (for cgroup v1), but the
   246  	// exact number does not matter, so long as they are unique. We can
   247  	// just use a counter, but since linux sorts this file in descending
   248  	// order, we must count down to preserve this behavior.
   249  	i := len(controllers)
   250  	for name, dir := range controllers {
   251  		fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir)
   252  		i--
   253  	}
   254  	return newStaticFile(buf.String())
   255  }