github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/proc/task_files.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"io"
    21  	"sort"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    26  	"github.com/MerlinKodo/gvisor/pkg/context"
    27  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    28  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    29  	"github.com/MerlinKodo/gvisor/pkg/safemem"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/kernfs"
    31  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/nsfs"
    32  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    33  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    34  	"github.com/MerlinKodo/gvisor/pkg/sentry/limits"
    35  	"github.com/MerlinKodo/gvisor/pkg/sentry/mm"
    36  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    37  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    38  	"github.com/MerlinKodo/gvisor/pkg/sync"
    39  	"github.com/MerlinKodo/gvisor/pkg/usermem"
    40  )
    41  
    42  // "There is an (arbitrary) limit on the number of lines in the file. As at
    43  // Linux 3.18, the limit is five lines." - user_namespaces(7)
    44  const maxIDMapLines = 5
    45  
    46  // getMM gets the kernel task's MemoryManager. No additional reference is taken on
    47  // mm here. This is safe because MemoryManager.destroy is required to leave the
    48  // MemoryManager in a state where it's still usable as a DynamicBytesSource.
    49  func getMM(task *kernel.Task) *mm.MemoryManager {
    50  	var tmm *mm.MemoryManager
    51  	task.WithMuLocked(func(t *kernel.Task) {
    52  		if mm := t.MemoryManager(); mm != nil {
    53  			tmm = mm
    54  		}
    55  	})
    56  	return tmm
    57  }
    58  
    59  // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the
    60  // MemoryManager's users count is incremented, and must be decremented by the
    61  // caller when it is no longer in use.
    62  func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
    63  	var m *mm.MemoryManager
    64  	task.WithMuLocked(func(t *kernel.Task) {
    65  		m = t.MemoryManager()
    66  	})
    67  	if m == nil || !m.IncUsers() {
    68  		return nil, io.EOF
    69  	}
    70  	return m, nil
    71  }
    72  
    73  func checkTaskState(t *kernel.Task) error {
    74  	switch t.ExitState() {
    75  	case kernel.TaskExitZombie:
    76  		return linuxerr.EACCES
    77  	case kernel.TaskExitDead:
    78  		return linuxerr.ESRCH
    79  	}
    80  	return nil
    81  }
    82  
    83  type bufferWriter struct {
    84  	buf *bytes.Buffer
    85  }
    86  
    87  // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
    88  // the number of bytes written. It may return a partial write without an
    89  // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
    90  // return a full write with an error (i.e. srcs.NumBytes(), err) where err
    91  // != nil).
    92  func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
    93  	written := srcs.NumBytes()
    94  	for !srcs.IsEmpty() {
    95  		w.buf.Write(srcs.Head().ToSlice())
    96  		srcs = srcs.Tail()
    97  	}
    98  	return written, nil
    99  }
   100  
   101  // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv.
   102  //
   103  // +stateify savable
   104  type auxvData struct {
   105  	kernfs.DynamicBytesFile
   106  
   107  	task *kernel.Task
   108  }
   109  
   110  var _ dynamicInode = (*auxvData)(nil)
   111  
   112  // Generate implements vfs.DynamicBytesSource.Generate.
   113  func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   114  	if d.task.ExitState() == kernel.TaskExitDead {
   115  		return linuxerr.ESRCH
   116  	}
   117  	m, err := getMMIncRef(d.task)
   118  	if err != nil {
   119  		// Return empty file.
   120  		return nil
   121  	}
   122  	defer m.DecUsers(ctx)
   123  
   124  	auxv := m.Auxv()
   125  	// Space for buffer with AT_NULL (0) terminator at the end.
   126  	buf.Grow((len(auxv) + 1) * 16)
   127  	for _, e := range auxv {
   128  		var tmp [16]byte
   129  		hostarch.ByteOrder.PutUint64(tmp[:8], e.Key)
   130  		hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value))
   131  		buf.Write(tmp[:])
   132  	}
   133  	var atNull [16]byte
   134  	buf.Write(atNull[:])
   135  
   136  	return nil
   137  }
   138  
   139  // MetadataType enumerates the types of metadata that is exposed through proc.
   140  type MetadataType int
   141  
   142  const (
   143  	// Cmdline represents /proc/[pid]/cmdline.
   144  	Cmdline MetadataType = iota
   145  
   146  	// Environ represents /proc/[pid]/environ.
   147  	Environ
   148  )
   149  
   150  // GetMetadata fetches the process's metadata of type t and writes it into
   151  // buf. The process is identified by mm.
   152  func GetMetadata(ctx context.Context, mm *mm.MemoryManager, buf *bytes.Buffer, t MetadataType) error {
   153  	// Figure out the bounds of the exec arg we are trying to read.
   154  	var ar hostarch.AddrRange
   155  	switch t {
   156  	case Cmdline:
   157  		ar = hostarch.AddrRange{
   158  			Start: mm.ArgvStart(),
   159  			End:   mm.ArgvEnd(),
   160  		}
   161  	case Environ:
   162  		ar = hostarch.AddrRange{
   163  			Start: mm.EnvvStart(),
   164  			End:   mm.EnvvEnd(),
   165  		}
   166  	default:
   167  		panic(fmt.Sprintf("unknown exec arg type %v", t))
   168  	}
   169  	if ar.Start == 0 || ar.End == 0 {
   170  		// Don't attempt to read before the start/end are set up.
   171  		return io.EOF
   172  	}
   173  
   174  	// N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
   175  	// until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
   176  	// cmdline and environment").
   177  	writer := &bufferWriter{buf: buf}
   178  	if n, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil {
   179  		// Nothing to copy or something went wrong.
   180  		return err
   181  	}
   182  
   183  	// On Linux, if the NULL byte at the end of the argument vector has been
   184  	// overwritten, it continues reading the environment vector as part of
   185  	// the argument vector.
   186  	if t == Cmdline && buf.Bytes()[buf.Len()-1] != 0 {
   187  		if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 {
   188  			// If we found a NULL character somewhere else in argv, truncate the
   189  			// return up to the NULL terminator (including it).
   190  			buf.Truncate(end)
   191  			return nil
   192  		}
   193  
   194  		// There is no NULL terminator in the string, return into envp.
   195  		arEnvv := hostarch.AddrRange{
   196  			Start: mm.EnvvStart(),
   197  			End:   mm.EnvvEnd(),
   198  		}
   199  
   200  		// Upstream limits the returned amount to one page of slop.
   201  		// https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208
   202  		// we'll return one page total between argv and envp because of the
   203  		// above page restrictions.
   204  		if buf.Len() >= hostarch.PageSize {
   205  			// Returned at least one page already, nothing else to add.
   206  			return nil
   207  		}
   208  		remaining := hostarch.PageSize - buf.Len()
   209  		if int(arEnvv.Length()) > remaining {
   210  			end, ok := arEnvv.Start.AddLength(uint64(remaining))
   211  			if !ok {
   212  				return linuxerr.EFAULT
   213  			}
   214  			arEnvv.End = end
   215  		}
   216  		if _, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil {
   217  			return err
   218  		}
   219  
   220  		// Linux will return envp up to and including the first NULL character,
   221  		// so find it.
   222  		envStart := int(ar.Length())
   223  		if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
   224  			buf.Truncate(envStart + nullIdx)
   225  		}
   226  	}
   227  
   228  	return nil
   229  }
   230  
   231  // metadataData implements vfs.DynamicBytesSource for proc metadata fields like:
   232  //
   233  //   - /proc/[pid]/cmdline
   234  //   - /proc/[pid]/environ
   235  //
   236  // +stateify savable
   237  type metadataData struct {
   238  	kernfs.DynamicBytesFile
   239  
   240  	task *kernel.Task
   241  
   242  	// arg is the type of exec argument this file contains.
   243  	metaType MetadataType
   244  }
   245  
   246  var _ dynamicInode = (*metadataData)(nil)
   247  
   248  // Generate implements vfs.DynamicBytesSource.Generate.
   249  func (d *metadataData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   250  	if d.task.ExitState() == kernel.TaskExitDead {
   251  		return linuxerr.ESRCH
   252  	}
   253  	m, err := getMMIncRef(d.task)
   254  	if err != nil {
   255  		// Return empty file.
   256  		return nil
   257  	}
   258  	defer m.DecUsers(ctx)
   259  	return GetMetadata(ctx, m, buf, d.metaType)
   260  }
   261  
   262  // +stateify savable
   263  type commInode struct {
   264  	kernfs.DynamicBytesFile
   265  
   266  	task *kernel.Task
   267  }
   268  
   269  func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
   270  	inode := &commInode{task: task}
   271  	inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
   272  	return inode
   273  }
   274  
   275  func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   276  	// This file can always be read or written by members of the same thread
   277  	// group. See fs/proc/base.c:proc_tid_comm_permission.
   278  	t := kernel.TaskFromContext(ctx)
   279  	if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() {
   280  		return nil
   281  	}
   282  
   283  	return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats)
   284  }
   285  
   286  // commData implements vfs.WritableDynamicBytesSource for /proc/[pid]/comm.
   287  //
   288  // +stateify savable
   289  type commData struct {
   290  	kernfs.DynamicBytesFile
   291  
   292  	task *kernel.Task
   293  }
   294  
   295  var _ dynamicInode = (*commData)(nil)
   296  var _ vfs.WritableDynamicBytesSource = (*commData)(nil)
   297  
   298  // Generate implements vfs.DynamicBytesSource.Generate.
   299  func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   300  	buf.WriteString(d.task.Name())
   301  	buf.WriteString("\n")
   302  	return nil
   303  }
   304  
   305  // Write implements vfs.WritableDynamicBytesSource.Write.
   306  func (d *commData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   307  	srclen := src.NumBytes()
   308  	name := make([]byte, srclen)
   309  	if _, err := src.CopyIn(ctx, name); err != nil {
   310  		return 0, err
   311  	}
   312  
   313  	// Only allow writes from the same thread group, otherwise return
   314  	// EINVAL. See fs/proc/base.c:comm_write.
   315  	//
   316  	// Note that this check exists in addition to the same-thread-group
   317  	// check in CheckPermissions.
   318  	t := kernel.TaskFromContext(ctx)
   319  	if t == nil || t.ThreadGroup() != d.task.ThreadGroup() {
   320  		return 0, linuxerr.EINVAL
   321  	}
   322  	d.task.SetName(string(name))
   323  	return int64(srclen), nil
   324  }
   325  
   326  // idMapData implements vfs.WritableDynamicBytesSource for
   327  // /proc/[pid]/{gid_map|uid_map}.
   328  //
   329  // +stateify savable
   330  type idMapData struct {
   331  	kernfs.DynamicBytesFile
   332  
   333  	task *kernel.Task
   334  	gids bool
   335  }
   336  
   337  var _ dynamicInode = (*idMapData)(nil)
   338  var _ vfs.WritableDynamicBytesSource = (*idMapData)(nil)
   339  
   340  // Generate implements vfs.WritableDynamicBytesSource.Generate.
   341  func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   342  	var entries []auth.IDMapEntry
   343  	if d.gids {
   344  		entries = d.task.UserNamespace().GIDMap()
   345  	} else {
   346  		entries = d.task.UserNamespace().UIDMap()
   347  	}
   348  	for _, e := range entries {
   349  		fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
   350  	}
   351  	return nil
   352  }
   353  
   354  // Write implements vfs.WritableDynamicBytesSource.Write.
   355  func (d *idMapData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   356  	// "In addition, the number of bytes written to the file must be less than
   357  	// the system page size, and the write must be performed at the start of
   358  	// the file ..." - user_namespaces(7)
   359  	srclen := src.NumBytes()
   360  	if srclen >= hostarch.PageSize || offset != 0 {
   361  		return 0, linuxerr.EINVAL
   362  	}
   363  	b := make([]byte, srclen)
   364  	if _, err := src.CopyIn(ctx, b); err != nil {
   365  		return 0, err
   366  	}
   367  
   368  	// Truncate from the first NULL byte.
   369  	var nul int64
   370  	nul = int64(bytes.IndexByte(b, 0))
   371  	if nul == -1 {
   372  		nul = srclen
   373  	}
   374  	b = b[:nul]
   375  	// Remove the last \n.
   376  	if nul >= 1 && b[nul-1] == '\n' {
   377  		b = b[:nul-1]
   378  	}
   379  	lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
   380  	if len(lines) > maxIDMapLines {
   381  		return 0, linuxerr.EINVAL
   382  	}
   383  
   384  	entries := make([]auth.IDMapEntry, len(lines))
   385  	for i, l := range lines {
   386  		var e auth.IDMapEntry
   387  		_, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
   388  		if err != nil {
   389  			return 0, linuxerr.EINVAL
   390  		}
   391  		entries[i] = e
   392  	}
   393  	var err error
   394  	if d.gids {
   395  		err = d.task.UserNamespace().SetGIDMap(ctx, entries)
   396  	} else {
   397  		err = d.task.UserNamespace().SetUIDMap(ctx, entries)
   398  	}
   399  	if err != nil {
   400  		return 0, err
   401  	}
   402  
   403  	// On success, Linux's kernel/user_namespace.c:map_write() always returns
   404  	// count, even if fewer bytes were used.
   405  	return int64(srclen), nil
   406  }
   407  
   408  var _ kernfs.Inode = (*memInode)(nil)
   409  
   410  // memInode implements kernfs.Inode for /proc/[pid]/mem.
   411  //
   412  // +stateify savable
   413  type memInode struct {
   414  	kernfs.InodeAttrs
   415  	kernfs.InodeNoStatFS
   416  	kernfs.InodeNoopRefCount
   417  	kernfs.InodeNotAnonymous
   418  	kernfs.InodeNotDirectory
   419  	kernfs.InodeNotSymlink
   420  	kernfs.InodeWatches
   421  
   422  	task  *kernel.Task
   423  	locks vfs.FileLocks
   424  }
   425  
   426  func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
   427  	// Note: credentials are overridden by taskOwnedInode.
   428  	inode := &memInode{task: task}
   429  	inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
   430  	return &taskOwnedInode{Inode: inode, owner: task}
   431  }
   432  
   433  func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
   434  	if perm&^linux.PermissionsMask != 0 {
   435  		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
   436  	}
   437  	f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
   438  }
   439  
   440  // Open implements kernfs.Inode.Open.
   441  func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   442  	// TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS
   443  	// Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS
   444  	// Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH
   445  	if !kernel.ContextCanTrace(ctx, f.task, true) {
   446  		return nil, linuxerr.EACCES
   447  	}
   448  	if err := checkTaskState(f.task); err != nil {
   449  		return nil, err
   450  	}
   451  	fd := &memFD{}
   452  	if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil {
   453  		return nil, err
   454  	}
   455  	return &fd.vfsfd, nil
   456  }
   457  
   458  // SetStat implements kernfs.Inode.SetStat.
   459  func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   460  	return linuxerr.EPERM
   461  }
   462  
   463  var _ vfs.FileDescriptionImpl = (*memFD)(nil)
   464  
   465  // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem.
   466  //
   467  // +stateify savable
   468  type memFD struct {
   469  	vfsfd vfs.FileDescription
   470  	vfs.FileDescriptionDefaultImpl
   471  	vfs.LockFD
   472  
   473  	inode *memInode
   474  
   475  	// mu guards the fields below.
   476  	mu     sync.Mutex `state:"nosave"`
   477  	offset int64
   478  }
   479  
   480  // Init initializes memFD.
   481  func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error {
   482  	fd.LockFD.Init(&inode.locks)
   483  	if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   484  		return err
   485  	}
   486  	fd.inode = inode
   487  	return nil
   488  }
   489  
   490  // Seek implements vfs.FileDescriptionImpl.Seek.
   491  func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   492  	fd.mu.Lock()
   493  	defer fd.mu.Unlock()
   494  	switch whence {
   495  	case linux.SEEK_SET:
   496  	case linux.SEEK_CUR:
   497  		offset += fd.offset
   498  	default:
   499  		return 0, linuxerr.EINVAL
   500  	}
   501  	if offset < 0 {
   502  		return 0, linuxerr.EINVAL
   503  	}
   504  	fd.offset = offset
   505  	return offset, nil
   506  }
   507  
   508  // PRead implements vfs.FileDescriptionImpl.PRead.
   509  func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   510  	if dst.NumBytes() == 0 {
   511  		return 0, nil
   512  	}
   513  	m, err := getMMIncRef(fd.inode.task)
   514  	if err != nil {
   515  		return 0, err
   516  	}
   517  	defer m.DecUsers(ctx)
   518  	// Buffer the read data because of MM locks
   519  	buf := make([]byte, dst.NumBytes())
   520  	n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true})
   521  	if n > 0 {
   522  		if _, err := dst.CopyOut(ctx, buf[:n]); err != nil {
   523  			return 0, linuxerr.EFAULT
   524  		}
   525  		return int64(n), nil
   526  	}
   527  	if readErr != nil {
   528  		return 0, linuxerr.EIO
   529  	}
   530  	return 0, nil
   531  }
   532  
   533  // Read implements vfs.FileDescriptionImpl.Read.
   534  func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   535  	fd.mu.Lock()
   536  	n, err := fd.PRead(ctx, dst, fd.offset, opts)
   537  	fd.offset += n
   538  	fd.mu.Unlock()
   539  	return n, err
   540  }
   541  
   542  // Stat implements vfs.FileDescriptionImpl.Stat.
   543  func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   544  	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
   545  	return fd.inode.Stat(ctx, fs, opts)
   546  }
   547  
   548  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   549  func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error {
   550  	return linuxerr.EPERM
   551  }
   552  
   553  // Release implements vfs.FileDescriptionImpl.Release.
   554  func (fd *memFD) Release(context.Context) {}
   555  
   556  // limitsData implements vfs.DynamicBytesSource for /proc/[pid]/limits.
   557  //
   558  // +stateify savable
   559  type limitsData struct {
   560  	kernfs.DynamicBytesFile
   561  
   562  	task *kernel.Task
   563  }
   564  
   565  func (d *limitsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   566  	taskLimits := d.task.Limits()
   567  	// formatting matches the kernel output from linux/fs/proc/base.c:proc_pid_limits()
   568  	fmt.Fprintf(buf, "Limit                     Soft Limit           Hard Limit           Units     \n")
   569  	for _, lt := range limits.AllLimitTypes {
   570  		fmt.Fprintf(buf, "%-25s ", lt.Name())
   571  
   572  		l := taskLimits.Get(lt)
   573  		if l.Cur == limits.Infinity {
   574  			fmt.Fprintf(buf, "%-20s ", "unlimited")
   575  		} else {
   576  			fmt.Fprintf(buf, "%-20d ", l.Cur)
   577  		}
   578  
   579  		if l.Max == limits.Infinity {
   580  			fmt.Fprintf(buf, "%-20s ", "unlimited")
   581  		} else {
   582  			fmt.Fprintf(buf, "%-20d ", l.Max)
   583  		}
   584  
   585  		if u := lt.Unit(); u != "" {
   586  			fmt.Fprintf(buf, "%-10s", u)
   587  		}
   588  
   589  		buf.WriteByte('\n')
   590  	}
   591  	return nil
   592  }
   593  
   594  // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
   595  //
   596  // +stateify savable
   597  type mapsData struct {
   598  	kernfs.DynamicBytesFile
   599  
   600  	task *kernel.Task
   601  }
   602  
   603  var _ dynamicInode = (*mapsData)(nil)
   604  
   605  // Generate implements vfs.DynamicBytesSource.Generate.
   606  func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   607  	if mm := getMM(d.task); mm != nil {
   608  		mm.ReadMapsDataInto(ctx, mm.MapsCallbackFuncForBuffer(buf))
   609  	}
   610  	return nil
   611  }
   612  
   613  // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
   614  //
   615  // +stateify savable
   616  type smapsData struct {
   617  	kernfs.DynamicBytesFile
   618  
   619  	task *kernel.Task
   620  }
   621  
   622  var _ dynamicInode = (*smapsData)(nil)
   623  
   624  // Generate implements vfs.DynamicBytesSource.Generate.
   625  func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   626  	if mm := getMM(d.task); mm != nil {
   627  		mm.ReadSmapsDataInto(ctx, buf)
   628  	}
   629  	return nil
   630  }
   631  
   632  // +stateify savable
   633  type taskStatData struct {
   634  	kernfs.DynamicBytesFile
   635  
   636  	task *kernel.Task
   637  
   638  	// If tgstats is true, accumulate fault stats (not implemented) and CPU
   639  	// time across all tasks in t's thread group.
   640  	tgstats bool
   641  
   642  	// pidns is the PID namespace associated with the proc filesystem that
   643  	// includes the file using this statData.
   644  	pidns *kernel.PIDNamespace
   645  }
   646  
   647  var _ dynamicInode = (*taskStatData)(nil)
   648  
   649  // Generate implements vfs.DynamicBytesSource.Generate.
   650  func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   651  	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task))
   652  	fmt.Fprintf(buf, "(%s) ", s.task.Name())
   653  	fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0])
   654  	ppid := kernel.ThreadID(0)
   655  	if parent := s.task.Parent(); parent != nil {
   656  		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
   657  	}
   658  	fmt.Fprintf(buf, "%d ", ppid)
   659  	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup()))
   660  	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session()))
   661  	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
   662  	fmt.Fprintf(buf, "0 " /* flags */)
   663  	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
   664  	var cputime usage.CPUStats
   665  	if s.tgstats {
   666  		cputime = s.task.ThreadGroup().CPUStats()
   667  	} else {
   668  		cputime = s.task.CPUStats()
   669  	}
   670  	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
   671  	cputime = s.task.ThreadGroup().JoinedChildCPUStats()
   672  	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
   673  	fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness())
   674  	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count())
   675  
   676  	// itrealvalue. Since kernel 2.6.17, this field is no longer
   677  	// maintained, and is hard coded as 0.
   678  	fmt.Fprintf(buf, "0 ")
   679  
   680  	// Start time is relative to boot time, expressed in clock ticks.
   681  	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime())))
   682  
   683  	var vss, rss uint64
   684  	if mm := getMM(s.task); mm != nil {
   685  		vss = mm.VirtualMemorySize()
   686  		rss = mm.ResidentSetSize()
   687  	}
   688  	fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize)
   689  
   690  	// rsslim.
   691  	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur)
   692  
   693  	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
   694  	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
   695  	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
   696  	terminationSignal := linux.Signal(0)
   697  	if s.task == s.task.ThreadGroup().Leader() {
   698  		terminationSignal = s.task.ThreadGroup().TerminationSignal()
   699  	}
   700  	fmt.Fprintf(buf, "%d ", terminationSignal)
   701  	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
   702  	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
   703  	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
   704  	fmt.Fprintf(buf, "0\n" /* exit_code */)
   705  
   706  	return nil
   707  }
   708  
   709  // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
   710  //
   711  // +stateify savable
   712  type statmData struct {
   713  	kernfs.DynamicBytesFile
   714  
   715  	task *kernel.Task
   716  }
   717  
   718  var _ dynamicInode = (*statmData)(nil)
   719  
   720  // Generate implements vfs.DynamicBytesSource.Generate.
   721  func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   722  	var vss, rss uint64
   723  	if mm := getMM(s.task); mm != nil {
   724  		vss = mm.VirtualMemorySize()
   725  		rss = mm.ResidentSetSize()
   726  	}
   727  	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize)
   728  	return nil
   729  }
   730  
   731  // statusInode implements kernfs.Inode for /proc/[pid]/status.
   732  //
   733  // +stateify savable
   734  type statusInode struct {
   735  	kernfs.InodeAttrs
   736  	kernfs.InodeNoStatFS
   737  	kernfs.InodeNoopRefCount
   738  	kernfs.InodeNotAnonymous
   739  	kernfs.InodeNotDirectory
   740  	kernfs.InodeNotSymlink
   741  	kernfs.InodeWatches
   742  
   743  	task  *kernel.Task
   744  	pidns *kernel.PIDNamespace
   745  	locks vfs.FileLocks
   746  }
   747  
   748  // statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for
   749  // /proc/[pid]/status.
   750  //
   751  // +stateify savable
   752  type statusFD struct {
   753  	statusFDLowerBase
   754  	vfs.DynamicBytesFileDescriptionImpl
   755  	vfs.LockFD
   756  
   757  	vfsfd vfs.FileDescription
   758  
   759  	inode  *statusInode
   760  	task   *kernel.Task
   761  	pidns  *kernel.PIDNamespace
   762  	userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns
   763  }
   764  
   765  // statusFDLowerBase is a dumb hack to ensure that statusFD prefers
   766  // vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl
   767  // methods.
   768  //
   769  // +stateify savable
   770  type statusFDLowerBase struct {
   771  	vfs.FileDescriptionDefaultImpl
   772  }
   773  
   774  func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode {
   775  	// Note: credentials are overridden by taskOwnedInode.
   776  	inode := &statusInode{
   777  		task:  task,
   778  		pidns: pidns,
   779  	}
   780  	inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm)
   781  	return &taskOwnedInode{Inode: inode, owner: task}
   782  }
   783  
   784  // Open implements kernfs.Inode.Open.
   785  func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   786  	fd := &statusFD{
   787  		inode:  s,
   788  		task:   s.task,
   789  		pidns:  s.pidns,
   790  		userns: rp.Credentials().UserNamespace,
   791  	}
   792  	fd.LockFD.Init(&s.locks)
   793  	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   794  		return nil, err
   795  	}
   796  	fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, fd)
   797  	return &fd.vfsfd, nil
   798  }
   799  
   800  // SetStat implements kernfs.Inode.SetStat.
   801  func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   802  	return linuxerr.EPERM
   803  }
   804  
   805  // Release implements vfs.FileDescriptionImpl.Release.
   806  func (s *statusFD) Release(ctx context.Context) {
   807  }
   808  
   809  // Stat implements vfs.FileDescriptionImpl.Stat.
   810  func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   811  	fs := s.vfsfd.VirtualDentry().Mount().Filesystem()
   812  	return s.inode.Stat(ctx, fs, opts)
   813  }
   814  
   815  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   816  func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   817  	return linuxerr.EPERM
   818  }
   819  
   820  // Generate implements vfs.DynamicBytesSource.Generate.
   821  func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
   822  	fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name())
   823  	fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus())
   824  	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup()))
   825  	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task))
   826  
   827  	ppid := kernel.ThreadID(0)
   828  	if parent := s.task.Parent(); parent != nil {
   829  		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
   830  	}
   831  	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
   832  
   833  	tpid := kernel.ThreadID(0)
   834  	if tracer := s.task.Tracer(); tracer != nil {
   835  		tpid = s.pidns.IDOfTask(tracer)
   836  	}
   837  	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
   838  
   839  	creds := s.task.Credentials()
   840  	ruid := creds.RealKUID.In(s.userns).OrOverflow()
   841  	euid := creds.EffectiveKUID.In(s.userns).OrOverflow()
   842  	suid := creds.SavedKUID.In(s.userns).OrOverflow()
   843  	rgid := creds.RealKGID.In(s.userns).OrOverflow()
   844  	egid := creds.EffectiveKGID.In(s.userns).OrOverflow()
   845  	sgid := creds.SavedKGID.In(s.userns).OrOverflow()
   846  	var fds int
   847  	var vss, rss, data uint64
   848  	s.task.WithMuLocked(func(t *kernel.Task) {
   849  		if fdTable := t.FDTable(); fdTable != nil {
   850  			fds = fdTable.CurrentMaxFDs()
   851  		}
   852  	})
   853  	if mm := getMM(s.task); mm != nil {
   854  		vss = mm.VirtualMemorySize()
   855  		rss = mm.ResidentSetSize()
   856  		data = mm.VirtualDataSize()
   857  	}
   858  	// Filesystem user/group IDs aren't implemented; effective UID/GID are used
   859  	// instead.
   860  	fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid)
   861  	fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid)
   862  	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
   863  	buf.WriteString("Groups:\t")
   864  	// There is a space between each pair of supplemental GIDs, as well as an
   865  	// unconditional trailing space that some applications actually depend on.
   866  	var sep string
   867  	for _, kgid := range creds.ExtraKGIDs {
   868  		fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow())
   869  		sep = " "
   870  	}
   871  	buf.WriteString(" \n")
   872  
   873  	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
   874  	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
   875  	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
   876  
   877  	fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count())
   878  	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
   879  	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
   880  	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
   881  	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
   882  	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode())
   883  	// We unconditionally report a single NUMA node. See
   884  	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
   885  	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
   886  	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
   887  	return nil
   888  }
   889  
   890  // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider.
   891  type ioUsage interface {
   892  	// IOUsage returns the io usage data.
   893  	IOUsage() *usage.IO
   894  }
   895  
   896  // +stateify savable
   897  type ioData struct {
   898  	kernfs.DynamicBytesFile
   899  
   900  	ioUsage
   901  }
   902  
   903  var _ dynamicInode = (*ioData)(nil)
   904  
   905  // Generate implements vfs.DynamicBytesSource.Generate.
   906  func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   907  	io := usage.IO{}
   908  	io.Accumulate(i.IOUsage())
   909  
   910  	fmt.Fprintf(buf, "char: %d\n", io.CharsRead.RacyLoad())
   911  	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten.RacyLoad())
   912  	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls.RacyLoad())
   913  	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls.RacyLoad())
   914  	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead.RacyLoad())
   915  	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten.RacyLoad())
   916  	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled.RacyLoad())
   917  	return nil
   918  }
   919  
   920  // oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
   921  //
   922  // +stateify savable
   923  type oomScoreAdj struct {
   924  	kernfs.DynamicBytesFile
   925  
   926  	task *kernel.Task
   927  }
   928  
   929  var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
   930  
   931  // Generate implements vfs.DynamicBytesSource.Generate.
   932  func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
   933  	if o.task.ExitState() == kernel.TaskExitDead {
   934  		return linuxerr.ESRCH
   935  	}
   936  	fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj())
   937  	return nil
   938  }
   939  
   940  // Write implements vfs.WritableDynamicBytesSource.Write.
   941  func (o *oomScoreAdj) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   942  	if src.NumBytes() == 0 {
   943  		return 0, nil
   944  	}
   945  
   946  	// Limit input size so as not to impact performance if input size is large.
   947  	src = src.TakeFirst(hostarch.PageSize - 1)
   948  
   949  	str, err := usermem.CopyStringIn(ctx, src.IO, src.Addrs.Head().Start, int(src.Addrs.Head().Length()), src.Opts)
   950  	if err != nil && err != linuxerr.ENAMETOOLONG {
   951  		return 0, err
   952  	}
   953  
   954  	str = strings.TrimSpace(str)
   955  	v, err := strconv.ParseInt(str, 0, 32)
   956  	if err != nil {
   957  		return 0, linuxerr.EINVAL
   958  	}
   959  
   960  	if o.task.ExitState() == kernel.TaskExitDead {
   961  		return 0, linuxerr.ESRCH
   962  	}
   963  	if err := o.task.SetOOMScoreAdj(int32(v)); err != nil {
   964  		return 0, err
   965  	}
   966  
   967  	return src.NumBytes(), nil
   968  }
   969  
   970  // exeSymlink is an symlink for the /proc/[pid]/exe file.
   971  //
   972  // +stateify savable
   973  type exeSymlink struct {
   974  	implStatFS
   975  	kernfs.InodeAttrs
   976  	kernfs.InodeNoopRefCount
   977  	kernfs.InodeNotAnonymous
   978  	kernfs.InodeSymlink
   979  	kernfs.InodeWatches
   980  
   981  	fs   *filesystem
   982  	task *kernel.Task
   983  }
   984  
   985  var _ kernfs.Inode = (*exeSymlink)(nil)
   986  
   987  func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
   988  	inode := &exeSymlink{
   989  		fs:   fs,
   990  		task: task,
   991  	}
   992  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
   993  	return inode
   994  }
   995  
   996  // Readlink implements kernfs.Inode.Readlink.
   997  func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
   998  	exec, _, err := s.Getlink(ctx, nil)
   999  	if err != nil {
  1000  		return "", err
  1001  	}
  1002  	defer s.fs.SafeDecRef(ctx, exec)
  1003  
  1004  	root := vfs.RootFromContext(ctx)
  1005  	if !root.Ok() {
  1006  		panic("procfs Readlink requires context with root value")
  1007  	}
  1008  	defer s.fs.SafeDecRef(ctx, root)
  1009  
  1010  	vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
  1011  	name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
  1012  	return name, nil
  1013  }
  1014  
  1015  // Getlink implements kernfs.Inode.Getlink.
  1016  func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1017  	if !kernel.ContextCanTrace(ctx, s.task, false) {
  1018  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1019  	}
  1020  	if err := checkTaskState(s.task); err != nil {
  1021  		return vfs.VirtualDentry{}, "", err
  1022  	}
  1023  
  1024  	mm := getMM(s.task)
  1025  	if mm == nil {
  1026  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1027  	}
  1028  
  1029  	// The MemoryManager may be destroyed, in which case
  1030  	// MemoryManager.destroy will simply set the executable to nil
  1031  	// (with locks held).
  1032  	exec := mm.Executable()
  1033  	if exec == nil {
  1034  		return vfs.VirtualDentry{}, "", linuxerr.ESRCH
  1035  	}
  1036  	defer exec.DecRef(ctx)
  1037  
  1038  	vd := exec.VirtualDentry()
  1039  	vd.IncRef()
  1040  	return vd, "", nil
  1041  }
  1042  
  1043  // cwdSymlink is an symlink for the /proc/[pid]/cwd file.
  1044  //
  1045  // +stateify savable
  1046  type cwdSymlink struct {
  1047  	implStatFS
  1048  	kernfs.InodeAttrs
  1049  	kernfs.InodeNoopRefCount
  1050  	kernfs.InodeNotAnonymous
  1051  	kernfs.InodeSymlink
  1052  	kernfs.InodeWatches
  1053  
  1054  	fs   *filesystem
  1055  	task *kernel.Task
  1056  }
  1057  
  1058  var _ kernfs.Inode = (*cwdSymlink)(nil)
  1059  
  1060  func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
  1061  	inode := &cwdSymlink{
  1062  		fs:   fs,
  1063  		task: task,
  1064  	}
  1065  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
  1066  	return inode
  1067  }
  1068  
  1069  // Readlink implements kernfs.Inode.Readlink.
  1070  func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
  1071  	cwd, _, err := s.Getlink(ctx, nil)
  1072  	if err != nil {
  1073  		return "", err
  1074  	}
  1075  	defer s.fs.SafeDecRef(ctx, cwd)
  1076  
  1077  	root := vfs.RootFromContext(ctx)
  1078  	if !root.Ok() {
  1079  		panic("procfs Readlink requires context with root value")
  1080  	}
  1081  	defer s.fs.SafeDecRef(ctx, root)
  1082  
  1083  	vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
  1084  	name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
  1085  	return name, nil
  1086  }
  1087  
  1088  // Getlink implements kernfs.Inode.Getlink.
  1089  func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1090  	if !kernel.ContextCanTrace(ctx, s.task, false) {
  1091  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1092  	}
  1093  	if err := checkTaskState(s.task); err != nil {
  1094  		return vfs.VirtualDentry{}, "", err
  1095  	}
  1096  	cwd := s.task.FSContext().WorkingDirectory()
  1097  	if !cwd.Ok() {
  1098  		// It could have raced with process deletion.
  1099  		return vfs.VirtualDentry{}, "", linuxerr.ESRCH
  1100  	}
  1101  	// The reference is transferred to the caller.
  1102  	return cwd, "", nil
  1103  }
  1104  
  1105  // rootSymlink is an symlink for the /proc/[pid]/root file.
  1106  //
  1107  // +stateify savable
  1108  type rootSymlink struct {
  1109  	implStatFS
  1110  	kernfs.InodeAttrs
  1111  	kernfs.InodeNoopRefCount
  1112  	kernfs.InodeNotAnonymous
  1113  	kernfs.InodeSymlink
  1114  	kernfs.InodeWatches
  1115  
  1116  	fs   *filesystem
  1117  	task *kernel.Task
  1118  }
  1119  
  1120  var _ kernfs.Inode = (*rootSymlink)(nil)
  1121  
  1122  func (fs *filesystem) newRootSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
  1123  	inode := &rootSymlink{
  1124  		fs:   fs,
  1125  		task: task,
  1126  	}
  1127  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
  1128  	return inode
  1129  }
  1130  
  1131  // Readlink implements kernfs.Inode.Readlink.
  1132  func (s *rootSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
  1133  	root, _, err := s.Getlink(ctx, nil)
  1134  	if err != nil {
  1135  		return "", err
  1136  	}
  1137  	defer s.fs.SafeDecRef(ctx, root)
  1138  
  1139  	vfsRoot := vfs.RootFromContext(ctx)
  1140  	if !vfsRoot.Ok() {
  1141  		panic("procfs Readlink requires context with root value")
  1142  	}
  1143  	defer s.fs.SafeDecRef(ctx, vfsRoot)
  1144  
  1145  	vfsObj := root.Mount().Filesystem().VirtualFilesystem()
  1146  	name, _ := vfsObj.PathnameWithDeleted(ctx, vfsRoot, root)
  1147  	return name, nil
  1148  }
  1149  
  1150  // Getlink implements kernfs.Inode.Getlink.
  1151  func (s *rootSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1152  	if !kernel.ContextCanTrace(ctx, s.task, false) {
  1153  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1154  	}
  1155  	if err := checkTaskState(s.task); err != nil {
  1156  		return vfs.VirtualDentry{}, "", err
  1157  	}
  1158  	root := s.task.FSContext().RootDirectory()
  1159  	if !root.Ok() {
  1160  		// It could have raced with process deletion.
  1161  		return vfs.VirtualDentry{}, "", linuxerr.ESRCH
  1162  	}
  1163  	// The reference is transferred to the caller.
  1164  	return root, "", nil
  1165  }
  1166  
  1167  // mountInfoData is used to implement /proc/[pid]/mountinfo.
  1168  //
  1169  // +stateify savable
  1170  type mountInfoData struct {
  1171  	kernfs.DynamicBytesFile
  1172  
  1173  	fs   *filesystem
  1174  	task *kernel.Task
  1175  }
  1176  
  1177  var _ dynamicInode = (*mountInfoData)(nil)
  1178  
  1179  // Generate implements vfs.DynamicBytesSource.Generate.
  1180  func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1181  	var fsctx *kernel.FSContext
  1182  	i.task.WithMuLocked(func(t *kernel.Task) {
  1183  		fsctx = t.FSContext()
  1184  	})
  1185  	if fsctx == nil {
  1186  		// The task has been destroyed. Nothing to show here.
  1187  		return nil
  1188  	}
  1189  	rootDir := fsctx.RootDirectory()
  1190  	if !rootDir.Ok() {
  1191  		// Root has been destroyed. Don't try to read mounts.
  1192  		return nil
  1193  	}
  1194  	defer i.fs.SafeDecRef(ctx, rootDir)
  1195  	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
  1196  	return nil
  1197  }
  1198  
  1199  // mountsData is used to implement /proc/[pid]/mounts.
  1200  //
  1201  // +stateify savable
  1202  type mountsData struct {
  1203  	kernfs.DynamicBytesFile
  1204  
  1205  	fs   *filesystem
  1206  	task *kernel.Task
  1207  }
  1208  
  1209  var _ dynamicInode = (*mountsData)(nil)
  1210  
  1211  // Generate implements vfs.DynamicBytesSource.Generate.
  1212  func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1213  	var fsctx *kernel.FSContext
  1214  	i.task.WithMuLocked(func(t *kernel.Task) {
  1215  		fsctx = t.FSContext()
  1216  	})
  1217  	if fsctx == nil {
  1218  		// The task has been destroyed. Nothing to show here.
  1219  		return nil
  1220  	}
  1221  	rootDir := fsctx.RootDirectory()
  1222  	if !rootDir.Ok() {
  1223  		// Root has been destroyed. Don't try to read mounts.
  1224  		return nil
  1225  	}
  1226  	defer i.fs.SafeDecRef(ctx, rootDir)
  1227  	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
  1228  	return nil
  1229  }
  1230  
  1231  // +stateify savable
  1232  type namespaceSymlink struct {
  1233  	kernfs.StaticSymlink
  1234  
  1235  	task   *kernel.Task
  1236  	nsType int
  1237  }
  1238  
  1239  func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, nsType int) kernfs.Inode {
  1240  	inode := &namespaceSymlink{task: task, nsType: nsType}
  1241  
  1242  	// Note: credentials are overridden by taskOwnedInode.
  1243  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, "")
  1244  
  1245  	taskInode := &taskOwnedInode{Inode: inode, owner: task}
  1246  	return taskInode
  1247  }
  1248  
  1249  func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
  1250  	target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID())
  1251  
  1252  	inode := &namespaceSymlink{task: task}
  1253  	// Note: credentials are overridden by taskOwnedInode.
  1254  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
  1255  
  1256  	taskInode := &taskOwnedInode{Inode: inode, owner: task}
  1257  	return taskInode
  1258  }
  1259  
  1260  func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode {
  1261  	// Namespace symlinks should contain the namespace name and the inode number
  1262  	// for the namespace instance, so for example user:[123456]. We currently fake
  1263  	// the inode number by sticking the symlink inode in its place.
  1264  	target := fmt.Sprintf("%s:[%d]", ns, ino)
  1265  
  1266  	inode := &namespaceSymlink{task: task}
  1267  	// Note: credentials are overridden by taskOwnedInode.
  1268  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
  1269  
  1270  	taskInode := &taskOwnedInode{Inode: inode, owner: task}
  1271  	return taskInode
  1272  }
  1273  
  1274  func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode {
  1275  	switch s.nsType {
  1276  	case linux.CLONE_NEWNET:
  1277  		netns := t.GetNetworkNamespace()
  1278  		if netns == nil {
  1279  			return nil
  1280  		}
  1281  		return netns.GetInode()
  1282  	case linux.CLONE_NEWIPC:
  1283  		if ipcns := t.GetIPCNamespace(); ipcns != nil {
  1284  			return ipcns.GetInode()
  1285  		}
  1286  		return nil
  1287  	case linux.CLONE_NEWUTS:
  1288  		if utsns := t.GetUTSNamespace(); utsns != nil {
  1289  			return utsns.GetInode()
  1290  		}
  1291  		return nil
  1292  	case linux.CLONE_NEWNS:
  1293  		mntns := t.GetMountNamespace()
  1294  		if mntns == nil {
  1295  			return nil
  1296  		}
  1297  		inode, _ := mntns.Refs.(*nsfs.Inode)
  1298  		return inode
  1299  	default:
  1300  		panic("unknown namespace")
  1301  	}
  1302  }
  1303  
  1304  // Readlink implements kernfs.Inode.Readlink.
  1305  func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
  1306  	if err := checkTaskState(s.task); err != nil {
  1307  		return "", err
  1308  	}
  1309  	if s.nsType != 0 {
  1310  		inode := s.getInode(s.task)
  1311  		if inode == nil {
  1312  			return "", linuxerr.ENOENT
  1313  		}
  1314  		target := inode.Name()
  1315  		inode.DecRef(ctx)
  1316  		return target, nil
  1317  	}
  1318  	return s.StaticSymlink.Readlink(ctx, mnt)
  1319  }
  1320  
  1321  // Getlink implements kernfs.Inode.Getlink.
  1322  func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1323  	if err := checkTaskState(s.task); err != nil {
  1324  		return vfs.VirtualDentry{}, "", err
  1325  	}
  1326  
  1327  	if s.nsType != 0 {
  1328  		inode := s.getInode(s.task)
  1329  		if inode == nil {
  1330  			return vfs.VirtualDentry{}, "", linuxerr.ENOENT
  1331  		}
  1332  		defer inode.DecRef(ctx)
  1333  		return inode.VirtualDentry(), "", nil
  1334  	}
  1335  	// Create a synthetic inode to represent the namespace.
  1336  	fs := mnt.Filesystem().Impl().(*filesystem)
  1337  	nsInode := &namespaceInode{}
  1338  	nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444)
  1339  	dentry := &kernfs.Dentry{}
  1340  	dentry.Init(&fs.Filesystem, nsInode)
  1341  	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
  1342  	// Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
  1343  	mnt.IncRef()
  1344  	return vd, "", nil
  1345  }
  1346  
  1347  // namespaceInode is a synthetic inode created to represent a namespace in
  1348  // /proc/[pid]/ns/*.
  1349  //
  1350  // +stateify savable
  1351  type namespaceInode struct {
  1352  	implStatFS
  1353  	kernfs.InodeAttrs
  1354  	kernfs.InodeNoopRefCount
  1355  	kernfs.InodeNotAnonymous
  1356  	kernfs.InodeNotDirectory
  1357  	kernfs.InodeNotSymlink
  1358  	kernfs.InodeWatches
  1359  
  1360  	locks vfs.FileLocks
  1361  }
  1362  
  1363  var _ kernfs.Inode = (*namespaceInode)(nil)
  1364  
  1365  // Init initializes a namespace inode.
  1366  func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
  1367  	if perm&^linux.PermissionsMask != 0 {
  1368  		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
  1369  	}
  1370  	i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
  1371  }
  1372  
  1373  // Open implements kernfs.Inode.Open.
  1374  func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
  1375  	fd := &namespaceFD{inode: i}
  1376  	i.IncRef()
  1377  	fd.LockFD.Init(&i.locks)
  1378  	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
  1379  		return nil, err
  1380  	}
  1381  	return &fd.vfsfd, nil
  1382  }
  1383  
  1384  // namespace FD is a synthetic file that represents a namespace in
  1385  // /proc/[pid]/ns/*.
  1386  //
  1387  // +stateify savable
  1388  type namespaceFD struct {
  1389  	vfs.FileDescriptionDefaultImpl
  1390  	vfs.LockFD
  1391  
  1392  	vfsfd vfs.FileDescription
  1393  	inode *namespaceInode
  1394  }
  1395  
  1396  var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
  1397  
  1398  // Stat implements vfs.FileDescriptionImpl.Stat.
  1399  func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
  1400  	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
  1401  	return fd.inode.Stat(ctx, vfs, opts)
  1402  }
  1403  
  1404  // SetStat implements vfs.FileDescriptionImpl.SetStat.
  1405  func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
  1406  	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
  1407  	creds := auth.CredentialsFromContext(ctx)
  1408  	return fd.inode.SetStat(ctx, vfs, creds, opts)
  1409  }
  1410  
  1411  // Release implements vfs.FileDescriptionImpl.Release.
  1412  func (fd *namespaceFD) Release(ctx context.Context) {
  1413  	fd.inode.DecRef(ctx)
  1414  }
  1415  
  1416  // taskCgroupData generates data for /proc/[pid]/cgroup.
  1417  //
  1418  // +stateify savable
  1419  type taskCgroupData struct {
  1420  	dynamicBytesFileSetAttr
  1421  	task *kernel.Task
  1422  }
  1423  
  1424  var _ dynamicInode = (*taskCgroupData)(nil)
  1425  
  1426  // Generate implements vfs.DynamicBytesSource.Generate.
  1427  func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1428  	// When a task is existing on Linux, a task's cgroup set is cleared and
  1429  	// reset to the initial cgroup set, which is essentially the set of root
  1430  	// cgroups. Because of this, the /proc/<pid>/cgroup file is always readable
  1431  	// on Linux throughout a task's lifetime.
  1432  	//
  1433  	// The sentry removes tasks from cgroups during the exit process, but
  1434  	// doesn't move them into an initial cgroup set, so partway through task
  1435  	// exit this file show a task is in no cgroups, which is incorrect. Instead,
  1436  	// once a task has left its cgroups, we return an error.
  1437  	if d.task.ExitState() >= kernel.TaskExitInitiated {
  1438  		return linuxerr.ESRCH
  1439  	}
  1440  
  1441  	d.task.GenerateProcTaskCgroup(buf)
  1442  	return nil
  1443  }
  1444  
  1445  // childrenData implements vfs.DynamicBytesSource for /proc/[pid]/task/[tid]/children.
  1446  //
  1447  // +stateify savable
  1448  type childrenData struct {
  1449  	kernfs.DynamicBytesFile
  1450  
  1451  	task *kernel.Task
  1452  
  1453  	// pidns is the PID namespace associated with the proc filesystem that
  1454  	// includes the file using this childrenData.
  1455  	pidns *kernel.PIDNamespace
  1456  }
  1457  
  1458  // Generate implements vfs.DynamicBytesSource.Generate.
  1459  func (d *childrenData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1460  	children := d.task.Children()
  1461  	var childrenTIDs []int
  1462  	for childTask := range children {
  1463  		childrenTIDs = append(childrenTIDs, int(d.pidns.IDOfTask(childTask)))
  1464  	}
  1465  
  1466  	// The TIDs need to be in sorted order in accordance with the Linux implementation.
  1467  	sort.Ints(childrenTIDs)
  1468  
  1469  	for _, childrenTID := range childrenTIDs {
  1470  		// It contains a space-separated list of child tasks of the `task`.
  1471  		// Each task is represented by its TID.
  1472  		fmt.Fprintf(buf, "%d ", childrenTID)
  1473  	}
  1474  
  1475  	return nil
  1476  }