github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/proc/task_files.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package proc
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"io"
    21  	"strconv"
    22  	"strings"
    23  
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    25  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    26  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/safemem"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/kernfs"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/nsfs"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/limits"
    34  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/mm"
    35  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage"
    36  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    37  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/usermem"
    39  )
    40  
    41  // "There is an (arbitrary) limit on the number of lines in the file. As at
    42  // Linux 3.18, the limit is five lines." - user_namespaces(7)
    43  const maxIDMapLines = 5
    44  
    45  // getMM gets the kernel task's MemoryManager. No additional reference is taken on
    46  // mm here. This is safe because MemoryManager.destroy is required to leave the
    47  // MemoryManager in a state where it's still usable as a DynamicBytesSource.
    48  func getMM(task *kernel.Task) *mm.MemoryManager {
    49  	var tmm *mm.MemoryManager
    50  	task.WithMuLocked(func(t *kernel.Task) {
    51  		if mm := t.MemoryManager(); mm != nil {
    52  			tmm = mm
    53  		}
    54  	})
    55  	return tmm
    56  }
    57  
    58  // getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the
    59  // MemoryManager's users count is incremented, and must be decremented by the
    60  // caller when it is no longer in use.
    61  func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
    62  	var m *mm.MemoryManager
    63  	task.WithMuLocked(func(t *kernel.Task) {
    64  		m = t.MemoryManager()
    65  	})
    66  	if m == nil || !m.IncUsers() {
    67  		return nil, io.EOF
    68  	}
    69  	return m, nil
    70  }
    71  
    72  func checkTaskState(t *kernel.Task) error {
    73  	switch t.ExitState() {
    74  	case kernel.TaskExitZombie:
    75  		return linuxerr.EACCES
    76  	case kernel.TaskExitDead:
    77  		return linuxerr.ESRCH
    78  	}
    79  	return nil
    80  }
    81  
    82  type bufferWriter struct {
    83  	buf *bytes.Buffer
    84  }
    85  
    86  // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
    87  // the number of bytes written. It may return a partial write without an
    88  // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
    89  // return a full write with an error (i.e. srcs.NumBytes(), err) where err
    90  // != nil).
    91  func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
    92  	written := srcs.NumBytes()
    93  	for !srcs.IsEmpty() {
    94  		w.buf.Write(srcs.Head().ToSlice())
    95  		srcs = srcs.Tail()
    96  	}
    97  	return written, nil
    98  }
    99  
   100  // auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv.
   101  //
   102  // +stateify savable
   103  type auxvData struct {
   104  	kernfs.DynamicBytesFile
   105  
   106  	task *kernel.Task
   107  }
   108  
   109  var _ dynamicInode = (*auxvData)(nil)
   110  
   111  // Generate implements vfs.DynamicBytesSource.Generate.
   112  func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   113  	if d.task.ExitState() == kernel.TaskExitDead {
   114  		return linuxerr.ESRCH
   115  	}
   116  	m, err := getMMIncRef(d.task)
   117  	if err != nil {
   118  		// Return empty file.
   119  		return nil
   120  	}
   121  	defer m.DecUsers(ctx)
   122  
   123  	auxv := m.Auxv()
   124  	// Space for buffer with AT_NULL (0) terminator at the end.
   125  	buf.Grow((len(auxv) + 1) * 16)
   126  	for _, e := range auxv {
   127  		var tmp [16]byte
   128  		hostarch.ByteOrder.PutUint64(tmp[:8], e.Key)
   129  		hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value))
   130  		buf.Write(tmp[:])
   131  	}
   132  	var atNull [16]byte
   133  	buf.Write(atNull[:])
   134  
   135  	return nil
   136  }
   137  
   138  // MetadataType enumerates the types of metadata that is exposed through proc.
   139  type MetadataType int
   140  
   141  const (
   142  	// Cmdline represents /proc/[pid]/cmdline.
   143  	Cmdline MetadataType = iota
   144  
   145  	// Environ represents /proc/[pid]/environ.
   146  	Environ
   147  )
   148  
   149  // GetMetadata fetches the process's metadata of type t and writes it into
   150  // buf. The process is identified by mm.
   151  func GetMetadata(ctx context.Context, mm *mm.MemoryManager, buf *bytes.Buffer, t MetadataType) error {
   152  	// Figure out the bounds of the exec arg we are trying to read.
   153  	var ar hostarch.AddrRange
   154  	switch t {
   155  	case Cmdline:
   156  		ar = hostarch.AddrRange{
   157  			Start: mm.ArgvStart(),
   158  			End:   mm.ArgvEnd(),
   159  		}
   160  	case Environ:
   161  		ar = hostarch.AddrRange{
   162  			Start: mm.EnvvStart(),
   163  			End:   mm.EnvvEnd(),
   164  		}
   165  	default:
   166  		panic(fmt.Sprintf("unknown exec arg type %v", t))
   167  	}
   168  	if ar.Start == 0 || ar.End == 0 {
   169  		// Don't attempt to read before the start/end are set up.
   170  		return io.EOF
   171  	}
   172  
   173  	// N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
   174  	// until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
   175  	// cmdline and environment").
   176  	writer := &bufferWriter{buf: buf}
   177  	if n, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil {
   178  		// Nothing to copy or something went wrong.
   179  		return err
   180  	}
   181  
   182  	// On Linux, if the NULL byte at the end of the argument vector has been
   183  	// overwritten, it continues reading the environment vector as part of
   184  	// the argument vector.
   185  	if t == Cmdline && buf.Bytes()[buf.Len()-1] != 0 {
   186  		if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 {
   187  			// If we found a NULL character somewhere else in argv, truncate the
   188  			// return up to the NULL terminator (including it).
   189  			buf.Truncate(end)
   190  			return nil
   191  		}
   192  
   193  		// There is no NULL terminator in the string, return into envp.
   194  		arEnvv := hostarch.AddrRange{
   195  			Start: mm.EnvvStart(),
   196  			End:   mm.EnvvEnd(),
   197  		}
   198  
   199  		// Upstream limits the returned amount to one page of slop.
   200  		// https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208
   201  		// we'll return one page total between argv and envp because of the
   202  		// above page restrictions.
   203  		if buf.Len() >= hostarch.PageSize {
   204  			// Returned at least one page already, nothing else to add.
   205  			return nil
   206  		}
   207  		remaining := hostarch.PageSize - buf.Len()
   208  		if int(arEnvv.Length()) > remaining {
   209  			end, ok := arEnvv.Start.AddLength(uint64(remaining))
   210  			if !ok {
   211  				return linuxerr.EFAULT
   212  			}
   213  			arEnvv.End = end
   214  		}
   215  		if _, err := mm.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil {
   216  			return err
   217  		}
   218  
   219  		// Linux will return envp up to and including the first NULL character,
   220  		// so find it.
   221  		envStart := int(ar.Length())
   222  		if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
   223  			buf.Truncate(envStart + nullIdx)
   224  		}
   225  	}
   226  
   227  	return nil
   228  }
   229  
   230  // metadataData implements vfs.DynamicBytesSource for proc metadata fields like:
   231  //
   232  //   - /proc/[pid]/cmdline
   233  //   - /proc/[pid]/environ
   234  //
   235  // +stateify savable
   236  type metadataData struct {
   237  	kernfs.DynamicBytesFile
   238  
   239  	task *kernel.Task
   240  
   241  	// arg is the type of exec argument this file contains.
   242  	metaType MetadataType
   243  }
   244  
   245  var _ dynamicInode = (*metadataData)(nil)
   246  
   247  // Generate implements vfs.DynamicBytesSource.Generate.
   248  func (d *metadataData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   249  	if d.task.ExitState() == kernel.TaskExitDead {
   250  		return linuxerr.ESRCH
   251  	}
   252  	m, err := getMMIncRef(d.task)
   253  	if err != nil {
   254  		// Return empty file.
   255  		return nil
   256  	}
   257  	defer m.DecUsers(ctx)
   258  	return GetMetadata(ctx, m, buf, d.metaType)
   259  }
   260  
   261  // +stateify savable
   262  type commInode struct {
   263  	kernfs.DynamicBytesFile
   264  
   265  	task *kernel.Task
   266  }
   267  
   268  func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
   269  	inode := &commInode{task: task}
   270  	inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
   271  	return inode
   272  }
   273  
   274  func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   275  	// This file can always be read or written by members of the same thread
   276  	// group. See fs/proc/base.c:proc_tid_comm_permission.
   277  	t := kernel.TaskFromContext(ctx)
   278  	if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() {
   279  		return nil
   280  	}
   281  
   282  	return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats)
   283  }
   284  
   285  // commData implements vfs.WritableDynamicBytesSource for /proc/[pid]/comm.
   286  //
   287  // +stateify savable
   288  type commData struct {
   289  	kernfs.DynamicBytesFile
   290  
   291  	task *kernel.Task
   292  }
   293  
   294  var _ dynamicInode = (*commData)(nil)
   295  var _ vfs.WritableDynamicBytesSource = (*commData)(nil)
   296  
   297  // Generate implements vfs.DynamicBytesSource.Generate.
   298  func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   299  	buf.WriteString(d.task.Name())
   300  	buf.WriteString("\n")
   301  	return nil
   302  }
   303  
   304  // Write implements vfs.WritableDynamicBytesSource.Write.
   305  func (d *commData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   306  	srclen := src.NumBytes()
   307  	name := make([]byte, srclen)
   308  	if _, err := src.CopyIn(ctx, name); err != nil {
   309  		return 0, err
   310  	}
   311  
   312  	// Only allow writes from the same thread group, otherwise return
   313  	// EINVAL. See fs/proc/base.c:comm_write.
   314  	//
   315  	// Note that this check exists in addition to the same-thread-group
   316  	// check in CheckPermissions.
   317  	t := kernel.TaskFromContext(ctx)
   318  	if t == nil || t.ThreadGroup() != d.task.ThreadGroup() {
   319  		return 0, linuxerr.EINVAL
   320  	}
   321  	d.task.SetName(string(name))
   322  	return int64(srclen), nil
   323  }
   324  
   325  // idMapData implements vfs.WritableDynamicBytesSource for
   326  // /proc/[pid]/{gid_map|uid_map}.
   327  //
   328  // +stateify savable
   329  type idMapData struct {
   330  	kernfs.DynamicBytesFile
   331  
   332  	task *kernel.Task
   333  	gids bool
   334  }
   335  
   336  var _ dynamicInode = (*idMapData)(nil)
   337  var _ vfs.WritableDynamicBytesSource = (*idMapData)(nil)
   338  
   339  // Generate implements vfs.WritableDynamicBytesSource.Generate.
   340  func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   341  	var entries []auth.IDMapEntry
   342  	if d.gids {
   343  		entries = d.task.UserNamespace().GIDMap()
   344  	} else {
   345  		entries = d.task.UserNamespace().UIDMap()
   346  	}
   347  	for _, e := range entries {
   348  		fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
   349  	}
   350  	return nil
   351  }
   352  
   353  // Write implements vfs.WritableDynamicBytesSource.Write.
   354  func (d *idMapData) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   355  	// "In addition, the number of bytes written to the file must be less than
   356  	// the system page size, and the write must be performed at the start of
   357  	// the file ..." - user_namespaces(7)
   358  	srclen := src.NumBytes()
   359  	if srclen >= hostarch.PageSize || offset != 0 {
   360  		return 0, linuxerr.EINVAL
   361  	}
   362  	b := make([]byte, srclen)
   363  	if _, err := src.CopyIn(ctx, b); err != nil {
   364  		return 0, err
   365  	}
   366  
   367  	// Truncate from the first NULL byte.
   368  	var nul int64
   369  	nul = int64(bytes.IndexByte(b, 0))
   370  	if nul == -1 {
   371  		nul = srclen
   372  	}
   373  	b = b[:nul]
   374  	// Remove the last \n.
   375  	if nul >= 1 && b[nul-1] == '\n' {
   376  		b = b[:nul-1]
   377  	}
   378  	lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
   379  	if len(lines) > maxIDMapLines {
   380  		return 0, linuxerr.EINVAL
   381  	}
   382  
   383  	entries := make([]auth.IDMapEntry, len(lines))
   384  	for i, l := range lines {
   385  		var e auth.IDMapEntry
   386  		_, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
   387  		if err != nil {
   388  			return 0, linuxerr.EINVAL
   389  		}
   390  		entries[i] = e
   391  	}
   392  	var err error
   393  	if d.gids {
   394  		err = d.task.UserNamespace().SetGIDMap(ctx, entries)
   395  	} else {
   396  		err = d.task.UserNamespace().SetUIDMap(ctx, entries)
   397  	}
   398  	if err != nil {
   399  		return 0, err
   400  	}
   401  
   402  	// On success, Linux's kernel/user_namespace.c:map_write() always returns
   403  	// count, even if fewer bytes were used.
   404  	return int64(srclen), nil
   405  }
   406  
   407  var _ kernfs.Inode = (*memInode)(nil)
   408  
   409  // memInode implements kernfs.Inode for /proc/[pid]/mem.
   410  //
   411  // +stateify savable
   412  type memInode struct {
   413  	kernfs.InodeAttrs
   414  	kernfs.InodeNoStatFS
   415  	kernfs.InodeNoopRefCount
   416  	kernfs.InodeNotAnonymous
   417  	kernfs.InodeNotDirectory
   418  	kernfs.InodeNotSymlink
   419  	kernfs.InodeWatches
   420  
   421  	task  *kernel.Task
   422  	locks vfs.FileLocks
   423  }
   424  
   425  func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
   426  	// Note: credentials are overridden by taskOwnedInode.
   427  	inode := &memInode{task: task}
   428  	inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
   429  	return &taskOwnedInode{Inode: inode, owner: task}
   430  }
   431  
   432  func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
   433  	if perm&^linux.PermissionsMask != 0 {
   434  		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
   435  	}
   436  	f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
   437  }
   438  
   439  // Open implements kernfs.Inode.Open.
   440  func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   441  	// TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS
   442  	// Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS
   443  	// Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH
   444  	if !kernel.ContextCanTrace(ctx, f.task, true) {
   445  		return nil, linuxerr.EACCES
   446  	}
   447  	if err := checkTaskState(f.task); err != nil {
   448  		return nil, err
   449  	}
   450  	fd := &memFD{}
   451  	if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil {
   452  		return nil, err
   453  	}
   454  	return &fd.vfsfd, nil
   455  }
   456  
   457  // SetStat implements kernfs.Inode.SetStat.
   458  func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   459  	return linuxerr.EPERM
   460  }
   461  
   462  var _ vfs.FileDescriptionImpl = (*memFD)(nil)
   463  
   464  // memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem.
   465  //
   466  // +stateify savable
   467  type memFD struct {
   468  	vfsfd vfs.FileDescription
   469  	vfs.FileDescriptionDefaultImpl
   470  	vfs.LockFD
   471  
   472  	inode *memInode
   473  
   474  	// mu guards the fields below.
   475  	mu     sync.Mutex `state:"nosave"`
   476  	offset int64
   477  }
   478  
   479  // Init initializes memFD.
   480  func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error {
   481  	fd.LockFD.Init(&inode.locks)
   482  	if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   483  		return err
   484  	}
   485  	fd.inode = inode
   486  	return nil
   487  }
   488  
   489  // Seek implements vfs.FileDescriptionImpl.Seek.
   490  func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   491  	fd.mu.Lock()
   492  	defer fd.mu.Unlock()
   493  	switch whence {
   494  	case linux.SEEK_SET:
   495  	case linux.SEEK_CUR:
   496  		offset += fd.offset
   497  	default:
   498  		return 0, linuxerr.EINVAL
   499  	}
   500  	if offset < 0 {
   501  		return 0, linuxerr.EINVAL
   502  	}
   503  	fd.offset = offset
   504  	return offset, nil
   505  }
   506  
   507  // PRead implements vfs.FileDescriptionImpl.PRead.
   508  func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   509  	if dst.NumBytes() == 0 {
   510  		return 0, nil
   511  	}
   512  	m, err := getMMIncRef(fd.inode.task)
   513  	if err != nil {
   514  		return 0, err
   515  	}
   516  	defer m.DecUsers(ctx)
   517  	// Buffer the read data because of MM locks
   518  	buf := make([]byte, dst.NumBytes())
   519  	n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true})
   520  	if n > 0 {
   521  		if _, err := dst.CopyOut(ctx, buf[:n]); err != nil {
   522  			return 0, linuxerr.EFAULT
   523  		}
   524  		return int64(n), nil
   525  	}
   526  	if readErr != nil {
   527  		return 0, linuxerr.EIO
   528  	}
   529  	return 0, nil
   530  }
   531  
   532  // Read implements vfs.FileDescriptionImpl.Read.
   533  func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   534  	fd.mu.Lock()
   535  	n, err := fd.PRead(ctx, dst, fd.offset, opts)
   536  	fd.offset += n
   537  	fd.mu.Unlock()
   538  	return n, err
   539  }
   540  
   541  // Stat implements vfs.FileDescriptionImpl.Stat.
   542  func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   543  	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
   544  	return fd.inode.Stat(ctx, fs, opts)
   545  }
   546  
   547  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   548  func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error {
   549  	return linuxerr.EPERM
   550  }
   551  
   552  // Release implements vfs.FileDescriptionImpl.Release.
   553  func (fd *memFD) Release(context.Context) {}
   554  
   555  // limitsData implements vfs.DynamicBytesSource for /proc/[pid]/limits.
   556  //
   557  // +stateify savable
   558  type limitsData struct {
   559  	kernfs.DynamicBytesFile
   560  
   561  	task *kernel.Task
   562  }
   563  
   564  func (d *limitsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   565  	taskLimits := d.task.Limits()
   566  	// formatting matches the kernel output from linux/fs/proc/base.c:proc_pid_limits()
   567  	fmt.Fprintf(buf, "Limit                     Soft Limit           Hard Limit           Units     \n")
   568  	for _, lt := range limits.AllLimitTypes {
   569  		fmt.Fprintf(buf, "%-25s ", lt.Name())
   570  
   571  		l := taskLimits.Get(lt)
   572  		if l.Cur == limits.Infinity {
   573  			fmt.Fprintf(buf, "%-20s ", "unlimited")
   574  		} else {
   575  			fmt.Fprintf(buf, "%-20d ", l.Cur)
   576  		}
   577  
   578  		if l.Max == limits.Infinity {
   579  			fmt.Fprintf(buf, "%-20s ", "unlimited")
   580  		} else {
   581  			fmt.Fprintf(buf, "%-20d ", l.Max)
   582  		}
   583  
   584  		if u := lt.Unit(); u != "" {
   585  			fmt.Fprintf(buf, "%-10s", u)
   586  		}
   587  
   588  		buf.WriteByte('\n')
   589  	}
   590  	return nil
   591  }
   592  
   593  // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
   594  //
   595  // +stateify savable
   596  type mapsData struct {
   597  	kernfs.DynamicBytesFile
   598  
   599  	task *kernel.Task
   600  }
   601  
   602  var _ dynamicInode = (*mapsData)(nil)
   603  
   604  // Generate implements vfs.DynamicBytesSource.Generate.
   605  func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   606  	if mm := getMM(d.task); mm != nil {
   607  		mm.ReadMapsDataInto(ctx, mm.MapsCallbackFuncForBuffer(buf))
   608  	}
   609  	return nil
   610  }
   611  
   612  // smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
   613  //
   614  // +stateify savable
   615  type smapsData struct {
   616  	kernfs.DynamicBytesFile
   617  
   618  	task *kernel.Task
   619  }
   620  
   621  var _ dynamicInode = (*smapsData)(nil)
   622  
   623  // Generate implements vfs.DynamicBytesSource.Generate.
   624  func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   625  	if mm := getMM(d.task); mm != nil {
   626  		mm.ReadSmapsDataInto(ctx, buf)
   627  	}
   628  	return nil
   629  }
   630  
   631  // +stateify savable
   632  type taskStatData struct {
   633  	kernfs.DynamicBytesFile
   634  
   635  	task *kernel.Task
   636  
   637  	// If tgstats is true, accumulate fault stats (not implemented) and CPU
   638  	// time across all tasks in t's thread group.
   639  	tgstats bool
   640  
   641  	// pidns is the PID namespace associated with the proc filesystem that
   642  	// includes the file using this statData.
   643  	pidns *kernel.PIDNamespace
   644  }
   645  
   646  var _ dynamicInode = (*taskStatData)(nil)
   647  
   648  // Generate implements vfs.DynamicBytesSource.Generate.
   649  func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   650  	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task))
   651  	fmt.Fprintf(buf, "(%s) ", s.task.Name())
   652  	fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0])
   653  	ppid := kernel.ThreadID(0)
   654  	if parent := s.task.Parent(); parent != nil {
   655  		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
   656  	}
   657  	fmt.Fprintf(buf, "%d ", ppid)
   658  	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup()))
   659  	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session()))
   660  	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
   661  	fmt.Fprintf(buf, "0 " /* flags */)
   662  	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
   663  	var cputime usage.CPUStats
   664  	if s.tgstats {
   665  		cputime = s.task.ThreadGroup().CPUStats()
   666  	} else {
   667  		cputime = s.task.CPUStats()
   668  	}
   669  	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
   670  	cputime = s.task.ThreadGroup().JoinedChildCPUStats()
   671  	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
   672  	fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness())
   673  	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count())
   674  
   675  	// itrealvalue. Since kernel 2.6.17, this field is no longer
   676  	// maintained, and is hard coded as 0.
   677  	fmt.Fprintf(buf, "0 ")
   678  
   679  	// Start time is relative to boot time, expressed in clock ticks.
   680  	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime())))
   681  
   682  	var vss, rss uint64
   683  	if mm := getMM(s.task); mm != nil {
   684  		vss = mm.VirtualMemorySize()
   685  		rss = mm.ResidentSetSize()
   686  	}
   687  	fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize)
   688  
   689  	// rsslim.
   690  	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur)
   691  
   692  	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
   693  	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
   694  	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
   695  	terminationSignal := linux.Signal(0)
   696  	if s.task == s.task.ThreadGroup().Leader() {
   697  		terminationSignal = s.task.ThreadGroup().TerminationSignal()
   698  	}
   699  	fmt.Fprintf(buf, "%d ", terminationSignal)
   700  	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
   701  	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
   702  	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
   703  	fmt.Fprintf(buf, "0\n" /* exit_code */)
   704  
   705  	return nil
   706  }
   707  
   708  // statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
   709  //
   710  // +stateify savable
   711  type statmData struct {
   712  	kernfs.DynamicBytesFile
   713  
   714  	task *kernel.Task
   715  }
   716  
   717  var _ dynamicInode = (*statmData)(nil)
   718  
   719  // Generate implements vfs.DynamicBytesSource.Generate.
   720  func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   721  	var vss, rss uint64
   722  	if mm := getMM(s.task); mm != nil {
   723  		vss = mm.VirtualMemorySize()
   724  		rss = mm.ResidentSetSize()
   725  	}
   726  	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize)
   727  	return nil
   728  }
   729  
   730  // statusInode implements kernfs.Inode for /proc/[pid]/status.
   731  //
   732  // +stateify savable
   733  type statusInode struct {
   734  	kernfs.InodeAttrs
   735  	kernfs.InodeNoStatFS
   736  	kernfs.InodeNoopRefCount
   737  	kernfs.InodeNotAnonymous
   738  	kernfs.InodeNotDirectory
   739  	kernfs.InodeNotSymlink
   740  	kernfs.InodeWatches
   741  
   742  	task  *kernel.Task
   743  	pidns *kernel.PIDNamespace
   744  	locks vfs.FileLocks
   745  }
   746  
   747  // statusFD implements vfs.FileDescriptionImpl and vfs.DynamicByteSource for
   748  // /proc/[pid]/status.
   749  //
   750  // +stateify savable
   751  type statusFD struct {
   752  	statusFDLowerBase
   753  	vfs.DynamicBytesFileDescriptionImpl
   754  	vfs.LockFD
   755  
   756  	vfsfd vfs.FileDescription
   757  
   758  	inode  *statusInode
   759  	task   *kernel.Task
   760  	pidns  *kernel.PIDNamespace
   761  	userns *auth.UserNamespace // equivalent to struct file::f_cred::user_ns
   762  }
   763  
   764  // statusFDLowerBase is a dumb hack to ensure that statusFD prefers
   765  // vfs.DynamicBytesFileDescriptionImpl methods to vfs.FileDescriptinDefaultImpl
   766  // methods.
   767  //
   768  // +stateify savable
   769  type statusFDLowerBase struct {
   770  	vfs.FileDescriptionDefaultImpl
   771  }
   772  
   773  func (fs *filesystem) newStatusInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, ino uint64, perm linux.FileMode) kernfs.Inode {
   774  	// Note: credentials are overridden by taskOwnedInode.
   775  	inode := &statusInode{
   776  		task:  task,
   777  		pidns: pidns,
   778  	}
   779  	inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeRegular|perm)
   780  	return &taskOwnedInode{Inode: inode, owner: task}
   781  }
   782  
   783  // Open implements kernfs.Inode.Open.
   784  func (s *statusInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   785  	fd := &statusFD{
   786  		inode:  s,
   787  		task:   s.task,
   788  		pidns:  s.pidns,
   789  		userns: rp.Credentials().UserNamespace,
   790  	}
   791  	fd.LockFD.Init(&s.locks)
   792  	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
   793  		return nil, err
   794  	}
   795  	fd.DynamicBytesFileDescriptionImpl.Init(&fd.vfsfd, fd)
   796  	return &fd.vfsfd, nil
   797  }
   798  
   799  // SetStat implements kernfs.Inode.SetStat.
   800  func (*statusInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   801  	return linuxerr.EPERM
   802  }
   803  
   804  // Release implements vfs.FileDescriptionImpl.Release.
   805  func (s *statusFD) Release(ctx context.Context) {
   806  }
   807  
   808  // Stat implements vfs.FileDescriptionImpl.Stat.
   809  func (s *statusFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   810  	fs := s.vfsfd.VirtualDentry().Mount().Filesystem()
   811  	return s.inode.Stat(ctx, fs, opts)
   812  }
   813  
   814  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   815  func (s *statusFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   816  	return linuxerr.EPERM
   817  }
   818  
   819  // Generate implements vfs.DynamicBytesSource.Generate.
   820  func (s *statusFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
   821  	fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name())
   822  	fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus())
   823  	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup()))
   824  	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task))
   825  
   826  	ppid := kernel.ThreadID(0)
   827  	if parent := s.task.Parent(); parent != nil {
   828  		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
   829  	}
   830  	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
   831  
   832  	tpid := kernel.ThreadID(0)
   833  	if tracer := s.task.Tracer(); tracer != nil {
   834  		tpid = s.pidns.IDOfTask(tracer)
   835  	}
   836  	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
   837  
   838  	creds := s.task.Credentials()
   839  	ruid := creds.RealKUID.In(s.userns).OrOverflow()
   840  	euid := creds.EffectiveKUID.In(s.userns).OrOverflow()
   841  	suid := creds.SavedKUID.In(s.userns).OrOverflow()
   842  	rgid := creds.RealKGID.In(s.userns).OrOverflow()
   843  	egid := creds.EffectiveKGID.In(s.userns).OrOverflow()
   844  	sgid := creds.SavedKGID.In(s.userns).OrOverflow()
   845  	var fds int
   846  	var vss, rss, data uint64
   847  	s.task.WithMuLocked(func(t *kernel.Task) {
   848  		if fdTable := t.FDTable(); fdTable != nil {
   849  			fds = fdTable.CurrentMaxFDs()
   850  		}
   851  	})
   852  	if mm := getMM(s.task); mm != nil {
   853  		vss = mm.VirtualMemorySize()
   854  		rss = mm.ResidentSetSize()
   855  		data = mm.VirtualDataSize()
   856  	}
   857  	// Filesystem user/group IDs aren't implemented; effective UID/GID are used
   858  	// instead.
   859  	fmt.Fprintf(buf, "Uid:\t%d\t%d\t%d\t%d\n", ruid, euid, suid, euid)
   860  	fmt.Fprintf(buf, "Gid:\t%d\t%d\t%d\t%d\n", rgid, egid, sgid, egid)
   861  	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
   862  	buf.WriteString("Groups:\t")
   863  	// There is a space between each pair of supplemental GIDs, as well as an
   864  	// unconditional trailing space that some applications actually depend on.
   865  	var sep string
   866  	for _, kgid := range creds.ExtraKGIDs {
   867  		fmt.Fprintf(buf, "%s%d", sep, kgid.In(s.userns).OrOverflow())
   868  		sep = " "
   869  	}
   870  	buf.WriteString(" \n")
   871  
   872  	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
   873  	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
   874  	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
   875  
   876  	fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count())
   877  	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
   878  	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
   879  	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
   880  	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
   881  	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode())
   882  	// We unconditionally report a single NUMA node. See
   883  	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
   884  	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
   885  	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
   886  	return nil
   887  }
   888  
   889  // ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider.
   890  type ioUsage interface {
   891  	// IOUsage returns the io usage data.
   892  	IOUsage() *usage.IO
   893  }
   894  
   895  // +stateify savable
   896  type ioData struct {
   897  	kernfs.DynamicBytesFile
   898  
   899  	ioUsage
   900  }
   901  
   902  var _ dynamicInode = (*ioData)(nil)
   903  
   904  // Generate implements vfs.DynamicBytesSource.Generate.
   905  func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
   906  	io := usage.IO{}
   907  	io.Accumulate(i.IOUsage())
   908  
   909  	fmt.Fprintf(buf, "char: %d\n", io.CharsRead.RacyLoad())
   910  	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten.RacyLoad())
   911  	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls.RacyLoad())
   912  	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls.RacyLoad())
   913  	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead.RacyLoad())
   914  	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten.RacyLoad())
   915  	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled.RacyLoad())
   916  	return nil
   917  }
   918  
   919  // oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
   920  //
   921  // +stateify savable
   922  type oomScoreAdj struct {
   923  	kernfs.DynamicBytesFile
   924  
   925  	task *kernel.Task
   926  }
   927  
   928  var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
   929  
   930  // Generate implements vfs.DynamicBytesSource.Generate.
   931  func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
   932  	if o.task.ExitState() == kernel.TaskExitDead {
   933  		return linuxerr.ESRCH
   934  	}
   935  	fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj())
   936  	return nil
   937  }
   938  
   939  // Write implements vfs.WritableDynamicBytesSource.Write.
   940  func (o *oomScoreAdj) Write(ctx context.Context, _ *vfs.FileDescription, src usermem.IOSequence, offset int64) (int64, error) {
   941  	if src.NumBytes() == 0 {
   942  		return 0, nil
   943  	}
   944  
   945  	// Limit input size so as not to impact performance if input size is large.
   946  	src = src.TakeFirst(hostarch.PageSize - 1)
   947  
   948  	str, err := usermem.CopyStringIn(ctx, src.IO, src.Addrs.Head().Start, int(src.Addrs.Head().Length()), src.Opts)
   949  	if err != nil && err != linuxerr.ENAMETOOLONG {
   950  		return 0, err
   951  	}
   952  
   953  	str = strings.TrimSpace(str)
   954  	v, err := strconv.ParseInt(str, 0, 32)
   955  	if err != nil {
   956  		return 0, linuxerr.EINVAL
   957  	}
   958  
   959  	if o.task.ExitState() == kernel.TaskExitDead {
   960  		return 0, linuxerr.ESRCH
   961  	}
   962  	if err := o.task.SetOOMScoreAdj(int32(v)); err != nil {
   963  		return 0, err
   964  	}
   965  
   966  	return src.NumBytes(), nil
   967  }
   968  
   969  // exeSymlink is an symlink for the /proc/[pid]/exe file.
   970  //
   971  // +stateify savable
   972  type exeSymlink struct {
   973  	implStatFS
   974  	kernfs.InodeAttrs
   975  	kernfs.InodeNoopRefCount
   976  	kernfs.InodeNotAnonymous
   977  	kernfs.InodeSymlink
   978  	kernfs.InodeWatches
   979  
   980  	fs   *filesystem
   981  	task *kernel.Task
   982  }
   983  
   984  var _ kernfs.Inode = (*exeSymlink)(nil)
   985  
   986  func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
   987  	inode := &exeSymlink{
   988  		fs:   fs,
   989  		task: task,
   990  	}
   991  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
   992  	return inode
   993  }
   994  
   995  // Readlink implements kernfs.Inode.Readlink.
   996  func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
   997  	exec, _, err := s.Getlink(ctx, nil)
   998  	if err != nil {
   999  		return "", err
  1000  	}
  1001  	defer s.fs.SafeDecRef(ctx, exec)
  1002  
  1003  	root := vfs.RootFromContext(ctx)
  1004  	if !root.Ok() {
  1005  		panic("procfs Readlink requires context with root value")
  1006  	}
  1007  	defer s.fs.SafeDecRef(ctx, root)
  1008  
  1009  	vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
  1010  	name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
  1011  	return name, nil
  1012  }
  1013  
  1014  // Getlink implements kernfs.Inode.Getlink.
  1015  func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1016  	if !kernel.ContextCanTrace(ctx, s.task, false) {
  1017  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1018  	}
  1019  	if err := checkTaskState(s.task); err != nil {
  1020  		return vfs.VirtualDentry{}, "", err
  1021  	}
  1022  
  1023  	mm := getMM(s.task)
  1024  	if mm == nil {
  1025  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1026  	}
  1027  
  1028  	// The MemoryManager may be destroyed, in which case
  1029  	// MemoryManager.destroy will simply set the executable to nil
  1030  	// (with locks held).
  1031  	exec := mm.Executable()
  1032  	if exec == nil {
  1033  		return vfs.VirtualDentry{}, "", linuxerr.ESRCH
  1034  	}
  1035  	defer exec.DecRef(ctx)
  1036  
  1037  	vd := exec.VirtualDentry()
  1038  	vd.IncRef()
  1039  	return vd, "", nil
  1040  }
  1041  
  1042  // cwdSymlink is an symlink for the /proc/[pid]/cwd file.
  1043  //
  1044  // +stateify savable
  1045  type cwdSymlink struct {
  1046  	implStatFS
  1047  	kernfs.InodeAttrs
  1048  	kernfs.InodeNoopRefCount
  1049  	kernfs.InodeNotAnonymous
  1050  	kernfs.InodeSymlink
  1051  	kernfs.InodeWatches
  1052  
  1053  	fs   *filesystem
  1054  	task *kernel.Task
  1055  }
  1056  
  1057  var _ kernfs.Inode = (*cwdSymlink)(nil)
  1058  
  1059  func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
  1060  	inode := &cwdSymlink{
  1061  		fs:   fs,
  1062  		task: task,
  1063  	}
  1064  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
  1065  	return inode
  1066  }
  1067  
  1068  // Readlink implements kernfs.Inode.Readlink.
  1069  func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
  1070  	cwd, _, err := s.Getlink(ctx, nil)
  1071  	if err != nil {
  1072  		return "", err
  1073  	}
  1074  	defer s.fs.SafeDecRef(ctx, cwd)
  1075  
  1076  	root := vfs.RootFromContext(ctx)
  1077  	if !root.Ok() {
  1078  		panic("procfs Readlink requires context with root value")
  1079  	}
  1080  	defer s.fs.SafeDecRef(ctx, root)
  1081  
  1082  	vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
  1083  	name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
  1084  	return name, nil
  1085  }
  1086  
  1087  // Getlink implements kernfs.Inode.Getlink.
  1088  func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1089  	if !kernel.ContextCanTrace(ctx, s.task, false) {
  1090  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1091  	}
  1092  	if err := checkTaskState(s.task); err != nil {
  1093  		return vfs.VirtualDentry{}, "", err
  1094  	}
  1095  	cwd := s.task.FSContext().WorkingDirectory()
  1096  	if !cwd.Ok() {
  1097  		// It could have raced with process deletion.
  1098  		return vfs.VirtualDentry{}, "", linuxerr.ESRCH
  1099  	}
  1100  	// The reference is transferred to the caller.
  1101  	return cwd, "", nil
  1102  }
  1103  
  1104  // rootSymlink is an symlink for the /proc/[pid]/root file.
  1105  //
  1106  // +stateify savable
  1107  type rootSymlink struct {
  1108  	implStatFS
  1109  	kernfs.InodeAttrs
  1110  	kernfs.InodeNoopRefCount
  1111  	kernfs.InodeNotAnonymous
  1112  	kernfs.InodeSymlink
  1113  	kernfs.InodeWatches
  1114  
  1115  	fs   *filesystem
  1116  	task *kernel.Task
  1117  }
  1118  
  1119  var _ kernfs.Inode = (*rootSymlink)(nil)
  1120  
  1121  func (fs *filesystem) newRootSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
  1122  	inode := &rootSymlink{
  1123  		fs:   fs,
  1124  		task: task,
  1125  	}
  1126  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
  1127  	return inode
  1128  }
  1129  
  1130  // Readlink implements kernfs.Inode.Readlink.
  1131  func (s *rootSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
  1132  	root, _, err := s.Getlink(ctx, nil)
  1133  	if err != nil {
  1134  		return "", err
  1135  	}
  1136  	defer s.fs.SafeDecRef(ctx, root)
  1137  
  1138  	vfsRoot := vfs.RootFromContext(ctx)
  1139  	if !vfsRoot.Ok() {
  1140  		panic("procfs Readlink requires context with root value")
  1141  	}
  1142  	defer s.fs.SafeDecRef(ctx, vfsRoot)
  1143  
  1144  	vfsObj := root.Mount().Filesystem().VirtualFilesystem()
  1145  	name, _ := vfsObj.PathnameWithDeleted(ctx, vfsRoot, root)
  1146  	return name, nil
  1147  }
  1148  
  1149  // Getlink implements kernfs.Inode.Getlink.
  1150  func (s *rootSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1151  	if !kernel.ContextCanTrace(ctx, s.task, false) {
  1152  		return vfs.VirtualDentry{}, "", linuxerr.EACCES
  1153  	}
  1154  	if err := checkTaskState(s.task); err != nil {
  1155  		return vfs.VirtualDentry{}, "", err
  1156  	}
  1157  	root := s.task.FSContext().RootDirectory()
  1158  	if !root.Ok() {
  1159  		// It could have raced with process deletion.
  1160  		return vfs.VirtualDentry{}, "", linuxerr.ESRCH
  1161  	}
  1162  	// The reference is transferred to the caller.
  1163  	return root, "", nil
  1164  }
  1165  
  1166  // mountInfoData is used to implement /proc/[pid]/mountinfo.
  1167  //
  1168  // +stateify savable
  1169  type mountInfoData struct {
  1170  	kernfs.DynamicBytesFile
  1171  
  1172  	fs   *filesystem
  1173  	task *kernel.Task
  1174  }
  1175  
  1176  var _ dynamicInode = (*mountInfoData)(nil)
  1177  
  1178  // Generate implements vfs.DynamicBytesSource.Generate.
  1179  func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1180  	var fsctx *kernel.FSContext
  1181  	i.task.WithMuLocked(func(t *kernel.Task) {
  1182  		fsctx = t.FSContext()
  1183  	})
  1184  	if fsctx == nil {
  1185  		// The task has been destroyed. Nothing to show here.
  1186  		return nil
  1187  	}
  1188  	rootDir := fsctx.RootDirectory()
  1189  	if !rootDir.Ok() {
  1190  		// Root has been destroyed. Don't try to read mounts.
  1191  		return nil
  1192  	}
  1193  	defer i.fs.SafeDecRef(ctx, rootDir)
  1194  	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
  1195  	return nil
  1196  }
  1197  
  1198  // mountsData is used to implement /proc/[pid]/mounts.
  1199  //
  1200  // +stateify savable
  1201  type mountsData struct {
  1202  	kernfs.DynamicBytesFile
  1203  
  1204  	fs   *filesystem
  1205  	task *kernel.Task
  1206  }
  1207  
  1208  var _ dynamicInode = (*mountsData)(nil)
  1209  
  1210  // Generate implements vfs.DynamicBytesSource.Generate.
  1211  func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1212  	var fsctx *kernel.FSContext
  1213  	i.task.WithMuLocked(func(t *kernel.Task) {
  1214  		fsctx = t.FSContext()
  1215  	})
  1216  	if fsctx == nil {
  1217  		// The task has been destroyed. Nothing to show here.
  1218  		return nil
  1219  	}
  1220  	rootDir := fsctx.RootDirectory()
  1221  	if !rootDir.Ok() {
  1222  		// Root has been destroyed. Don't try to read mounts.
  1223  		return nil
  1224  	}
  1225  	defer i.fs.SafeDecRef(ctx, rootDir)
  1226  	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
  1227  	return nil
  1228  }
  1229  
  1230  // +stateify savable
  1231  type namespaceSymlink struct {
  1232  	kernfs.StaticSymlink
  1233  
  1234  	task   *kernel.Task
  1235  	nsType int
  1236  }
  1237  
  1238  func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, nsType int) kernfs.Inode {
  1239  	inode := &namespaceSymlink{task: task, nsType: nsType}
  1240  
  1241  	// Note: credentials are overridden by taskOwnedInode.
  1242  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, "")
  1243  
  1244  	taskInode := &taskOwnedInode{Inode: inode, owner: task}
  1245  	return taskInode
  1246  }
  1247  
  1248  func (fs *filesystem) newPIDNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
  1249  	target := fmt.Sprintf("pid:[%d]", task.PIDNamespace().ID())
  1250  
  1251  	inode := &namespaceSymlink{task: task}
  1252  	// Note: credentials are overridden by taskOwnedInode.
  1253  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
  1254  
  1255  	taskInode := &taskOwnedInode{Inode: inode, owner: task}
  1256  	return taskInode
  1257  }
  1258  
  1259  func (fs *filesystem) newFakeNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode {
  1260  	// Namespace symlinks should contain the namespace name and the inode number
  1261  	// for the namespace instance, so for example user:[123456]. We currently fake
  1262  	// the inode number by sticking the symlink inode in its place.
  1263  	target := fmt.Sprintf("%s:[%d]", ns, ino)
  1264  
  1265  	inode := &namespaceSymlink{task: task}
  1266  	// Note: credentials are overridden by taskOwnedInode.
  1267  	inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
  1268  
  1269  	taskInode := &taskOwnedInode{Inode: inode, owner: task}
  1270  	return taskInode
  1271  }
  1272  
  1273  func (s *namespaceSymlink) getInode(t *kernel.Task) *nsfs.Inode {
  1274  	switch s.nsType {
  1275  	case linux.CLONE_NEWNET:
  1276  		return t.GetNetworkNamespace().GetInode()
  1277  	default:
  1278  		panic("unknown namespace")
  1279  	}
  1280  }
  1281  
  1282  // Readlink implements kernfs.Inode.Readlink.
  1283  func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
  1284  	if err := checkTaskState(s.task); err != nil {
  1285  		return "", err
  1286  	}
  1287  	if s.nsType != 0 {
  1288  		inode := s.getInode(s.task)
  1289  		if inode == nil {
  1290  			return "", linuxerr.ENOENT
  1291  		}
  1292  		target := inode.Name()
  1293  		inode.DecRef(ctx)
  1294  		return target, nil
  1295  	}
  1296  	return s.StaticSymlink.Readlink(ctx, mnt)
  1297  }
  1298  
  1299  // Getlink implements kernfs.Inode.Getlink.
  1300  func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
  1301  	if err := checkTaskState(s.task); err != nil {
  1302  		return vfs.VirtualDentry{}, "", err
  1303  	}
  1304  
  1305  	if s.nsType != 0 {
  1306  		inode := s.getInode(s.task)
  1307  		if inode == nil {
  1308  			return vfs.VirtualDentry{}, "", linuxerr.ENOENT
  1309  		}
  1310  		defer inode.DecRef(ctx)
  1311  		return inode.VirtualDentry(), "", nil
  1312  	}
  1313  	// Create a synthetic inode to represent the namespace.
  1314  	fs := mnt.Filesystem().Impl().(*filesystem)
  1315  	nsInode := &namespaceInode{}
  1316  	nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444)
  1317  	dentry := &kernfs.Dentry{}
  1318  	dentry.Init(&fs.Filesystem, nsInode)
  1319  	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
  1320  	// Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
  1321  	mnt.IncRef()
  1322  	return vd, "", nil
  1323  }
  1324  
  1325  // namespaceInode is a synthetic inode created to represent a namespace in
  1326  // /proc/[pid]/ns/*.
  1327  //
  1328  // +stateify savable
  1329  type namespaceInode struct {
  1330  	implStatFS
  1331  	kernfs.InodeAttrs
  1332  	kernfs.InodeNoopRefCount
  1333  	kernfs.InodeNotAnonymous
  1334  	kernfs.InodeNotDirectory
  1335  	kernfs.InodeNotSymlink
  1336  	kernfs.InodeWatches
  1337  
  1338  	locks vfs.FileLocks
  1339  }
  1340  
  1341  var _ kernfs.Inode = (*namespaceInode)(nil)
  1342  
  1343  // Init initializes a namespace inode.
  1344  func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
  1345  	if perm&^linux.PermissionsMask != 0 {
  1346  		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
  1347  	}
  1348  	i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
  1349  }
  1350  
  1351  // Open implements kernfs.Inode.Open.
  1352  func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
  1353  	fd := &namespaceFD{inode: i}
  1354  	i.IncRef()
  1355  	fd.LockFD.Init(&i.locks)
  1356  	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
  1357  		return nil, err
  1358  	}
  1359  	return &fd.vfsfd, nil
  1360  }
  1361  
  1362  // namespace FD is a synthetic file that represents a namespace in
  1363  // /proc/[pid]/ns/*.
  1364  //
  1365  // +stateify savable
  1366  type namespaceFD struct {
  1367  	vfs.FileDescriptionDefaultImpl
  1368  	vfs.LockFD
  1369  
  1370  	vfsfd vfs.FileDescription
  1371  	inode *namespaceInode
  1372  }
  1373  
  1374  var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
  1375  
  1376  // Stat implements vfs.FileDescriptionImpl.Stat.
  1377  func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
  1378  	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
  1379  	return fd.inode.Stat(ctx, vfs, opts)
  1380  }
  1381  
  1382  // SetStat implements vfs.FileDescriptionImpl.SetStat.
  1383  func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
  1384  	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
  1385  	creds := auth.CredentialsFromContext(ctx)
  1386  	return fd.inode.SetStat(ctx, vfs, creds, opts)
  1387  }
  1388  
  1389  // Release implements vfs.FileDescriptionImpl.Release.
  1390  func (fd *namespaceFD) Release(ctx context.Context) {
  1391  	fd.inode.DecRef(ctx)
  1392  }
  1393  
  1394  // taskCgroupData generates data for /proc/[pid]/cgroup.
  1395  //
  1396  // +stateify savable
  1397  type taskCgroupData struct {
  1398  	dynamicBytesFileSetAttr
  1399  	task *kernel.Task
  1400  }
  1401  
  1402  var _ dynamicInode = (*taskCgroupData)(nil)
  1403  
  1404  // Generate implements vfs.DynamicBytesSource.Generate.
  1405  func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error {
  1406  	// When a task is existing on Linux, a task's cgroup set is cleared and
  1407  	// reset to the initial cgroup set, which is essentially the set of root
  1408  	// cgroups. Because of this, the /proc/<pid>/cgroup file is always readable
  1409  	// on Linux throughout a task's lifetime.
  1410  	//
  1411  	// The sentry removes tasks from cgroups during the exit process, but
  1412  	// doesn't move them into an initial cgroup set, so partway through task
  1413  	// exit this file show a task is in no cgroups, which is incorrect. Instead,
  1414  	// once a task has left its cgroups, we return an error.
  1415  	if d.task.ExitState() >= kernel.TaskExitInitiated {
  1416  		return linuxerr.ESRCH
  1417  	}
  1418  
  1419  	d.task.GenerateProcTaskCgroup(buf)
  1420  	return nil
  1421  }