github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/control/proc.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package control
    16  
    17  import (
    18  	"bytes"
    19  	"encoding/json"
    20  	"fmt"
    21  	"sort"
    22  	"strings"
    23  	"text/tabwriter"
    24  	"time"
    25  
    26  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    27  	"github.com/SagerNet/gvisor/pkg/fd"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/fdimport"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/fs/host"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/fs/user"
    32  	hostvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/host"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    35  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    37  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    38  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    39  	"github.com/SagerNet/gvisor/pkg/urpc"
    40  )
    41  
    42  // Proc includes task-related functions.
    43  //
    44  // At the moment, this is limited to exec support.
    45  type Proc struct {
    46  	Kernel *kernel.Kernel
    47  }
    48  
    49  // ExecArgs is the set of arguments to exec.
    50  type ExecArgs struct {
    51  	// Filename is the filename to load.
    52  	//
    53  	// If this is provided as "", then the file will be guessed via Argv[0].
    54  	Filename string `json:"filename"`
    55  
    56  	// Argv is a list of arguments.
    57  	Argv []string `json:"argv"`
    58  
    59  	// Envv is a list of environment variables.
    60  	Envv []string `json:"envv"`
    61  
    62  	// MountNamespace is the mount namespace to execute the new process in.
    63  	// A reference on MountNamespace must be held for the lifetime of the
    64  	// ExecArgs. If MountNamespace is nil, it will default to the init
    65  	// process's MountNamespace.
    66  	MountNamespace *fs.MountNamespace
    67  
    68  	// MountNamespaceVFS2 is the mount namespace to execute the new process in.
    69  	// A reference on MountNamespace must be held for the lifetime of the
    70  	// ExecArgs. If MountNamespace is nil, it will default to the init
    71  	// process's MountNamespace.
    72  	MountNamespaceVFS2 *vfs.MountNamespace
    73  
    74  	// WorkingDirectory defines the working directory for the new process.
    75  	WorkingDirectory string `json:"wd"`
    76  
    77  	// KUID is the UID to run with in the root user namespace. Defaults to
    78  	// root if not set explicitly.
    79  	KUID auth.KUID
    80  
    81  	// KGID is the GID to run with in the root user namespace. Defaults to
    82  	// the root group if not set explicitly.
    83  	KGID auth.KGID
    84  
    85  	// ExtraKGIDs is the list of additional groups to which the user belongs.
    86  	ExtraKGIDs []auth.KGID
    87  
    88  	// Capabilities is the list of capabilities to give to the process.
    89  	Capabilities *auth.TaskCapabilities
    90  
    91  	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD.
    92  	StdioIsPty bool
    93  
    94  	// FilePayload determines the files to give to the new process.
    95  	urpc.FilePayload
    96  
    97  	// ContainerID is the container for the process being executed.
    98  	ContainerID string
    99  
   100  	// PIDNamespace is the pid namespace for the process being executed.
   101  	PIDNamespace *kernel.PIDNamespace
   102  
   103  	// Limits is the limit set for the process being executed.
   104  	Limits *limits.LimitSet
   105  }
   106  
   107  // String prints the arguments as a string.
   108  func (args ExecArgs) String() string {
   109  	if len(args.Argv) == 0 {
   110  		return args.Filename
   111  	}
   112  	a := make([]string, len(args.Argv))
   113  	copy(a, args.Argv)
   114  	if args.Filename != "" {
   115  		a[0] = args.Filename
   116  	}
   117  	return strings.Join(a, " ")
   118  }
   119  
   120  // Exec runs a new task.
   121  func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
   122  	newTG, _, _, _, err := proc.execAsync(args)
   123  	if err != nil {
   124  		return err
   125  	}
   126  
   127  	// Wait for completion.
   128  	newTG.WaitExited()
   129  	*waitStatus = newTG.ExitStatus().Status()
   130  	return nil
   131  }
   132  
   133  // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
   134  // as a function rather than a method to avoid exposing execAsync as an RPC.
   135  func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
   136  	return proc.execAsync(args)
   137  }
   138  
   139  // execAsync runs a new task, but doesn't wait for it to finish. It returns the
   140  // newly created thread group and its PID. If the stdio FDs are TTYs, then a
   141  // TTYFileOperations that wraps the TTY is also returned.
   142  func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
   143  	// Import file descriptors.
   144  	fdTable := proc.Kernel.NewFDTable()
   145  
   146  	creds := auth.NewUserCredentials(
   147  		args.KUID,
   148  		args.KGID,
   149  		args.ExtraKGIDs,
   150  		args.Capabilities,
   151  		proc.Kernel.RootUserNamespace())
   152  
   153  	pidns := args.PIDNamespace
   154  	if pidns == nil {
   155  		pidns = proc.Kernel.RootPIDNamespace()
   156  	}
   157  	limitSet := args.Limits
   158  	if limitSet == nil {
   159  		limitSet = limits.NewLimitSet()
   160  	}
   161  	initArgs := kernel.CreateProcessArgs{
   162  		Filename:                args.Filename,
   163  		Argv:                    args.Argv,
   164  		Envv:                    args.Envv,
   165  		WorkingDirectory:        args.WorkingDirectory,
   166  		MountNamespace:          args.MountNamespace,
   167  		MountNamespaceVFS2:      args.MountNamespaceVFS2,
   168  		Credentials:             creds,
   169  		FDTable:                 fdTable,
   170  		Umask:                   0022,
   171  		Limits:                  limitSet,
   172  		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
   173  		UTSNamespace:            proc.Kernel.RootUTSNamespace(),
   174  		IPCNamespace:            proc.Kernel.RootIPCNamespace(),
   175  		AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
   176  		ContainerID:             args.ContainerID,
   177  		PIDNamespace:            pidns,
   178  	}
   179  	if initArgs.MountNamespace != nil {
   180  		// initArgs must hold a reference on MountNamespace, which will
   181  		// be donated to the new process in CreateProcess.
   182  		initArgs.MountNamespace.IncRef()
   183  	}
   184  	if initArgs.MountNamespaceVFS2 != nil {
   185  		// initArgs must hold a reference on MountNamespaceVFS2, which will
   186  		// be donated to the new process in CreateProcess.
   187  		initArgs.MountNamespaceVFS2.IncRef()
   188  	}
   189  	ctx := initArgs.NewContext(proc.Kernel)
   190  	defer fdTable.DecRef(ctx)
   191  
   192  	if kernel.VFS2Enabled {
   193  		// Get the full path to the filename from the PATH env variable.
   194  		if initArgs.MountNamespaceVFS2 == nil {
   195  			// Set initArgs so that 'ctx' returns the namespace.
   196  			//
   197  			// Add a reference to the namespace, which is transferred to the new process.
   198  			initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
   199  			initArgs.MountNamespaceVFS2.IncRef()
   200  		}
   201  	} else {
   202  		if initArgs.MountNamespace == nil {
   203  			// Set initArgs so that 'ctx' returns the namespace.
   204  			initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
   205  
   206  			// initArgs must hold a reference on MountNamespace, which will
   207  			// be donated to the new process in CreateProcess.
   208  			initArgs.MountNamespace.IncRef()
   209  		}
   210  	}
   211  	resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
   212  	if err != nil {
   213  		return nil, 0, nil, nil, err
   214  	}
   215  	initArgs.Filename = resolved
   216  
   217  	fds, err := fd.NewFromFiles(args.Files)
   218  	if err != nil {
   219  		return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err)
   220  	}
   221  	defer func() {
   222  		for _, fd := range fds {
   223  			_ = fd.Close()
   224  		}
   225  	}()
   226  	ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds)
   227  	if err != nil {
   228  		return nil, 0, nil, nil, err
   229  	}
   230  
   231  	tg, tid, err := proc.Kernel.CreateProcess(initArgs)
   232  	if err != nil {
   233  		return nil, 0, nil, nil, err
   234  	}
   235  
   236  	// Set the foreground process group on the TTY before starting the process.
   237  	switch {
   238  	case ttyFile != nil:
   239  		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
   240  	case ttyFileVFS2 != nil:
   241  		ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup())
   242  	}
   243  
   244  	// Start the newly created process.
   245  	proc.Kernel.StartProcess(tg)
   246  
   247  	return tg, tid, ttyFile, ttyFileVFS2, nil
   248  }
   249  
   250  // PsArgs is the set of arguments to ps.
   251  type PsArgs struct {
   252  	// JSON will force calls to Ps to return the result as a JSON payload.
   253  	JSON bool
   254  }
   255  
   256  // Ps provides a process listing for the running kernel.
   257  func (proc *Proc) Ps(args *PsArgs, out *string) error {
   258  	var p []*Process
   259  	if e := Processes(proc.Kernel, "", &p); e != nil {
   260  		return e
   261  	}
   262  	if !args.JSON {
   263  		*out = ProcessListToTable(p)
   264  	} else {
   265  		s, e := ProcessListToJSON(p)
   266  		if e != nil {
   267  			return e
   268  		}
   269  		*out = s
   270  	}
   271  	return nil
   272  }
   273  
   274  // Process contains information about a single process in a Sandbox.
   275  type Process struct {
   276  	UID auth.KUID       `json:"uid"`
   277  	PID kernel.ThreadID `json:"pid"`
   278  	// Parent PID
   279  	PPID    kernel.ThreadID   `json:"ppid"`
   280  	Threads []kernel.ThreadID `json:"threads"`
   281  	// Processor utilization
   282  	C int32 `json:"c"`
   283  	// TTY name of the process. Will be of the form "pts/N" if there is a
   284  	// TTY, or "?" if there is not.
   285  	TTY string `json:"tty"`
   286  	// Start time
   287  	STime string `json:"stime"`
   288  	// CPU time
   289  	Time string `json:"time"`
   290  	// Executable shortname (e.g. "sh" for /bin/sh)
   291  	Cmd string `json:"cmd"`
   292  }
   293  
   294  // ProcessListToTable prints a table with the following format:
   295  // UID       PID       PPID      C         TTY		STIME     TIME       CMD
   296  // 0         1         0         0         pty/4	14:04     505262ns   tail
   297  func ProcessListToTable(pl []*Process) string {
   298  	var buf bytes.Buffer
   299  	tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
   300  	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD")
   301  	for _, d := range pl {
   302  		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s",
   303  			d.UID,
   304  			d.PID,
   305  			d.PPID,
   306  			d.C,
   307  			d.TTY,
   308  			d.STime,
   309  			d.Time,
   310  			d.Cmd)
   311  	}
   312  	tw.Flush()
   313  	return buf.String()
   314  }
   315  
   316  // ProcessListToJSON will return the JSON representation of ps.
   317  func ProcessListToJSON(pl []*Process) (string, error) {
   318  	b, err := json.MarshalIndent(pl, "", "  ")
   319  	if err != nil {
   320  		return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
   321  	}
   322  	return string(b), nil
   323  }
   324  
   325  // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This
   326  // behavior is the same as runc's.
   327  func PrintPIDsJSON(pl []*Process) (string, error) {
   328  	pids := make([]kernel.ThreadID, 0, len(pl))
   329  	for _, d := range pl {
   330  		pids = append(pids, d.PID)
   331  	}
   332  	b, err := json.Marshal(pids)
   333  	if err != nil {
   334  		return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err)
   335  	}
   336  	return string(b), nil
   337  }
   338  
   339  // Processes retrieves information about processes running in the sandbox with
   340  // the given container id. All processes are returned if 'containerID' is empty.
   341  func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
   342  	ts := k.TaskSet()
   343  	now := k.RealtimeClock().Now()
   344  	pidns := ts.Root
   345  	for _, tg := range pidns.ThreadGroups() {
   346  		pid := pidns.IDOfThreadGroup(tg)
   347  
   348  		// If tg has already been reaped ignore it.
   349  		if pid == 0 {
   350  			continue
   351  		}
   352  		if containerID != "" && containerID != tg.Leader().ContainerID() {
   353  			continue
   354  		}
   355  
   356  		ppid := kernel.ThreadID(0)
   357  		if p := tg.Leader().Parent(); p != nil {
   358  			ppid = pidns.IDOfThreadGroup(p.ThreadGroup())
   359  		}
   360  		threads := tg.MemberIDs(pidns)
   361  		*out = append(*out, &Process{
   362  			UID:     tg.Leader().Credentials().EffectiveKUID,
   363  			PID:     pid,
   364  			PPID:    ppid,
   365  			Threads: threads,
   366  			STime:   formatStartTime(now, tg.Leader().StartTime()),
   367  			C:       percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
   368  			Time:    tg.CPUStats().SysTime.String(),
   369  			Cmd:     tg.Leader().Name(),
   370  			TTY:     ttyName(tg.TTY()),
   371  		})
   372  	}
   373  	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
   374  	return nil
   375  }
   376  
   377  // formatStartTime formats startTime depending on the current time:
   378  // - If startTime was today, HH:MM is used.
   379  // - If startTime was not today but was this year, MonDD is used (e.g. Jan02)
   380  // - If startTime was not this year, the year is used.
   381  func formatStartTime(now, startTime ktime.Time) string {
   382  	nowS, nowNs := now.Unix()
   383  	n := time.Unix(nowS, nowNs)
   384  	startTimeS, startTimeNs := startTime.Unix()
   385  	st := time.Unix(startTimeS, startTimeNs)
   386  	format := "15:04"
   387  	if st.YearDay() != n.YearDay() {
   388  		format = "Jan02"
   389  	}
   390  	if st.Year() != n.Year() {
   391  		format = "2006"
   392  	}
   393  	return st.Format(format)
   394  }
   395  
   396  func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
   397  	// Note: In procps, there is an option to include child CPU stats. As
   398  	// it is disabled by default, we do not include them.
   399  	total := stats.UserTime + stats.SysTime
   400  	lifetime := now.Sub(startTime)
   401  	if lifetime <= 0 {
   402  		return 0
   403  	}
   404  	percentCPU := total * 100 / lifetime
   405  	// Cap at 99% since procps does the same.
   406  	if percentCPU > 99 {
   407  		percentCPU = 99
   408  	}
   409  	return int32(percentCPU)
   410  }
   411  
   412  func ttyName(tty *kernel.TTY) string {
   413  	if tty == nil {
   414  		return "?"
   415  	}
   416  	return fmt.Sprintf("pts/%d", tty.Index)
   417  }
   418  
   419  // ContainerUsage retrieves per-container CPU usage.
   420  func ContainerUsage(kr *kernel.Kernel) map[string]uint64 {
   421  	cusage := make(map[string]uint64)
   422  	for _, tg := range kr.TaskSet().Root.ThreadGroups() {
   423  		// We want each tg's usage including reaped children.
   424  		cid := tg.Leader().ContainerID()
   425  		stats := tg.CPUStats()
   426  		stats.Accumulate(tg.JoinedChildCPUStats())
   427  		cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds())
   428  	}
   429  	return cusage
   430  }