github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/control/proc.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package control
    16  
    17  import (
    18  	"bytes"
    19  	"encoding/json"
    20  	"fmt"
    21  	"os"
    22  	"sort"
    23  	"strings"
    24  	"text/tabwriter"
    25  	"time"
    26  
    27  	"github.com/metacubex/gvisor/pkg/abi/linux"
    28  	"github.com/metacubex/gvisor/pkg/fd"
    29  	"github.com/metacubex/gvisor/pkg/log"
    30  	"github.com/metacubex/gvisor/pkg/sentry/fdimport"
    31  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/host"
    32  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/user"
    33  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    34  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    35  	ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time"
    36  	"github.com/metacubex/gvisor/pkg/sentry/limits"
    37  	"github.com/metacubex/gvisor/pkg/sentry/usage"
    38  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    39  	"github.com/metacubex/gvisor/pkg/urpc"
    40  )
    41  
    42  // Proc includes task-related functions.
    43  //
    44  // At the moment, this is limited to exec support.
    45  type Proc struct {
    46  	Kernel *kernel.Kernel
    47  }
    48  
    49  // FilePayload aids to ensure that payload files and guest file descriptors are
    50  // consistent when instantiated through the NewFilePayload helper method.
    51  type FilePayload struct {
    52  	// FilePayload is the file payload that is transferred via RPC.
    53  	urpc.FilePayload
    54  
    55  	// GuestFDs are the file descriptors in the file descriptor map of the
    56  	// executed application. They correspond 1:1 to the files in the
    57  	// urpc.FilePayload. If a program is executed from a host file descriptor,
    58  	// the file payload may contain one additional file. In that case, the file
    59  	// used for program execution is the last file in the Files array.
    60  	GuestFDs []int
    61  }
    62  
    63  // NewFilePayload returns a FilePayload that maps file descriptors to files inside
    64  // the executed process and provides a file for execution.
    65  func NewFilePayload(fdMap map[int]*os.File, execFile *os.File) FilePayload {
    66  	fileCount := len(fdMap)
    67  	if execFile != nil {
    68  		fileCount++
    69  	}
    70  	files := make([]*os.File, 0, fileCount)
    71  	guestFDs := make([]int, 0, len(fdMap))
    72  
    73  	// Make the map iteration order deterministic for the sake of testing.
    74  	// Otherwise, the order is randomized and tests relying on the comparison
    75  	// of equality will fail.
    76  	for key := range fdMap {
    77  		guestFDs = append(guestFDs, key)
    78  	}
    79  	sort.Ints(guestFDs)
    80  
    81  	for _, guestFD := range guestFDs {
    82  		files = append(files, fdMap[guestFD])
    83  	}
    84  
    85  	if execFile != nil {
    86  		files = append(files, execFile)
    87  	}
    88  
    89  	return FilePayload{
    90  		FilePayload: urpc.FilePayload{Files: files},
    91  		GuestFDs:    guestFDs,
    92  	}
    93  }
    94  
    95  // ExecArgs is the set of arguments to exec.
    96  type ExecArgs struct {
    97  	// Filename is the filename to load.
    98  	//
    99  	// If this is provided as "", then the file will be guessed via Argv[0].
   100  	Filename string `json:"filename"`
   101  
   102  	// Argv is a list of arguments.
   103  	Argv []string `json:"argv"`
   104  
   105  	// Envv is a list of environment variables.
   106  	Envv []string `json:"envv"`
   107  
   108  	// MountNamespace is the mount namespace to execute the new process in.
   109  	// A reference on MountNamespace must be held for the lifetime of the
   110  	// ExecArgs. If MountNamespace is nil, it will default to the init
   111  	// process's MountNamespace.
   112  	MountNamespace *vfs.MountNamespace
   113  
   114  	// WorkingDirectory defines the working directory for the new process.
   115  	WorkingDirectory string `json:"wd"`
   116  
   117  	// KUID is the UID to run with in the root user namespace. Defaults to
   118  	// root if not set explicitly.
   119  	KUID auth.KUID
   120  
   121  	// KGID is the GID to run with in the root user namespace. Defaults to
   122  	// the root group if not set explicitly.
   123  	KGID auth.KGID
   124  
   125  	// ExtraKGIDs is the list of additional groups to which the user belongs.
   126  	ExtraKGIDs []auth.KGID
   127  
   128  	// Capabilities is the list of capabilities to give to the process.
   129  	Capabilities *auth.TaskCapabilities
   130  
   131  	// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD.
   132  	StdioIsPty bool
   133  
   134  	// FilePayload determines the files to give to the new process.
   135  	FilePayload
   136  
   137  	// ContainerID is the container for the process being executed.
   138  	ContainerID string
   139  
   140  	// PIDNamespace is the pid namespace for the process being executed.
   141  	PIDNamespace *kernel.PIDNamespace
   142  
   143  	// Limits is the limit set for the process being executed.
   144  	Limits *limits.LimitSet
   145  }
   146  
   147  // String prints the arguments as a string.
   148  func (args *ExecArgs) String() string {
   149  	if len(args.Argv) == 0 {
   150  		return args.Filename
   151  	}
   152  	a := make([]string, len(args.Argv))
   153  	copy(a, args.Argv)
   154  	if args.Filename != "" {
   155  		a[0] = args.Filename
   156  	}
   157  	return strings.Join(a, " ")
   158  }
   159  
   160  // Exec runs a new task.
   161  func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
   162  	newTG, _, _, err := proc.execAsync(args)
   163  	if err != nil {
   164  		return err
   165  	}
   166  
   167  	// Wait for completion.
   168  	newTG.WaitExited()
   169  	*waitStatus = uint32(newTG.ExitStatus())
   170  	return nil
   171  }
   172  
   173  // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
   174  // as a function rather than a method to avoid exposing execAsync as an RPC.
   175  func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) {
   176  	return proc.execAsync(args)
   177  }
   178  
   179  // execAsync runs a new task, but doesn't wait for it to finish. It returns the
   180  // newly created thread group and its PID. If the stdio FDs are TTYs, then a
   181  // TTYFileOperations that wraps the TTY is also returned.
   182  func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) {
   183  	// Import file descriptors.
   184  	fdTable := proc.Kernel.NewFDTable()
   185  
   186  	creds := auth.NewUserCredentials(
   187  		args.KUID,
   188  		args.KGID,
   189  		args.ExtraKGIDs,
   190  		args.Capabilities,
   191  		proc.Kernel.RootUserNamespace())
   192  
   193  	pidns := args.PIDNamespace
   194  	if pidns == nil {
   195  		pidns = proc.Kernel.RootPIDNamespace()
   196  	}
   197  	limitSet := args.Limits
   198  	if limitSet == nil {
   199  		limitSet = limits.NewLimitSet()
   200  	}
   201  	initArgs := kernel.CreateProcessArgs{
   202  		Filename:             args.Filename,
   203  		Argv:                 args.Argv,
   204  		Envv:                 args.Envv,
   205  		WorkingDirectory:     args.WorkingDirectory,
   206  		MountNamespace:       args.MountNamespace,
   207  		Credentials:          creds,
   208  		FDTable:              fdTable,
   209  		Umask:                0022,
   210  		Limits:               limitSet,
   211  		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
   212  		UTSNamespace:         proc.Kernel.RootUTSNamespace(),
   213  		IPCNamespace:         proc.Kernel.RootIPCNamespace(),
   214  		ContainerID:          args.ContainerID,
   215  		PIDNamespace:         pidns,
   216  	}
   217  	if initArgs.MountNamespace != nil {
   218  		// initArgs must hold a reference on MountNamespace, which will
   219  		// be donated to the new process in CreateProcess.
   220  		initArgs.MountNamespace.IncRef()
   221  	}
   222  	ctx := initArgs.NewContext(proc.Kernel)
   223  	defer fdTable.DecRef(ctx)
   224  
   225  	// Get the full path to the filename from the PATH env variable.
   226  	if initArgs.MountNamespace == nil {
   227  		// Set initArgs so that 'ctx' returns the namespace.
   228  		//
   229  		// Add a reference to the namespace, which is transferred to the new process.
   230  		initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
   231  		initArgs.MountNamespace.IncRef()
   232  	}
   233  
   234  	fdMap, execFD, err := args.unpackFiles()
   235  	if err != nil {
   236  		return nil, 0, nil, fmt.Errorf("creating fd map: %w", err)
   237  	}
   238  	defer func() {
   239  		for _, hostFD := range fdMap {
   240  			_ = hostFD.Close()
   241  		}
   242  	}()
   243  
   244  	if execFD != nil {
   245  		if initArgs.Filename != "" {
   246  			return nil, 0, nil, fmt.Errorf("process must either be started from a file or a filename, not both")
   247  		}
   248  		file, err := host.NewFD(ctx, proc.Kernel.HostMount(), execFD.FD(), &host.NewFDOptions{
   249  			Readonly:     true,
   250  			Savable:      true,
   251  			VirtualOwner: true,
   252  			UID:          args.KUID,
   253  			GID:          args.KGID,
   254  		})
   255  		if err != nil {
   256  			return nil, 0, nil, err
   257  		}
   258  		defer file.DecRef(ctx)
   259  		execFD.Release()
   260  		initArgs.File = file
   261  	} else {
   262  		resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
   263  		if err != nil {
   264  			return nil, 0, nil, err
   265  		}
   266  		initArgs.Filename = resolved
   267  	}
   268  
   269  	// TODO(gvisor.dev/issue/1956): Container name is not really needed because
   270  	// exec processes are not restored, but add it for completeness.
   271  	ttyFile, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fdMap, "")
   272  	if err != nil {
   273  		return nil, 0, nil, err
   274  	}
   275  
   276  	// Set cgroups to the new exec task if cgroups are mounted.
   277  	cgroupRegistry := proc.Kernel.CgroupRegistry()
   278  	initialCgrps := map[kernel.Cgroup]struct{}{}
   279  	for _, ctrl := range kernel.CgroupCtrls {
   280  		cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+args.ContainerID)
   281  		if err != nil {
   282  			log.Warningf("cgroup mount for controller %v not found", ctrl)
   283  			continue
   284  		}
   285  		initialCgrps[cg] = struct{}{}
   286  	}
   287  	if len(initialCgrps) > 0 {
   288  		initArgs.InitialCgroups = initialCgrps
   289  	}
   290  
   291  	tg, tid, err := proc.Kernel.CreateProcess(initArgs)
   292  	if err != nil {
   293  		return nil, 0, nil, err
   294  	}
   295  
   296  	// Set the foreground process group on the TTY before starting the process.
   297  	if ttyFile != nil {
   298  		ttyFile.InitForegroundProcessGroup(tg.ProcessGroup())
   299  	}
   300  
   301  	// Start the newly created process.
   302  	proc.Kernel.StartProcess(tg)
   303  
   304  	return tg, tid, ttyFile, nil
   305  }
   306  
   307  // PsArgs is the set of arguments to ps.
   308  type PsArgs struct {
   309  	// JSON will force calls to Ps to return the result as a JSON payload.
   310  	JSON bool
   311  }
   312  
   313  // Ps provides a process listing for the running kernel.
   314  func (proc *Proc) Ps(args *PsArgs, out *string) error {
   315  	var p []*Process
   316  	if e := Processes(proc.Kernel, "", &p); e != nil {
   317  		return e
   318  	}
   319  	if !args.JSON {
   320  		*out = ProcessListToTable(p)
   321  	} else {
   322  		s, e := ProcessListToJSON(p)
   323  		if e != nil {
   324  			return e
   325  		}
   326  		*out = s
   327  	}
   328  	return nil
   329  }
   330  
   331  // Process contains information about a single process in a Sandbox.
   332  type Process struct {
   333  	UID auth.KUID       `json:"uid"`
   334  	PID kernel.ThreadID `json:"pid"`
   335  	// Parent PID
   336  	PPID    kernel.ThreadID   `json:"ppid"`
   337  	Threads []kernel.ThreadID `json:"threads"`
   338  	// Processor utilization
   339  	C int32 `json:"c"`
   340  	// TTY name of the process. Will be of the form "pts/N" if there is a
   341  	// TTY, or "?" if there is not.
   342  	TTY string `json:"tty"`
   343  	// Start time
   344  	STime string `json:"stime"`
   345  	// CPU time
   346  	Time string `json:"time"`
   347  	// Executable shortname (e.g. "sh" for /bin/sh)
   348  	Cmd string `json:"cmd"`
   349  }
   350  
   351  // ProcessListToTable prints a table with the following format:
   352  // UID       PID       PPID      C         TTY		STIME     TIME       CMD
   353  // 0         1         0         0         pty/4	14:04     505262ns   tail
   354  func ProcessListToTable(pl []*Process) string {
   355  	var buf bytes.Buffer
   356  	tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
   357  	fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD")
   358  	for _, d := range pl {
   359  		fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s",
   360  			d.UID,
   361  			d.PID,
   362  			d.PPID,
   363  			d.C,
   364  			d.TTY,
   365  			d.STime,
   366  			d.Time,
   367  			d.Cmd)
   368  	}
   369  	tw.Flush()
   370  	return buf.String()
   371  }
   372  
   373  // ProcessListToJSON will return the JSON representation of ps.
   374  func ProcessListToJSON(pl []*Process) (string, error) {
   375  	b, err := json.MarshalIndent(pl, "", "  ")
   376  	if err != nil {
   377  		return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
   378  	}
   379  	return string(b), nil
   380  }
   381  
   382  // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This
   383  // behavior is the same as runc's.
   384  func PrintPIDsJSON(pl []*Process) (string, error) {
   385  	pids := make([]kernel.ThreadID, 0, len(pl))
   386  	for _, d := range pl {
   387  		pids = append(pids, d.PID)
   388  	}
   389  	b, err := json.Marshal(pids)
   390  	if err != nil {
   391  		return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err)
   392  	}
   393  	return string(b), nil
   394  }
   395  
   396  // Processes retrieves information about processes running in the sandbox with
   397  // the given container id. All processes are returned if 'containerID' is empty.
   398  func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
   399  	ts := k.TaskSet()
   400  	now := k.RealtimeClock().Now()
   401  	pidns := ts.Root
   402  	for _, tg := range pidns.ThreadGroups() {
   403  		pid := pidns.IDOfThreadGroup(tg)
   404  
   405  		// If tg has already been reaped ignore it.
   406  		if pid == 0 {
   407  			continue
   408  		}
   409  		if containerID != "" && containerID != tg.Leader().ContainerID() {
   410  			continue
   411  		}
   412  
   413  		ppid := kernel.ThreadID(0)
   414  		if p := tg.Leader().Parent(); p != nil {
   415  			ppid = pidns.IDOfThreadGroup(p.ThreadGroup())
   416  		}
   417  		threads := tg.MemberIDs(pidns)
   418  		*out = append(*out, &Process{
   419  			UID:     tg.Leader().Credentials().EffectiveKUID,
   420  			PID:     pid,
   421  			PPID:    ppid,
   422  			Threads: threads,
   423  			STime:   formatStartTime(now, tg.Leader().StartTime()),
   424  			C:       percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
   425  			Time:    tg.CPUStats().SysTime.String(),
   426  			Cmd:     tg.Leader().Name(),
   427  			TTY:     ttyName(tg.TTY()),
   428  		})
   429  	}
   430  	sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
   431  	return nil
   432  }
   433  
   434  // formatStartTime formats startTime depending on the current time:
   435  //   - If startTime was today, HH:MM is used.
   436  //   - If startTime was not today but was this year, MonDD is used (e.g. Jan02)
   437  //   - If startTime was not this year, the year is used.
   438  func formatStartTime(now, startTime ktime.Time) string {
   439  	nowS, nowNs := now.Unix()
   440  	n := time.Unix(nowS, nowNs)
   441  	startTimeS, startTimeNs := startTime.Unix()
   442  	st := time.Unix(startTimeS, startTimeNs)
   443  	format := "15:04"
   444  	if st.YearDay() != n.YearDay() {
   445  		format = "Jan02"
   446  	}
   447  	if st.Year() != n.Year() {
   448  		format = "2006"
   449  	}
   450  	return st.Format(format)
   451  }
   452  
   453  func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
   454  	// Note: In procps, there is an option to include child CPU stats. As
   455  	// it is disabled by default, we do not include them.
   456  	total := stats.UserTime + stats.SysTime
   457  	lifetime := now.Sub(startTime)
   458  	if lifetime <= 0 {
   459  		return 0
   460  	}
   461  	percentCPU := total * 100 / lifetime
   462  	// Cap at 99% since procps does the same.
   463  	if percentCPU > 99 {
   464  		percentCPU = 99
   465  	}
   466  	return int32(percentCPU)
   467  }
   468  
   469  func ttyName(tty *kernel.TTY) string {
   470  	if tty == nil {
   471  		return "?"
   472  	}
   473  	return fmt.Sprintf("pts/%d", tty.Index)
   474  }
   475  
   476  // ContainerUsage retrieves per-container CPU usage.
   477  func ContainerUsage(kr *kernel.Kernel) map[string]uint64 {
   478  	cusage := make(map[string]uint64)
   479  	for _, tg := range kr.TaskSet().Root.ThreadGroups() {
   480  		// We want each tg's usage including reaped children.
   481  		cid := tg.Leader().ContainerID()
   482  		stats := tg.CPUStats()
   483  		stats.Accumulate(tg.JoinedChildCPUStats())
   484  		cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds())
   485  	}
   486  	return cusage
   487  }
   488  
   489  // unpackFiles unpacks the file descriptor map and, if applicable, the file
   490  // descriptor to be used for execution from the unmarshalled ExecArgs.
   491  func (args *ExecArgs) unpackFiles() (map[int]*fd.FD, *fd.FD, error) {
   492  	var execFD *fd.FD
   493  	var err error
   494  
   495  	// If there is one additional file, the last file is used for program
   496  	// execution.
   497  	if len(args.Files) == len(args.GuestFDs)+1 {
   498  		execFD, err = fd.NewFromFile(args.Files[len(args.Files)-1])
   499  		if err != nil {
   500  			return nil, nil, fmt.Errorf("duplicating exec file: %w", err)
   501  		}
   502  	} else if len(args.Files) != len(args.GuestFDs) {
   503  		return nil, nil, fmt.Errorf("length of payload files does not match length of file descriptor array")
   504  	}
   505  
   506  	// GuestFDs are the indexes of our FD map.
   507  	fdMap := make(map[int]*fd.FD, len(args.GuestFDs))
   508  	for i, appFD := range args.GuestFDs {
   509  		file := args.Files[i]
   510  		if appFD < 0 {
   511  			return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater")
   512  		}
   513  		hostFD, err := fd.NewFromFile(file)
   514  		if err != nil {
   515  			return nil, nil, fmt.Errorf("duplicating payload files: %w", err)
   516  		}
   517  		fdMap[appFD] = hostFD
   518  	}
   519  	return fdMap, execFD, nil
   520  }