github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/control/proc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "bytes" 19 "encoding/json" 20 "fmt" 21 "sort" 22 "strings" 23 "text/tabwriter" 24 "time" 25 26 "github.com/SagerNet/gvisor/pkg/abi/linux" 27 "github.com/SagerNet/gvisor/pkg/fd" 28 "github.com/SagerNet/gvisor/pkg/sentry/fdimport" 29 "github.com/SagerNet/gvisor/pkg/sentry/fs" 30 "github.com/SagerNet/gvisor/pkg/sentry/fs/host" 31 "github.com/SagerNet/gvisor/pkg/sentry/fs/user" 32 hostvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/host" 33 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 34 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 35 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 36 "github.com/SagerNet/gvisor/pkg/sentry/limits" 37 "github.com/SagerNet/gvisor/pkg/sentry/usage" 38 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 39 "github.com/SagerNet/gvisor/pkg/urpc" 40 ) 41 42 // Proc includes task-related functions. 43 // 44 // At the moment, this is limited to exec support. 45 type Proc struct { 46 Kernel *kernel.Kernel 47 } 48 49 // ExecArgs is the set of arguments to exec. 50 type ExecArgs struct { 51 // Filename is the filename to load. 52 // 53 // If this is provided as "", then the file will be guessed via Argv[0]. 54 Filename string `json:"filename"` 55 56 // Argv is a list of arguments. 57 Argv []string `json:"argv"` 58 59 // Envv is a list of environment variables. 60 Envv []string `json:"envv"` 61 62 // MountNamespace is the mount namespace to execute the new process in. 63 // A reference on MountNamespace must be held for the lifetime of the 64 // ExecArgs. If MountNamespace is nil, it will default to the init 65 // process's MountNamespace. 66 MountNamespace *fs.MountNamespace 67 68 // MountNamespaceVFS2 is the mount namespace to execute the new process in. 69 // A reference on MountNamespace must be held for the lifetime of the 70 // ExecArgs. If MountNamespace is nil, it will default to the init 71 // process's MountNamespace. 72 MountNamespaceVFS2 *vfs.MountNamespace 73 74 // WorkingDirectory defines the working directory for the new process. 75 WorkingDirectory string `json:"wd"` 76 77 // KUID is the UID to run with in the root user namespace. Defaults to 78 // root if not set explicitly. 79 KUID auth.KUID 80 81 // KGID is the GID to run with in the root user namespace. Defaults to 82 // the root group if not set explicitly. 83 KGID auth.KGID 84 85 // ExtraKGIDs is the list of additional groups to which the user belongs. 86 ExtraKGIDs []auth.KGID 87 88 // Capabilities is the list of capabilities to give to the process. 89 Capabilities *auth.TaskCapabilities 90 91 // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. 92 StdioIsPty bool 93 94 // FilePayload determines the files to give to the new process. 95 urpc.FilePayload 96 97 // ContainerID is the container for the process being executed. 98 ContainerID string 99 100 // PIDNamespace is the pid namespace for the process being executed. 101 PIDNamespace *kernel.PIDNamespace 102 103 // Limits is the limit set for the process being executed. 104 Limits *limits.LimitSet 105 } 106 107 // String prints the arguments as a string. 108 func (args ExecArgs) String() string { 109 if len(args.Argv) == 0 { 110 return args.Filename 111 } 112 a := make([]string, len(args.Argv)) 113 copy(a, args.Argv) 114 if args.Filename != "" { 115 a[0] = args.Filename 116 } 117 return strings.Join(a, " ") 118 } 119 120 // Exec runs a new task. 121 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { 122 newTG, _, _, _, err := proc.execAsync(args) 123 if err != nil { 124 return err 125 } 126 127 // Wait for completion. 128 newTG.WaitExited() 129 *waitStatus = newTG.ExitStatus().Status() 130 return nil 131 } 132 133 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined 134 // as a function rather than a method to avoid exposing execAsync as an RPC. 135 func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { 136 return proc.execAsync(args) 137 } 138 139 // execAsync runs a new task, but doesn't wait for it to finish. It returns the 140 // newly created thread group and its PID. If the stdio FDs are TTYs, then a 141 // TTYFileOperations that wraps the TTY is also returned. 142 func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { 143 // Import file descriptors. 144 fdTable := proc.Kernel.NewFDTable() 145 146 creds := auth.NewUserCredentials( 147 args.KUID, 148 args.KGID, 149 args.ExtraKGIDs, 150 args.Capabilities, 151 proc.Kernel.RootUserNamespace()) 152 153 pidns := args.PIDNamespace 154 if pidns == nil { 155 pidns = proc.Kernel.RootPIDNamespace() 156 } 157 limitSet := args.Limits 158 if limitSet == nil { 159 limitSet = limits.NewLimitSet() 160 } 161 initArgs := kernel.CreateProcessArgs{ 162 Filename: args.Filename, 163 Argv: args.Argv, 164 Envv: args.Envv, 165 WorkingDirectory: args.WorkingDirectory, 166 MountNamespace: args.MountNamespace, 167 MountNamespaceVFS2: args.MountNamespaceVFS2, 168 Credentials: creds, 169 FDTable: fdTable, 170 Umask: 0022, 171 Limits: limitSet, 172 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 173 UTSNamespace: proc.Kernel.RootUTSNamespace(), 174 IPCNamespace: proc.Kernel.RootIPCNamespace(), 175 AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(), 176 ContainerID: args.ContainerID, 177 PIDNamespace: pidns, 178 } 179 if initArgs.MountNamespace != nil { 180 // initArgs must hold a reference on MountNamespace, which will 181 // be donated to the new process in CreateProcess. 182 initArgs.MountNamespace.IncRef() 183 } 184 if initArgs.MountNamespaceVFS2 != nil { 185 // initArgs must hold a reference on MountNamespaceVFS2, which will 186 // be donated to the new process in CreateProcess. 187 initArgs.MountNamespaceVFS2.IncRef() 188 } 189 ctx := initArgs.NewContext(proc.Kernel) 190 defer fdTable.DecRef(ctx) 191 192 if kernel.VFS2Enabled { 193 // Get the full path to the filename from the PATH env variable. 194 if initArgs.MountNamespaceVFS2 == nil { 195 // Set initArgs so that 'ctx' returns the namespace. 196 // 197 // Add a reference to the namespace, which is transferred to the new process. 198 initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() 199 initArgs.MountNamespaceVFS2.IncRef() 200 } 201 } else { 202 if initArgs.MountNamespace == nil { 203 // Set initArgs so that 'ctx' returns the namespace. 204 initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() 205 206 // initArgs must hold a reference on MountNamespace, which will 207 // be donated to the new process in CreateProcess. 208 initArgs.MountNamespace.IncRef() 209 } 210 } 211 resolved, err := user.ResolveExecutablePath(ctx, &initArgs) 212 if err != nil { 213 return nil, 0, nil, nil, err 214 } 215 initArgs.Filename = resolved 216 217 fds, err := fd.NewFromFiles(args.Files) 218 if err != nil { 219 return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err) 220 } 221 defer func() { 222 for _, fd := range fds { 223 _ = fd.Close() 224 } 225 }() 226 ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) 227 if err != nil { 228 return nil, 0, nil, nil, err 229 } 230 231 tg, tid, err := proc.Kernel.CreateProcess(initArgs) 232 if err != nil { 233 return nil, 0, nil, nil, err 234 } 235 236 // Set the foreground process group on the TTY before starting the process. 237 switch { 238 case ttyFile != nil: 239 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 240 case ttyFileVFS2 != nil: 241 ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) 242 } 243 244 // Start the newly created process. 245 proc.Kernel.StartProcess(tg) 246 247 return tg, tid, ttyFile, ttyFileVFS2, nil 248 } 249 250 // PsArgs is the set of arguments to ps. 251 type PsArgs struct { 252 // JSON will force calls to Ps to return the result as a JSON payload. 253 JSON bool 254 } 255 256 // Ps provides a process listing for the running kernel. 257 func (proc *Proc) Ps(args *PsArgs, out *string) error { 258 var p []*Process 259 if e := Processes(proc.Kernel, "", &p); e != nil { 260 return e 261 } 262 if !args.JSON { 263 *out = ProcessListToTable(p) 264 } else { 265 s, e := ProcessListToJSON(p) 266 if e != nil { 267 return e 268 } 269 *out = s 270 } 271 return nil 272 } 273 274 // Process contains information about a single process in a Sandbox. 275 type Process struct { 276 UID auth.KUID `json:"uid"` 277 PID kernel.ThreadID `json:"pid"` 278 // Parent PID 279 PPID kernel.ThreadID `json:"ppid"` 280 Threads []kernel.ThreadID `json:"threads"` 281 // Processor utilization 282 C int32 `json:"c"` 283 // TTY name of the process. Will be of the form "pts/N" if there is a 284 // TTY, or "?" if there is not. 285 TTY string `json:"tty"` 286 // Start time 287 STime string `json:"stime"` 288 // CPU time 289 Time string `json:"time"` 290 // Executable shortname (e.g. "sh" for /bin/sh) 291 Cmd string `json:"cmd"` 292 } 293 294 // ProcessListToTable prints a table with the following format: 295 // UID PID PPID C TTY STIME TIME CMD 296 // 0 1 0 0 pty/4 14:04 505262ns tail 297 func ProcessListToTable(pl []*Process) string { 298 var buf bytes.Buffer 299 tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) 300 fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") 301 for _, d := range pl { 302 fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", 303 d.UID, 304 d.PID, 305 d.PPID, 306 d.C, 307 d.TTY, 308 d.STime, 309 d.Time, 310 d.Cmd) 311 } 312 tw.Flush() 313 return buf.String() 314 } 315 316 // ProcessListToJSON will return the JSON representation of ps. 317 func ProcessListToJSON(pl []*Process) (string, error) { 318 b, err := json.MarshalIndent(pl, "", " ") 319 if err != nil { 320 return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) 321 } 322 return string(b), nil 323 } 324 325 // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This 326 // behavior is the same as runc's. 327 func PrintPIDsJSON(pl []*Process) (string, error) { 328 pids := make([]kernel.ThreadID, 0, len(pl)) 329 for _, d := range pl { 330 pids = append(pids, d.PID) 331 } 332 b, err := json.Marshal(pids) 333 if err != nil { 334 return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err) 335 } 336 return string(b), nil 337 } 338 339 // Processes retrieves information about processes running in the sandbox with 340 // the given container id. All processes are returned if 'containerID' is empty. 341 func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { 342 ts := k.TaskSet() 343 now := k.RealtimeClock().Now() 344 pidns := ts.Root 345 for _, tg := range pidns.ThreadGroups() { 346 pid := pidns.IDOfThreadGroup(tg) 347 348 // If tg has already been reaped ignore it. 349 if pid == 0 { 350 continue 351 } 352 if containerID != "" && containerID != tg.Leader().ContainerID() { 353 continue 354 } 355 356 ppid := kernel.ThreadID(0) 357 if p := tg.Leader().Parent(); p != nil { 358 ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) 359 } 360 threads := tg.MemberIDs(pidns) 361 *out = append(*out, &Process{ 362 UID: tg.Leader().Credentials().EffectiveKUID, 363 PID: pid, 364 PPID: ppid, 365 Threads: threads, 366 STime: formatStartTime(now, tg.Leader().StartTime()), 367 C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), 368 Time: tg.CPUStats().SysTime.String(), 369 Cmd: tg.Leader().Name(), 370 TTY: ttyName(tg.TTY()), 371 }) 372 } 373 sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) 374 return nil 375 } 376 377 // formatStartTime formats startTime depending on the current time: 378 // - If startTime was today, HH:MM is used. 379 // - If startTime was not today but was this year, MonDD is used (e.g. Jan02) 380 // - If startTime was not this year, the year is used. 381 func formatStartTime(now, startTime ktime.Time) string { 382 nowS, nowNs := now.Unix() 383 n := time.Unix(nowS, nowNs) 384 startTimeS, startTimeNs := startTime.Unix() 385 st := time.Unix(startTimeS, startTimeNs) 386 format := "15:04" 387 if st.YearDay() != n.YearDay() { 388 format = "Jan02" 389 } 390 if st.Year() != n.Year() { 391 format = "2006" 392 } 393 return st.Format(format) 394 } 395 396 func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { 397 // Note: In procps, there is an option to include child CPU stats. As 398 // it is disabled by default, we do not include them. 399 total := stats.UserTime + stats.SysTime 400 lifetime := now.Sub(startTime) 401 if lifetime <= 0 { 402 return 0 403 } 404 percentCPU := total * 100 / lifetime 405 // Cap at 99% since procps does the same. 406 if percentCPU > 99 { 407 percentCPU = 99 408 } 409 return int32(percentCPU) 410 } 411 412 func ttyName(tty *kernel.TTY) string { 413 if tty == nil { 414 return "?" 415 } 416 return fmt.Sprintf("pts/%d", tty.Index) 417 } 418 419 // ContainerUsage retrieves per-container CPU usage. 420 func ContainerUsage(kr *kernel.Kernel) map[string]uint64 { 421 cusage := make(map[string]uint64) 422 for _, tg := range kr.TaskSet().Root.ThreadGroups() { 423 // We want each tg's usage including reaped children. 424 cid := tg.Leader().ContainerID() 425 stats := tg.CPUStats() 426 stats.Accumulate(tg.JoinedChildCPUStats()) 427 cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds()) 428 } 429 return cusage 430 }