gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/control/proc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "bytes" 19 "encoding/json" 20 "fmt" 21 "os" 22 "sort" 23 "strings" 24 "text/tabwriter" 25 "time" 26 27 "gvisor.dev/gvisor/pkg/abi/linux" 28 "gvisor.dev/gvisor/pkg/fd" 29 "gvisor.dev/gvisor/pkg/log" 30 "gvisor.dev/gvisor/pkg/sentry/fdimport" 31 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" 32 "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" 33 "gvisor.dev/gvisor/pkg/sentry/kernel" 34 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 35 ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" 36 "gvisor.dev/gvisor/pkg/sentry/limits" 37 "gvisor.dev/gvisor/pkg/sentry/usage" 38 "gvisor.dev/gvisor/pkg/sentry/vfs" 39 "gvisor.dev/gvisor/pkg/urpc" 40 ) 41 42 // Proc includes task-related functions. 43 // 44 // At the moment, this is limited to exec support. 45 type Proc struct { 46 Kernel *kernel.Kernel 47 } 48 49 // FilePayload aids to ensure that payload files and guest file descriptors are 50 // consistent when instantiated through the NewFilePayload helper method. 51 type FilePayload struct { 52 // FilePayload is the file payload that is transferred via RPC. 53 urpc.FilePayload 54 55 // GuestFDs are the file descriptors in the file descriptor map of the 56 // executed application. They correspond 1:1 to the files in the 57 // urpc.FilePayload. If a program is executed from a host file descriptor, 58 // the file payload may contain one additional file. In that case, the file 59 // used for program execution is the last file in the Files array. 60 GuestFDs []int 61 } 62 63 // NewFilePayload returns a FilePayload that maps file descriptors to files inside 64 // the executed process and provides a file for execution. 65 func NewFilePayload(fdMap map[int]*os.File, execFile *os.File) FilePayload { 66 fileCount := len(fdMap) 67 if execFile != nil { 68 fileCount++ 69 } 70 files := make([]*os.File, 0, fileCount) 71 guestFDs := make([]int, 0, len(fdMap)) 72 73 // Make the map iteration order deterministic for the sake of testing. 74 // Otherwise, the order is randomized and tests relying on the comparison 75 // of equality will fail. 76 for key := range fdMap { 77 guestFDs = append(guestFDs, key) 78 } 79 sort.Ints(guestFDs) 80 81 for _, guestFD := range guestFDs { 82 files = append(files, fdMap[guestFD]) 83 } 84 85 if execFile != nil { 86 files = append(files, execFile) 87 } 88 89 return FilePayload{ 90 FilePayload: urpc.FilePayload{Files: files}, 91 GuestFDs: guestFDs, 92 } 93 } 94 95 // ExecArgs is the set of arguments to exec. 96 type ExecArgs struct { 97 // Filename is the filename to load. 98 // 99 // If this is provided as "", then the file will be guessed via Argv[0]. 100 Filename string `json:"filename"` 101 102 // Argv is a list of arguments. 103 Argv []string `json:"argv"` 104 105 // Envv is a list of environment variables. 106 Envv []string `json:"envv"` 107 108 // MountNamespace is the mount namespace to execute the new process in. 109 // A reference on MountNamespace must be held for the lifetime of the 110 // ExecArgs. If MountNamespace is nil, it will default to the init 111 // process's MountNamespace. 112 MountNamespace *vfs.MountNamespace 113 114 // WorkingDirectory defines the working directory for the new process. 115 WorkingDirectory string `json:"wd"` 116 117 // KUID is the UID to run with in the root user namespace. Defaults to 118 // root if not set explicitly. 119 KUID auth.KUID 120 121 // KGID is the GID to run with in the root user namespace. Defaults to 122 // the root group if not set explicitly. 123 KGID auth.KGID 124 125 // ExtraKGIDs is the list of additional groups to which the user belongs. 126 ExtraKGIDs []auth.KGID 127 128 // Capabilities is the list of capabilities to give to the process. 129 Capabilities *auth.TaskCapabilities 130 131 // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. 132 StdioIsPty bool 133 134 // FilePayload determines the files to give to the new process. 135 FilePayload 136 137 // ContainerID is the container for the process being executed. 138 ContainerID string 139 140 // PIDNamespace is the pid namespace for the process being executed. 141 PIDNamespace *kernel.PIDNamespace 142 143 // Limits is the limit set for the process being executed. 144 Limits *limits.LimitSet 145 } 146 147 // String prints the arguments as a string. 148 func (args *ExecArgs) String() string { 149 if len(args.Argv) == 0 { 150 return args.Filename 151 } 152 a := make([]string, len(args.Argv)) 153 copy(a, args.Argv) 154 if args.Filename != "" { 155 a[0] = args.Filename 156 } 157 return strings.Join(a, " ") 158 } 159 160 // Exec runs a new task. 161 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { 162 newTG, _, _, err := proc.execAsync(args) 163 if err != nil { 164 return err 165 } 166 167 // Wait for completion. 168 newTG.WaitExited() 169 *waitStatus = uint32(newTG.ExitStatus()) 170 return nil 171 } 172 173 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined 174 // as a function rather than a method to avoid exposing execAsync as an RPC. 175 func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { 176 return proc.execAsync(args) 177 } 178 179 // execAsync runs a new task, but doesn't wait for it to finish. It returns the 180 // newly created thread group and its PID. If the stdio FDs are TTYs, then a 181 // TTYFileOperations that wraps the TTY is also returned. 182 func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { 183 // Import file descriptors. 184 fdTable := proc.Kernel.NewFDTable() 185 186 creds := auth.NewUserCredentials( 187 args.KUID, 188 args.KGID, 189 args.ExtraKGIDs, 190 args.Capabilities, 191 proc.Kernel.RootUserNamespace()) 192 193 pidns := args.PIDNamespace 194 if pidns == nil { 195 pidns = proc.Kernel.RootPIDNamespace() 196 } 197 limitSet := args.Limits 198 if limitSet == nil { 199 limitSet = limits.NewLimitSet() 200 } 201 initArgs := kernel.CreateProcessArgs{ 202 Filename: args.Filename, 203 Argv: args.Argv, 204 Envv: args.Envv, 205 WorkingDirectory: args.WorkingDirectory, 206 MountNamespace: args.MountNamespace, 207 Credentials: creds, 208 FDTable: fdTable, 209 Umask: 0022, 210 Limits: limitSet, 211 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 212 UTSNamespace: proc.Kernel.RootUTSNamespace(), 213 IPCNamespace: proc.Kernel.RootIPCNamespace(), 214 ContainerID: args.ContainerID, 215 PIDNamespace: pidns, 216 Origin: kernel.OriginExec, 217 } 218 if initArgs.MountNamespace != nil { 219 // initArgs must hold a reference on MountNamespace, which will 220 // be donated to the new process in CreateProcess. 221 initArgs.MountNamespace.IncRef() 222 } 223 ctx := initArgs.NewContext(proc.Kernel) 224 defer fdTable.DecRef(ctx) 225 226 // Get the full path to the filename from the PATH env variable. 227 if initArgs.MountNamespace == nil { 228 // Set initArgs so that 'ctx' returns the namespace. 229 // 230 // Add a reference to the namespace, which is transferred to the new process. 231 initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() 232 initArgs.MountNamespace.IncRef() 233 } 234 235 fdMap, execFD, err := args.unpackFiles() 236 if err != nil { 237 return nil, 0, nil, fmt.Errorf("creating fd map: %w", err) 238 } 239 defer func() { 240 for _, hostFD := range fdMap { 241 _ = hostFD.Close() 242 } 243 }() 244 245 if execFD != nil { 246 if initArgs.Filename != "" { 247 return nil, 0, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 248 } 249 file, err := host.NewFD(ctx, proc.Kernel.HostMount(), execFD.FD(), &host.NewFDOptions{ 250 Readonly: true, 251 Savable: true, 252 VirtualOwner: true, 253 UID: args.KUID, 254 GID: args.KGID, 255 }) 256 if err != nil { 257 return nil, 0, nil, err 258 } 259 defer file.DecRef(ctx) 260 execFD.Release() 261 initArgs.File = file 262 } else { 263 resolved, err := user.ResolveExecutablePath(ctx, &initArgs) 264 if err != nil { 265 return nil, 0, nil, err 266 } 267 initArgs.Filename = resolved 268 } 269 270 // TODO(gvisor.dev/issue/1956): Container name is not really needed because 271 // exec processes are not restored, but add it for completeness. 272 ttyFile, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fdMap, "") 273 if err != nil { 274 return nil, 0, nil, err 275 } 276 277 // Set cgroups to the new exec task if cgroups are mounted. 278 cgroupRegistry := proc.Kernel.CgroupRegistry() 279 initialCgrps := map[kernel.Cgroup]struct{}{} 280 for _, ctrl := range kernel.CgroupCtrls { 281 cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+args.ContainerID) 282 if err != nil { 283 log.Warningf("cgroup mount for controller %v not found", ctrl) 284 continue 285 } 286 initialCgrps[cg] = struct{}{} 287 } 288 if len(initialCgrps) > 0 { 289 initArgs.InitialCgroups = initialCgrps 290 } 291 292 tg, tid, err := proc.Kernel.CreateProcess(initArgs) 293 if err != nil { 294 return nil, 0, nil, err 295 } 296 297 // Set the foreground process group on the TTY before starting the process. 298 if ttyFile != nil { 299 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 300 } 301 302 // Start the newly created process. 303 proc.Kernel.StartProcess(tg) 304 305 return tg, tid, ttyFile, nil 306 } 307 308 // PsArgs is the set of arguments to ps. 309 type PsArgs struct { 310 // JSON will force calls to Ps to return the result as a JSON payload. 311 JSON bool 312 } 313 314 // Ps provides a process listing for the running kernel. 315 func (proc *Proc) Ps(args *PsArgs, out *string) error { 316 var p []*Process 317 if e := Processes(proc.Kernel, "", &p); e != nil { 318 return e 319 } 320 if !args.JSON { 321 *out = ProcessListToTable(p) 322 } else { 323 s, e := ProcessListToJSON(p) 324 if e != nil { 325 return e 326 } 327 *out = s 328 } 329 return nil 330 } 331 332 // Process contains information about a single process in a Sandbox. 333 type Process struct { 334 UID auth.KUID `json:"uid"` 335 PID kernel.ThreadID `json:"pid"` 336 // Parent PID 337 PPID kernel.ThreadID `json:"ppid"` 338 Threads []kernel.ThreadID `json:"threads"` 339 // Processor utilization 340 C int32 `json:"c"` 341 // TTY name of the process. Will be of the form "pts/N" if there is a 342 // TTY, or "?" if there is not. 343 TTY string `json:"tty"` 344 // Start time 345 STime string `json:"stime"` 346 // CPU time 347 Time string `json:"time"` 348 // Executable shortname (e.g. "sh" for /bin/sh) 349 Cmd string `json:"cmd"` 350 } 351 352 // ProcessListToTable prints a table with the following format: 353 // UID PID PPID C TTY STIME TIME CMD 354 // 0 1 0 0 pty/4 14:04 505262ns tail 355 func ProcessListToTable(pl []*Process) string { 356 var buf bytes.Buffer 357 tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) 358 fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") 359 for _, d := range pl { 360 fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", 361 d.UID, 362 d.PID, 363 d.PPID, 364 d.C, 365 d.TTY, 366 d.STime, 367 d.Time, 368 d.Cmd) 369 } 370 tw.Flush() 371 return buf.String() 372 } 373 374 // ProcessListToJSON will return the JSON representation of ps. 375 func ProcessListToJSON(pl []*Process) (string, error) { 376 b, err := json.MarshalIndent(pl, "", " ") 377 if err != nil { 378 return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) 379 } 380 return string(b), nil 381 } 382 383 // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This 384 // behavior is the same as runc's. 385 func PrintPIDsJSON(pl []*Process) (string, error) { 386 pids := make([]kernel.ThreadID, 0, len(pl)) 387 for _, d := range pl { 388 pids = append(pids, d.PID) 389 } 390 b, err := json.Marshal(pids) 391 if err != nil { 392 return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err) 393 } 394 return string(b), nil 395 } 396 397 // Processes retrieves information about processes running in the sandbox with 398 // the given container id. All processes are returned if 'containerID' is empty. 399 func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { 400 ts := k.TaskSet() 401 now := k.RealtimeClock().Now() 402 pidns := ts.Root 403 for _, tg := range pidns.ThreadGroups() { 404 pid := pidns.IDOfThreadGroup(tg) 405 406 // If tg has already been reaped ignore it. 407 if pid == 0 { 408 continue 409 } 410 if containerID != "" && containerID != tg.Leader().ContainerID() { 411 continue 412 } 413 414 ppid := kernel.ThreadID(0) 415 if p := tg.Leader().Parent(); p != nil { 416 ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) 417 } 418 threads := tg.MemberIDs(pidns) 419 *out = append(*out, &Process{ 420 UID: tg.Leader().Credentials().EffectiveKUID, 421 PID: pid, 422 PPID: ppid, 423 Threads: threads, 424 STime: formatStartTime(now, tg.Leader().StartTime()), 425 C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), 426 Time: tg.CPUStats().SysTime.String(), 427 Cmd: tg.Leader().Name(), 428 TTY: ttyName(tg.TTY()), 429 }) 430 } 431 sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) 432 return nil 433 } 434 435 // formatStartTime formats startTime depending on the current time: 436 // - If startTime was today, HH:MM is used. 437 // - If startTime was not today but was this year, MonDD is used (e.g. Jan02) 438 // - If startTime was not this year, the year is used. 439 func formatStartTime(now, startTime ktime.Time) string { 440 nowS, nowNs := now.Unix() 441 n := time.Unix(nowS, nowNs) 442 startTimeS, startTimeNs := startTime.Unix() 443 st := time.Unix(startTimeS, startTimeNs) 444 format := "15:04" 445 if st.YearDay() != n.YearDay() { 446 format = "Jan02" 447 } 448 if st.Year() != n.Year() { 449 format = "2006" 450 } 451 return st.Format(format) 452 } 453 454 func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { 455 // Note: In procps, there is an option to include child CPU stats. As 456 // it is disabled by default, we do not include them. 457 total := stats.UserTime + stats.SysTime 458 lifetime := now.Sub(startTime) 459 if lifetime <= 0 { 460 return 0 461 } 462 percentCPU := total * 100 / lifetime 463 // Cap at 99% since procps does the same. 464 if percentCPU > 99 { 465 percentCPU = 99 466 } 467 return int32(percentCPU) 468 } 469 470 func ttyName(tty *kernel.TTY) string { 471 if tty == nil { 472 return "?" 473 } 474 return fmt.Sprintf("pts/%d", tty.Index) 475 } 476 477 // ContainerUsage retrieves per-container CPU usage. 478 func ContainerUsage(kr *kernel.Kernel) map[string]uint64 { 479 cusage := make(map[string]uint64) 480 for _, tg := range kr.TaskSet().Root.ThreadGroups() { 481 // We want each tg's usage including reaped children. 482 cid := tg.Leader().ContainerID() 483 stats := tg.CPUStats() 484 stats.Accumulate(tg.JoinedChildCPUStats()) 485 cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds()) 486 } 487 return cusage 488 } 489 490 // unpackFiles unpacks the file descriptor map and, if applicable, the file 491 // descriptor to be used for execution from the unmarshalled ExecArgs. 492 func (args *ExecArgs) unpackFiles() (map[int]*fd.FD, *fd.FD, error) { 493 var execFD *fd.FD 494 var err error 495 496 // If there is one additional file, the last file is used for program 497 // execution. 498 if len(args.Files) == len(args.GuestFDs)+1 { 499 execFD, err = fd.NewFromFile(args.Files[len(args.Files)-1]) 500 if err != nil { 501 return nil, nil, fmt.Errorf("duplicating exec file: %w", err) 502 } 503 } else if len(args.Files) != len(args.GuestFDs) { 504 return nil, nil, fmt.Errorf("length of payload files does not match length of file descriptor array") 505 } 506 507 // GuestFDs are the indexes of our FD map. 508 fdMap := make(map[int]*fd.FD, len(args.GuestFDs)) 509 for i, appFD := range args.GuestFDs { 510 file := args.Files[i] 511 if appFD < 0 { 512 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 513 } 514 hostFD, err := fd.NewFromFile(file) 515 if err != nil { 516 return nil, nil, fmt.Errorf("duplicating payload files: %w", err) 517 } 518 fdMap[appFD] = hostFD 519 } 520 return fdMap, execFD, nil 521 }