github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/control/proc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "bytes" 19 "encoding/json" 20 "fmt" 21 "os" 22 "sort" 23 "strings" 24 "text/tabwriter" 25 "time" 26 27 "github.com/metacubex/gvisor/pkg/abi/linux" 28 "github.com/metacubex/gvisor/pkg/fd" 29 "github.com/metacubex/gvisor/pkg/log" 30 "github.com/metacubex/gvisor/pkg/sentry/fdimport" 31 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/host" 32 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/user" 33 "github.com/metacubex/gvisor/pkg/sentry/kernel" 34 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 35 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 36 "github.com/metacubex/gvisor/pkg/sentry/limits" 37 "github.com/metacubex/gvisor/pkg/sentry/usage" 38 "github.com/metacubex/gvisor/pkg/sentry/vfs" 39 "github.com/metacubex/gvisor/pkg/urpc" 40 ) 41 42 // Proc includes task-related functions. 43 // 44 // At the moment, this is limited to exec support. 45 type Proc struct { 46 Kernel *kernel.Kernel 47 } 48 49 // FilePayload aids to ensure that payload files and guest file descriptors are 50 // consistent when instantiated through the NewFilePayload helper method. 51 type FilePayload struct { 52 // FilePayload is the file payload that is transferred via RPC. 53 urpc.FilePayload 54 55 // GuestFDs are the file descriptors in the file descriptor map of the 56 // executed application. They correspond 1:1 to the files in the 57 // urpc.FilePayload. If a program is executed from a host file descriptor, 58 // the file payload may contain one additional file. In that case, the file 59 // used for program execution is the last file in the Files array. 60 GuestFDs []int 61 } 62 63 // NewFilePayload returns a FilePayload that maps file descriptors to files inside 64 // the executed process and provides a file for execution. 65 func NewFilePayload(fdMap map[int]*os.File, execFile *os.File) FilePayload { 66 fileCount := len(fdMap) 67 if execFile != nil { 68 fileCount++ 69 } 70 files := make([]*os.File, 0, fileCount) 71 guestFDs := make([]int, 0, len(fdMap)) 72 73 // Make the map iteration order deterministic for the sake of testing. 74 // Otherwise, the order is randomized and tests relying on the comparison 75 // of equality will fail. 76 for key := range fdMap { 77 guestFDs = append(guestFDs, key) 78 } 79 sort.Ints(guestFDs) 80 81 for _, guestFD := range guestFDs { 82 files = append(files, fdMap[guestFD]) 83 } 84 85 if execFile != nil { 86 files = append(files, execFile) 87 } 88 89 return FilePayload{ 90 FilePayload: urpc.FilePayload{Files: files}, 91 GuestFDs: guestFDs, 92 } 93 } 94 95 // ExecArgs is the set of arguments to exec. 96 type ExecArgs struct { 97 // Filename is the filename to load. 98 // 99 // If this is provided as "", then the file will be guessed via Argv[0]. 100 Filename string `json:"filename"` 101 102 // Argv is a list of arguments. 103 Argv []string `json:"argv"` 104 105 // Envv is a list of environment variables. 106 Envv []string `json:"envv"` 107 108 // MountNamespace is the mount namespace to execute the new process in. 109 // A reference on MountNamespace must be held for the lifetime of the 110 // ExecArgs. If MountNamespace is nil, it will default to the init 111 // process's MountNamespace. 112 MountNamespace *vfs.MountNamespace 113 114 // WorkingDirectory defines the working directory for the new process. 115 WorkingDirectory string `json:"wd"` 116 117 // KUID is the UID to run with in the root user namespace. Defaults to 118 // root if not set explicitly. 119 KUID auth.KUID 120 121 // KGID is the GID to run with in the root user namespace. Defaults to 122 // the root group if not set explicitly. 123 KGID auth.KGID 124 125 // ExtraKGIDs is the list of additional groups to which the user belongs. 126 ExtraKGIDs []auth.KGID 127 128 // Capabilities is the list of capabilities to give to the process. 129 Capabilities *auth.TaskCapabilities 130 131 // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. 132 StdioIsPty bool 133 134 // FilePayload determines the files to give to the new process. 135 FilePayload 136 137 // ContainerID is the container for the process being executed. 138 ContainerID string 139 140 // PIDNamespace is the pid namespace for the process being executed. 141 PIDNamespace *kernel.PIDNamespace 142 143 // Limits is the limit set for the process being executed. 144 Limits *limits.LimitSet 145 } 146 147 // String prints the arguments as a string. 148 func (args *ExecArgs) String() string { 149 if len(args.Argv) == 0 { 150 return args.Filename 151 } 152 a := make([]string, len(args.Argv)) 153 copy(a, args.Argv) 154 if args.Filename != "" { 155 a[0] = args.Filename 156 } 157 return strings.Join(a, " ") 158 } 159 160 // Exec runs a new task. 161 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { 162 newTG, _, _, err := proc.execAsync(args) 163 if err != nil { 164 return err 165 } 166 167 // Wait for completion. 168 newTG.WaitExited() 169 *waitStatus = uint32(newTG.ExitStatus()) 170 return nil 171 } 172 173 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined 174 // as a function rather than a method to avoid exposing execAsync as an RPC. 175 func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { 176 return proc.execAsync(args) 177 } 178 179 // execAsync runs a new task, but doesn't wait for it to finish. It returns the 180 // newly created thread group and its PID. If the stdio FDs are TTYs, then a 181 // TTYFileOperations that wraps the TTY is also returned. 182 func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { 183 // Import file descriptors. 184 fdTable := proc.Kernel.NewFDTable() 185 186 creds := auth.NewUserCredentials( 187 args.KUID, 188 args.KGID, 189 args.ExtraKGIDs, 190 args.Capabilities, 191 proc.Kernel.RootUserNamespace()) 192 193 pidns := args.PIDNamespace 194 if pidns == nil { 195 pidns = proc.Kernel.RootPIDNamespace() 196 } 197 limitSet := args.Limits 198 if limitSet == nil { 199 limitSet = limits.NewLimitSet() 200 } 201 initArgs := kernel.CreateProcessArgs{ 202 Filename: args.Filename, 203 Argv: args.Argv, 204 Envv: args.Envv, 205 WorkingDirectory: args.WorkingDirectory, 206 MountNamespace: args.MountNamespace, 207 Credentials: creds, 208 FDTable: fdTable, 209 Umask: 0022, 210 Limits: limitSet, 211 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 212 UTSNamespace: proc.Kernel.RootUTSNamespace(), 213 IPCNamespace: proc.Kernel.RootIPCNamespace(), 214 ContainerID: args.ContainerID, 215 PIDNamespace: pidns, 216 } 217 if initArgs.MountNamespace != nil { 218 // initArgs must hold a reference on MountNamespace, which will 219 // be donated to the new process in CreateProcess. 220 initArgs.MountNamespace.IncRef() 221 } 222 ctx := initArgs.NewContext(proc.Kernel) 223 defer fdTable.DecRef(ctx) 224 225 // Get the full path to the filename from the PATH env variable. 226 if initArgs.MountNamespace == nil { 227 // Set initArgs so that 'ctx' returns the namespace. 228 // 229 // Add a reference to the namespace, which is transferred to the new process. 230 initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() 231 initArgs.MountNamespace.IncRef() 232 } 233 234 fdMap, execFD, err := args.unpackFiles() 235 if err != nil { 236 return nil, 0, nil, fmt.Errorf("creating fd map: %w", err) 237 } 238 defer func() { 239 for _, hostFD := range fdMap { 240 _ = hostFD.Close() 241 } 242 }() 243 244 if execFD != nil { 245 if initArgs.Filename != "" { 246 return nil, 0, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 247 } 248 file, err := host.NewFD(ctx, proc.Kernel.HostMount(), execFD.FD(), &host.NewFDOptions{ 249 Readonly: true, 250 Savable: true, 251 VirtualOwner: true, 252 UID: args.KUID, 253 GID: args.KGID, 254 }) 255 if err != nil { 256 return nil, 0, nil, err 257 } 258 defer file.DecRef(ctx) 259 execFD.Release() 260 initArgs.File = file 261 } else { 262 resolved, err := user.ResolveExecutablePath(ctx, &initArgs) 263 if err != nil { 264 return nil, 0, nil, err 265 } 266 initArgs.Filename = resolved 267 } 268 269 // TODO(gvisor.dev/issue/1956): Container name is not really needed because 270 // exec processes are not restored, but add it for completeness. 271 ttyFile, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fdMap, "") 272 if err != nil { 273 return nil, 0, nil, err 274 } 275 276 // Set cgroups to the new exec task if cgroups are mounted. 277 cgroupRegistry := proc.Kernel.CgroupRegistry() 278 initialCgrps := map[kernel.Cgroup]struct{}{} 279 for _, ctrl := range kernel.CgroupCtrls { 280 cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+args.ContainerID) 281 if err != nil { 282 log.Warningf("cgroup mount for controller %v not found", ctrl) 283 continue 284 } 285 initialCgrps[cg] = struct{}{} 286 } 287 if len(initialCgrps) > 0 { 288 initArgs.InitialCgroups = initialCgrps 289 } 290 291 tg, tid, err := proc.Kernel.CreateProcess(initArgs) 292 if err != nil { 293 return nil, 0, nil, err 294 } 295 296 // Set the foreground process group on the TTY before starting the process. 297 if ttyFile != nil { 298 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 299 } 300 301 // Start the newly created process. 302 proc.Kernel.StartProcess(tg) 303 304 return tg, tid, ttyFile, nil 305 } 306 307 // PsArgs is the set of arguments to ps. 308 type PsArgs struct { 309 // JSON will force calls to Ps to return the result as a JSON payload. 310 JSON bool 311 } 312 313 // Ps provides a process listing for the running kernel. 314 func (proc *Proc) Ps(args *PsArgs, out *string) error { 315 var p []*Process 316 if e := Processes(proc.Kernel, "", &p); e != nil { 317 return e 318 } 319 if !args.JSON { 320 *out = ProcessListToTable(p) 321 } else { 322 s, e := ProcessListToJSON(p) 323 if e != nil { 324 return e 325 } 326 *out = s 327 } 328 return nil 329 } 330 331 // Process contains information about a single process in a Sandbox. 332 type Process struct { 333 UID auth.KUID `json:"uid"` 334 PID kernel.ThreadID `json:"pid"` 335 // Parent PID 336 PPID kernel.ThreadID `json:"ppid"` 337 Threads []kernel.ThreadID `json:"threads"` 338 // Processor utilization 339 C int32 `json:"c"` 340 // TTY name of the process. Will be of the form "pts/N" if there is a 341 // TTY, or "?" if there is not. 342 TTY string `json:"tty"` 343 // Start time 344 STime string `json:"stime"` 345 // CPU time 346 Time string `json:"time"` 347 // Executable shortname (e.g. "sh" for /bin/sh) 348 Cmd string `json:"cmd"` 349 } 350 351 // ProcessListToTable prints a table with the following format: 352 // UID PID PPID C TTY STIME TIME CMD 353 // 0 1 0 0 pty/4 14:04 505262ns tail 354 func ProcessListToTable(pl []*Process) string { 355 var buf bytes.Buffer 356 tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) 357 fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") 358 for _, d := range pl { 359 fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", 360 d.UID, 361 d.PID, 362 d.PPID, 363 d.C, 364 d.TTY, 365 d.STime, 366 d.Time, 367 d.Cmd) 368 } 369 tw.Flush() 370 return buf.String() 371 } 372 373 // ProcessListToJSON will return the JSON representation of ps. 374 func ProcessListToJSON(pl []*Process) (string, error) { 375 b, err := json.MarshalIndent(pl, "", " ") 376 if err != nil { 377 return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) 378 } 379 return string(b), nil 380 } 381 382 // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This 383 // behavior is the same as runc's. 384 func PrintPIDsJSON(pl []*Process) (string, error) { 385 pids := make([]kernel.ThreadID, 0, len(pl)) 386 for _, d := range pl { 387 pids = append(pids, d.PID) 388 } 389 b, err := json.Marshal(pids) 390 if err != nil { 391 return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err) 392 } 393 return string(b), nil 394 } 395 396 // Processes retrieves information about processes running in the sandbox with 397 // the given container id. All processes are returned if 'containerID' is empty. 398 func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { 399 ts := k.TaskSet() 400 now := k.RealtimeClock().Now() 401 pidns := ts.Root 402 for _, tg := range pidns.ThreadGroups() { 403 pid := pidns.IDOfThreadGroup(tg) 404 405 // If tg has already been reaped ignore it. 406 if pid == 0 { 407 continue 408 } 409 if containerID != "" && containerID != tg.Leader().ContainerID() { 410 continue 411 } 412 413 ppid := kernel.ThreadID(0) 414 if p := tg.Leader().Parent(); p != nil { 415 ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) 416 } 417 threads := tg.MemberIDs(pidns) 418 *out = append(*out, &Process{ 419 UID: tg.Leader().Credentials().EffectiveKUID, 420 PID: pid, 421 PPID: ppid, 422 Threads: threads, 423 STime: formatStartTime(now, tg.Leader().StartTime()), 424 C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), 425 Time: tg.CPUStats().SysTime.String(), 426 Cmd: tg.Leader().Name(), 427 TTY: ttyName(tg.TTY()), 428 }) 429 } 430 sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) 431 return nil 432 } 433 434 // formatStartTime formats startTime depending on the current time: 435 // - If startTime was today, HH:MM is used. 436 // - If startTime was not today but was this year, MonDD is used (e.g. Jan02) 437 // - If startTime was not this year, the year is used. 438 func formatStartTime(now, startTime ktime.Time) string { 439 nowS, nowNs := now.Unix() 440 n := time.Unix(nowS, nowNs) 441 startTimeS, startTimeNs := startTime.Unix() 442 st := time.Unix(startTimeS, startTimeNs) 443 format := "15:04" 444 if st.YearDay() != n.YearDay() { 445 format = "Jan02" 446 } 447 if st.Year() != n.Year() { 448 format = "2006" 449 } 450 return st.Format(format) 451 } 452 453 func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { 454 // Note: In procps, there is an option to include child CPU stats. As 455 // it is disabled by default, we do not include them. 456 total := stats.UserTime + stats.SysTime 457 lifetime := now.Sub(startTime) 458 if lifetime <= 0 { 459 return 0 460 } 461 percentCPU := total * 100 / lifetime 462 // Cap at 99% since procps does the same. 463 if percentCPU > 99 { 464 percentCPU = 99 465 } 466 return int32(percentCPU) 467 } 468 469 func ttyName(tty *kernel.TTY) string { 470 if tty == nil { 471 return "?" 472 } 473 return fmt.Sprintf("pts/%d", tty.Index) 474 } 475 476 // ContainerUsage retrieves per-container CPU usage. 477 func ContainerUsage(kr *kernel.Kernel) map[string]uint64 { 478 cusage := make(map[string]uint64) 479 for _, tg := range kr.TaskSet().Root.ThreadGroups() { 480 // We want each tg's usage including reaped children. 481 cid := tg.Leader().ContainerID() 482 stats := tg.CPUStats() 483 stats.Accumulate(tg.JoinedChildCPUStats()) 484 cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds()) 485 } 486 return cusage 487 } 488 489 // unpackFiles unpacks the file descriptor map and, if applicable, the file 490 // descriptor to be used for execution from the unmarshalled ExecArgs. 491 func (args *ExecArgs) unpackFiles() (map[int]*fd.FD, *fd.FD, error) { 492 var execFD *fd.FD 493 var err error 494 495 // If there is one additional file, the last file is used for program 496 // execution. 497 if len(args.Files) == len(args.GuestFDs)+1 { 498 execFD, err = fd.NewFromFile(args.Files[len(args.Files)-1]) 499 if err != nil { 500 return nil, nil, fmt.Errorf("duplicating exec file: %w", err) 501 } 502 } else if len(args.Files) != len(args.GuestFDs) { 503 return nil, nil, fmt.Errorf("length of payload files does not match length of file descriptor array") 504 } 505 506 // GuestFDs are the indexes of our FD map. 507 fdMap := make(map[int]*fd.FD, len(args.GuestFDs)) 508 for i, appFD := range args.GuestFDs { 509 file := args.Files[i] 510 if appFD < 0 { 511 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 512 } 513 hostFD, err := fd.NewFromFile(file) 514 if err != nil { 515 return nil, nil, fmt.Errorf("duplicating payload files: %w", err) 516 } 517 fdMap[appFD] = hostFD 518 } 519 return fdMap, execFD, nil 520 }