github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/control/proc.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "bytes" 19 "encoding/json" 20 "fmt" 21 "os" 22 "sort" 23 "strings" 24 "text/tabwriter" 25 "time" 26 27 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 28 "github.com/MerlinKodo/gvisor/pkg/fd" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/fdimport" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/host" 31 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 34 ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time" 35 "github.com/MerlinKodo/gvisor/pkg/sentry/limits" 36 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 37 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 38 "github.com/MerlinKodo/gvisor/pkg/urpc" 39 ) 40 41 // Proc includes task-related functions. 42 // 43 // At the moment, this is limited to exec support. 44 type Proc struct { 45 Kernel *kernel.Kernel 46 } 47 48 // FilePayload aids to ensure that payload files and guest file descriptors are 49 // consistent when instantiated through the NewFilePayload helper method. 50 type FilePayload struct { 51 // FilePayload is the file payload that is transferred via RPC. 52 urpc.FilePayload 53 54 // GuestFDs are the file descriptors in the file descriptor map of the 55 // executed application. They correspond 1:1 to the files in the 56 // urpc.FilePayload. If a program is executed from a host file descriptor, 57 // the file payload may contain one additional file. In that case, the file 58 // used for program execution is the last file in the Files array. 59 GuestFDs []int 60 } 61 62 // NewFilePayload returns a FilePayload that maps file descriptors to files inside 63 // the executed process and provides a file for execution. 64 func NewFilePayload(fdMap map[int]*os.File, execFile *os.File) FilePayload { 65 fileCount := len(fdMap) 66 if execFile != nil { 67 fileCount++ 68 } 69 files := make([]*os.File, 0, fileCount) 70 guestFDs := make([]int, 0, len(fdMap)) 71 72 // Make the map iteration order deterministic for the sake of testing. 73 // Otherwise, the order is randomized and tests relying on the comparison 74 // of equality will fail. 75 for key := range fdMap { 76 guestFDs = append(guestFDs, key) 77 } 78 sort.Ints(guestFDs) 79 80 for _, guestFD := range guestFDs { 81 files = append(files, fdMap[guestFD]) 82 } 83 84 if execFile != nil { 85 files = append(files, execFile) 86 } 87 88 return FilePayload{ 89 FilePayload: urpc.FilePayload{Files: files}, 90 GuestFDs: guestFDs, 91 } 92 } 93 94 // ExecArgs is the set of arguments to exec. 95 type ExecArgs struct { 96 // Filename is the filename to load. 97 // 98 // If this is provided as "", then the file will be guessed via Argv[0]. 99 Filename string `json:"filename"` 100 101 // Argv is a list of arguments. 102 Argv []string `json:"argv"` 103 104 // Envv is a list of environment variables. 105 Envv []string `json:"envv"` 106 107 // MountNamespace is the mount namespace to execute the new process in. 108 // A reference on MountNamespace must be held for the lifetime of the 109 // ExecArgs. If MountNamespace is nil, it will default to the init 110 // process's MountNamespace. 111 MountNamespace *vfs.MountNamespace 112 113 // WorkingDirectory defines the working directory for the new process. 114 WorkingDirectory string `json:"wd"` 115 116 // KUID is the UID to run with in the root user namespace. Defaults to 117 // root if not set explicitly. 118 KUID auth.KUID 119 120 // KGID is the GID to run with in the root user namespace. Defaults to 121 // the root group if not set explicitly. 122 KGID auth.KGID 123 124 // ExtraKGIDs is the list of additional groups to which the user belongs. 125 ExtraKGIDs []auth.KGID 126 127 // Capabilities is the list of capabilities to give to the process. 128 Capabilities *auth.TaskCapabilities 129 130 // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. 131 StdioIsPty bool 132 133 // FilePayload determines the files to give to the new process. 134 FilePayload 135 136 // ContainerID is the container for the process being executed. 137 ContainerID string 138 139 // PIDNamespace is the pid namespace for the process being executed. 140 PIDNamespace *kernel.PIDNamespace 141 142 // Limits is the limit set for the process being executed. 143 Limits *limits.LimitSet 144 } 145 146 // String prints the arguments as a string. 147 func (args *ExecArgs) String() string { 148 if len(args.Argv) == 0 { 149 return args.Filename 150 } 151 a := make([]string, len(args.Argv)) 152 copy(a, args.Argv) 153 if args.Filename != "" { 154 a[0] = args.Filename 155 } 156 return strings.Join(a, " ") 157 } 158 159 // Exec runs a new task. 160 func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { 161 newTG, _, _, err := proc.execAsync(args) 162 if err != nil { 163 return err 164 } 165 166 // Wait for completion. 167 newTG.WaitExited() 168 *waitStatus = uint32(newTG.ExitStatus()) 169 return nil 170 } 171 172 // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined 173 // as a function rather than a method to avoid exposing execAsync as an RPC. 174 func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { 175 return proc.execAsync(args) 176 } 177 178 // execAsync runs a new task, but doesn't wait for it to finish. It returns the 179 // newly created thread group and its PID. If the stdio FDs are TTYs, then a 180 // TTYFileOperations that wraps the TTY is also returned. 181 func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileDescription, error) { 182 // Import file descriptors. 183 fdTable := proc.Kernel.NewFDTable() 184 185 creds := auth.NewUserCredentials( 186 args.KUID, 187 args.KGID, 188 args.ExtraKGIDs, 189 args.Capabilities, 190 proc.Kernel.RootUserNamespace()) 191 192 pidns := args.PIDNamespace 193 if pidns == nil { 194 pidns = proc.Kernel.RootPIDNamespace() 195 } 196 limitSet := args.Limits 197 if limitSet == nil { 198 limitSet = limits.NewLimitSet() 199 } 200 initArgs := kernel.CreateProcessArgs{ 201 Filename: args.Filename, 202 Argv: args.Argv, 203 Envv: args.Envv, 204 WorkingDirectory: args.WorkingDirectory, 205 MountNamespace: args.MountNamespace, 206 Credentials: creds, 207 FDTable: fdTable, 208 Umask: 0022, 209 Limits: limitSet, 210 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 211 UTSNamespace: proc.Kernel.RootUTSNamespace(), 212 IPCNamespace: proc.Kernel.RootIPCNamespace(), 213 AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(), 214 ContainerID: args.ContainerID, 215 PIDNamespace: pidns, 216 } 217 if initArgs.MountNamespace != nil { 218 // initArgs must hold a reference on MountNamespace, which will 219 // be donated to the new process in CreateProcess. 220 initArgs.MountNamespace.IncRef() 221 } 222 ctx := initArgs.NewContext(proc.Kernel) 223 defer fdTable.DecRef(ctx) 224 225 // Get the full path to the filename from the PATH env variable. 226 if initArgs.MountNamespace == nil { 227 // Set initArgs so that 'ctx' returns the namespace. 228 // 229 // Add a reference to the namespace, which is transferred to the new process. 230 initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() 231 initArgs.MountNamespace.IncRef() 232 } 233 234 fdMap, execFD, err := args.unpackFiles() 235 if err != nil { 236 return nil, 0, nil, fmt.Errorf("creating fd map: %w", err) 237 } 238 defer func() { 239 for _, hostFD := range fdMap { 240 _ = hostFD.Close() 241 } 242 }() 243 244 if execFD != nil { 245 if initArgs.Filename != "" { 246 return nil, 0, nil, fmt.Errorf("process must either be started from a file or a filename, not both") 247 } 248 file, err := host.NewFD(ctx, proc.Kernel.HostMount(), execFD.FD(), &host.NewFDOptions{ 249 Readonly: true, 250 Savable: true, 251 VirtualOwner: true, 252 UID: args.KUID, 253 GID: args.KGID, 254 }) 255 if err != nil { 256 return nil, 0, nil, err 257 } 258 defer file.DecRef(ctx) 259 execFD.Release() 260 initArgs.File = file 261 } else { 262 resolved, err := user.ResolveExecutablePath(ctx, &initArgs) 263 if err != nil { 264 return nil, 0, nil, err 265 } 266 initArgs.Filename = resolved 267 } 268 269 ttyFile, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, args.KUID, args.KGID, fdMap) 270 if err != nil { 271 return nil, 0, nil, err 272 } 273 274 tg, tid, err := proc.Kernel.CreateProcess(initArgs) 275 if err != nil { 276 return nil, 0, nil, err 277 } 278 279 // Set the foreground process group on the TTY before starting the process. 280 if ttyFile != nil { 281 ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) 282 } 283 284 // Start the newly created process. 285 proc.Kernel.StartProcess(tg) 286 287 return tg, tid, ttyFile, nil 288 } 289 290 // PsArgs is the set of arguments to ps. 291 type PsArgs struct { 292 // JSON will force calls to Ps to return the result as a JSON payload. 293 JSON bool 294 } 295 296 // Ps provides a process listing for the running kernel. 297 func (proc *Proc) Ps(args *PsArgs, out *string) error { 298 var p []*Process 299 if e := Processes(proc.Kernel, "", &p); e != nil { 300 return e 301 } 302 if !args.JSON { 303 *out = ProcessListToTable(p) 304 } else { 305 s, e := ProcessListToJSON(p) 306 if e != nil { 307 return e 308 } 309 *out = s 310 } 311 return nil 312 } 313 314 // Process contains information about a single process in a Sandbox. 315 type Process struct { 316 UID auth.KUID `json:"uid"` 317 PID kernel.ThreadID `json:"pid"` 318 // Parent PID 319 PPID kernel.ThreadID `json:"ppid"` 320 Threads []kernel.ThreadID `json:"threads"` 321 // Processor utilization 322 C int32 `json:"c"` 323 // TTY name of the process. Will be of the form "pts/N" if there is a 324 // TTY, or "?" if there is not. 325 TTY string `json:"tty"` 326 // Start time 327 STime string `json:"stime"` 328 // CPU time 329 Time string `json:"time"` 330 // Executable shortname (e.g. "sh" for /bin/sh) 331 Cmd string `json:"cmd"` 332 } 333 334 // ProcessListToTable prints a table with the following format: 335 // UID PID PPID C TTY STIME TIME CMD 336 // 0 1 0 0 pty/4 14:04 505262ns tail 337 func ProcessListToTable(pl []*Process) string { 338 var buf bytes.Buffer 339 tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0) 340 fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD") 341 for _, d := range pl { 342 fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s", 343 d.UID, 344 d.PID, 345 d.PPID, 346 d.C, 347 d.TTY, 348 d.STime, 349 d.Time, 350 d.Cmd) 351 } 352 tw.Flush() 353 return buf.String() 354 } 355 356 // ProcessListToJSON will return the JSON representation of ps. 357 func ProcessListToJSON(pl []*Process) (string, error) { 358 b, err := json.MarshalIndent(pl, "", " ") 359 if err != nil { 360 return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err) 361 } 362 return string(b), nil 363 } 364 365 // PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This 366 // behavior is the same as runc's. 367 func PrintPIDsJSON(pl []*Process) (string, error) { 368 pids := make([]kernel.ThreadID, 0, len(pl)) 369 for _, d := range pl { 370 pids = append(pids, d.PID) 371 } 372 b, err := json.Marshal(pids) 373 if err != nil { 374 return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err) 375 } 376 return string(b), nil 377 } 378 379 // Processes retrieves information about processes running in the sandbox with 380 // the given container id. All processes are returned if 'containerID' is empty. 381 func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error { 382 ts := k.TaskSet() 383 now := k.RealtimeClock().Now() 384 pidns := ts.Root 385 for _, tg := range pidns.ThreadGroups() { 386 pid := pidns.IDOfThreadGroup(tg) 387 388 // If tg has already been reaped ignore it. 389 if pid == 0 { 390 continue 391 } 392 if containerID != "" && containerID != tg.Leader().ContainerID() { 393 continue 394 } 395 396 ppid := kernel.ThreadID(0) 397 if p := tg.Leader().Parent(); p != nil { 398 ppid = pidns.IDOfThreadGroup(p.ThreadGroup()) 399 } 400 threads := tg.MemberIDs(pidns) 401 *out = append(*out, &Process{ 402 UID: tg.Leader().Credentials().EffectiveKUID, 403 PID: pid, 404 PPID: ppid, 405 Threads: threads, 406 STime: formatStartTime(now, tg.Leader().StartTime()), 407 C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now), 408 Time: tg.CPUStats().SysTime.String(), 409 Cmd: tg.Leader().Name(), 410 TTY: ttyName(tg.TTY()), 411 }) 412 } 413 sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID }) 414 return nil 415 } 416 417 // formatStartTime formats startTime depending on the current time: 418 // - If startTime was today, HH:MM is used. 419 // - If startTime was not today but was this year, MonDD is used (e.g. Jan02) 420 // - If startTime was not this year, the year is used. 421 func formatStartTime(now, startTime ktime.Time) string { 422 nowS, nowNs := now.Unix() 423 n := time.Unix(nowS, nowNs) 424 startTimeS, startTimeNs := startTime.Unix() 425 st := time.Unix(startTimeS, startTimeNs) 426 format := "15:04" 427 if st.YearDay() != n.YearDay() { 428 format = "Jan02" 429 } 430 if st.Year() != n.Year() { 431 format = "2006" 432 } 433 return st.Format(format) 434 } 435 436 func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 { 437 // Note: In procps, there is an option to include child CPU stats. As 438 // it is disabled by default, we do not include them. 439 total := stats.UserTime + stats.SysTime 440 lifetime := now.Sub(startTime) 441 if lifetime <= 0 { 442 return 0 443 } 444 percentCPU := total * 100 / lifetime 445 // Cap at 99% since procps does the same. 446 if percentCPU > 99 { 447 percentCPU = 99 448 } 449 return int32(percentCPU) 450 } 451 452 func ttyName(tty *kernel.TTY) string { 453 if tty == nil { 454 return "?" 455 } 456 return fmt.Sprintf("pts/%d", tty.Index) 457 } 458 459 // ContainerUsage retrieves per-container CPU usage. 460 func ContainerUsage(kr *kernel.Kernel) map[string]uint64 { 461 cusage := make(map[string]uint64) 462 for _, tg := range kr.TaskSet().Root.ThreadGroups() { 463 // We want each tg's usage including reaped children. 464 cid := tg.Leader().ContainerID() 465 stats := tg.CPUStats() 466 stats.Accumulate(tg.JoinedChildCPUStats()) 467 cusage[cid] += uint64(stats.UserTime.Nanoseconds()) + uint64(stats.SysTime.Nanoseconds()) 468 } 469 return cusage 470 } 471 472 // unpackFiles unpacks the file descriptor map and, if applicable, the file 473 // descriptor to be used for execution from the unmarshalled ExecArgs. 474 func (args *ExecArgs) unpackFiles() (map[int]*fd.FD, *fd.FD, error) { 475 var execFD *fd.FD 476 var err error 477 478 // If there is one additional file, the last file is used for program 479 // execution. 480 if len(args.Files) == len(args.GuestFDs)+1 { 481 execFD, err = fd.NewFromFile(args.Files[len(args.Files)-1]) 482 if err != nil { 483 return nil, nil, fmt.Errorf("duplicating exec file: %w", err) 484 } 485 } else if len(args.Files) != len(args.GuestFDs) { 486 return nil, nil, fmt.Errorf("length of payload files does not match length of file descriptor array") 487 } 488 489 // GuestFDs are the indexes of our FD map. 490 fdMap := make(map[int]*fd.FD, len(args.GuestFDs)) 491 for i, appFD := range args.GuestFDs { 492 file := args.Files[i] 493 if appFD < 0 { 494 return nil, nil, fmt.Errorf("guest file descriptors must be 0 or greater") 495 } 496 hostFD, err := fd.NewFromFile(file) 497 if err != nil { 498 return nil, nil, fmt.Errorf("duplicating payload files: %w", err) 499 } 500 fdMap[appFD] = hostFD 501 } 502 return fdMap, execFD, nil 503 }