gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/control/lifecycle.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "encoding/json" 19 "fmt" 20 "time" 21 22 "google.golang.org/protobuf/types/known/timestamppb" 23 "gvisor.dev/gvisor/pkg/abi/linux" 24 "gvisor.dev/gvisor/pkg/eventchannel" 25 "gvisor.dev/gvisor/pkg/fd" 26 "gvisor.dev/gvisor/pkg/log" 27 pb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto" 28 "gvisor.dev/gvisor/pkg/sentry/fdimport" 29 "gvisor.dev/gvisor/pkg/sentry/fsimpl/user" 30 "gvisor.dev/gvisor/pkg/sentry/kernel" 31 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 32 "gvisor.dev/gvisor/pkg/sentry/limits" 33 "gvisor.dev/gvisor/pkg/sentry/vfs" 34 "gvisor.dev/gvisor/pkg/sync" 35 "gvisor.dev/gvisor/pkg/urpc" 36 ) 37 38 // Lifecycle provides functions related to starting and stopping tasks. 39 type Lifecycle struct { 40 // Kernel is the kernel where the tasks belong to. 41 Kernel *kernel.Kernel 42 43 // ShutdownCh is the channel used to signal the sentry to shutdown 44 // the sentry/sandbox. 45 ShutdownCh chan struct{} 46 47 // mu protects the fields below. 48 mu sync.RWMutex 49 50 // MountNamespacesMap is a map of container id/names and the mount 51 // namespaces. 52 MountNamespacesMap map[string]*vfs.MountNamespace 53 54 // containerMap is a map of the container id and the container. 55 containerMap map[string]*Container 56 } 57 58 // containerState is the state of the container. 59 type containerState int 60 61 const ( 62 // stateCreated is the state when the container was created. It is the 63 // initial state. 64 stateCreated containerState = iota 65 66 // stateRunning is the state when the container/application is running. 67 stateRunning 68 69 // stateStopped is the state when the container has exited. 70 stateStopped 71 ) 72 73 // Container contains the set of parameters to represent a container. 74 type Container struct { 75 // containerID. 76 containerID string 77 78 // tg is the init(PID 1) threadgroup of the container. 79 tg *kernel.ThreadGroup 80 81 // state is the current state of the container. 82 state containerState 83 } 84 85 // StartContainerArgs is the set of arguments to start a container. 86 type StartContainerArgs struct { 87 // Filename is the filename to load. 88 // 89 // If this is provided as "", then the file will be guessed via Argv[0]. 90 Filename string `json:"filename"` 91 92 // Argv is a list of arguments. 93 Argv []string `json:"argv"` 94 95 // Envv is a list of environment variables. 96 Envv []string `json:"envv"` 97 98 // Secret_envv is a list of secret environment variables. 99 // 100 // NOTE: This field must never be logged! 101 SecretEnvv []string `json:"secret_envv"` 102 103 // WorkingDirectory defines the working directory for the new process. 104 WorkingDirectory string `json:"wd"` 105 106 // KUID is the UID to run with in the root user namespace. Defaults to 107 // root if not set explicitly. 108 KUID auth.KUID `json:"KUID"` 109 110 // KGID is the GID to run with in the root user namespace. Defaults to 111 // the root group if not set explicitly. 112 KGID auth.KGID `json:"KGID"` 113 114 // User is the user string used to retrieve UID/GID. 115 User string `json:"user"` 116 117 // ContainerID is the container for the process being executed. 118 ContainerID string `json:"container_id"` 119 120 // InitialCgroups is the set of cgroup controllers container needs to be initialised to. 121 InitialCgroups map[kernel.CgroupControllerType]string `json:"initial_cgroups"` 122 123 // Limits is the limit set for the process being executed. 124 Limits map[string]limits.Limit `json:"limits"` 125 126 // If HOME environment variable is not provided, and this flag is set, 127 // then the HOME environment variable will be set inside the container 128 // based on the user's home directory in /etc/passwd. 129 ResolveHome bool `json:"resolve_home"` 130 131 // If set, attempt to resolve the binary_path via the following procedure: 132 // 1) If binary_path is absolute, it is used directly. 133 // 2) If binary_path contains a slash, then it is resolved relative to the 134 // working_directory (or the root it working_directory is not set). 135 // 3) Otherwise, search the PATH environment variable for the first directory 136 // that contains an executable file with name in binary_path. 137 ResolveBinaryPath bool `json:"resolve_binary_path"` 138 139 // DonatedFDs is the list of sentry-intrenal file descriptors that will 140 // donated. They correspond to the donated files in FilePayload. 141 DonatedFDs []int `json:"donated_fds"` 142 143 // FilePayload determines the files to give to the new process. 144 urpc.FilePayload 145 } 146 147 // String formats the StartContainerArgs without the SecretEnvv field. 148 func (sca StartContainerArgs) String() string { 149 sca.SecretEnvv = make([]string, len(sca.SecretEnvv)) 150 for i := range sca.SecretEnvv { 151 sca.SecretEnvv[i] = "(hidden)" 152 } 153 b, err := json.Marshal(sca) 154 if err != nil { 155 return fmt.Sprintf("error marshaling: %s", err) 156 } 157 return string(b) 158 } 159 160 func (l *Lifecycle) updateContainerState(containerID string, newState containerState) error { 161 l.mu.Lock() 162 defer l.mu.Unlock() 163 164 c, ok := l.containerMap[containerID] 165 if !ok { 166 return fmt.Errorf("container %v not started", containerID) 167 } 168 169 switch newState { 170 case stateCreated: 171 // Impossible. 172 panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState)) 173 174 case stateRunning: 175 if c.state != stateCreated { 176 // Impossible. 177 panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState)) 178 } 179 180 case stateStopped: 181 // Valid state transition. 182 183 default: 184 // Invalid new state. 185 panic(fmt.Sprintf("invalid new state: %v", newState)) 186 } 187 188 c.state = newState 189 return nil 190 } 191 192 // StartContainer will start a new container in the sandbox. 193 func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error { 194 timeRequested := time.Now() 195 timeRequestReceived := ×tamppb.Timestamp{ 196 Seconds: timeRequested.Unix(), 197 Nanos: int32(timeRequested.Nanosecond()), 198 } 199 log.Infof("StartContainer: %v", args) 200 if len(args.Files) != len(args.DonatedFDs) { 201 return fmt.Errorf("FilePayload.Files and DonatedFDs must have same number of elements (%d != %d)", len(args.Files), len(args.DonatedFDs)) 202 } 203 204 l.mu.RLock() 205 mntns, ok := l.MountNamespacesMap[args.ContainerID] 206 if !ok { 207 l.mu.RUnlock() 208 return fmt.Errorf("mount namespace is nil for %s", args.ContainerID) 209 } 210 l.mu.RUnlock() 211 212 uid := args.KUID 213 gid := args.KGID 214 if args.User != "" { 215 if uid != 0 || gid != 0 { 216 return fmt.Errorf("container spec specified both an explicit UID/GID and a user name, only one or the other may be provided") 217 } 218 var err error 219 uid, gid, err = user.GetExecUIDGIDFromUser(l.Kernel.SupervisorContext(), mntns, args.User) 220 if err != nil { 221 return fmt.Errorf("couldn't retrieve UID and GID for user %v, err: %v", args.User, err) 222 } 223 } 224 225 creds := auth.NewUserCredentials( 226 uid, 227 gid, 228 nil, /* extraKGIDs */ 229 nil, /* capabilities */ 230 l.Kernel.RootUserNamespace()) 231 232 ls, err := limits.NewLinuxDistroLimitSet() 233 if err != nil { 234 return fmt.Errorf("error creating default limit set: %w", err) 235 } 236 for name, limit := range args.Limits { 237 lt, ok := limits.FromLinuxResourceName[name] 238 if !ok { 239 return fmt.Errorf("unknown limit %q", name) 240 } 241 ls.SetUnchecked(lt, limit) 242 } 243 244 // Create a new pid namespace for the container. Each container must run 245 // in its own pid namespace. 246 pidNs := l.Kernel.RootPIDNamespace().NewChild(l.Kernel.RootUserNamespace()) 247 248 initArgs := kernel.CreateProcessArgs{ 249 Filename: args.Filename, 250 Argv: args.Argv, 251 // Order Envv before SecretEnvv. 252 Envv: append(args.Envv, args.SecretEnvv...), 253 WorkingDirectory: args.WorkingDirectory, 254 Credentials: creds, 255 Umask: 0022, 256 Limits: ls, 257 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 258 UTSNamespace: l.Kernel.RootUTSNamespace(), 259 IPCNamespace: l.Kernel.RootIPCNamespace(), 260 ContainerID: args.ContainerID, 261 PIDNamespace: pidNs, 262 } 263 264 ctx := initArgs.NewContext(l.Kernel) 265 266 // Import file descriptors. 267 fdTable := l.Kernel.NewFDTable() 268 defer fdTable.DecRef(ctx) 269 hostFDs, err := fd.NewFromFiles(args.Files) 270 if err != nil { 271 return fmt.Errorf("error donating host files: %w", err) 272 } 273 defer func() { 274 for _, hfd := range hostFDs { 275 _ = hfd.Close() 276 } 277 }() 278 fdMap := make(map[int]*fd.FD, len(args.DonatedFDs)) 279 for i, appFD := range args.DonatedFDs { 280 fdMap[appFD] = hostFDs[i] 281 } 282 // Use ContainerID since containers don't have names here. 283 if _, err := fdimport.Import(ctx, fdTable, false, args.KUID, args.KGID, fdMap, initArgs.ContainerID); err != nil { 284 return fmt.Errorf("error importing host files: %w", err) 285 } 286 initArgs.FDTable = fdTable 287 288 initArgs.MountNamespace = mntns 289 initArgs.MountNamespace.IncRef() 290 291 if args.ResolveBinaryPath { 292 resolved, err := user.ResolveExecutablePath(ctx, &initArgs) 293 if err != nil { 294 return fmt.Errorf("failed to resolve binary path: %w", err) 295 } 296 initArgs.Filename = resolved 297 } 298 299 if args.ResolveHome { 300 envVars, err := user.MaybeAddExecUserHome(ctx, initArgs.MountNamespace, creds.RealKUID, initArgs.Envv) 301 if err != nil { 302 return fmt.Errorf("failed to get user home dir: %w", err) 303 } 304 initArgs.Envv = envVars 305 } 306 307 fds, err := fd.NewFromFiles(args.Files) 308 if err != nil { 309 return fmt.Errorf("duplicating payload files: %w", err) 310 } 311 defer func() { 312 for _, fd := range fds { 313 _ = fd.Close() 314 } 315 }() 316 317 initialCgroups := make(map[kernel.Cgroup]struct{}, len(args.InitialCgroups)) 318 cgroupRegistry := l.Kernel.CgroupRegistry() 319 // path is relative to the container's cgroup controller of specified type. 320 for initialCgroupController, path := range args.InitialCgroups { 321 cg, err := cgroupRegistry.FindCgroup(ctx, initialCgroupController, path) 322 if err != nil { 323 return fmt.Errorf("FindCgroup can't locate cgroup controller: %v err: %v", initialCgroupController, err) 324 } 325 initialCgroups[cg] = struct{}{} 326 } 327 initArgs.InitialCgroups = initialCgroups 328 329 tg, _, err := l.Kernel.CreateProcess(initArgs) 330 if err != nil { 331 return err 332 } 333 334 c := &Container{ 335 containerID: initArgs.ContainerID, 336 tg: tg, 337 state: stateCreated, 338 } 339 340 l.mu.Lock() 341 if l.containerMap == nil { 342 l.containerMap = make(map[string]*Container) 343 } 344 345 if _, ok := l.containerMap[initArgs.ContainerID]; ok { 346 l.mu.Unlock() 347 return fmt.Errorf("container id: %v already exists", initArgs.ContainerID) 348 } 349 350 l.containerMap[initArgs.ContainerID] = c 351 l.mu.Unlock() 352 353 // Start the newly created process. 354 l.Kernel.StartProcess(tg) 355 log.Infof("Started the new container %v ", initArgs.ContainerID) 356 357 if err := l.updateContainerState(initArgs.ContainerID, stateRunning); err != nil { 358 // Sanity check: shouldn't fail to update the state at this point. 359 panic(fmt.Sprintf("Failed to set running state: %v", err)) 360 361 } 362 363 timeRequestCompleted := time.Now() 364 eventchannel.LogEmit(&pb.ContainerStartedEvent{ 365 Started: true, 366 ContainerId: initArgs.ContainerID, 367 RequestReceived: timeRequestReceived, 368 RequestCompleted: ×tamppb.Timestamp{ 369 Seconds: timeRequestCompleted.Unix(), 370 Nanos: int32(timeRequestCompleted.Nanosecond()), 371 }, 372 }) 373 374 // TODO(b/251490950): reap thread needs to synchronize with Save, so the 375 // container state update doesn't race with state serialization. 376 go l.reap(initArgs.ContainerID, tg) // S/R-SAFE: see above. 377 378 return nil 379 } 380 381 func (l *Lifecycle) reap(containerID string, tg *kernel.ThreadGroup) { 382 tg.WaitExited() 383 if err := l.updateContainerState(containerID, stateStopped); err != nil { 384 panic(err) 385 } 386 eventchannel.LogEmit(&pb.ContainerExitEvent{ 387 ContainerId: containerID, 388 ExitStatus: uint32(tg.ExitStatus()), 389 }) 390 } 391 392 // Pause pauses all tasks, blocking until they are stopped. 393 func (l *Lifecycle) Pause(_, _ *struct{}) error { 394 l.Kernel.Pause() 395 return nil 396 } 397 398 // Resume resumes all tasks. 399 func (l *Lifecycle) Resume(_, _ *struct{}) error { 400 l.Kernel.Unpause() 401 return nil 402 } 403 404 // Shutdown sends signal to destroy the sentry/sandbox. 405 func (l *Lifecycle) Shutdown(_, _ *struct{}) error { 406 close(l.ShutdownCh) 407 return nil 408 } 409 410 func (l *Lifecycle) getInitContainerProcess(containerID string) (*kernel.ThreadGroup, error) { 411 l.mu.Lock() 412 defer l.mu.Unlock() 413 414 c, ok := l.containerMap[containerID] 415 if !ok { 416 return nil, fmt.Errorf("container %v not started", containerID) 417 } 418 return c.tg, nil 419 } 420 421 // ContainerArgs is the set of arguments for container related APIs after 422 // starting the container. 423 type ContainerArgs struct { 424 ContainerID string `json:"container_id"` 425 } 426 427 // GetExitStatus returns the container exit status if it has stopped. 428 func (l *Lifecycle) GetExitStatus(args *ContainerArgs, status *uint32) error { 429 l.mu.Lock() 430 defer l.mu.Unlock() 431 432 c, ok := l.containerMap[args.ContainerID] 433 if !ok { 434 return fmt.Errorf("container %q doesn't exist, or has not been started", args.ContainerID) 435 } 436 437 if c.state != stateStopped { 438 return fmt.Errorf("container %q hasn't exited yet", args.ContainerID) 439 } 440 441 *status = uint32(c.tg.ExitStatus()) 442 eventchannel.LogEmit(&pb.ContainerExitEvent{ 443 ContainerId: args.ContainerID, 444 ExitStatus: *status, 445 }) 446 return nil 447 } 448 449 // Reap notifies the sandbox that the caller is interested in the exit status via 450 // an exit event. The caller is responsible for handling any corresponding exit 451 // events, especially if they're interested in waiting for the exit. 452 func (l *Lifecycle) Reap(args *ContainerArgs, _ *struct{}) error { 453 // Check if there are any real emitters registered. If there are no 454 // emitters, the caller will never be notified, so fail immediately. 455 if !eventchannel.HaveEmitters() { 456 return fmt.Errorf("no event emitters configured") 457 } 458 459 l.mu.Lock() 460 461 c, ok := l.containerMap[args.ContainerID] 462 if !ok { 463 l.mu.Unlock() 464 return fmt.Errorf("no container with id %q", args.ContainerID) 465 } 466 467 // Once a container enters the stop state, the state never changes. It's 468 // safe to cache a stopped state outside a l.mu critical section. 469 isStopped := c.state == stateStopped 470 l.mu.Unlock() 471 472 if isStopped { 473 // Already stopped, emit stop to ensure any callbacks registered after 474 // the actual stop is called. This may be a duplicate event, but is 475 // necessary in case the reap goroutine transitions the container to the 476 // stop state before the caller starts observing the event channel. 477 eventchannel.LogEmit(&pb.ContainerExitEvent{ 478 ContainerId: args.ContainerID, 479 ExitStatus: uint32(c.tg.ExitStatus()), 480 }) 481 } 482 483 // Caller now responsible for blocking on the exit event. 484 return nil 485 } 486 487 // IsContainerRunning returns true if the container is running. 488 func (l *Lifecycle) IsContainerRunning(args *ContainerArgs, isRunning *bool) error { 489 l.mu.Lock() 490 defer l.mu.Unlock() 491 492 c, ok := l.containerMap[args.ContainerID] 493 // We may be racing with the reaper goroutine updating c.state, so also 494 // check the number non-exited tasks. 495 if !ok || c.state != stateRunning || c.tg.Count() == 0 { 496 return nil 497 } 498 499 *isRunning = true 500 return nil 501 } 502 503 // SignalContainerArgs is the set of arguments for signalling a container. 504 type SignalContainerArgs struct { 505 ContainerID string `json:"container_id"` 506 Signo int32 `json:"signo"` 507 SignalAll bool `json:"signalAll"` 508 } 509 510 // SignalContainer signals the container in multi-container mode. It returns error if the 511 // container hasn't started or has exited. 512 func (l *Lifecycle) SignalContainer(args *SignalContainerArgs, _ *struct{}) error { 513 tg, err := l.getInitContainerProcess(args.ContainerID) 514 if err != nil { 515 return err 516 } 517 518 l.mu.Lock() 519 c, ok := l.containerMap[args.ContainerID] 520 if !ok || c.state != stateRunning { 521 l.mu.Unlock() 522 return fmt.Errorf("%v container not running", args.ContainerID) 523 } 524 l.mu.Unlock() 525 526 // Signalling a single process is supported only for the init process. 527 if !args.SignalAll { 528 if tg == nil { 529 return fmt.Errorf("no process exists in %v", tg) 530 } 531 return l.Kernel.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: args.Signo}) 532 } 533 534 l.Kernel.Pause() 535 defer l.Kernel.Unpause() 536 return l.Kernel.SendContainerSignal(args.ContainerID, &linux.SignalInfo{Signo: args.Signo}) 537 }