github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/control/lifecycle.go (about) 1 // Copyright 2021 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package control 16 17 import ( 18 "encoding/json" 19 "fmt" 20 "time" 21 22 "google.golang.org/protobuf/types/known/timestamppb" 23 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 24 "github.com/MerlinKodo/gvisor/pkg/eventchannel" 25 "github.com/MerlinKodo/gvisor/pkg/fd" 26 "github.com/MerlinKodo/gvisor/pkg/log" 27 pb "github.com/MerlinKodo/gvisor/pkg/sentry/control/control_go_proto" 28 "github.com/MerlinKodo/gvisor/pkg/sentry/fdimport" 29 "github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user" 30 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel" 31 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 32 "github.com/MerlinKodo/gvisor/pkg/sentry/limits" 33 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 34 "github.com/MerlinKodo/gvisor/pkg/sync" 35 "github.com/MerlinKodo/gvisor/pkg/urpc" 36 ) 37 38 // Lifecycle provides functions related to starting and stopping tasks. 39 type Lifecycle struct { 40 // Kernel is the kernel where the tasks belong to. 41 Kernel *kernel.Kernel 42 43 // ShutdownCh is the channel used to signal the sentry to shutdown 44 // the sentry/sandbox. 45 ShutdownCh chan struct{} 46 47 // mu protects the fields below. 48 mu sync.RWMutex 49 50 // MountNamespacesMap is a map of container id/names and the mount 51 // namespaces. 52 MountNamespacesMap map[string]*vfs.MountNamespace 53 54 // containerMap is a map of the container id and the container. 55 containerMap map[string]*Container 56 } 57 58 // containerState is the state of the container. 59 type containerState int 60 61 const ( 62 // stateCreated is the state when the container was created. It is the 63 // initial state. 64 stateCreated containerState = iota 65 66 // stateRunning is the state when the container/application is running. 67 stateRunning 68 69 // stateStopped is the state when the container has exited. 70 stateStopped 71 ) 72 73 // Container contains the set of parameters to represent a container. 74 type Container struct { 75 // containerID. 76 containerID string 77 78 // tg is the init(PID 1) threadgroup of the container. 79 tg *kernel.ThreadGroup 80 81 // state is the current state of the container. 82 state containerState 83 } 84 85 // StartContainerArgs is the set of arguments to start a container. 86 type StartContainerArgs struct { 87 // Filename is the filename to load. 88 // 89 // If this is provided as "", then the file will be guessed via Argv[0]. 90 Filename string `json:"filename"` 91 92 // Argv is a list of arguments. 93 Argv []string `json:"argv"` 94 95 // Envv is a list of environment variables. 96 Envv []string `json:"envv"` 97 98 // Secret_envv is a list of secret environment variables. 99 // 100 // NOTE: This field must never be logged! 101 SecretEnvv []string `json:"secret_envv"` 102 103 // WorkingDirectory defines the working directory for the new process. 104 WorkingDirectory string `json:"wd"` 105 106 // KUID is the UID to run with in the root user namespace. Defaults to 107 // root if not set explicitly. 108 KUID auth.KUID `json:"KUID"` 109 110 // KGID is the GID to run with in the root user namespace. Defaults to 111 // the root group if not set explicitly. 112 KGID auth.KGID `json:"KGID"` 113 114 // ContainerID is the container for the process being executed. 115 ContainerID string `json:"container_id"` 116 117 // InitialCgroups is the set of cgroup controllers container needs to be initialised to. 118 InitialCgroups map[kernel.CgroupControllerType]string `json:"initial_cgroups"` 119 120 // Limits is the limit set for the process being executed. 121 Limits map[string]limits.Limit `json:"limits"` 122 123 // If HOME environment variable is not provided, and this flag is set, 124 // then the HOME environment variable will be set inside the container 125 // based on the user's home directory in /etc/passwd. 126 ResolveHome bool `json:"resolve_home"` 127 128 // If set, attempt to resolve the binary_path via the following procedure: 129 // 1) If binary_path is absolute, it is used directly. 130 // 2) If binary_path contains a slash, then it is resolved relative to the 131 // working_directory (or the root it working_directory is not set). 132 // 3) Otherwise, search the PATH environment variable for the first directory 133 // that contains an executable file with name in binary_path. 134 ResolveBinaryPath bool `json:"resolve_binary_path"` 135 136 // DonatedFDs is the list of sentry-intrenal file descriptors that will 137 // donated. They correspond to the donated files in FilePayload. 138 DonatedFDs []int `json:"donated_fds"` 139 140 // FilePayload determines the files to give to the new process. 141 urpc.FilePayload 142 } 143 144 // String formats the StartContainerArgs without the SecretEnvv field. 145 func (sca StartContainerArgs) String() string { 146 sca.SecretEnvv = make([]string, len(sca.SecretEnvv)) 147 for i := range sca.SecretEnvv { 148 sca.SecretEnvv[i] = "(hidden)" 149 } 150 b, err := json.Marshal(sca) 151 if err != nil { 152 return fmt.Sprintf("error marshaling: %s", err) 153 } 154 return string(b) 155 } 156 157 func (l *Lifecycle) updateContainerState(containerID string, newState containerState) error { 158 l.mu.Lock() 159 defer l.mu.Unlock() 160 161 c, ok := l.containerMap[containerID] 162 if !ok { 163 return fmt.Errorf("container %v not started", containerID) 164 } 165 166 switch newState { 167 case stateCreated: 168 // Impossible. 169 panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState)) 170 171 case stateRunning: 172 if c.state != stateCreated { 173 // Impossible. 174 panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState)) 175 } 176 177 case stateStopped: 178 // Valid state transition. 179 180 default: 181 // Invalid new state. 182 panic(fmt.Sprintf("invalid new state: %v", newState)) 183 } 184 185 c.state = newState 186 return nil 187 } 188 189 // StartContainer will start a new container in the sandbox. 190 func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error { 191 timeRequested := time.Now() 192 timeRequestReceived := ×tamppb.Timestamp{ 193 Seconds: timeRequested.Unix(), 194 Nanos: int32(timeRequested.Nanosecond()), 195 } 196 log.Infof("StartContainer: %v", args) 197 if len(args.Files) != len(args.DonatedFDs) { 198 return fmt.Errorf("FilePayload.Files and DonatedFDs must have same number of elements (%d != %d)", len(args.Files), len(args.DonatedFDs)) 199 } 200 201 creds := auth.NewUserCredentials( 202 args.KUID, 203 args.KGID, 204 nil, /* extraKGIDs */ 205 nil, /* capabilities */ 206 l.Kernel.RootUserNamespace()) 207 208 ls, err := limits.NewLinuxDistroLimitSet() 209 if err != nil { 210 return fmt.Errorf("error creating default limit set: %w", err) 211 } 212 for name, limit := range args.Limits { 213 lt, ok := limits.FromLinuxResourceName[name] 214 if !ok { 215 return fmt.Errorf("unknown limit %q", name) 216 } 217 ls.SetUnchecked(lt, limit) 218 } 219 220 // Create a new pid namespace for the container. Each container must run 221 // in its own pid namespace. 222 pidNs := l.Kernel.RootPIDNamespace().NewChild(l.Kernel.RootUserNamespace()) 223 224 initArgs := kernel.CreateProcessArgs{ 225 Filename: args.Filename, 226 Argv: args.Argv, 227 // Order Envv before SecretEnvv. 228 Envv: append(args.Envv, args.SecretEnvv...), 229 WorkingDirectory: args.WorkingDirectory, 230 Credentials: creds, 231 Umask: 0022, 232 Limits: ls, 233 MaxSymlinkTraversals: linux.MaxSymlinkTraversals, 234 UTSNamespace: l.Kernel.RootUTSNamespace(), 235 IPCNamespace: l.Kernel.RootIPCNamespace(), 236 AbstractSocketNamespace: l.Kernel.RootAbstractSocketNamespace(), 237 ContainerID: args.ContainerID, 238 PIDNamespace: pidNs, 239 } 240 241 ctx := initArgs.NewContext(l.Kernel) 242 243 // Import file descriptors. 244 fdTable := l.Kernel.NewFDTable() 245 defer fdTable.DecRef(ctx) 246 hostFDs, err := fd.NewFromFiles(args.Files) 247 if err != nil { 248 return fmt.Errorf("error donating host files: %w", err) 249 } 250 defer func() { 251 for _, hfd := range hostFDs { 252 _ = hfd.Close() 253 } 254 }() 255 fdMap := make(map[int]*fd.FD, len(args.DonatedFDs)) 256 for i, appFD := range args.DonatedFDs { 257 fdMap[appFD] = hostFDs[i] 258 } 259 if _, err := fdimport.Import(ctx, fdTable, false, args.KUID, args.KGID, fdMap); err != nil { 260 return fmt.Errorf("error importing host files: %w", err) 261 } 262 initArgs.FDTable = fdTable 263 264 l.mu.RLock() 265 mntns, ok := l.MountNamespacesMap[initArgs.ContainerID] 266 if !ok { 267 l.mu.RUnlock() 268 return fmt.Errorf("mount namespace is nil for %s", initArgs.ContainerID) 269 } 270 initArgs.MountNamespace = mntns 271 l.mu.RUnlock() 272 initArgs.MountNamespace.IncRef() 273 274 if args.ResolveBinaryPath { 275 resolved, err := user.ResolveExecutablePath(ctx, &initArgs) 276 if err != nil { 277 return fmt.Errorf("failed to resolve binary path: %w", err) 278 } 279 initArgs.Filename = resolved 280 } 281 282 if args.ResolveHome { 283 envVars, err := user.MaybeAddExecUserHome(ctx, initArgs.MountNamespace, creds.RealKUID, initArgs.Envv) 284 if err != nil { 285 return fmt.Errorf("failed to get user home dir: %w", err) 286 } 287 initArgs.Envv = envVars 288 } 289 290 fds, err := fd.NewFromFiles(args.Files) 291 if err != nil { 292 return fmt.Errorf("duplicating payload files: %w", err) 293 } 294 defer func() { 295 for _, fd := range fds { 296 _ = fd.Close() 297 } 298 }() 299 300 initialCgroups := make(map[kernel.Cgroup]struct{}, len(args.InitialCgroups)) 301 cgroupRegistry := l.Kernel.CgroupRegistry() 302 // path is relative to the container's cgroup controller of specified type. 303 for initialCgroupController, path := range args.InitialCgroups { 304 cg, err := cgroupRegistry.FindCgroup(ctx, initialCgroupController, path) 305 if err != nil { 306 return fmt.Errorf("FindCgroup can't locate cgroup controller: %v err: %v", initialCgroupController, err) 307 } 308 initialCgroups[cg] = struct{}{} 309 } 310 initArgs.InitialCgroups = initialCgroups 311 312 tg, _, err := l.Kernel.CreateProcess(initArgs) 313 if err != nil { 314 return err 315 } 316 317 c := &Container{ 318 containerID: initArgs.ContainerID, 319 tg: tg, 320 state: stateCreated, 321 } 322 323 l.mu.Lock() 324 if l.containerMap == nil { 325 l.containerMap = make(map[string]*Container) 326 } 327 328 if _, ok := l.containerMap[initArgs.ContainerID]; ok { 329 l.mu.Unlock() 330 return fmt.Errorf("container id: %v already exists", initArgs.ContainerID) 331 } 332 333 l.containerMap[initArgs.ContainerID] = c 334 l.mu.Unlock() 335 336 // Start the newly created process. 337 l.Kernel.StartProcess(tg) 338 log.Infof("Started the new container %v ", initArgs.ContainerID) 339 340 if err := l.updateContainerState(initArgs.ContainerID, stateRunning); err != nil { 341 // Sanity check: shouldn't fail to update the state at this point. 342 panic(fmt.Sprintf("Failed to set running state: %v", err)) 343 344 } 345 346 timeRequestCompleted := time.Now() 347 eventchannel.LogEmit(&pb.ContainerStartedEvent{ 348 Started: true, 349 ContainerId: initArgs.ContainerID, 350 RequestReceived: timeRequestReceived, 351 RequestCompleted: ×tamppb.Timestamp{ 352 Seconds: timeRequestCompleted.Unix(), 353 Nanos: int32(timeRequestCompleted.Nanosecond()), 354 }, 355 }) 356 357 // TODO(b/251490950): reap thread needs to synchronize with Save, so the 358 // container state update doesn't race with state serialization. 359 go l.reap(initArgs.ContainerID, tg) // S/R-SAFE: see above. 360 361 return nil 362 } 363 364 func (l *Lifecycle) reap(containerID string, tg *kernel.ThreadGroup) { 365 tg.WaitExited() 366 if err := l.updateContainerState(containerID, stateStopped); err != nil { 367 panic(err) 368 } 369 eventchannel.LogEmit(&pb.ContainerExitEvent{ 370 ContainerId: containerID, 371 ExitStatus: uint32(tg.ExitStatus()), 372 }) 373 } 374 375 // Pause pauses all tasks, blocking until they are stopped. 376 func (l *Lifecycle) Pause(_, _ *struct{}) error { 377 l.Kernel.Pause() 378 return nil 379 } 380 381 // Resume resumes all tasks. 382 func (l *Lifecycle) Resume(_, _ *struct{}) error { 383 l.Kernel.Unpause() 384 return nil 385 } 386 387 // Shutdown sends signal to destroy the sentry/sandbox. 388 func (l *Lifecycle) Shutdown(_, _ *struct{}) error { 389 close(l.ShutdownCh) 390 return nil 391 } 392 393 func (l *Lifecycle) getInitContainerProcess(containerID string) (*kernel.ThreadGroup, error) { 394 l.mu.Lock() 395 defer l.mu.Unlock() 396 397 c, ok := l.containerMap[containerID] 398 if !ok { 399 return nil, fmt.Errorf("container %v not started", containerID) 400 } 401 return c.tg, nil 402 } 403 404 // ContainerArgs is the set of arguments for container related APIs after 405 // starting the container. 406 type ContainerArgs struct { 407 ContainerID string `json:"container_id"` 408 } 409 410 // GetExitStatus returns the container exit status if it has stopped. 411 func (l *Lifecycle) GetExitStatus(args *ContainerArgs, status *uint32) error { 412 l.mu.Lock() 413 defer l.mu.Unlock() 414 415 c, ok := l.containerMap[args.ContainerID] 416 if !ok { 417 return fmt.Errorf("container %q doesn't exist, or has not been started", args.ContainerID) 418 } 419 420 if c.state != stateStopped { 421 return fmt.Errorf("container %q hasn't exited yet", args.ContainerID) 422 } 423 424 *status = uint32(c.tg.ExitStatus()) 425 eventchannel.LogEmit(&pb.ContainerExitEvent{ 426 ContainerId: args.ContainerID, 427 ExitStatus: *status, 428 }) 429 return nil 430 } 431 432 // Reap notifies the sandbox that the caller is interested in the exit status via 433 // an exit event. The caller is responsible for handling any corresponding exit 434 // events, especially if they're interested in waiting for the exit. 435 func (l *Lifecycle) Reap(args *ContainerArgs, _ *struct{}) error { 436 // Check if there are any real emitters registered. If there are no 437 // emitters, the caller will never be notified, so fail immediately. 438 if !eventchannel.HaveEmitters() { 439 return fmt.Errorf("no event emitters configured") 440 } 441 442 l.mu.Lock() 443 444 c, ok := l.containerMap[args.ContainerID] 445 if !ok { 446 l.mu.Unlock() 447 return fmt.Errorf("no container with id %q", args.ContainerID) 448 } 449 450 // Once a container enters the stop state, the state never changes. It's 451 // safe to cache a stopped state outside a l.mu critical section. 452 isStopped := c.state == stateStopped 453 l.mu.Unlock() 454 455 if isStopped { 456 // Already stopped, emit stop to ensure any callbacks registered after 457 // the actual stop is called. This may be a duplicate event, but is 458 // necessary in case the reap goroutine transitions the container to the 459 // stop state before the caller starts observing the event channel. 460 eventchannel.LogEmit(&pb.ContainerExitEvent{ 461 ContainerId: args.ContainerID, 462 ExitStatus: uint32(c.tg.ExitStatus()), 463 }) 464 } 465 466 // Caller now responsible for blocking on the exit event. 467 return nil 468 } 469 470 // IsContainerRunning returns true if the container is running. 471 func (l *Lifecycle) IsContainerRunning(args *ContainerArgs, isRunning *bool) error { 472 l.mu.Lock() 473 defer l.mu.Unlock() 474 475 c, ok := l.containerMap[args.ContainerID] 476 // We may be racing with the reaper goroutine updating c.state, so also 477 // check the number non-exited tasks. 478 if !ok || c.state != stateRunning || c.tg.Count() == 0 { 479 return nil 480 } 481 482 *isRunning = true 483 return nil 484 } 485 486 // SignalContainerArgs is the set of arguments for signalling a container. 487 type SignalContainerArgs struct { 488 ContainerID string `json:"container_id"` 489 Signo int32 `json:"signo"` 490 SignalAll bool `json:"signalAll"` 491 } 492 493 // SignalContainer signals the container in multi-container mode. It returns error if the 494 // container hasn't started or has exited. 495 func (l *Lifecycle) SignalContainer(args *SignalContainerArgs, _ *struct{}) error { 496 tg, err := l.getInitContainerProcess(args.ContainerID) 497 if err != nil { 498 return err 499 } 500 501 l.mu.Lock() 502 c, ok := l.containerMap[args.ContainerID] 503 if !ok || c.state != stateRunning { 504 l.mu.Unlock() 505 return fmt.Errorf("%v container not running", args.ContainerID) 506 } 507 l.mu.Unlock() 508 509 // Signalling a single process is supported only for the init process. 510 if !args.SignalAll { 511 if tg == nil { 512 return fmt.Errorf("no process exists in %v", tg) 513 } 514 return l.Kernel.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: args.Signo}) 515 } 516 517 l.Kernel.Pause() 518 defer l.Kernel.Unpause() 519 return l.Kernel.SendContainerSignal(args.ContainerID, &linux.SignalInfo{Signo: args.Signo}) 520 }