gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/control/lifecycle.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package control
    16  
    17  import (
    18  	"encoding/json"
    19  	"fmt"
    20  	"time"
    21  
    22  	"google.golang.org/protobuf/types/known/timestamppb"
    23  	"gvisor.dev/gvisor/pkg/abi/linux"
    24  	"gvisor.dev/gvisor/pkg/eventchannel"
    25  	"gvisor.dev/gvisor/pkg/fd"
    26  	"gvisor.dev/gvisor/pkg/log"
    27  	pb "gvisor.dev/gvisor/pkg/sentry/control/control_go_proto"
    28  	"gvisor.dev/gvisor/pkg/sentry/fdimport"
    29  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/user"
    30  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    31  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    32  	"gvisor.dev/gvisor/pkg/sentry/limits"
    33  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    34  	"gvisor.dev/gvisor/pkg/sync"
    35  	"gvisor.dev/gvisor/pkg/urpc"
    36  )
    37  
    38  // Lifecycle provides functions related to starting and stopping tasks.
    39  type Lifecycle struct {
    40  	// Kernel is the kernel where the tasks belong to.
    41  	Kernel *kernel.Kernel
    42  
    43  	// ShutdownCh is the channel used to signal the sentry to shutdown
    44  	// the sentry/sandbox.
    45  	ShutdownCh chan struct{}
    46  
    47  	// mu protects the fields below.
    48  	mu sync.RWMutex
    49  
    50  	// MountNamespacesMap is a map of container id/names and the mount
    51  	// namespaces.
    52  	MountNamespacesMap map[string]*vfs.MountNamespace
    53  
    54  	// containerMap is a map of the container id and the container.
    55  	containerMap map[string]*Container
    56  }
    57  
    58  // containerState is the state of the container.
    59  type containerState int
    60  
    61  const (
    62  	// stateCreated is the state when the container was created. It is the
    63  	// initial state.
    64  	stateCreated containerState = iota
    65  
    66  	// stateRunning is the state when the container/application is running.
    67  	stateRunning
    68  
    69  	// stateStopped is the state when the container has exited.
    70  	stateStopped
    71  )
    72  
    73  // Container contains the set of parameters to represent a container.
    74  type Container struct {
    75  	// containerID.
    76  	containerID string
    77  
    78  	// tg is the init(PID 1) threadgroup of the container.
    79  	tg *kernel.ThreadGroup
    80  
    81  	// state is the current state of the container.
    82  	state containerState
    83  }
    84  
    85  // StartContainerArgs is the set of arguments to start a container.
    86  type StartContainerArgs struct {
    87  	// Filename is the filename to load.
    88  	//
    89  	// If this is provided as "", then the file will be guessed via Argv[0].
    90  	Filename string `json:"filename"`
    91  
    92  	// Argv is a list of arguments.
    93  	Argv []string `json:"argv"`
    94  
    95  	// Envv is a list of environment variables.
    96  	Envv []string `json:"envv"`
    97  
    98  	// Secret_envv is a list of secret environment variables.
    99  	//
   100  	// NOTE: This field must never be logged!
   101  	SecretEnvv []string `json:"secret_envv"`
   102  
   103  	// WorkingDirectory defines the working directory for the new process.
   104  	WorkingDirectory string `json:"wd"`
   105  
   106  	// KUID is the UID to run with in the root user namespace. Defaults to
   107  	// root if not set explicitly.
   108  	KUID auth.KUID `json:"KUID"`
   109  
   110  	// KGID is the GID to run with in the root user namespace. Defaults to
   111  	// the root group if not set explicitly.
   112  	KGID auth.KGID `json:"KGID"`
   113  
   114  	// User is the user string used to retrieve UID/GID.
   115  	User string `json:"user"`
   116  
   117  	// ContainerID is the container for the process being executed.
   118  	ContainerID string `json:"container_id"`
   119  
   120  	// InitialCgroups is the set of cgroup controllers container needs to be initialised to.
   121  	InitialCgroups map[kernel.CgroupControllerType]string `json:"initial_cgroups"`
   122  
   123  	// Limits is the limit set for the process being executed.
   124  	Limits map[string]limits.Limit `json:"limits"`
   125  
   126  	// If HOME environment variable is not provided, and this flag is set,
   127  	// then the HOME environment variable will be set inside the container
   128  	// based on the user's home directory in /etc/passwd.
   129  	ResolveHome bool `json:"resolve_home"`
   130  
   131  	// If set, attempt to resolve the binary_path via the following procedure:
   132  	// 1) If binary_path is absolute, it is used directly.
   133  	// 2) If binary_path contains a slash, then it is resolved relative to the
   134  	//    working_directory (or the root it working_directory is not set).
   135  	// 3) Otherwise, search the PATH environment variable for the first directory
   136  	//    that contains an executable file with name in binary_path.
   137  	ResolveBinaryPath bool `json:"resolve_binary_path"`
   138  
   139  	// DonatedFDs is the list of sentry-intrenal file descriptors that will
   140  	// donated. They correspond to the donated files in FilePayload.
   141  	DonatedFDs []int `json:"donated_fds"`
   142  
   143  	// FilePayload determines the files to give to the new process.
   144  	urpc.FilePayload
   145  }
   146  
   147  // String formats the StartContainerArgs without the SecretEnvv field.
   148  func (sca StartContainerArgs) String() string {
   149  	sca.SecretEnvv = make([]string, len(sca.SecretEnvv))
   150  	for i := range sca.SecretEnvv {
   151  		sca.SecretEnvv[i] = "(hidden)"
   152  	}
   153  	b, err := json.Marshal(sca)
   154  	if err != nil {
   155  		return fmt.Sprintf("error marshaling: %s", err)
   156  	}
   157  	return string(b)
   158  }
   159  
   160  func (l *Lifecycle) updateContainerState(containerID string, newState containerState) error {
   161  	l.mu.Lock()
   162  	defer l.mu.Unlock()
   163  
   164  	c, ok := l.containerMap[containerID]
   165  	if !ok {
   166  		return fmt.Errorf("container %v not started", containerID)
   167  	}
   168  
   169  	switch newState {
   170  	case stateCreated:
   171  		// Impossible.
   172  		panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState))
   173  
   174  	case stateRunning:
   175  		if c.state != stateCreated {
   176  			// Impossible.
   177  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState))
   178  		}
   179  
   180  	case stateStopped:
   181  		// Valid state transition.
   182  
   183  	default:
   184  		// Invalid new state.
   185  		panic(fmt.Sprintf("invalid new state: %v", newState))
   186  	}
   187  
   188  	c.state = newState
   189  	return nil
   190  }
   191  
   192  // StartContainer will start a new container in the sandbox.
   193  func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error {
   194  	timeRequested := time.Now()
   195  	timeRequestReceived := &timestamppb.Timestamp{
   196  		Seconds: timeRequested.Unix(),
   197  		Nanos:   int32(timeRequested.Nanosecond()),
   198  	}
   199  	log.Infof("StartContainer: %v", args)
   200  	if len(args.Files) != len(args.DonatedFDs) {
   201  		return fmt.Errorf("FilePayload.Files and DonatedFDs must have same number of elements (%d != %d)", len(args.Files), len(args.DonatedFDs))
   202  	}
   203  
   204  	l.mu.RLock()
   205  	mntns, ok := l.MountNamespacesMap[args.ContainerID]
   206  	if !ok {
   207  		l.mu.RUnlock()
   208  		return fmt.Errorf("mount namespace is nil for %s", args.ContainerID)
   209  	}
   210  	l.mu.RUnlock()
   211  
   212  	uid := args.KUID
   213  	gid := args.KGID
   214  	if args.User != "" {
   215  		if uid != 0 || gid != 0 {
   216  			return fmt.Errorf("container spec specified both an explicit UID/GID and a user name, only one or the other may be provided")
   217  		}
   218  		var err error
   219  		uid, gid, err = user.GetExecUIDGIDFromUser(l.Kernel.SupervisorContext(), mntns, args.User)
   220  		if err != nil {
   221  			return fmt.Errorf("couldn't retrieve UID and GID for user %v, err: %v", args.User, err)
   222  		}
   223  	}
   224  
   225  	creds := auth.NewUserCredentials(
   226  		uid,
   227  		gid,
   228  		nil, /* extraKGIDs */
   229  		nil, /* capabilities */
   230  		l.Kernel.RootUserNamespace())
   231  
   232  	ls, err := limits.NewLinuxDistroLimitSet()
   233  	if err != nil {
   234  		return fmt.Errorf("error creating default limit set: %w", err)
   235  	}
   236  	for name, limit := range args.Limits {
   237  		lt, ok := limits.FromLinuxResourceName[name]
   238  		if !ok {
   239  			return fmt.Errorf("unknown limit %q", name)
   240  		}
   241  		ls.SetUnchecked(lt, limit)
   242  	}
   243  
   244  	// Create a new pid namespace for the container. Each container must run
   245  	// in its own pid namespace.
   246  	pidNs := l.Kernel.RootPIDNamespace().NewChild(l.Kernel.RootUserNamespace())
   247  
   248  	initArgs := kernel.CreateProcessArgs{
   249  		Filename: args.Filename,
   250  		Argv:     args.Argv,
   251  		// Order Envv before SecretEnvv.
   252  		Envv:                 append(args.Envv, args.SecretEnvv...),
   253  		WorkingDirectory:     args.WorkingDirectory,
   254  		Credentials:          creds,
   255  		Umask:                0022,
   256  		Limits:               ls,
   257  		MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
   258  		UTSNamespace:         l.Kernel.RootUTSNamespace(),
   259  		IPCNamespace:         l.Kernel.RootIPCNamespace(),
   260  		ContainerID:          args.ContainerID,
   261  		PIDNamespace:         pidNs,
   262  	}
   263  
   264  	ctx := initArgs.NewContext(l.Kernel)
   265  
   266  	// Import file descriptors.
   267  	fdTable := l.Kernel.NewFDTable()
   268  	defer fdTable.DecRef(ctx)
   269  	hostFDs, err := fd.NewFromFiles(args.Files)
   270  	if err != nil {
   271  		return fmt.Errorf("error donating host files: %w", err)
   272  	}
   273  	defer func() {
   274  		for _, hfd := range hostFDs {
   275  			_ = hfd.Close()
   276  		}
   277  	}()
   278  	fdMap := make(map[int]*fd.FD, len(args.DonatedFDs))
   279  	for i, appFD := range args.DonatedFDs {
   280  		fdMap[appFD] = hostFDs[i]
   281  	}
   282  	// Use ContainerID since containers don't have names here.
   283  	if _, err := fdimport.Import(ctx, fdTable, false, args.KUID, args.KGID, fdMap, initArgs.ContainerID); err != nil {
   284  		return fmt.Errorf("error importing host files: %w", err)
   285  	}
   286  	initArgs.FDTable = fdTable
   287  
   288  	initArgs.MountNamespace = mntns
   289  	initArgs.MountNamespace.IncRef()
   290  
   291  	if args.ResolveBinaryPath {
   292  		resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
   293  		if err != nil {
   294  			return fmt.Errorf("failed to resolve binary path: %w", err)
   295  		}
   296  		initArgs.Filename = resolved
   297  	}
   298  
   299  	if args.ResolveHome {
   300  		envVars, err := user.MaybeAddExecUserHome(ctx, initArgs.MountNamespace, creds.RealKUID, initArgs.Envv)
   301  		if err != nil {
   302  			return fmt.Errorf("failed to get user home dir: %w", err)
   303  		}
   304  		initArgs.Envv = envVars
   305  	}
   306  
   307  	fds, err := fd.NewFromFiles(args.Files)
   308  	if err != nil {
   309  		return fmt.Errorf("duplicating payload files: %w", err)
   310  	}
   311  	defer func() {
   312  		for _, fd := range fds {
   313  			_ = fd.Close()
   314  		}
   315  	}()
   316  
   317  	initialCgroups := make(map[kernel.Cgroup]struct{}, len(args.InitialCgroups))
   318  	cgroupRegistry := l.Kernel.CgroupRegistry()
   319  	// path is relative to the container's cgroup controller of specified type.
   320  	for initialCgroupController, path := range args.InitialCgroups {
   321  		cg, err := cgroupRegistry.FindCgroup(ctx, initialCgroupController, path)
   322  		if err != nil {
   323  			return fmt.Errorf("FindCgroup can't locate cgroup controller: %v err: %v", initialCgroupController, err)
   324  		}
   325  		initialCgroups[cg] = struct{}{}
   326  	}
   327  	initArgs.InitialCgroups = initialCgroups
   328  
   329  	tg, _, err := l.Kernel.CreateProcess(initArgs)
   330  	if err != nil {
   331  		return err
   332  	}
   333  
   334  	c := &Container{
   335  		containerID: initArgs.ContainerID,
   336  		tg:          tg,
   337  		state:       stateCreated,
   338  	}
   339  
   340  	l.mu.Lock()
   341  	if l.containerMap == nil {
   342  		l.containerMap = make(map[string]*Container)
   343  	}
   344  
   345  	if _, ok := l.containerMap[initArgs.ContainerID]; ok {
   346  		l.mu.Unlock()
   347  		return fmt.Errorf("container id: %v already exists", initArgs.ContainerID)
   348  	}
   349  
   350  	l.containerMap[initArgs.ContainerID] = c
   351  	l.mu.Unlock()
   352  
   353  	// Start the newly created process.
   354  	l.Kernel.StartProcess(tg)
   355  	log.Infof("Started the new container %v ", initArgs.ContainerID)
   356  
   357  	if err := l.updateContainerState(initArgs.ContainerID, stateRunning); err != nil {
   358  		// Sanity check: shouldn't fail to update the state at this point.
   359  		panic(fmt.Sprintf("Failed to set running state: %v", err))
   360  
   361  	}
   362  
   363  	timeRequestCompleted := time.Now()
   364  	eventchannel.LogEmit(&pb.ContainerStartedEvent{
   365  		Started:         true,
   366  		ContainerId:     initArgs.ContainerID,
   367  		RequestReceived: timeRequestReceived,
   368  		RequestCompleted: &timestamppb.Timestamp{
   369  			Seconds: timeRequestCompleted.Unix(),
   370  			Nanos:   int32(timeRequestCompleted.Nanosecond()),
   371  		},
   372  	})
   373  
   374  	// TODO(b/251490950): reap thread needs to synchronize with Save, so the
   375  	// container state update doesn't race with state serialization.
   376  	go l.reap(initArgs.ContainerID, tg) // S/R-SAFE: see above.
   377  
   378  	return nil
   379  }
   380  
   381  func (l *Lifecycle) reap(containerID string, tg *kernel.ThreadGroup) {
   382  	tg.WaitExited()
   383  	if err := l.updateContainerState(containerID, stateStopped); err != nil {
   384  		panic(err)
   385  	}
   386  	eventchannel.LogEmit(&pb.ContainerExitEvent{
   387  		ContainerId: containerID,
   388  		ExitStatus:  uint32(tg.ExitStatus()),
   389  	})
   390  }
   391  
   392  // Pause pauses all tasks, blocking until they are stopped.
   393  func (l *Lifecycle) Pause(_, _ *struct{}) error {
   394  	l.Kernel.Pause()
   395  	return nil
   396  }
   397  
   398  // Resume resumes all tasks.
   399  func (l *Lifecycle) Resume(_, _ *struct{}) error {
   400  	l.Kernel.Unpause()
   401  	return nil
   402  }
   403  
   404  // Shutdown sends signal to destroy the sentry/sandbox.
   405  func (l *Lifecycle) Shutdown(_, _ *struct{}) error {
   406  	close(l.ShutdownCh)
   407  	return nil
   408  }
   409  
   410  func (l *Lifecycle) getInitContainerProcess(containerID string) (*kernel.ThreadGroup, error) {
   411  	l.mu.Lock()
   412  	defer l.mu.Unlock()
   413  
   414  	c, ok := l.containerMap[containerID]
   415  	if !ok {
   416  		return nil, fmt.Errorf("container %v not started", containerID)
   417  	}
   418  	return c.tg, nil
   419  }
   420  
   421  // ContainerArgs is the set of arguments for container related APIs after
   422  // starting the container.
   423  type ContainerArgs struct {
   424  	ContainerID string `json:"container_id"`
   425  }
   426  
   427  // GetExitStatus returns the container exit status if it has stopped.
   428  func (l *Lifecycle) GetExitStatus(args *ContainerArgs, status *uint32) error {
   429  	l.mu.Lock()
   430  	defer l.mu.Unlock()
   431  
   432  	c, ok := l.containerMap[args.ContainerID]
   433  	if !ok {
   434  		return fmt.Errorf("container %q doesn't exist, or has not been started", args.ContainerID)
   435  	}
   436  
   437  	if c.state != stateStopped {
   438  		return fmt.Errorf("container %q hasn't exited yet", args.ContainerID)
   439  	}
   440  
   441  	*status = uint32(c.tg.ExitStatus())
   442  	eventchannel.LogEmit(&pb.ContainerExitEvent{
   443  		ContainerId: args.ContainerID,
   444  		ExitStatus:  *status,
   445  	})
   446  	return nil
   447  }
   448  
   449  // Reap notifies the sandbox that the caller is interested in the exit status via
   450  // an exit event. The caller is responsible for handling any corresponding exit
   451  // events, especially if they're interested in waiting for the exit.
   452  func (l *Lifecycle) Reap(args *ContainerArgs, _ *struct{}) error {
   453  	// Check if there are any real emitters registered. If there are no
   454  	// emitters, the caller will never be notified, so fail immediately.
   455  	if !eventchannel.HaveEmitters() {
   456  		return fmt.Errorf("no event emitters configured")
   457  	}
   458  
   459  	l.mu.Lock()
   460  
   461  	c, ok := l.containerMap[args.ContainerID]
   462  	if !ok {
   463  		l.mu.Unlock()
   464  		return fmt.Errorf("no container with id %q", args.ContainerID)
   465  	}
   466  
   467  	// Once a container enters the stop state, the state never changes. It's
   468  	// safe to cache a stopped state outside a l.mu critical section.
   469  	isStopped := c.state == stateStopped
   470  	l.mu.Unlock()
   471  
   472  	if isStopped {
   473  		// Already stopped, emit stop to ensure any callbacks registered after
   474  		// the actual stop is called. This may be a duplicate event, but is
   475  		// necessary in case the reap goroutine transitions the container to the
   476  		// stop state before the caller starts observing the event channel.
   477  		eventchannel.LogEmit(&pb.ContainerExitEvent{
   478  			ContainerId: args.ContainerID,
   479  			ExitStatus:  uint32(c.tg.ExitStatus()),
   480  		})
   481  	}
   482  
   483  	// Caller now responsible for blocking on the exit event.
   484  	return nil
   485  }
   486  
   487  // IsContainerRunning returns true if the container is running.
   488  func (l *Lifecycle) IsContainerRunning(args *ContainerArgs, isRunning *bool) error {
   489  	l.mu.Lock()
   490  	defer l.mu.Unlock()
   491  
   492  	c, ok := l.containerMap[args.ContainerID]
   493  	// We may be racing with the reaper goroutine updating c.state, so also
   494  	// check the number non-exited tasks.
   495  	if !ok || c.state != stateRunning || c.tg.Count() == 0 {
   496  		return nil
   497  	}
   498  
   499  	*isRunning = true
   500  	return nil
   501  }
   502  
   503  // SignalContainerArgs is the set of arguments for signalling a container.
   504  type SignalContainerArgs struct {
   505  	ContainerID string `json:"container_id"`
   506  	Signo       int32  `json:"signo"`
   507  	SignalAll   bool   `json:"signalAll"`
   508  }
   509  
   510  // SignalContainer signals the container in multi-container mode. It returns error if the
   511  // container hasn't started or has exited.
   512  func (l *Lifecycle) SignalContainer(args *SignalContainerArgs, _ *struct{}) error {
   513  	tg, err := l.getInitContainerProcess(args.ContainerID)
   514  	if err != nil {
   515  		return err
   516  	}
   517  
   518  	l.mu.Lock()
   519  	c, ok := l.containerMap[args.ContainerID]
   520  	if !ok || c.state != stateRunning {
   521  		l.mu.Unlock()
   522  		return fmt.Errorf("%v container not running", args.ContainerID)
   523  	}
   524  	l.mu.Unlock()
   525  
   526  	// Signalling a single process is supported only for the init process.
   527  	if !args.SignalAll {
   528  		if tg == nil {
   529  			return fmt.Errorf("no process exists in %v", tg)
   530  		}
   531  		return l.Kernel.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: args.Signo})
   532  	}
   533  
   534  	l.Kernel.Pause()
   535  	defer l.Kernel.Unpause()
   536  	return l.Kernel.SendContainerSignal(args.ContainerID, &linux.SignalInfo{Signo: args.Signo})
   537  }