github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/control/lifecycle.go (about)

     1  // Copyright 2021 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package control
    16  
    17  import (
    18  	"encoding/json"
    19  	"fmt"
    20  	"time"
    21  
    22  	"google.golang.org/protobuf/types/known/timestamppb"
    23  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    24  	"github.com/MerlinKodo/gvisor/pkg/eventchannel"
    25  	"github.com/MerlinKodo/gvisor/pkg/fd"
    26  	"github.com/MerlinKodo/gvisor/pkg/log"
    27  	pb "github.com/MerlinKodo/gvisor/pkg/sentry/control/control_go_proto"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/fdimport"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    31  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    32  	"github.com/MerlinKodo/gvisor/pkg/sentry/limits"
    33  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    34  	"github.com/MerlinKodo/gvisor/pkg/sync"
    35  	"github.com/MerlinKodo/gvisor/pkg/urpc"
    36  )
    37  
    38  // Lifecycle provides functions related to starting and stopping tasks.
    39  type Lifecycle struct {
    40  	// Kernel is the kernel where the tasks belong to.
    41  	Kernel *kernel.Kernel
    42  
    43  	// ShutdownCh is the channel used to signal the sentry to shutdown
    44  	// the sentry/sandbox.
    45  	ShutdownCh chan struct{}
    46  
    47  	// mu protects the fields below.
    48  	mu sync.RWMutex
    49  
    50  	// MountNamespacesMap is a map of container id/names and the mount
    51  	// namespaces.
    52  	MountNamespacesMap map[string]*vfs.MountNamespace
    53  
    54  	// containerMap is a map of the container id and the container.
    55  	containerMap map[string]*Container
    56  }
    57  
    58  // containerState is the state of the container.
    59  type containerState int
    60  
    61  const (
    62  	// stateCreated is the state when the container was created. It is the
    63  	// initial state.
    64  	stateCreated containerState = iota
    65  
    66  	// stateRunning is the state when the container/application is running.
    67  	stateRunning
    68  
    69  	// stateStopped is the state when the container has exited.
    70  	stateStopped
    71  )
    72  
    73  // Container contains the set of parameters to represent a container.
    74  type Container struct {
    75  	// containerID.
    76  	containerID string
    77  
    78  	// tg is the init(PID 1) threadgroup of the container.
    79  	tg *kernel.ThreadGroup
    80  
    81  	// state is the current state of the container.
    82  	state containerState
    83  }
    84  
    85  // StartContainerArgs is the set of arguments to start a container.
    86  type StartContainerArgs struct {
    87  	// Filename is the filename to load.
    88  	//
    89  	// If this is provided as "", then the file will be guessed via Argv[0].
    90  	Filename string `json:"filename"`
    91  
    92  	// Argv is a list of arguments.
    93  	Argv []string `json:"argv"`
    94  
    95  	// Envv is a list of environment variables.
    96  	Envv []string `json:"envv"`
    97  
    98  	// Secret_envv is a list of secret environment variables.
    99  	//
   100  	// NOTE: This field must never be logged!
   101  	SecretEnvv []string `json:"secret_envv"`
   102  
   103  	// WorkingDirectory defines the working directory for the new process.
   104  	WorkingDirectory string `json:"wd"`
   105  
   106  	// KUID is the UID to run with in the root user namespace. Defaults to
   107  	// root if not set explicitly.
   108  	KUID auth.KUID `json:"KUID"`
   109  
   110  	// KGID is the GID to run with in the root user namespace. Defaults to
   111  	// the root group if not set explicitly.
   112  	KGID auth.KGID `json:"KGID"`
   113  
   114  	// ContainerID is the container for the process being executed.
   115  	ContainerID string `json:"container_id"`
   116  
   117  	// InitialCgroups is the set of cgroup controllers container needs to be initialised to.
   118  	InitialCgroups map[kernel.CgroupControllerType]string `json:"initial_cgroups"`
   119  
   120  	// Limits is the limit set for the process being executed.
   121  	Limits map[string]limits.Limit `json:"limits"`
   122  
   123  	// If HOME environment variable is not provided, and this flag is set,
   124  	// then the HOME environment variable will be set inside the container
   125  	// based on the user's home directory in /etc/passwd.
   126  	ResolveHome bool `json:"resolve_home"`
   127  
   128  	// If set, attempt to resolve the binary_path via the following procedure:
   129  	// 1) If binary_path is absolute, it is used directly.
   130  	// 2) If binary_path contains a slash, then it is resolved relative to the
   131  	//    working_directory (or the root it working_directory is not set).
   132  	// 3) Otherwise, search the PATH environment variable for the first directory
   133  	//    that contains an executable file with name in binary_path.
   134  	ResolveBinaryPath bool `json:"resolve_binary_path"`
   135  
   136  	// DonatedFDs is the list of sentry-intrenal file descriptors that will
   137  	// donated. They correspond to the donated files in FilePayload.
   138  	DonatedFDs []int `json:"donated_fds"`
   139  
   140  	// FilePayload determines the files to give to the new process.
   141  	urpc.FilePayload
   142  }
   143  
   144  // String formats the StartContainerArgs without the SecretEnvv field.
   145  func (sca StartContainerArgs) String() string {
   146  	sca.SecretEnvv = make([]string, len(sca.SecretEnvv))
   147  	for i := range sca.SecretEnvv {
   148  		sca.SecretEnvv[i] = "(hidden)"
   149  	}
   150  	b, err := json.Marshal(sca)
   151  	if err != nil {
   152  		return fmt.Sprintf("error marshaling: %s", err)
   153  	}
   154  	return string(b)
   155  }
   156  
   157  func (l *Lifecycle) updateContainerState(containerID string, newState containerState) error {
   158  	l.mu.Lock()
   159  	defer l.mu.Unlock()
   160  
   161  	c, ok := l.containerMap[containerID]
   162  	if !ok {
   163  		return fmt.Errorf("container %v not started", containerID)
   164  	}
   165  
   166  	switch newState {
   167  	case stateCreated:
   168  		// Impossible.
   169  		panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState))
   170  
   171  	case stateRunning:
   172  		if c.state != stateCreated {
   173  			// Impossible.
   174  			panic(fmt.Sprintf("invalid state transition: %v => %v", c.state, newState))
   175  		}
   176  
   177  	case stateStopped:
   178  		// Valid state transition.
   179  
   180  	default:
   181  		// Invalid new state.
   182  		panic(fmt.Sprintf("invalid new state: %v", newState))
   183  	}
   184  
   185  	c.state = newState
   186  	return nil
   187  }
   188  
   189  // StartContainer will start a new container in the sandbox.
   190  func (l *Lifecycle) StartContainer(args *StartContainerArgs, _ *uint32) error {
   191  	timeRequested := time.Now()
   192  	timeRequestReceived := &timestamppb.Timestamp{
   193  		Seconds: timeRequested.Unix(),
   194  		Nanos:   int32(timeRequested.Nanosecond()),
   195  	}
   196  	log.Infof("StartContainer: %v", args)
   197  	if len(args.Files) != len(args.DonatedFDs) {
   198  		return fmt.Errorf("FilePayload.Files and DonatedFDs must have same number of elements (%d != %d)", len(args.Files), len(args.DonatedFDs))
   199  	}
   200  
   201  	creds := auth.NewUserCredentials(
   202  		args.KUID,
   203  		args.KGID,
   204  		nil, /* extraKGIDs */
   205  		nil, /* capabilities */
   206  		l.Kernel.RootUserNamespace())
   207  
   208  	ls, err := limits.NewLinuxDistroLimitSet()
   209  	if err != nil {
   210  		return fmt.Errorf("error creating default limit set: %w", err)
   211  	}
   212  	for name, limit := range args.Limits {
   213  		lt, ok := limits.FromLinuxResourceName[name]
   214  		if !ok {
   215  			return fmt.Errorf("unknown limit %q", name)
   216  		}
   217  		ls.SetUnchecked(lt, limit)
   218  	}
   219  
   220  	// Create a new pid namespace for the container. Each container must run
   221  	// in its own pid namespace.
   222  	pidNs := l.Kernel.RootPIDNamespace().NewChild(l.Kernel.RootUserNamespace())
   223  
   224  	initArgs := kernel.CreateProcessArgs{
   225  		Filename: args.Filename,
   226  		Argv:     args.Argv,
   227  		// Order Envv before SecretEnvv.
   228  		Envv:                    append(args.Envv, args.SecretEnvv...),
   229  		WorkingDirectory:        args.WorkingDirectory,
   230  		Credentials:             creds,
   231  		Umask:                   0022,
   232  		Limits:                  ls,
   233  		MaxSymlinkTraversals:    linux.MaxSymlinkTraversals,
   234  		UTSNamespace:            l.Kernel.RootUTSNamespace(),
   235  		IPCNamespace:            l.Kernel.RootIPCNamespace(),
   236  		AbstractSocketNamespace: l.Kernel.RootAbstractSocketNamespace(),
   237  		ContainerID:             args.ContainerID,
   238  		PIDNamespace:            pidNs,
   239  	}
   240  
   241  	ctx := initArgs.NewContext(l.Kernel)
   242  
   243  	// Import file descriptors.
   244  	fdTable := l.Kernel.NewFDTable()
   245  	defer fdTable.DecRef(ctx)
   246  	hostFDs, err := fd.NewFromFiles(args.Files)
   247  	if err != nil {
   248  		return fmt.Errorf("error donating host files: %w", err)
   249  	}
   250  	defer func() {
   251  		for _, hfd := range hostFDs {
   252  			_ = hfd.Close()
   253  		}
   254  	}()
   255  	fdMap := make(map[int]*fd.FD, len(args.DonatedFDs))
   256  	for i, appFD := range args.DonatedFDs {
   257  		fdMap[appFD] = hostFDs[i]
   258  	}
   259  	if _, err := fdimport.Import(ctx, fdTable, false, args.KUID, args.KGID, fdMap); err != nil {
   260  		return fmt.Errorf("error importing host files: %w", err)
   261  	}
   262  	initArgs.FDTable = fdTable
   263  
   264  	l.mu.RLock()
   265  	mntns, ok := l.MountNamespacesMap[initArgs.ContainerID]
   266  	if !ok {
   267  		l.mu.RUnlock()
   268  		return fmt.Errorf("mount namespace is nil for %s", initArgs.ContainerID)
   269  	}
   270  	initArgs.MountNamespace = mntns
   271  	l.mu.RUnlock()
   272  	initArgs.MountNamespace.IncRef()
   273  
   274  	if args.ResolveBinaryPath {
   275  		resolved, err := user.ResolveExecutablePath(ctx, &initArgs)
   276  		if err != nil {
   277  			return fmt.Errorf("failed to resolve binary path: %w", err)
   278  		}
   279  		initArgs.Filename = resolved
   280  	}
   281  
   282  	if args.ResolveHome {
   283  		envVars, err := user.MaybeAddExecUserHome(ctx, initArgs.MountNamespace, creds.RealKUID, initArgs.Envv)
   284  		if err != nil {
   285  			return fmt.Errorf("failed to get user home dir: %w", err)
   286  		}
   287  		initArgs.Envv = envVars
   288  	}
   289  
   290  	fds, err := fd.NewFromFiles(args.Files)
   291  	if err != nil {
   292  		return fmt.Errorf("duplicating payload files: %w", err)
   293  	}
   294  	defer func() {
   295  		for _, fd := range fds {
   296  			_ = fd.Close()
   297  		}
   298  	}()
   299  
   300  	initialCgroups := make(map[kernel.Cgroup]struct{}, len(args.InitialCgroups))
   301  	cgroupRegistry := l.Kernel.CgroupRegistry()
   302  	// path is relative to the container's cgroup controller of specified type.
   303  	for initialCgroupController, path := range args.InitialCgroups {
   304  		cg, err := cgroupRegistry.FindCgroup(ctx, initialCgroupController, path)
   305  		if err != nil {
   306  			return fmt.Errorf("FindCgroup can't locate cgroup controller: %v err: %v", initialCgroupController, err)
   307  		}
   308  		initialCgroups[cg] = struct{}{}
   309  	}
   310  	initArgs.InitialCgroups = initialCgroups
   311  
   312  	tg, _, err := l.Kernel.CreateProcess(initArgs)
   313  	if err != nil {
   314  		return err
   315  	}
   316  
   317  	c := &Container{
   318  		containerID: initArgs.ContainerID,
   319  		tg:          tg,
   320  		state:       stateCreated,
   321  	}
   322  
   323  	l.mu.Lock()
   324  	if l.containerMap == nil {
   325  		l.containerMap = make(map[string]*Container)
   326  	}
   327  
   328  	if _, ok := l.containerMap[initArgs.ContainerID]; ok {
   329  		l.mu.Unlock()
   330  		return fmt.Errorf("container id: %v already exists", initArgs.ContainerID)
   331  	}
   332  
   333  	l.containerMap[initArgs.ContainerID] = c
   334  	l.mu.Unlock()
   335  
   336  	// Start the newly created process.
   337  	l.Kernel.StartProcess(tg)
   338  	log.Infof("Started the new container %v ", initArgs.ContainerID)
   339  
   340  	if err := l.updateContainerState(initArgs.ContainerID, stateRunning); err != nil {
   341  		// Sanity check: shouldn't fail to update the state at this point.
   342  		panic(fmt.Sprintf("Failed to set running state: %v", err))
   343  
   344  	}
   345  
   346  	timeRequestCompleted := time.Now()
   347  	eventchannel.LogEmit(&pb.ContainerStartedEvent{
   348  		Started:         true,
   349  		ContainerId:     initArgs.ContainerID,
   350  		RequestReceived: timeRequestReceived,
   351  		RequestCompleted: &timestamppb.Timestamp{
   352  			Seconds: timeRequestCompleted.Unix(),
   353  			Nanos:   int32(timeRequestCompleted.Nanosecond()),
   354  		},
   355  	})
   356  
   357  	// TODO(b/251490950): reap thread needs to synchronize with Save, so the
   358  	// container state update doesn't race with state serialization.
   359  	go l.reap(initArgs.ContainerID, tg) // S/R-SAFE: see above.
   360  
   361  	return nil
   362  }
   363  
   364  func (l *Lifecycle) reap(containerID string, tg *kernel.ThreadGroup) {
   365  	tg.WaitExited()
   366  	if err := l.updateContainerState(containerID, stateStopped); err != nil {
   367  		panic(err)
   368  	}
   369  	eventchannel.LogEmit(&pb.ContainerExitEvent{
   370  		ContainerId: containerID,
   371  		ExitStatus:  uint32(tg.ExitStatus()),
   372  	})
   373  }
   374  
   375  // Pause pauses all tasks, blocking until they are stopped.
   376  func (l *Lifecycle) Pause(_, _ *struct{}) error {
   377  	l.Kernel.Pause()
   378  	return nil
   379  }
   380  
   381  // Resume resumes all tasks.
   382  func (l *Lifecycle) Resume(_, _ *struct{}) error {
   383  	l.Kernel.Unpause()
   384  	return nil
   385  }
   386  
   387  // Shutdown sends signal to destroy the sentry/sandbox.
   388  func (l *Lifecycle) Shutdown(_, _ *struct{}) error {
   389  	close(l.ShutdownCh)
   390  	return nil
   391  }
   392  
   393  func (l *Lifecycle) getInitContainerProcess(containerID string) (*kernel.ThreadGroup, error) {
   394  	l.mu.Lock()
   395  	defer l.mu.Unlock()
   396  
   397  	c, ok := l.containerMap[containerID]
   398  	if !ok {
   399  		return nil, fmt.Errorf("container %v not started", containerID)
   400  	}
   401  	return c.tg, nil
   402  }
   403  
   404  // ContainerArgs is the set of arguments for container related APIs after
   405  // starting the container.
   406  type ContainerArgs struct {
   407  	ContainerID string `json:"container_id"`
   408  }
   409  
   410  // GetExitStatus returns the container exit status if it has stopped.
   411  func (l *Lifecycle) GetExitStatus(args *ContainerArgs, status *uint32) error {
   412  	l.mu.Lock()
   413  	defer l.mu.Unlock()
   414  
   415  	c, ok := l.containerMap[args.ContainerID]
   416  	if !ok {
   417  		return fmt.Errorf("container %q doesn't exist, or has not been started", args.ContainerID)
   418  	}
   419  
   420  	if c.state != stateStopped {
   421  		return fmt.Errorf("container %q hasn't exited yet", args.ContainerID)
   422  	}
   423  
   424  	*status = uint32(c.tg.ExitStatus())
   425  	eventchannel.LogEmit(&pb.ContainerExitEvent{
   426  		ContainerId: args.ContainerID,
   427  		ExitStatus:  *status,
   428  	})
   429  	return nil
   430  }
   431  
   432  // Reap notifies the sandbox that the caller is interested in the exit status via
   433  // an exit event. The caller is responsible for handling any corresponding exit
   434  // events, especially if they're interested in waiting for the exit.
   435  func (l *Lifecycle) Reap(args *ContainerArgs, _ *struct{}) error {
   436  	// Check if there are any real emitters registered. If there are no
   437  	// emitters, the caller will never be notified, so fail immediately.
   438  	if !eventchannel.HaveEmitters() {
   439  		return fmt.Errorf("no event emitters configured")
   440  	}
   441  
   442  	l.mu.Lock()
   443  
   444  	c, ok := l.containerMap[args.ContainerID]
   445  	if !ok {
   446  		l.mu.Unlock()
   447  		return fmt.Errorf("no container with id %q", args.ContainerID)
   448  	}
   449  
   450  	// Once a container enters the stop state, the state never changes. It's
   451  	// safe to cache a stopped state outside a l.mu critical section.
   452  	isStopped := c.state == stateStopped
   453  	l.mu.Unlock()
   454  
   455  	if isStopped {
   456  		// Already stopped, emit stop to ensure any callbacks registered after
   457  		// the actual stop is called. This may be a duplicate event, but is
   458  		// necessary in case the reap goroutine transitions the container to the
   459  		// stop state before the caller starts observing the event channel.
   460  		eventchannel.LogEmit(&pb.ContainerExitEvent{
   461  			ContainerId: args.ContainerID,
   462  			ExitStatus:  uint32(c.tg.ExitStatus()),
   463  		})
   464  	}
   465  
   466  	// Caller now responsible for blocking on the exit event.
   467  	return nil
   468  }
   469  
   470  // IsContainerRunning returns true if the container is running.
   471  func (l *Lifecycle) IsContainerRunning(args *ContainerArgs, isRunning *bool) error {
   472  	l.mu.Lock()
   473  	defer l.mu.Unlock()
   474  
   475  	c, ok := l.containerMap[args.ContainerID]
   476  	// We may be racing with the reaper goroutine updating c.state, so also
   477  	// check the number non-exited tasks.
   478  	if !ok || c.state != stateRunning || c.tg.Count() == 0 {
   479  		return nil
   480  	}
   481  
   482  	*isRunning = true
   483  	return nil
   484  }
   485  
   486  // SignalContainerArgs is the set of arguments for signalling a container.
   487  type SignalContainerArgs struct {
   488  	ContainerID string `json:"container_id"`
   489  	Signo       int32  `json:"signo"`
   490  	SignalAll   bool   `json:"signalAll"`
   491  }
   492  
   493  // SignalContainer signals the container in multi-container mode. It returns error if the
   494  // container hasn't started or has exited.
   495  func (l *Lifecycle) SignalContainer(args *SignalContainerArgs, _ *struct{}) error {
   496  	tg, err := l.getInitContainerProcess(args.ContainerID)
   497  	if err != nil {
   498  		return err
   499  	}
   500  
   501  	l.mu.Lock()
   502  	c, ok := l.containerMap[args.ContainerID]
   503  	if !ok || c.state != stateRunning {
   504  		l.mu.Unlock()
   505  		return fmt.Errorf("%v container not running", args.ContainerID)
   506  	}
   507  	l.mu.Unlock()
   508  
   509  	// Signalling a single process is supported only for the init process.
   510  	if !args.SignalAll {
   511  		if tg == nil {
   512  			return fmt.Errorf("no process exists in %v", tg)
   513  		}
   514  		return l.Kernel.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: args.Signo})
   515  	}
   516  
   517  	l.Kernel.Pause()
   518  	defer l.Kernel.Unpause()
   519  	return l.Kernel.SendContainerSignal(args.ContainerID, &linux.SignalInfo{Signo: args.Signo})
   520  }