github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/controller.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"os"
    21  	gtime "time"
    22  
    23  	specs "github.com/opencontainers/runtime-spec/specs-go"
    24  	"golang.org/x/sys/unix"
    25  	"github.com/SagerNet/gvisor/pkg/control/server"
    26  	"github.com/SagerNet/gvisor/pkg/fd"
    27  	"github.com/SagerNet/gvisor/pkg/log"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/control"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/socket/netstack"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/state"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/time"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/watchdog"
    36  	"github.com/SagerNet/gvisor/pkg/tcpip/stack"
    37  	"github.com/SagerNet/gvisor/pkg/urpc"
    38  	"github.com/SagerNet/gvisor/runsc/boot/pprof"
    39  	"github.com/SagerNet/gvisor/runsc/config"
    40  	"github.com/SagerNet/gvisor/runsc/specutils"
    41  )
    42  
    43  const (
    44  	// ContainerCheckpoint checkpoints a container.
    45  	ContainerCheckpoint = "containerManager.Checkpoint"
    46  
    47  	// ContainerCreate creates a container.
    48  	ContainerCreate = "containerManager.Create"
    49  
    50  	// ContainerDestroy is used to stop a non-root container and free all
    51  	// associated resources in the sandbox.
    52  	ContainerDestroy = "containerManager.Destroy"
    53  
    54  	// ContainerEvent is the URPC endpoint for getting stats about the
    55  	// container used by "runsc events".
    56  	ContainerEvent = "containerManager.Event"
    57  
    58  	// ContainerExecuteAsync is the URPC endpoint for executing a command in a
    59  	// container.
    60  	ContainerExecuteAsync = "containerManager.ExecuteAsync"
    61  
    62  	// ContainerPause pauses the container.
    63  	ContainerPause = "containerManager.Pause"
    64  
    65  	// ContainerProcesses is the URPC endpoint for getting the list of
    66  	// processes running in a container.
    67  	ContainerProcesses = "containerManager.Processes"
    68  
    69  	// ContainerRestore restores a container from a statefile.
    70  	ContainerRestore = "containerManager.Restore"
    71  
    72  	// ContainerResume unpauses the paused container.
    73  	ContainerResume = "containerManager.Resume"
    74  
    75  	// ContainerSignal is used to send a signal to a container.
    76  	ContainerSignal = "containerManager.Signal"
    77  
    78  	// ContainerSignalProcess is used to send a signal to a particular
    79  	// process in a container.
    80  	ContainerSignalProcess = "containerManager.SignalProcess"
    81  
    82  	// ContainerStart is the URPC endpoint for running a non-root container
    83  	// within a sandbox.
    84  	ContainerStart = "containerManager.Start"
    85  
    86  	// ContainerWait is used to wait on the init process of the container
    87  	// and return its ExitStatus.
    88  	ContainerWait = "containerManager.Wait"
    89  
    90  	// ContainerWaitPID is used to wait on a process with a certain PID in
    91  	// the sandbox and return its ExitStatus.
    92  	ContainerWaitPID = "containerManager.WaitPID"
    93  
    94  	// NetworkCreateLinksAndRoutes is the URPC endpoint for creating links
    95  	// and routes in a network stack.
    96  	NetworkCreateLinksAndRoutes = "Network.CreateLinksAndRoutes"
    97  
    98  	// RootContainerStart is the URPC endpoint for starting a new sandbox
    99  	// with root container.
   100  	RootContainerStart = "containerManager.StartRoot"
   101  
   102  	// SandboxStacks collects sandbox stacks for debugging.
   103  	SandboxStacks = "debug.Stacks"
   104  )
   105  
   106  // Profiling related commands (see pprof.go for more details).
   107  const (
   108  	CPUProfile   = "Profile.CPU"
   109  	HeapProfile  = "Profile.Heap"
   110  	BlockProfile = "Profile.Block"
   111  	MutexProfile = "Profile.Mutex"
   112  	Trace        = "Profile.Trace"
   113  )
   114  
   115  // Logging related commands (see logging.go for more details).
   116  const (
   117  	ChangeLogging = "Logging.Change"
   118  )
   119  
   120  // ControlSocketAddr generates an abstract unix socket name for the given ID.
   121  func ControlSocketAddr(id string) string {
   122  	return fmt.Sprintf("\x00runsc-sandbox.%s", id)
   123  }
   124  
   125  // controller holds the control server, and is used for communication into the
   126  // sandbox.
   127  type controller struct {
   128  	// srv is the control server.
   129  	srv *server.Server
   130  
   131  	// manager holds the containerManager methods.
   132  	manager *containerManager
   133  }
   134  
   135  // newController creates a new controller. The caller must call
   136  // controller.srv.StartServing() to start the controller.
   137  func newController(fd int, l *Loader) (*controller, error) {
   138  	ctrl := &controller{}
   139  	var err error
   140  	ctrl.srv, err = server.CreateFromFD(fd)
   141  	if err != nil {
   142  		return nil, err
   143  	}
   144  
   145  	ctrl.manager = &containerManager{
   146  		startChan:       make(chan struct{}),
   147  		startResultChan: make(chan error),
   148  		l:               l,
   149  	}
   150  	ctrl.srv.Register(ctrl.manager)
   151  
   152  	if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok {
   153  		net := &Network{
   154  			Stack: eps.Stack,
   155  		}
   156  		ctrl.srv.Register(net)
   157  	}
   158  
   159  	ctrl.srv.Register(&debug{})
   160  	ctrl.srv.Register(&control.Logging{})
   161  
   162  	if l.root.conf.ProfileEnable {
   163  		ctrl.srv.Register(control.NewProfile(l.k))
   164  	}
   165  
   166  	return ctrl, nil
   167  }
   168  
   169  // stopRPCTimeout is the time for clients to complete ongoing RPCs.
   170  const stopRPCTimeout = 15 * gtime.Second
   171  
   172  func (c *controller) stop() {
   173  	c.srv.Stop(stopRPCTimeout)
   174  }
   175  
   176  // containerManager manages sandbox containers.
   177  type containerManager struct {
   178  	// startChan is used to signal when the root container process should
   179  	// be started.
   180  	startChan chan struct{}
   181  
   182  	// startResultChan is used to signal when the root container  has
   183  	// started. Any errors encountered during startup will be sent to the
   184  	// channel. A nil value indicates success.
   185  	startResultChan chan error
   186  
   187  	// l is the loader that creates containers and sandboxes.
   188  	l *Loader
   189  }
   190  
   191  // StartRoot will start the root container process.
   192  func (cm *containerManager) StartRoot(cid *string, _ *struct{}) error {
   193  	log.Debugf("containerManager.StartRoot, cid: %s", *cid)
   194  	// Tell the root container to start and wait for the result.
   195  	cm.startChan <- struct{}{}
   196  	if err := <-cm.startResultChan; err != nil {
   197  		return fmt.Errorf("starting sandbox: %v", err)
   198  	}
   199  	return nil
   200  }
   201  
   202  // Processes retrieves information about processes running in the sandbox.
   203  func (cm *containerManager) Processes(cid *string, out *[]*control.Process) error {
   204  	log.Debugf("containerManager.Processes, cid: %s", *cid)
   205  	return control.Processes(cm.l.k, *cid, out)
   206  }
   207  
   208  // CreateArgs contains arguments to the Create method.
   209  type CreateArgs struct {
   210  	// CID is the ID of the container to start.
   211  	CID string
   212  
   213  	// FilePayload may contain a TTY file for the terminal, if enabled.
   214  	urpc.FilePayload
   215  }
   216  
   217  // Create creates a container within a sandbox.
   218  func (cm *containerManager) Create(args *CreateArgs, _ *struct{}) error {
   219  	log.Debugf("containerManager.Create: %s", args.CID)
   220  
   221  	if len(args.Files) > 1 {
   222  		return fmt.Errorf("start arguments must have at most 1 files for TTY")
   223  	}
   224  	var tty *fd.FD
   225  	if len(args.Files) == 1 {
   226  		var err error
   227  		tty, err = fd.NewFromFile(args.Files[0])
   228  		if err != nil {
   229  			return fmt.Errorf("error dup'ing TTY file: %w", err)
   230  		}
   231  	}
   232  	return cm.l.createContainer(args.CID, tty)
   233  }
   234  
   235  // StartArgs contains arguments to the Start method.
   236  type StartArgs struct {
   237  	// Spec is the spec of the container to start.
   238  	Spec *specs.Spec
   239  
   240  	// Config is the runsc-specific configuration for the sandbox.
   241  	Conf *config.Config
   242  
   243  	// CID is the ID of the container to start.
   244  	CID string
   245  
   246  	// FilePayload contains, in order:
   247  	//   * stdin, stdout, and stderr (optional: if terminal is disabled).
   248  	//   * file descriptors to connect to gofer to serve the root filesystem.
   249  	urpc.FilePayload
   250  }
   251  
   252  // Start runs a created container within a sandbox.
   253  func (cm *containerManager) Start(args *StartArgs, _ *struct{}) error {
   254  	// Validate arguments.
   255  	if args == nil {
   256  		return errors.New("start missing arguments")
   257  	}
   258  	log.Debugf("containerManager.Start, cid: %s, args: %+v", args.CID, args)
   259  	if args.Spec == nil {
   260  		return errors.New("start arguments missing spec")
   261  	}
   262  	if args.Conf == nil {
   263  		return errors.New("start arguments missing config")
   264  	}
   265  	if args.CID == "" {
   266  		return errors.New("start argument missing container ID")
   267  	}
   268  	if len(args.Files) < 1 {
   269  		return fmt.Errorf("start arguments must contain at least one file for the container root gofer")
   270  	}
   271  
   272  	// All validation passed, logs the spec for debugging.
   273  	specutils.LogSpec(args.Spec)
   274  
   275  	goferFiles := args.Files
   276  	var stdios []*fd.FD
   277  	if !args.Spec.Process.Terminal {
   278  		// When not using a terminal, stdios come as the first 3 files in the
   279  		// payload.
   280  		if l := len(args.Files); l < 4 {
   281  			return fmt.Errorf("start arguments (len: %d) must contain stdios and files for the container root gofer", l)
   282  		}
   283  		var err error
   284  		stdios, err = fd.NewFromFiles(goferFiles[:3])
   285  		if err != nil {
   286  			return fmt.Errorf("error dup'ing stdio files: %w", err)
   287  		}
   288  		goferFiles = goferFiles[3:]
   289  	}
   290  	defer func() {
   291  		for _, fd := range stdios {
   292  			_ = fd.Close()
   293  		}
   294  	}()
   295  
   296  	goferFDs, err := fd.NewFromFiles(goferFiles)
   297  	if err != nil {
   298  		return fmt.Errorf("error dup'ing gofer files: %w", err)
   299  	}
   300  	defer func() {
   301  		for _, fd := range goferFDs {
   302  			_ = fd.Close()
   303  		}
   304  	}()
   305  
   306  	if err := cm.l.startContainer(args.Spec, args.Conf, args.CID, stdios, goferFDs); err != nil {
   307  		log.Debugf("containerManager.Start failed, cid: %s, args: %+v, err: %v", args.CID, args, err)
   308  		return err
   309  	}
   310  	log.Debugf("Container started, cid: %s", args.CID)
   311  	return nil
   312  }
   313  
   314  // Destroy stops a container if it is still running and cleans up its
   315  // filesystem.
   316  func (cm *containerManager) Destroy(cid *string, _ *struct{}) error {
   317  	log.Debugf("containerManager.destroy, cid: %s", *cid)
   318  	return cm.l.destroyContainer(*cid)
   319  }
   320  
   321  // ExecuteAsync starts running a command on a created or running sandbox. It
   322  // returns the PID of the new process.
   323  func (cm *containerManager) ExecuteAsync(args *control.ExecArgs, pid *int32) error {
   324  	log.Debugf("containerManager.ExecuteAsync, cid: %s, args: %+v", args.ContainerID, args)
   325  	tgid, err := cm.l.executeAsync(args)
   326  	if err != nil {
   327  		log.Debugf("containerManager.ExecuteAsync failed, cid: %s, args: %+v, err: %v", args.ContainerID, args, err)
   328  		return err
   329  	}
   330  	*pid = int32(tgid)
   331  	return nil
   332  }
   333  
   334  // Checkpoint pauses a sandbox and saves its state.
   335  func (cm *containerManager) Checkpoint(o *control.SaveOpts, _ *struct{}) error {
   336  	log.Debugf("containerManager.Checkpoint")
   337  	// TODO(github.com/SagerNet/issues/6243): save/restore not supported w/ hostinet
   338  	if cm.l.root.conf.Network == config.NetworkHost {
   339  		return errors.New("checkpoint not supported when using hostinet")
   340  	}
   341  
   342  	state := control.State{
   343  		Kernel:   cm.l.k,
   344  		Watchdog: cm.l.watchdog,
   345  	}
   346  	return state.Save(o, nil)
   347  }
   348  
   349  // Pause suspends a container.
   350  func (cm *containerManager) Pause(_, _ *struct{}) error {
   351  	log.Debugf("containerManager.Pause")
   352  	// TODO(github.com/SagerNet/issues/6243): save/restore not supported w/ hostinet
   353  	if cm.l.root.conf.Network == config.NetworkHost {
   354  		return errors.New("pause not supported when using hostinet")
   355  	}
   356  	cm.l.k.Pause()
   357  	return nil
   358  }
   359  
   360  // RestoreOpts contains options related to restoring a container's file system.
   361  type RestoreOpts struct {
   362  	// FilePayload contains the state file to be restored, followed by the
   363  	// platform device file if necessary.
   364  	urpc.FilePayload
   365  
   366  	// SandboxID contains the ID of the sandbox.
   367  	SandboxID string
   368  }
   369  
   370  // Restore loads a container from a statefile.
   371  // The container's current kernel is destroyed, a restore environment is
   372  // created, and the kernel is recreated with the restore state file. The
   373  // container then sends the signal to start.
   374  func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
   375  	log.Debugf("containerManager.Restore")
   376  
   377  	var specFile, deviceFile *os.File
   378  	switch numFiles := len(o.Files); numFiles {
   379  	case 2:
   380  		// The device file is donated to the platform.
   381  		// Can't take ownership away from os.File. dup them to get a new FD.
   382  		fd, err := unix.Dup(int(o.Files[1].Fd()))
   383  		if err != nil {
   384  			return fmt.Errorf("failed to dup file: %v", err)
   385  		}
   386  		deviceFile = os.NewFile(uintptr(fd), "platform device")
   387  		fallthrough
   388  	case 1:
   389  		specFile = o.Files[0]
   390  	case 0:
   391  		return fmt.Errorf("at least one file must be passed to Restore")
   392  	default:
   393  		return fmt.Errorf("at most two files may be passed to Restore")
   394  	}
   395  
   396  	// Pause the kernel while we build a new one.
   397  	cm.l.k.Pause()
   398  
   399  	p, err := createPlatform(cm.l.root.conf, deviceFile)
   400  	if err != nil {
   401  		return fmt.Errorf("creating platform: %v", err)
   402  	}
   403  	k := &kernel.Kernel{
   404  		Platform: p,
   405  	}
   406  	mf, err := createMemoryFile()
   407  	if err != nil {
   408  		return fmt.Errorf("creating memory file: %v", err)
   409  	}
   410  	k.SetMemoryFile(mf)
   411  	networkStack := cm.l.k.RootNetworkNamespace().Stack()
   412  	cm.l.k = k
   413  
   414  	// Set up the restore environment.
   415  	ctx := k.SupervisorContext()
   416  	mntr := newContainerMounter(&cm.l.root, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled)
   417  	if kernel.VFS2Enabled {
   418  		ctx, err = mntr.configureRestore(ctx)
   419  		if err != nil {
   420  			return fmt.Errorf("configuring filesystem restore: %v", err)
   421  		}
   422  	} else {
   423  		renv, err := mntr.createRestoreEnvironment(cm.l.root.conf)
   424  		if err != nil {
   425  			return fmt.Errorf("creating RestoreEnvironment: %v", err)
   426  		}
   427  		fs.SetRestoreEnvironment(*renv)
   428  	}
   429  
   430  	// Prepare to load from the state file.
   431  	if eps, ok := networkStack.(*netstack.Stack); ok {
   432  		stack.StackFromEnv = eps.Stack // FIXME(b/36201077)
   433  	}
   434  	info, err := specFile.Stat()
   435  	if err != nil {
   436  		return err
   437  	}
   438  	if info.Size() == 0 {
   439  		return fmt.Errorf("file cannot be empty")
   440  	}
   441  
   442  	if cm.l.root.conf.ProfileEnable {
   443  		// pprof.Initialize opens /proc/self/maps, so has to be called before
   444  		// installing seccomp filters.
   445  		pprof.Initialize()
   446  	}
   447  
   448  	// Seccomp filters have to be applied before parsing the state file.
   449  	if err := cm.l.installSeccompFilters(); err != nil {
   450  		return err
   451  	}
   452  
   453  	// Load the state.
   454  	loadOpts := state.LoadOpts{Source: specFile}
   455  	if err := loadOpts.Load(ctx, k, nil, networkStack, time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
   456  		return err
   457  	}
   458  
   459  	// Since we have a new kernel we also must make a new watchdog.
   460  	dogOpts := watchdog.DefaultOpts
   461  	dogOpts.TaskTimeoutAction = cm.l.root.conf.WatchdogAction
   462  	dog := watchdog.New(k, dogOpts)
   463  
   464  	// Change the loader fields to reflect the changes made when restoring.
   465  	cm.l.k = k
   466  	cm.l.watchdog = dog
   467  	cm.l.root.procArgs = kernel.CreateProcessArgs{}
   468  	cm.l.restore = true
   469  
   470  	// Reinitialize the sandbox ID and processes map. Note that it doesn't
   471  	// restore the state of multiple containers, nor exec processes.
   472  	cm.l.sandboxID = o.SandboxID
   473  	cm.l.mu.Lock()
   474  	eid := execID{cid: o.SandboxID}
   475  	cm.l.processes = map[execID]*execProcess{
   476  		eid: {
   477  			tg: cm.l.k.GlobalInit(),
   478  		},
   479  	}
   480  	cm.l.mu.Unlock()
   481  
   482  	// Tell the root container to start and wait for the result.
   483  	cm.startChan <- struct{}{}
   484  	if err := <-cm.startResultChan; err != nil {
   485  		return fmt.Errorf("starting sandbox: %v", err)
   486  	}
   487  
   488  	return nil
   489  }
   490  
   491  // Resume unpauses a container.
   492  func (cm *containerManager) Resume(_, _ *struct{}) error {
   493  	log.Debugf("containerManager.Resume")
   494  	cm.l.k.Unpause()
   495  	return nil
   496  }
   497  
   498  // Wait waits for the init process in the given container.
   499  func (cm *containerManager) Wait(cid *string, waitStatus *uint32) error {
   500  	log.Debugf("containerManager.Wait, cid: %s", *cid)
   501  	err := cm.l.waitContainer(*cid, waitStatus)
   502  	log.Debugf("containerManager.Wait returned, cid: %s, waitStatus: %#x, err: %v", *cid, *waitStatus, err)
   503  	return err
   504  }
   505  
   506  // WaitPIDArgs are arguments to the WaitPID method.
   507  type WaitPIDArgs struct {
   508  	// PID is the PID in the container's PID namespace.
   509  	PID int32
   510  
   511  	// CID is the container ID.
   512  	CID string
   513  }
   514  
   515  // WaitPID waits for the process with PID 'pid' in the sandbox.
   516  func (cm *containerManager) WaitPID(args *WaitPIDArgs, waitStatus *uint32) error {
   517  	log.Debugf("containerManager.Wait, cid: %s, pid: %d", args.CID, args.PID)
   518  	err := cm.l.waitPID(kernel.ThreadID(args.PID), args.CID, waitStatus)
   519  	log.Debugf("containerManager.Wait, cid: %s, pid: %d, waitStatus: %#x, err: %v", args.CID, args.PID, *waitStatus, err)
   520  	return err
   521  }
   522  
   523  // SignalDeliveryMode enumerates different signal delivery modes.
   524  type SignalDeliveryMode int
   525  
   526  const (
   527  	// DeliverToProcess delivers the signal to the container process with
   528  	// the specified PID. If PID is 0, then the container init process is
   529  	// signaled.
   530  	DeliverToProcess SignalDeliveryMode = iota
   531  
   532  	// DeliverToAllProcesses delivers the signal to all processes in the
   533  	// container. PID must be 0.
   534  	DeliverToAllProcesses
   535  
   536  	// DeliverToForegroundProcessGroup delivers the signal to the
   537  	// foreground process group in the same TTY session as the specified
   538  	// process. If PID is 0, then the signal is delivered to the foreground
   539  	// process group for the TTY for the init process.
   540  	DeliverToForegroundProcessGroup
   541  )
   542  
   543  func (s SignalDeliveryMode) String() string {
   544  	switch s {
   545  	case DeliverToProcess:
   546  		return "Process"
   547  	case DeliverToAllProcesses:
   548  		return "All"
   549  	case DeliverToForegroundProcessGroup:
   550  		return "Foreground Process Group"
   551  	}
   552  	return fmt.Sprintf("unknown signal delivery mode: %d", s)
   553  }
   554  
   555  // SignalArgs are arguments to the Signal method.
   556  type SignalArgs struct {
   557  	// CID is the container ID.
   558  	CID string
   559  
   560  	// Signo is the signal to send to the process.
   561  	Signo int32
   562  
   563  	// PID is the process ID in the given container that will be signaled,
   564  	// relative to the root PID namespace, not the container's.
   565  	// If 0, the root container will be signalled.
   566  	PID int32
   567  
   568  	// Mode is the signal delivery mode.
   569  	Mode SignalDeliveryMode
   570  }
   571  
   572  // Signal sends a signal to one or more processes in a container. If args.PID
   573  // is 0, then the container init process is used. Depending on the
   574  // args.SignalDeliveryMode option, the signal may be sent directly to the
   575  // indicated process, to all processes in the container, or to the foreground
   576  // process group.
   577  func (cm *containerManager) Signal(args *SignalArgs, _ *struct{}) error {
   578  	log.Debugf("containerManager.Signal: cid: %s, PID: %d, signal: %d, mode: %v", args.CID, args.PID, args.Signo, args.Mode)
   579  	return cm.l.signal(args.CID, args.PID, args.Signo, args.Mode)
   580  }