gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/restore.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"errors"
    19  	"fmt"
    20  	"io"
    21  	"strconv"
    22  	time2 "time"
    23  
    24  	specs "github.com/opencontainers/runtime-spec/specs-go"
    25  	"gvisor.dev/gvisor/pkg/abi/linux"
    26  	"gvisor.dev/gvisor/pkg/cleanup"
    27  	"gvisor.dev/gvisor/pkg/context"
    28  	"gvisor.dev/gvisor/pkg/devutil"
    29  	"gvisor.dev/gvisor/pkg/fd"
    30  	"gvisor.dev/gvisor/pkg/log"
    31  	"gvisor.dev/gvisor/pkg/sentry/control"
    32  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
    33  	"gvisor.dev/gvisor/pkg/sentry/inet"
    34  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    35  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    36  	"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
    37  	"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
    38  	"gvisor.dev/gvisor/pkg/sentry/state"
    39  	"gvisor.dev/gvisor/pkg/sentry/time"
    40  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    41  	"gvisor.dev/gvisor/pkg/sentry/watchdog"
    42  	"gvisor.dev/gvisor/pkg/sync"
    43  	"gvisor.dev/gvisor/pkg/tcpip/stack"
    44  	"gvisor.dev/gvisor/runsc/boot/pprof"
    45  	"gvisor.dev/gvisor/runsc/config"
    46  )
    47  
    48  const (
    49  	// CheckpointStateFileName is the file within the given image-path's
    50  	// directory which contains the container's saved state.
    51  	CheckpointStateFileName = "checkpoint.img"
    52  	// CheckpointPagesMetadataFileName is the file within the given image-path's
    53  	// directory containing the container's MemoryFile metadata.
    54  	CheckpointPagesMetadataFileName = "pages_meta.img"
    55  	// CheckpointPagesFileName is the file within the given image-path's
    56  	// directory containing the container's MemoryFile pages.
    57  	CheckpointPagesFileName = "pages.img"
    58  )
    59  
    60  // restorer manages a restore session for a sandbox. It stores information about
    61  // all containers and triggers the full sandbox restore after the last
    62  // container is restored.
    63  type restorer struct {
    64  	mu sync.Mutex
    65  
    66  	// totalContainers is the number of containers expected to be restored in
    67  	// the sandbox. Sandbox restore can only happen, after all containers have
    68  	// been restored.
    69  	totalContainers int
    70  
    71  	// containers is the list of containers restored so far.
    72  	containers []*containerInfo
    73  
    74  	// Files used by restore to rehydrate the state.
    75  	stateFile     io.ReadCloser
    76  	pagesMetadata *fd.FD
    77  	pagesFile     *fd.FD
    78  
    79  	// deviceFile is the required to start the platform.
    80  	deviceFile *fd.FD
    81  
    82  	// restoreDone is a callback triggered when restore is successful.
    83  	restoreDone func() error
    84  }
    85  
    86  func (r *restorer) restoreSubcontainer(spec *specs.Spec, conf *config.Config, l *Loader, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error {
    87  	containerName := l.registerContainer(spec, cid)
    88  	info := &containerInfo{
    89  		cid:               cid,
    90  		containerName:     containerName,
    91  		conf:              conf,
    92  		spec:              spec,
    93  		stdioFDs:          stdioFDs,
    94  		goferFDs:          goferFDs,
    95  		devGoferFD:        devGoferFD,
    96  		goferFilestoreFDs: goferFilestoreFDs,
    97  		goferMountConfs:   goferMountConfs,
    98  	}
    99  	return r.restoreContainerInfo(l, info)
   100  }
   101  
   102  func (r *restorer) restoreContainerInfo(l *Loader, info *containerInfo) error {
   103  	r.mu.Lock()
   104  	defer r.mu.Unlock()
   105  
   106  	for _, container := range r.containers {
   107  		if container.containerName == info.containerName {
   108  			return fmt.Errorf("container %q already restored", info.containerName)
   109  		}
   110  		if container.cid == info.cid {
   111  			return fmt.Errorf("container CID %q already belongs to container %q", info.cid, container.containerName)
   112  		}
   113  	}
   114  
   115  	r.containers = append(r.containers, info)
   116  
   117  	log.Infof("Restored container %d of %d", len(r.containers), r.totalContainers)
   118  	if log.IsLogging(log.Debug) {
   119  		for i, fd := range info.stdioFDs {
   120  			log.Debugf("Restore app FD: %d host FD: %d", i, fd.FD())
   121  		}
   122  	}
   123  
   124  	if len(r.containers) == r.totalContainers {
   125  		// Trigger the restore if this is the last container.
   126  		return r.restore(l)
   127  	}
   128  	return nil
   129  }
   130  
   131  func createNetworkNamespaceForRestore(l *Loader) (*stack.Stack, *inet.Namespace, error) {
   132  	creds := getRootCredentials(l.root.spec, l.root.conf, nil /* UserNamespace */)
   133  	if creds == nil {
   134  		return nil, nil, fmt.Errorf("getting root credentials")
   135  	}
   136  
   137  	// Save the current network stack to slap on top of the one that was restored.
   138  	curNetwork := l.k.RootNetworkNamespace().Stack()
   139  	eps, ok := curNetwork.(*netstack.Stack)
   140  	if !ok {
   141  		return nil, inet.NewRootNamespace(hostinet.NewStack(), nil, creds.UserNamespace), nil
   142  	}
   143  
   144  	creator := &sandboxNetstackCreator{
   145  		clock:                    l.k.Timekeeper(),
   146  		uniqueID:                 l.k,
   147  		allowPacketEndpointWrite: l.root.conf.AllowPacketEndpointWrite,
   148  	}
   149  	return eps.Stack, inet.NewRootNamespace(curNetwork, creator, creds.UserNamespace), nil
   150  }
   151  
   152  func (r *restorer) restore(l *Loader) error {
   153  	log.Infof("Starting to restore %d containers", len(r.containers))
   154  
   155  	// Create a new root network namespace with the network stack of the
   156  	// old kernel to preserve the existing network configuration.
   157  	oldStack, netns, err := createNetworkNamespaceForRestore(l)
   158  	if err != nil {
   159  		return fmt.Errorf("creating network: %w", err)
   160  	}
   161  
   162  	// Reset the network stack in the network namespace to nil before
   163  	// replacing the kernel. This will not free the network stack when this
   164  	// old kernel is released.
   165  	l.k.RootNetworkNamespace().ResetStack()
   166  
   167  	p, err := createPlatform(l.root.conf, r.deviceFile)
   168  	if err != nil {
   169  		return fmt.Errorf("creating platform: %v", err)
   170  	}
   171  
   172  	// Start the old watchdog before replacing it with a new one below.
   173  	l.watchdog.Start()
   174  
   175  	// Release the kernel and replace it with a new one that will be restored into.
   176  	if l.k != nil {
   177  		l.k.Release()
   178  	}
   179  	l.k = &kernel.Kernel{
   180  		Platform: p,
   181  	}
   182  
   183  	mf, err := createMemoryFile()
   184  	if err != nil {
   185  		return fmt.Errorf("creating memory file: %v", err)
   186  	}
   187  	l.k.SetMemoryFile(mf)
   188  
   189  	if l.root.conf.ProfileEnable {
   190  		// pprof.Initialize opens /proc/self/maps, so has to be called before
   191  		// installing seccomp filters.
   192  		pprof.Initialize()
   193  	}
   194  
   195  	// Seccomp filters have to be applied before vfs restore and before parsing
   196  	// the state file.
   197  	if err := l.installSeccompFilters(); err != nil {
   198  		return err
   199  	}
   200  
   201  	// Set up the restore environment.
   202  	ctx := l.k.SupervisorContext()
   203  	if oldStack != nil {
   204  		ctx = context.WithValue(ctx, stack.CtxRestoreStack, oldStack)
   205  	}
   206  
   207  	fdmap := make(map[vfs.RestoreID]int)
   208  	mfmap := make(map[string]*pgalloc.MemoryFile)
   209  	for _, cont := range r.containers {
   210  		// TODO(b/298078576): Need to process hints here probably
   211  		mntr := newContainerMounter(cont, l.k, l.mountHints, l.sharedMounts, l.productName, cont.cid)
   212  		if err = mntr.configureRestore(fdmap, mfmap); err != nil {
   213  			return fmt.Errorf("configuring filesystem restore: %v", err)
   214  		}
   215  
   216  		for i, fd := range cont.stdioFDs {
   217  			key := host.MakeRestoreID(cont.containerName, i)
   218  			fdmap[key] = fd.Release()
   219  		}
   220  		for _, customFD := range cont.passFDs {
   221  			key := host.MakeRestoreID(cont.containerName, customFD.guest)
   222  			fdmap[key] = customFD.host.FD()
   223  		}
   224  	}
   225  
   226  	log.Debugf("Restore using fdmap: %v", fdmap)
   227  	ctx = context.WithValue(ctx, vfs.CtxRestoreFilesystemFDMap, fdmap)
   228  	log.Debugf("Restore using mfmap: %v", mfmap)
   229  	ctx = context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfmap)
   230  	ctx = context.WithValue(ctx, devutil.CtxDevGoferClientProvider, l.k)
   231  
   232  	// Load the state.
   233  	loadOpts := state.LoadOpts{Source: r.stateFile, PagesMetadata: r.pagesMetadata, PagesFile: r.pagesFile}
   234  	if err := loadOpts.Load(ctx, l.k, nil, netns.Stack(), time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
   235  		return err
   236  	}
   237  
   238  	// Since we have a new kernel we also must make a new watchdog.
   239  	dogOpts := watchdog.DefaultOpts
   240  	dogOpts.TaskTimeoutAction = l.root.conf.WatchdogAction
   241  	dogOpts.StartupTimeout = 3 * time2.Minute // Give extra time for all containers to restore.
   242  	dog := watchdog.New(l.k, dogOpts)
   243  
   244  	// Change the loader fields to reflect the changes made when restoring.
   245  	l.watchdog.Stop()
   246  	l.watchdog = dog
   247  	l.root.procArgs = kernel.CreateProcessArgs{}
   248  	l.restore = true
   249  
   250  	l.sandboxID = l.root.cid
   251  
   252  	l.mu.Lock()
   253  	cu := cleanup.Make(func() {
   254  		l.mu.Unlock()
   255  	})
   256  	defer cu.Clean()
   257  
   258  	// Update all tasks in the system with their respective new container IDs.
   259  	for _, task := range l.k.TaskSet().Root.Tasks() {
   260  		oldCid := task.ContainerID()
   261  		name := l.k.ContainerName(oldCid)
   262  		newCid, ok := l.containerIDs[name]
   263  		if !ok {
   264  			return fmt.Errorf("unable to remap task with CID %q (name: %q). Available names: %v", task.ContainerID(), name, l.containerIDs)
   265  		}
   266  		task.RestoreContainerID(newCid)
   267  	}
   268  
   269  	// Rebuild `processes` map with containers' root process from the restored kernel.
   270  	for _, tg := range l.k.RootPIDNamespace().ThreadGroups() {
   271  		// Find all processes with no parent (root of execution), that were not started
   272  		// via a call to `exec`.
   273  		if tg.Leader().Parent() == nil && tg.Leader().Origin != kernel.OriginExec {
   274  			cid := tg.Leader().ContainerID()
   275  			proc := l.processes[execID{cid: cid}]
   276  			if proc == nil {
   277  				return fmt.Errorf("unable to find container root process with CID %q, processes: %v", cid, l.processes)
   278  			}
   279  			proc.tg = tg
   280  		}
   281  	}
   282  
   283  	// Kill all processes that have been exec'd since they cannot be properly
   284  	// restored -- the caller is no longer connected.
   285  	for _, tg := range l.k.RootPIDNamespace().ThreadGroups() {
   286  		if tg.Leader().Origin == kernel.OriginExec {
   287  			if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: int32(linux.SIGKILL)}); err != nil {
   288  				log.Warningf("Failed to kill exec process after restore: %v", err)
   289  			}
   290  		}
   291  	}
   292  
   293  	l.k.RestoreContainerMapping(l.containerIDs)
   294  
   295  	// Release `l.mu` before calling into callbacks.
   296  	cu.Clean()
   297  
   298  	if err := r.restoreDone(); err != nil {
   299  		return err
   300  	}
   301  
   302  	r.stateFile.Close()
   303  	if r.pagesFile != nil {
   304  		r.pagesFile.Close()
   305  	}
   306  
   307  	log.Infof("Restore successful")
   308  	return nil
   309  }
   310  
   311  func (l *Loader) save(o *control.SaveOpts) error {
   312  	// TODO(gvisor.dev/issues/6243): save/restore not supported w/ hostinet
   313  	if l.root.conf.Network == config.NetworkHost {
   314  		return errors.New("checkpoint not supported when using hostinet")
   315  	}
   316  
   317  	if o.Metadata == nil {
   318  		o.Metadata = make(map[string]string)
   319  	}
   320  	o.Metadata["container_count"] = strconv.Itoa(l.containerCount())
   321  
   322  	state := control.State{
   323  		Kernel:   l.k,
   324  		Watchdog: l.watchdog,
   325  	}
   326  
   327  	return state.Save(o, nil)
   328  }