github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/boot/restore.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  
    21  	"github.com/metacubex/gvisor/pkg/context"
    22  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/host"
    23  	"github.com/metacubex/gvisor/pkg/sentry/inet"
    24  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    25  	"github.com/metacubex/gvisor/pkg/sentry/socket/hostinet"
    26  	"github.com/metacubex/gvisor/pkg/sentry/socket/netstack"
    27  	"github.com/metacubex/gvisor/pkg/sentry/state"
    28  	"github.com/metacubex/gvisor/pkg/sentry/time"
    29  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    30  	"github.com/metacubex/gvisor/pkg/sentry/watchdog"
    31  	"github.com/metacubex/gvisor/pkg/tcpip/stack"
    32  	"github.com/metacubex/gvisor/runsc/boot/pprof"
    33  )
    34  
    35  type restorer struct {
    36  	container  *containerInfo
    37  	stateFile  *os.File
    38  	deviceFile *os.File
    39  }
    40  
    41  func createNetworkNamespaceForRestore(l *Loader) (*stack.Stack, *inet.Namespace, error) {
    42  	creds := getRootCredentials(l.root.spec, l.root.conf, nil /* UserNamespace */)
    43  	if creds == nil {
    44  		return nil, nil, fmt.Errorf("getting root credentials")
    45  	}
    46  
    47  	// Save the current network stack to slap on top of the one that was restored.
    48  	curNetwork := l.k.RootNetworkNamespace().Stack()
    49  	eps, ok := curNetwork.(*netstack.Stack)
    50  	if !ok {
    51  		return nil, inet.NewRootNamespace(hostinet.NewStack(), nil, creds.UserNamespace), nil
    52  	}
    53  
    54  	creator := &sandboxNetstackCreator{
    55  		clock:                    l.k.Timekeeper(),
    56  		uniqueID:                 l.k,
    57  		allowPacketEndpointWrite: l.root.conf.AllowPacketEndpointWrite,
    58  	}
    59  	return eps.Stack, inet.NewRootNamespace(curNetwork, creator, creds.UserNamespace), nil
    60  }
    61  
    62  func (r *restorer) restore(l *Loader) error {
    63  	// Create a new root network namespace with the network stack of the
    64  	// old kernel to preserve the existing network configuration.
    65  	oldStack, netns, err := createNetworkNamespaceForRestore(l)
    66  	if err != nil {
    67  		return fmt.Errorf("creating network: %w", err)
    68  	}
    69  
    70  	// Reset the network stack in the network namespace to nil before
    71  	// replacing the kernel. This will not free the network stack when this
    72  	// old kernel is released.
    73  	l.k.RootNetworkNamespace().ResetStack()
    74  
    75  	p, err := createPlatform(l.root.conf, r.deviceFile)
    76  	if err != nil {
    77  		return fmt.Errorf("creating platform: %v", err)
    78  	}
    79  
    80  	// Start the old watchdog before replacing it with a new one below.
    81  	l.watchdog.Start()
    82  
    83  	// Release the kernel and replace it with a new one that will be restored into.
    84  	if l.k != nil {
    85  		l.k.Release()
    86  	}
    87  	l.k = &kernel.Kernel{
    88  		Platform: p,
    89  	}
    90  
    91  	mf, err := createMemoryFile()
    92  	if err != nil {
    93  		return fmt.Errorf("creating memory file: %v", err)
    94  	}
    95  	l.k.SetMemoryFile(mf)
    96  
    97  	if l.root.conf.ProfileEnable {
    98  		// pprof.Initialize opens /proc/self/maps, so has to be called before
    99  		// installing seccomp filters.
   100  		pprof.Initialize()
   101  	}
   102  
   103  	// Seccomp filters have to be applied before vfs restore and before parsing
   104  	// the state file.
   105  	if err := l.installSeccompFilters(); err != nil {
   106  		return err
   107  	}
   108  
   109  	// Set up the restore environment.
   110  	ctx := l.k.SupervisorContext()
   111  	if oldStack != nil {
   112  		ctx = context.WithValue(ctx, stack.CtxRestoreStack, oldStack)
   113  	}
   114  
   115  	// TODO(b/298078576): Need to process hints here probably
   116  	mntr := newContainerMounter(&l.root, l.k, l.mountHints, l.sharedMounts, l.productName, l.sandboxID)
   117  	ctx, err = mntr.configureRestore(ctx)
   118  	if err != nil {
   119  		return fmt.Errorf("configuring filesystem restore: %v", err)
   120  	}
   121  
   122  	fdmap := vfs.RestoreFilesystemFDMapFromContext(ctx)
   123  	for appFD, fd := range r.container.stdioFDs {
   124  		key := host.MakeRestoreID(r.container.containerName, appFD)
   125  		fdmap[key] = fd.Release()
   126  	}
   127  	for _, customFD := range r.container.passFDs {
   128  		key := host.MakeRestoreID(r.container.containerName, customFD.guest)
   129  		fdmap[key] = customFD.host.FD()
   130  	}
   131  
   132  	// Load the state.
   133  	loadOpts := state.LoadOpts{Source: r.stateFile}
   134  	if err := loadOpts.Load(ctx, l.k, nil, netns.Stack(), time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil {
   135  		return err
   136  	}
   137  
   138  	// Since we have a new kernel we also must make a new watchdog.
   139  	dogOpts := watchdog.DefaultOpts
   140  	dogOpts.TaskTimeoutAction = l.root.conf.WatchdogAction
   141  	dog := watchdog.New(l.k, dogOpts)
   142  
   143  	// Change the loader fields to reflect the changes made when restoring.
   144  	l.watchdog = dog
   145  	l.root.procArgs = kernel.CreateProcessArgs{}
   146  	l.restore = true
   147  
   148  	// Reinitialize the sandbox ID and processes map. Note that it doesn't
   149  	// restore the state of multiple containers, nor exec processes.
   150  	l.sandboxID = r.container.cid
   151  
   152  	l.mu.Lock()
   153  	defer l.mu.Unlock()
   154  
   155  	// Set new container ID if it has changed.
   156  	tasks := l.k.TaskSet().Root.Tasks()
   157  	if tasks[0].ContainerID() != l.sandboxID { // There must be at least 1 task.
   158  		for _, task := range tasks {
   159  			task.RestoreContainerID(l.sandboxID)
   160  		}
   161  	}
   162  
   163  	eid := execID{cid: l.sandboxID}
   164  	l.processes = map[execID]*execProcess{
   165  		eid: {
   166  			tg: l.k.GlobalInit(),
   167  		},
   168  	}
   169  
   170  	return nil
   171  }