gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/restore.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package boot 16 17 import ( 18 "errors" 19 "fmt" 20 "io" 21 "strconv" 22 time2 "time" 23 24 specs "github.com/opencontainers/runtime-spec/specs-go" 25 "gvisor.dev/gvisor/pkg/abi/linux" 26 "gvisor.dev/gvisor/pkg/cleanup" 27 "gvisor.dev/gvisor/pkg/context" 28 "gvisor.dev/gvisor/pkg/devutil" 29 "gvisor.dev/gvisor/pkg/fd" 30 "gvisor.dev/gvisor/pkg/log" 31 "gvisor.dev/gvisor/pkg/sentry/control" 32 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" 33 "gvisor.dev/gvisor/pkg/sentry/inet" 34 "gvisor.dev/gvisor/pkg/sentry/kernel" 35 "gvisor.dev/gvisor/pkg/sentry/pgalloc" 36 "gvisor.dev/gvisor/pkg/sentry/socket/hostinet" 37 "gvisor.dev/gvisor/pkg/sentry/socket/netstack" 38 "gvisor.dev/gvisor/pkg/sentry/state" 39 "gvisor.dev/gvisor/pkg/sentry/time" 40 "gvisor.dev/gvisor/pkg/sentry/vfs" 41 "gvisor.dev/gvisor/pkg/sentry/watchdog" 42 "gvisor.dev/gvisor/pkg/sync" 43 "gvisor.dev/gvisor/pkg/tcpip/stack" 44 "gvisor.dev/gvisor/runsc/boot/pprof" 45 "gvisor.dev/gvisor/runsc/config" 46 ) 47 48 const ( 49 // CheckpointStateFileName is the file within the given image-path's 50 // directory which contains the container's saved state. 51 CheckpointStateFileName = "checkpoint.img" 52 // CheckpointPagesMetadataFileName is the file within the given image-path's 53 // directory containing the container's MemoryFile metadata. 54 CheckpointPagesMetadataFileName = "pages_meta.img" 55 // CheckpointPagesFileName is the file within the given image-path's 56 // directory containing the container's MemoryFile pages. 57 CheckpointPagesFileName = "pages.img" 58 ) 59 60 // restorer manages a restore session for a sandbox. It stores information about 61 // all containers and triggers the full sandbox restore after the last 62 // container is restored. 63 type restorer struct { 64 mu sync.Mutex 65 66 // totalContainers is the number of containers expected to be restored in 67 // the sandbox. Sandbox restore can only happen, after all containers have 68 // been restored. 69 totalContainers int 70 71 // containers is the list of containers restored so far. 72 containers []*containerInfo 73 74 // Files used by restore to rehydrate the state. 75 stateFile io.ReadCloser 76 pagesMetadata *fd.FD 77 pagesFile *fd.FD 78 79 // deviceFile is the required to start the platform. 80 deviceFile *fd.FD 81 82 // restoreDone is a callback triggered when restore is successful. 83 restoreDone func() error 84 } 85 86 func (r *restorer) restoreSubcontainer(spec *specs.Spec, conf *config.Config, l *Loader, cid string, stdioFDs, goferFDs, goferFilestoreFDs []*fd.FD, devGoferFD *fd.FD, goferMountConfs []GoferMountConf) error { 87 containerName := l.registerContainer(spec, cid) 88 info := &containerInfo{ 89 cid: cid, 90 containerName: containerName, 91 conf: conf, 92 spec: spec, 93 stdioFDs: stdioFDs, 94 goferFDs: goferFDs, 95 devGoferFD: devGoferFD, 96 goferFilestoreFDs: goferFilestoreFDs, 97 goferMountConfs: goferMountConfs, 98 } 99 return r.restoreContainerInfo(l, info) 100 } 101 102 func (r *restorer) restoreContainerInfo(l *Loader, info *containerInfo) error { 103 r.mu.Lock() 104 defer r.mu.Unlock() 105 106 for _, container := range r.containers { 107 if container.containerName == info.containerName { 108 return fmt.Errorf("container %q already restored", info.containerName) 109 } 110 if container.cid == info.cid { 111 return fmt.Errorf("container CID %q already belongs to container %q", info.cid, container.containerName) 112 } 113 } 114 115 r.containers = append(r.containers, info) 116 117 log.Infof("Restored container %d of %d", len(r.containers), r.totalContainers) 118 if log.IsLogging(log.Debug) { 119 for i, fd := range info.stdioFDs { 120 log.Debugf("Restore app FD: %d host FD: %d", i, fd.FD()) 121 } 122 } 123 124 if len(r.containers) == r.totalContainers { 125 // Trigger the restore if this is the last container. 126 return r.restore(l) 127 } 128 return nil 129 } 130 131 func createNetworkNamespaceForRestore(l *Loader) (*stack.Stack, *inet.Namespace, error) { 132 creds := getRootCredentials(l.root.spec, l.root.conf, nil /* UserNamespace */) 133 if creds == nil { 134 return nil, nil, fmt.Errorf("getting root credentials") 135 } 136 137 // Save the current network stack to slap on top of the one that was restored. 138 curNetwork := l.k.RootNetworkNamespace().Stack() 139 eps, ok := curNetwork.(*netstack.Stack) 140 if !ok { 141 return nil, inet.NewRootNamespace(hostinet.NewStack(), nil, creds.UserNamespace), nil 142 } 143 144 creator := &sandboxNetstackCreator{ 145 clock: l.k.Timekeeper(), 146 uniqueID: l.k, 147 allowPacketEndpointWrite: l.root.conf.AllowPacketEndpointWrite, 148 } 149 return eps.Stack, inet.NewRootNamespace(curNetwork, creator, creds.UserNamespace), nil 150 } 151 152 func (r *restorer) restore(l *Loader) error { 153 log.Infof("Starting to restore %d containers", len(r.containers)) 154 155 // Create a new root network namespace with the network stack of the 156 // old kernel to preserve the existing network configuration. 157 oldStack, netns, err := createNetworkNamespaceForRestore(l) 158 if err != nil { 159 return fmt.Errorf("creating network: %w", err) 160 } 161 162 // Reset the network stack in the network namespace to nil before 163 // replacing the kernel. This will not free the network stack when this 164 // old kernel is released. 165 l.k.RootNetworkNamespace().ResetStack() 166 167 p, err := createPlatform(l.root.conf, r.deviceFile) 168 if err != nil { 169 return fmt.Errorf("creating platform: %v", err) 170 } 171 172 // Start the old watchdog before replacing it with a new one below. 173 l.watchdog.Start() 174 175 // Release the kernel and replace it with a new one that will be restored into. 176 if l.k != nil { 177 l.k.Release() 178 } 179 l.k = &kernel.Kernel{ 180 Platform: p, 181 } 182 183 mf, err := createMemoryFile() 184 if err != nil { 185 return fmt.Errorf("creating memory file: %v", err) 186 } 187 l.k.SetMemoryFile(mf) 188 189 if l.root.conf.ProfileEnable { 190 // pprof.Initialize opens /proc/self/maps, so has to be called before 191 // installing seccomp filters. 192 pprof.Initialize() 193 } 194 195 // Seccomp filters have to be applied before vfs restore and before parsing 196 // the state file. 197 if err := l.installSeccompFilters(); err != nil { 198 return err 199 } 200 201 // Set up the restore environment. 202 ctx := l.k.SupervisorContext() 203 if oldStack != nil { 204 ctx = context.WithValue(ctx, stack.CtxRestoreStack, oldStack) 205 } 206 207 fdmap := make(map[vfs.RestoreID]int) 208 mfmap := make(map[string]*pgalloc.MemoryFile) 209 for _, cont := range r.containers { 210 // TODO(b/298078576): Need to process hints here probably 211 mntr := newContainerMounter(cont, l.k, l.mountHints, l.sharedMounts, l.productName, cont.cid) 212 if err = mntr.configureRestore(fdmap, mfmap); err != nil { 213 return fmt.Errorf("configuring filesystem restore: %v", err) 214 } 215 216 for i, fd := range cont.stdioFDs { 217 key := host.MakeRestoreID(cont.containerName, i) 218 fdmap[key] = fd.Release() 219 } 220 for _, customFD := range cont.passFDs { 221 key := host.MakeRestoreID(cont.containerName, customFD.guest) 222 fdmap[key] = customFD.host.FD() 223 } 224 } 225 226 log.Debugf("Restore using fdmap: %v", fdmap) 227 ctx = context.WithValue(ctx, vfs.CtxRestoreFilesystemFDMap, fdmap) 228 log.Debugf("Restore using mfmap: %v", mfmap) 229 ctx = context.WithValue(ctx, pgalloc.CtxMemoryFileMap, mfmap) 230 ctx = context.WithValue(ctx, devutil.CtxDevGoferClientProvider, l.k) 231 232 // Load the state. 233 loadOpts := state.LoadOpts{Source: r.stateFile, PagesMetadata: r.pagesMetadata, PagesFile: r.pagesFile} 234 if err := loadOpts.Load(ctx, l.k, nil, netns.Stack(), time.NewCalibratedClocks(), &vfs.CompleteRestoreOptions{}); err != nil { 235 return err 236 } 237 238 // Since we have a new kernel we also must make a new watchdog. 239 dogOpts := watchdog.DefaultOpts 240 dogOpts.TaskTimeoutAction = l.root.conf.WatchdogAction 241 dogOpts.StartupTimeout = 3 * time2.Minute // Give extra time for all containers to restore. 242 dog := watchdog.New(l.k, dogOpts) 243 244 // Change the loader fields to reflect the changes made when restoring. 245 l.watchdog.Stop() 246 l.watchdog = dog 247 l.root.procArgs = kernel.CreateProcessArgs{} 248 l.restore = true 249 250 l.sandboxID = l.root.cid 251 252 l.mu.Lock() 253 cu := cleanup.Make(func() { 254 l.mu.Unlock() 255 }) 256 defer cu.Clean() 257 258 // Update all tasks in the system with their respective new container IDs. 259 for _, task := range l.k.TaskSet().Root.Tasks() { 260 oldCid := task.ContainerID() 261 name := l.k.ContainerName(oldCid) 262 newCid, ok := l.containerIDs[name] 263 if !ok { 264 return fmt.Errorf("unable to remap task with CID %q (name: %q). Available names: %v", task.ContainerID(), name, l.containerIDs) 265 } 266 task.RestoreContainerID(newCid) 267 } 268 269 // Rebuild `processes` map with containers' root process from the restored kernel. 270 for _, tg := range l.k.RootPIDNamespace().ThreadGroups() { 271 // Find all processes with no parent (root of execution), that were not started 272 // via a call to `exec`. 273 if tg.Leader().Parent() == nil && tg.Leader().Origin != kernel.OriginExec { 274 cid := tg.Leader().ContainerID() 275 proc := l.processes[execID{cid: cid}] 276 if proc == nil { 277 return fmt.Errorf("unable to find container root process with CID %q, processes: %v", cid, l.processes) 278 } 279 proc.tg = tg 280 } 281 } 282 283 // Kill all processes that have been exec'd since they cannot be properly 284 // restored -- the caller is no longer connected. 285 for _, tg := range l.k.RootPIDNamespace().ThreadGroups() { 286 if tg.Leader().Origin == kernel.OriginExec { 287 if err := l.k.SendExternalSignalThreadGroup(tg, &linux.SignalInfo{Signo: int32(linux.SIGKILL)}); err != nil { 288 log.Warningf("Failed to kill exec process after restore: %v", err) 289 } 290 } 291 } 292 293 l.k.RestoreContainerMapping(l.containerIDs) 294 295 // Release `l.mu` before calling into callbacks. 296 cu.Clean() 297 298 if err := r.restoreDone(); err != nil { 299 return err 300 } 301 302 r.stateFile.Close() 303 if r.pagesFile != nil { 304 r.pagesFile.Close() 305 } 306 307 log.Infof("Restore successful") 308 return nil 309 } 310 311 func (l *Loader) save(o *control.SaveOpts) error { 312 // TODO(gvisor.dev/issues/6243): save/restore not supported w/ hostinet 313 if l.root.conf.Network == config.NetworkHost { 314 return errors.New("checkpoint not supported when using hostinet") 315 } 316 317 if o.Metadata == nil { 318 o.Metadata = make(map[string]string) 319 } 320 o.Metadata["container_count"] = strconv.Itoa(l.containerCount()) 321 322 state := control.State{ 323 Kernel: l.k, 324 Watchdog: l.watchdog, 325 } 326 327 return state.Save(o, nil) 328 }