gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/pgalloc/save_restore.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package pgalloc
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"io"
    22  	"runtime"
    23  
    24  	"gvisor.dev/gvisor/pkg/atomicbitops"
    25  	"gvisor.dev/gvisor/pkg/hostarch"
    26  	"gvisor.dev/gvisor/pkg/log"
    27  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    28  	"gvisor.dev/gvisor/pkg/sentry/usage"
    29  	"gvisor.dev/gvisor/pkg/state"
    30  	"gvisor.dev/gvisor/pkg/state/statefile"
    31  	"gvisor.dev/gvisor/pkg/sync"
    32  )
    33  
    34  // SaveOpts provides options to MemoryFile.SaveTo().
    35  type SaveOpts struct {
    36  	// If ExcludeCommittedZeroPages is true, SaveTo() will scan both committed
    37  	// and possibly-committed pages to find zero pages, whose contents are
    38  	// saved implicitly rather than explicitly to reduce checkpoint size. If
    39  	// ExcludeCommittedZeroPages is false, SaveTo() will scan only
    40  	// possibly-committed pages to find zero pages.
    41  	//
    42  	// Enabling ExcludeCommittedZeroPages will usually increase the time taken
    43  	// by SaveTo() (due to the larger number of pages that must be scanned),
    44  	// but may instead improve SaveTo() and LoadFrom() time, and checkpoint
    45  	// size, if the application has many committed zero pages.
    46  	ExcludeCommittedZeroPages bool
    47  }
    48  
    49  // SaveTo writes f's state to the given stream.
    50  func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer, pw io.Writer, opts SaveOpts) error {
    51  	// Wait for reclaim.
    52  	f.mu.Lock()
    53  	defer f.mu.Unlock()
    54  	for f.reclaimable {
    55  		f.reclaimCond.Signal()
    56  		f.mu.Unlock()
    57  		runtime.Gosched()
    58  		f.mu.Lock()
    59  	}
    60  
    61  	// Ensure that there are no pending evictions.
    62  	if len(f.evictable) != 0 {
    63  		panic(fmt.Sprintf("evictions still pending for %d users; call StartEvictions and WaitForEvictions before SaveTo", len(f.evictable)))
    64  	}
    65  
    66  	// Ensure that all pages that contain non-zero bytes have knownCommitted
    67  	// set, since we only store knownCommitted pages below.
    68  	zeroPage := make([]byte, hostarch.PageSize)
    69  	var (
    70  		decommitWarnOnce  sync.Once
    71  		decommitPendingFR memmap.FileRange
    72  		scanTotal         uint64
    73  		decommitTotal     uint64
    74  		decommitCount     uint64
    75  	)
    76  	decommitNow := func(fr memmap.FileRange) {
    77  		decommitTotal += fr.Length()
    78  		decommitCount++
    79  		if err := f.decommitFile(fr); err != nil {
    80  			// This doesn't impact the correctness of saved memory, it just
    81  			// means that we're incrementally more likely to OOM. Complain, but
    82  			// don't abort saving.
    83  			decommitWarnOnce.Do(func() {
    84  				log.Warningf("Decommitting MemoryFile offsets %v while saving failed: %v", fr, err)
    85  			})
    86  		}
    87  	}
    88  	decommitAddPage := func(off uint64) {
    89  		// Invariants:
    90  		// (1) All of decommitPendingFR lies within a single huge page.
    91  		// (2) decommitPendingFR.End is hugepage-aligned iff
    92  		// decommitPendingFR.Length() == 0.
    93  		end := off + hostarch.PageSize
    94  		if decommitPendingFR.End == off {
    95  			// Merge with the existing range. By invariants, the page {off,
    96  			// end} must be within the same huge page as the rest of
    97  			// decommitPendingFR.
    98  			decommitPendingFR.End = end
    99  		} else {
   100  			// Decommit the existing range and start a new one.
   101  			if decommitPendingFR.Length() != 0 {
   102  				decommitNow(decommitPendingFR)
   103  			}
   104  			decommitPendingFR = memmap.FileRange{off, end}
   105  		}
   106  		// Maintain invariants by decommitting if we've reached the end of the
   107  		// containing huge page.
   108  		if hostarch.IsHugePageAligned(end) {
   109  			decommitNow(decommitPendingFR)
   110  			decommitPendingFR = memmap.FileRange{}
   111  		}
   112  	}
   113  	err := f.updateUsageLocked(0, nil, opts.ExcludeCommittedZeroPages, func(bs []byte, committed []byte, off uint64, wasCommitted bool) error {
   114  		scanTotal += uint64(len(bs))
   115  		for pgoff := 0; pgoff < len(bs); pgoff += hostarch.PageSize {
   116  			i := pgoff / hostarch.PageSize
   117  			pg := bs[pgoff : pgoff+hostarch.PageSize]
   118  			if !bytes.Equal(pg, zeroPage) {
   119  				committed[i] = 1
   120  				continue
   121  			}
   122  			committed[i] = 0
   123  			if !wasCommitted {
   124  				// Reading the page may have caused it to be committed;
   125  				// decommit it to reduce memory usage.
   126  				decommitAddPage(off + uint64(pgoff))
   127  			}
   128  		}
   129  		return nil
   130  	})
   131  	if decommitPendingFR.Length() != 0 {
   132  		decommitNow(decommitPendingFR)
   133  		decommitPendingFR = memmap.FileRange{}
   134  	}
   135  	if err != nil {
   136  		return err
   137  	}
   138  	log.Debugf("MemoryFile.SaveTo: scanned %d bytes, decommitted %d bytes in %d syscalls", scanTotal, decommitTotal, decommitCount)
   139  
   140  	// Save metadata.
   141  	if _, err := state.Save(ctx, w, &f.fileSize); err != nil {
   142  		return err
   143  	}
   144  	if _, err := state.Save(ctx, w, &f.usage); err != nil {
   145  		return err
   146  	}
   147  
   148  	// Dump out committed pages.
   149  	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
   150  		if !seg.Value().knownCommitted {
   151  			continue
   152  		}
   153  		// Write a header to distinguish from objects.
   154  		if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil {
   155  			return err
   156  		}
   157  		// Write out data.
   158  		var ioErr error
   159  		err := f.forEachMappingSlice(seg.Range(), func(s []byte) {
   160  			if ioErr != nil {
   161  				return
   162  			}
   163  			_, ioErr = pw.Write(s)
   164  		})
   165  		if ioErr != nil {
   166  			return ioErr
   167  		}
   168  		if err != nil {
   169  			return err
   170  		}
   171  	}
   172  
   173  	return nil
   174  }
   175  
   176  // MarkSavable marks f as savable.
   177  func (f *MemoryFile) MarkSavable() {
   178  	f.mu.Lock()
   179  	defer f.mu.Unlock()
   180  	f.savable = true
   181  }
   182  
   183  // IsSavable returns true if f is savable.
   184  func (f *MemoryFile) IsSavable() bool {
   185  	f.mu.Lock()
   186  	defer f.mu.Unlock()
   187  	return f.savable
   188  }
   189  
   190  // RestoreID returns the restore ID for f.
   191  func (f *MemoryFile) RestoreID() string {
   192  	return f.opts.RestoreID
   193  }
   194  
   195  // LoadFrom loads MemoryFile state from the given stream.
   196  func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, pr *statefile.AsyncReader) error {
   197  	// Load metadata.
   198  	if _, err := state.Load(ctx, r, &f.fileSize); err != nil {
   199  		return err
   200  	}
   201  	if err := f.file.Truncate(f.fileSize); err != nil {
   202  		return err
   203  	}
   204  	newMappings := make([]uintptr, f.fileSize>>chunkShift)
   205  	f.mappings.Store(&newMappings)
   206  	if _, err := state.Load(ctx, r, &f.usage); err != nil {
   207  		return err
   208  	}
   209  
   210  	// Try to map committed chunks concurrently: For any given chunk, either
   211  	// this loop or the following one will mmap the chunk first and cache it in
   212  	// f.mappings for the other, but this loop is likely to run ahead of the
   213  	// other since it doesn't do any work between mmaps. The rest of this
   214  	// function doesn't mutate f.usage, so it's safe to iterate concurrently.
   215  	mapperDone := make(chan struct{})
   216  	mapperCanceled := atomicbitops.FromInt32(0)
   217  	go func() { // S/R-SAFE: see comment
   218  		defer func() { close(mapperDone) }()
   219  		for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
   220  			if mapperCanceled.Load() != 0 {
   221  				return
   222  			}
   223  			if seg.Value().knownCommitted {
   224  				f.forEachMappingSlice(seg.Range(), func(s []byte) {})
   225  			}
   226  		}
   227  	}()
   228  	defer func() {
   229  		mapperCanceled.Store(1)
   230  		<-mapperDone
   231  	}()
   232  
   233  	// Load committed pages.
   234  	for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
   235  		if !seg.Value().knownCommitted {
   236  			continue
   237  		}
   238  		// Verify header.
   239  		length, object, err := state.ReadHeader(r)
   240  		if err != nil {
   241  			return err
   242  		}
   243  		if object {
   244  			// Not expected.
   245  			return fmt.Errorf("unexpected object")
   246  		}
   247  		if expected := uint64(seg.Range().Length()); length != expected {
   248  			// Size mismatch.
   249  			return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length)
   250  		}
   251  		// Read data.
   252  		var ioErr error
   253  		err = f.forEachMappingSlice(seg.Range(), func(s []byte) {
   254  			if ioErr != nil {
   255  				return
   256  			}
   257  			if pr != nil {
   258  				pr.ReadAsync(s)
   259  			} else {
   260  				_, ioErr = io.ReadFull(r, s)
   261  			}
   262  		})
   263  		if ioErr != nil {
   264  			return ioErr
   265  		}
   266  		if err != nil {
   267  			return err
   268  		}
   269  
   270  		// Update accounting for restored pages. We need to do this here since
   271  		// these segments are marked as "known committed", and will be skipped
   272  		// over on accounting scans.
   273  		amount := seg.Range().Length()
   274  		usage.MemoryAccounting.Inc(amount, seg.Value().kind, seg.Value().memCgID)
   275  		f.usageExpected += amount
   276  	}
   277  
   278  	return nil
   279  }