gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/pgalloc/save_restore.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package pgalloc 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "io" 22 "runtime" 23 24 "gvisor.dev/gvisor/pkg/atomicbitops" 25 "gvisor.dev/gvisor/pkg/hostarch" 26 "gvisor.dev/gvisor/pkg/log" 27 "gvisor.dev/gvisor/pkg/sentry/memmap" 28 "gvisor.dev/gvisor/pkg/sentry/usage" 29 "gvisor.dev/gvisor/pkg/state" 30 "gvisor.dev/gvisor/pkg/state/statefile" 31 "gvisor.dev/gvisor/pkg/sync" 32 ) 33 34 // SaveOpts provides options to MemoryFile.SaveTo(). 35 type SaveOpts struct { 36 // If ExcludeCommittedZeroPages is true, SaveTo() will scan both committed 37 // and possibly-committed pages to find zero pages, whose contents are 38 // saved implicitly rather than explicitly to reduce checkpoint size. If 39 // ExcludeCommittedZeroPages is false, SaveTo() will scan only 40 // possibly-committed pages to find zero pages. 41 // 42 // Enabling ExcludeCommittedZeroPages will usually increase the time taken 43 // by SaveTo() (due to the larger number of pages that must be scanned), 44 // but may instead improve SaveTo() and LoadFrom() time, and checkpoint 45 // size, if the application has many committed zero pages. 46 ExcludeCommittedZeroPages bool 47 } 48 49 // SaveTo writes f's state to the given stream. 50 func (f *MemoryFile) SaveTo(ctx context.Context, w io.Writer, pw io.Writer, opts SaveOpts) error { 51 // Wait for reclaim. 52 f.mu.Lock() 53 defer f.mu.Unlock() 54 for f.reclaimable { 55 f.reclaimCond.Signal() 56 f.mu.Unlock() 57 runtime.Gosched() 58 f.mu.Lock() 59 } 60 61 // Ensure that there are no pending evictions. 62 if len(f.evictable) != 0 { 63 panic(fmt.Sprintf("evictions still pending for %d users; call StartEvictions and WaitForEvictions before SaveTo", len(f.evictable))) 64 } 65 66 // Ensure that all pages that contain non-zero bytes have knownCommitted 67 // set, since we only store knownCommitted pages below. 68 zeroPage := make([]byte, hostarch.PageSize) 69 var ( 70 decommitWarnOnce sync.Once 71 decommitPendingFR memmap.FileRange 72 scanTotal uint64 73 decommitTotal uint64 74 decommitCount uint64 75 ) 76 decommitNow := func(fr memmap.FileRange) { 77 decommitTotal += fr.Length() 78 decommitCount++ 79 if err := f.decommitFile(fr); err != nil { 80 // This doesn't impact the correctness of saved memory, it just 81 // means that we're incrementally more likely to OOM. Complain, but 82 // don't abort saving. 83 decommitWarnOnce.Do(func() { 84 log.Warningf("Decommitting MemoryFile offsets %v while saving failed: %v", fr, err) 85 }) 86 } 87 } 88 decommitAddPage := func(off uint64) { 89 // Invariants: 90 // (1) All of decommitPendingFR lies within a single huge page. 91 // (2) decommitPendingFR.End is hugepage-aligned iff 92 // decommitPendingFR.Length() == 0. 93 end := off + hostarch.PageSize 94 if decommitPendingFR.End == off { 95 // Merge with the existing range. By invariants, the page {off, 96 // end} must be within the same huge page as the rest of 97 // decommitPendingFR. 98 decommitPendingFR.End = end 99 } else { 100 // Decommit the existing range and start a new one. 101 if decommitPendingFR.Length() != 0 { 102 decommitNow(decommitPendingFR) 103 } 104 decommitPendingFR = memmap.FileRange{off, end} 105 } 106 // Maintain invariants by decommitting if we've reached the end of the 107 // containing huge page. 108 if hostarch.IsHugePageAligned(end) { 109 decommitNow(decommitPendingFR) 110 decommitPendingFR = memmap.FileRange{} 111 } 112 } 113 err := f.updateUsageLocked(0, nil, opts.ExcludeCommittedZeroPages, func(bs []byte, committed []byte, off uint64, wasCommitted bool) error { 114 scanTotal += uint64(len(bs)) 115 for pgoff := 0; pgoff < len(bs); pgoff += hostarch.PageSize { 116 i := pgoff / hostarch.PageSize 117 pg := bs[pgoff : pgoff+hostarch.PageSize] 118 if !bytes.Equal(pg, zeroPage) { 119 committed[i] = 1 120 continue 121 } 122 committed[i] = 0 123 if !wasCommitted { 124 // Reading the page may have caused it to be committed; 125 // decommit it to reduce memory usage. 126 decommitAddPage(off + uint64(pgoff)) 127 } 128 } 129 return nil 130 }) 131 if decommitPendingFR.Length() != 0 { 132 decommitNow(decommitPendingFR) 133 decommitPendingFR = memmap.FileRange{} 134 } 135 if err != nil { 136 return err 137 } 138 log.Debugf("MemoryFile.SaveTo: scanned %d bytes, decommitted %d bytes in %d syscalls", scanTotal, decommitTotal, decommitCount) 139 140 // Save metadata. 141 if _, err := state.Save(ctx, w, &f.fileSize); err != nil { 142 return err 143 } 144 if _, err := state.Save(ctx, w, &f.usage); err != nil { 145 return err 146 } 147 148 // Dump out committed pages. 149 for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { 150 if !seg.Value().knownCommitted { 151 continue 152 } 153 // Write a header to distinguish from objects. 154 if err := state.WriteHeader(w, uint64(seg.Range().Length()), false); err != nil { 155 return err 156 } 157 // Write out data. 158 var ioErr error 159 err := f.forEachMappingSlice(seg.Range(), func(s []byte) { 160 if ioErr != nil { 161 return 162 } 163 _, ioErr = pw.Write(s) 164 }) 165 if ioErr != nil { 166 return ioErr 167 } 168 if err != nil { 169 return err 170 } 171 } 172 173 return nil 174 } 175 176 // MarkSavable marks f as savable. 177 func (f *MemoryFile) MarkSavable() { 178 f.mu.Lock() 179 defer f.mu.Unlock() 180 f.savable = true 181 } 182 183 // IsSavable returns true if f is savable. 184 func (f *MemoryFile) IsSavable() bool { 185 f.mu.Lock() 186 defer f.mu.Unlock() 187 return f.savable 188 } 189 190 // RestoreID returns the restore ID for f. 191 func (f *MemoryFile) RestoreID() string { 192 return f.opts.RestoreID 193 } 194 195 // LoadFrom loads MemoryFile state from the given stream. 196 func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, pr *statefile.AsyncReader) error { 197 // Load metadata. 198 if _, err := state.Load(ctx, r, &f.fileSize); err != nil { 199 return err 200 } 201 if err := f.file.Truncate(f.fileSize); err != nil { 202 return err 203 } 204 newMappings := make([]uintptr, f.fileSize>>chunkShift) 205 f.mappings.Store(&newMappings) 206 if _, err := state.Load(ctx, r, &f.usage); err != nil { 207 return err 208 } 209 210 // Try to map committed chunks concurrently: For any given chunk, either 211 // this loop or the following one will mmap the chunk first and cache it in 212 // f.mappings for the other, but this loop is likely to run ahead of the 213 // other since it doesn't do any work between mmaps. The rest of this 214 // function doesn't mutate f.usage, so it's safe to iterate concurrently. 215 mapperDone := make(chan struct{}) 216 mapperCanceled := atomicbitops.FromInt32(0) 217 go func() { // S/R-SAFE: see comment 218 defer func() { close(mapperDone) }() 219 for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { 220 if mapperCanceled.Load() != 0 { 221 return 222 } 223 if seg.Value().knownCommitted { 224 f.forEachMappingSlice(seg.Range(), func(s []byte) {}) 225 } 226 } 227 }() 228 defer func() { 229 mapperCanceled.Store(1) 230 <-mapperDone 231 }() 232 233 // Load committed pages. 234 for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { 235 if !seg.Value().knownCommitted { 236 continue 237 } 238 // Verify header. 239 length, object, err := state.ReadHeader(r) 240 if err != nil { 241 return err 242 } 243 if object { 244 // Not expected. 245 return fmt.Errorf("unexpected object") 246 } 247 if expected := uint64(seg.Range().Length()); length != expected { 248 // Size mismatch. 249 return fmt.Errorf("mismatched segment: expected %d, got %d", expected, length) 250 } 251 // Read data. 252 var ioErr error 253 err = f.forEachMappingSlice(seg.Range(), func(s []byte) { 254 if ioErr != nil { 255 return 256 } 257 if pr != nil { 258 pr.ReadAsync(s) 259 } else { 260 _, ioErr = io.ReadFull(r, s) 261 } 262 }) 263 if ioErr != nil { 264 return ioErr 265 } 266 if err != nil { 267 return err 268 } 269 270 // Update accounting for restored pages. We need to do this here since 271 // these segments are marked as "known committed", and will be skipped 272 // over on accounting scans. 273 amount := seg.Range().Length() 274 usage.MemoryAccounting.Inc(amount, seg.Value().kind, seg.Value().memCgID) 275 f.usageExpected += amount 276 } 277 278 return nil 279 }