github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/vfs/disk_full.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package vfs 6 7 import ( 8 "io" 9 "os" 10 "sync" 11 "sync/atomic" 12 "syscall" 13 14 "github.com/cockroachdb/errors" 15 ) 16 17 // OnDiskFull wraps the provided FS with an FS that examines returned errors, 18 // looking for ENOSPC errors. It invokes the provided callback when the 19 // underlying filesystem returns an error signifying the storage is out of 20 // disk space. 21 // 22 // All new writes to the filesystem are blocked while the callback executes, 23 // so care must be taken to avoid expensive work from within the callback. 24 // 25 // Once the callback completes, any write-oriented operations that encountered 26 // ENOSPC are retried exactly once. Once the callback completes, it will not 27 // be invoked again until a new operation that began after the callback 28 // returned encounters an ENOSPC error. 29 // 30 // OnDiskFull may be used to automatically manage a ballast file, which is 31 // removed from the filesystem from within the callback. Note that if managing 32 // a ballast, the caller should maintain a reference to the inner FS and 33 // remove the ballast on the unwrapped FS. 34 func OnDiskFull(fs FS, fn func()) FS { 35 newFS := &enospcFS{inner: fs} 36 newFS.mu.Cond.L = &newFS.mu.Mutex 37 newFS.mu.onDiskFull = fn 38 return newFS 39 } 40 41 type enospcFS struct { 42 inner FS 43 // generation is a monotonically increasing number that encodes the 44 // current state of ENOSPC error handling. Incoming writes are 45 // organized into generations to provide strong guarantees on when the 46 // disk full callback is invoked. The callback is invoked once per 47 // write generation. 48 // 49 // Special significance is given to the parity of this generation 50 // field to optimize incoming writes in the normal state, which only 51 // need to perform a single atomic load. If generation is odd, an 52 // ENOSPC error is being actively handled. The generations associated 53 // with writes are always even. 54 // 55 // The lifecycle of a write is: 56 // 57 // 1. Atomically load the current generation. 58 // a. If it's even, this is the write's generation number. 59 // b. If it's odd, an ENOSPC was recently encountered and the 60 // corresponding invocation of the disk full callback has not 61 // yet completed. The write must wait until the callback has 62 // completed and generation is updated to an even number, which 63 // becomes the write's generation number. 64 // 2. Perform the write. If it encounters no error or an error other 65 // than ENOSPC, the write returns and proceeds no further in this 66 // lifecycle. 67 // 3. Handle ENOSPC. If the write encounters ENOSPC, the callback must 68 // be invoked for the write's generation. The write's goroutine 69 // acquires the FS's mutex. 70 // a. If the FS's current generation is still equal to the write's 71 // generation, the write is the first write of its generation to 72 // encounter ENOSPC. It increments the FS's current generation 73 // to an odd number, signifying that an ENOSPC is being handled 74 // and invokes the callback. 75 // b. If the FS's current generation has changed, some other write 76 // from the same generation encountered an ENOSPC first. This 77 // write waits on the condition variable until the FS's current 78 // generation is updated indicating that the generation's 79 // callback invocation has completed. 80 // 3. Retry the write once. The callback for the write's generation 81 // has completed, either by this write's goroutine or another's. 82 // The write may proceed with the expectation that the callback 83 // remedied the full disk by freeing up disk space and an ENOSPC 84 // should not be encountered again for at least a few minutes. If 85 // we do encounter another ENOSPC on the retry, the callback was 86 // unable to remedy the full disk and another retry won't be 87 // useful. Any error, including ENOSPC, during the retry is 88 // returned without further handling. None of the retries invoke 89 // the callback. 90 // 91 // This scheme has a few nice properties: 92 // * Once the disk-full callback completes, it won't be invoked 93 // again unless a write that started strictly later encounters an 94 // ENOSPC. This is convenient if the callback strives to 'fix' the 95 // full disk, for example, by removing a ballast file. A new 96 // invocation of the callback guarantees a new problem. 97 // * Incoming writes block if there's an unhandled ENOSPC. Some 98 // writes, like WAL or MANIFEST fsyncs, are fatal if they encounter 99 // an ENOSPC. 100 generation atomic.Uint32 101 mu struct { 102 sync.Mutex 103 sync.Cond 104 onDiskFull func() 105 } 106 } 107 108 // Unwrap returns the underlying FS. This may be called by vfs.Root to access 109 // the underlying filesystem. 110 func (fs *enospcFS) Unwrap() FS { 111 return fs.inner 112 } 113 114 // waitUntilReady is called before every FS or File operation that 115 // might return ENOSPC. If an ENOSPC was encountered and the corresponding 116 // invocation of the `onDiskFull` callback has not yet returned, 117 // waitUntilReady blocks until the callback returns. The returned generation 118 // is always even. 119 func (fs *enospcFS) waitUntilReady() uint32 { 120 gen := fs.generation.Load() 121 if gen%2 == 0 { 122 // An even generation indicates that we're not currently handling an 123 // ENOSPC. Allow the write to proceed. 124 return gen 125 } 126 127 // We're currently handling an ENOSPC error. Wait on the condition 128 // variable until we're not handling an ENOSPC. 129 fs.mu.Lock() 130 defer fs.mu.Unlock() 131 132 // Load the generation again with fs.mu locked. 133 gen = fs.generation.Load() 134 for gen%2 == 1 { 135 fs.mu.Wait() 136 gen = fs.generation.Load() 137 } 138 return gen 139 } 140 141 func (fs *enospcFS) handleENOSPC(gen uint32) { 142 fs.mu.Lock() 143 defer fs.mu.Unlock() 144 145 currentGeneration := fs.generation.Load() 146 147 // If the current generation is still `gen`, this is the first goroutine 148 // to hit an ENOSPC within this write generation, so this goroutine is 149 // responsible for invoking the callback. 150 if currentGeneration == gen { 151 // Increment the generation to an odd number, indicating that the FS 152 // is out-of-disk space and incoming writes should pause and wait for 153 // the next generation before continuing. 154 fs.generation.Store(gen + 1) 155 156 func() { 157 // Drop the mutex while we invoke the callback, re-acquiring 158 // afterwards. 159 fs.mu.Unlock() 160 defer fs.mu.Lock() 161 fs.mu.onDiskFull() 162 }() 163 164 // Update the current generation again to an even number, indicating 165 // that the callback has completed for the write generation `gen`. 166 fs.generation.Store(gen + 2) 167 fs.mu.Broadcast() 168 return 169 } 170 171 // The current generation has already been incremented, so either the 172 // callback is currently being run by another goroutine or it's already 173 // completed. Wait for it complete if it hasn't already. 174 // 175 // The current generation may be updated multiple times, including to an 176 // odd number signifying a later write generation has already encountered 177 // ENOSPC. In that case, the callback was not able to remedy the full disk 178 // and waiting is unlikely to be helpful. Continuing to wait risks 179 // blocking an unbounded number of generations. Retrying and bubbling the 180 // ENOSPC up might be helpful if we can abort a large compaction that 181 // started before we became more selective about compaction picking, so 182 // this loop only waits for this write generation's callback and no 183 // subsequent generations' callbacks. 184 for currentGeneration == gen+1 { 185 fs.mu.Wait() 186 currentGeneration = fs.generation.Load() 187 } 188 } 189 190 func (fs *enospcFS) Create(name string) (File, error) { 191 gen := fs.waitUntilReady() 192 193 f, err := fs.inner.Create(name) 194 195 if err != nil && isENOSPC(err) { 196 fs.handleENOSPC(gen) 197 f, err = fs.inner.Create(name) 198 } 199 if f != nil { 200 f = &enospcFile{ 201 fs: fs, 202 inner: f, 203 } 204 } 205 return f, err 206 } 207 208 func (fs *enospcFS) Link(oldname, newname string) error { 209 gen := fs.waitUntilReady() 210 211 err := fs.inner.Link(oldname, newname) 212 213 if err != nil && isENOSPC(err) { 214 fs.handleENOSPC(gen) 215 err = fs.inner.Link(oldname, newname) 216 } 217 return err 218 } 219 220 func (fs *enospcFS) Open(name string, opts ...OpenOption) (File, error) { 221 f, err := fs.inner.Open(name, opts...) 222 if f != nil { 223 f = &enospcFile{ 224 fs: fs, 225 inner: f, 226 } 227 } 228 return f, err 229 } 230 231 func (fs *enospcFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) { 232 f, err := fs.inner.OpenReadWrite(name, opts...) 233 if f != nil { 234 f = &enospcFile{ 235 fs: fs, 236 inner: f, 237 } 238 } 239 return f, err 240 } 241 242 func (fs *enospcFS) OpenDir(name string) (File, error) { 243 f, err := fs.inner.OpenDir(name) 244 if f != nil { 245 f = &enospcFile{ 246 fs: fs, 247 inner: f, 248 } 249 } 250 return f, err 251 } 252 253 func (fs *enospcFS) Remove(name string) error { 254 gen := fs.waitUntilReady() 255 256 err := fs.inner.Remove(name) 257 258 if err != nil && isENOSPC(err) { 259 fs.handleENOSPC(gen) 260 err = fs.inner.Remove(name) 261 } 262 return err 263 } 264 265 func (fs *enospcFS) RemoveAll(name string) error { 266 gen := fs.waitUntilReady() 267 268 err := fs.inner.RemoveAll(name) 269 270 if err != nil && isENOSPC(err) { 271 fs.handleENOSPC(gen) 272 err = fs.inner.RemoveAll(name) 273 } 274 return err 275 } 276 277 func (fs *enospcFS) Rename(oldname, newname string) error { 278 gen := fs.waitUntilReady() 279 280 err := fs.inner.Rename(oldname, newname) 281 282 if err != nil && isENOSPC(err) { 283 fs.handleENOSPC(gen) 284 err = fs.inner.Rename(oldname, newname) 285 } 286 return err 287 } 288 289 func (fs *enospcFS) ReuseForWrite(oldname, newname string) (File, error) { 290 gen := fs.waitUntilReady() 291 292 f, err := fs.inner.ReuseForWrite(oldname, newname) 293 294 if err != nil && isENOSPC(err) { 295 fs.handleENOSPC(gen) 296 f, err = fs.inner.ReuseForWrite(oldname, newname) 297 } 298 299 if f != nil { 300 f = &enospcFile{ 301 fs: fs, 302 inner: f, 303 } 304 } 305 return f, err 306 } 307 308 func (fs *enospcFS) MkdirAll(dir string, perm os.FileMode) error { 309 gen := fs.waitUntilReady() 310 311 err := fs.inner.MkdirAll(dir, perm) 312 313 if err != nil && isENOSPC(err) { 314 fs.handleENOSPC(gen) 315 err = fs.inner.MkdirAll(dir, perm) 316 } 317 return err 318 } 319 320 func (fs *enospcFS) Lock(name string) (io.Closer, error) { 321 gen := fs.waitUntilReady() 322 323 closer, err := fs.inner.Lock(name) 324 325 if err != nil && isENOSPC(err) { 326 fs.handleENOSPC(gen) 327 closer, err = fs.inner.Lock(name) 328 } 329 return closer, err 330 } 331 332 func (fs *enospcFS) List(dir string) ([]string, error) { 333 return fs.inner.List(dir) 334 } 335 336 func (fs *enospcFS) Stat(name string) (os.FileInfo, error) { 337 return fs.inner.Stat(name) 338 } 339 340 func (fs *enospcFS) PathBase(path string) string { 341 return fs.inner.PathBase(path) 342 } 343 344 func (fs *enospcFS) PathJoin(elem ...string) string { 345 return fs.inner.PathJoin(elem...) 346 } 347 348 func (fs *enospcFS) PathDir(path string) string { 349 return fs.inner.PathDir(path) 350 } 351 352 func (fs *enospcFS) GetDiskUsage(path string) (DiskUsage, error) { 353 return fs.inner.GetDiskUsage(path) 354 } 355 356 type enospcFile struct { 357 fs *enospcFS 358 inner File 359 } 360 361 var _ File = (*enospcFile)(nil) 362 363 func (f *enospcFile) Close() error { 364 return f.inner.Close() 365 } 366 367 func (f *enospcFile) Read(p []byte) (n int, err error) { 368 return f.inner.Read(p) 369 } 370 371 func (f *enospcFile) ReadAt(p []byte, off int64) (n int, err error) { 372 return f.inner.ReadAt(p, off) 373 } 374 375 func (f *enospcFile) Write(p []byte) (n int, err error) { 376 gen := f.fs.waitUntilReady() 377 378 n, err = f.inner.Write(p) 379 380 if err != nil && isENOSPC(err) { 381 f.fs.handleENOSPC(gen) 382 var n2 int 383 n2, err = f.inner.Write(p[n:]) 384 n += n2 385 } 386 return n, err 387 } 388 389 func (f *enospcFile) WriteAt(p []byte, ofs int64) (n int, err error) { 390 gen := f.fs.waitUntilReady() 391 392 n, err = f.inner.WriteAt(p, ofs) 393 394 if err != nil && isENOSPC(err) { 395 f.fs.handleENOSPC(gen) 396 var n2 int 397 n2, err = f.inner.WriteAt(p[n:], ofs+int64(n)) 398 n += n2 399 } 400 return n, err 401 } 402 403 func (f *enospcFile) Prefetch(offset, length int64) error { 404 return f.inner.Prefetch(offset, length) 405 } 406 407 func (f *enospcFile) Preallocate(offset, length int64) error { 408 return f.inner.Preallocate(offset, length) 409 } 410 411 func (f *enospcFile) Stat() (os.FileInfo, error) { 412 return f.inner.Stat() 413 } 414 415 func (f *enospcFile) Sync() error { 416 gen := f.fs.waitUntilReady() 417 418 err := f.inner.Sync() 419 420 if err != nil && isENOSPC(err) { 421 f.fs.handleENOSPC(gen) 422 423 // NB: It is NOT safe to retry the Sync. See the PostgreSQL 424 // 'fsyncgate' discussion. A successful Sync after a failed one does 425 // not provide any guarantees and (always?) loses the unsynced writes. 426 // We need to bubble the error up and hope we weren't syncing a WAL or 427 // MANIFEST, because we'll have no choice but to crash. Errors while 428 // syncing an sstable will result in a failed flush/compaction, and 429 // the relevant sstable(s) will be marked as obsolete and deleted. 430 // See: https://lwn.net/Articles/752063/ 431 } 432 return err 433 } 434 435 func (f *enospcFile) SyncData() error { 436 return f.inner.SyncData() 437 } 438 439 func (f *enospcFile) SyncTo(length int64) (fullSync bool, err error) { 440 return f.inner.SyncTo(length) 441 } 442 443 func (f *enospcFile) Fd() uintptr { 444 return f.inner.Fd() 445 } 446 447 var _ FS = (*enospcFS)(nil) 448 449 func isENOSPC(err error) bool { 450 err = errors.UnwrapAll(err) 451 e, ok := err.(syscall.Errno) 452 return ok && e == syscall.ENOSPC 453 }