github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/vfs/disk_full.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package vfs
     6  
     7  import (
     8  	"io"
     9  	"os"
    10  	"sync"
    11  	"sync/atomic"
    12  	"syscall"
    13  
    14  	"github.com/cockroachdb/errors"
    15  )
    16  
    17  // OnDiskFull wraps the provided FS with an FS that examines returned errors,
    18  // looking for ENOSPC errors. It invokes the provided callback when the
    19  // underlying filesystem returns an error signifying the storage is out of
    20  // disk space.
    21  //
    22  // All new writes to the filesystem are blocked while the callback executes,
    23  // so care must be taken to avoid expensive work from within the callback.
    24  //
    25  // Once the callback completes, any write-oriented operations that encountered
    26  // ENOSPC are retried exactly once. Once the callback completes, it will not
    27  // be invoked again until a new operation that began after the callback
    28  // returned encounters an ENOSPC error.
    29  //
    30  // OnDiskFull may be used to automatically manage a ballast file, which is
    31  // removed from the filesystem from within the callback. Note that if managing
    32  // a ballast, the caller should maintain a reference to the inner FS and
    33  // remove the ballast on the unwrapped FS.
    34  func OnDiskFull(fs FS, fn func()) FS {
    35  	newFS := &enospcFS{inner: fs}
    36  	newFS.mu.Cond.L = &newFS.mu.Mutex
    37  	newFS.mu.onDiskFull = fn
    38  	return newFS
    39  }
    40  
    41  type enospcFS struct {
    42  	inner FS
    43  	// generation is a monotonically increasing number that encodes the
    44  	// current state of ENOSPC error handling. Incoming writes are
    45  	// organized into generations to provide strong guarantees on when the
    46  	// disk full callback is invoked. The callback is invoked once per
    47  	// write generation.
    48  	//
    49  	// Special significance is given to the parity of this generation
    50  	// field to optimize incoming writes in the normal state, which only
    51  	// need to perform a single atomic load. If generation is odd, an
    52  	// ENOSPC error is being actively handled. The generations associated
    53  	// with writes are always even.
    54  	//
    55  	// The lifecycle of a write is:
    56  	//
    57  	// 1. Atomically load the current generation.
    58  	//    a. If it's even, this is the write's generation number.
    59  	//    b. If it's odd, an ENOSPC was recently encountered and the
    60  	//       corresponding invocation of the disk full callback has not
    61  	//       yet completed. The write must wait until the callback has
    62  	//       completed and generation is updated to an even number, which
    63  	//       becomes the write's generation number.
    64  	// 2. Perform the write. If it encounters no error or an error other
    65  	//    than ENOSPC, the write returns and proceeds no further in this
    66  	//    lifecycle.
    67  	// 3. Handle ENOSPC. If the write encounters ENOSPC, the callback must
    68  	//    be invoked for the write's generation. The write's goroutine
    69  	//    acquires the FS's mutex.
    70  	//    a. If the FS's current generation is still equal to the write's
    71  	//       generation, the write is the first write of its generation to
    72  	//       encounter ENOSPC. It increments the FS's current generation
    73  	//       to an odd number, signifying that an ENOSPC is being handled
    74  	//       and invokes the callback.
    75  	//    b. If the FS's current generation has changed, some other write
    76  	//       from the same generation encountered an ENOSPC first. This
    77  	//       write waits on the condition variable until the FS's current
    78  	//       generation is updated indicating that the generation's
    79  	//       callback invocation has completed.
    80  	// 3. Retry the write once. The callback for the write's generation
    81  	//    has completed, either by this write's goroutine or another's.
    82  	//    The write may proceed with the expectation that the callback
    83  	//    remedied the full disk by freeing up disk space and an ENOSPC
    84  	//    should not be encountered again for at least a few minutes. If
    85  	//    we do encounter another ENOSPC on the retry, the callback was
    86  	//    unable to remedy the full disk and another retry won't be
    87  	//    useful. Any error, including ENOSPC, during the retry is
    88  	//    returned without further handling.  None of the retries invoke
    89  	//    the callback.
    90  	//
    91  	// This scheme has a few nice properties:
    92  	// * Once the disk-full callback completes, it won't be invoked
    93  	//   again unless a write that started strictly later encounters an
    94  	//   ENOSPC. This is convenient if the callback strives to 'fix' the
    95  	//   full disk, for example, by removing a ballast file. A new
    96  	//   invocation of the callback guarantees a new problem.
    97  	// * Incoming writes block if there's an unhandled ENOSPC. Some
    98  	//   writes, like WAL or MANIFEST fsyncs, are fatal if they encounter
    99  	//   an ENOSPC.
   100  	generation atomic.Uint32
   101  	mu         struct {
   102  		sync.Mutex
   103  		sync.Cond
   104  		onDiskFull func()
   105  	}
   106  }
   107  
   108  // Unwrap returns the underlying FS. This may be called by vfs.Root to access
   109  // the underlying filesystem.
   110  func (fs *enospcFS) Unwrap() FS {
   111  	return fs.inner
   112  }
   113  
   114  // waitUntilReady is called before every FS or File operation that
   115  // might return ENOSPC. If an ENOSPC was encountered and the corresponding
   116  // invocation of the `onDiskFull` callback has not yet returned,
   117  // waitUntilReady blocks until the callback returns. The returned generation
   118  // is always even.
   119  func (fs *enospcFS) waitUntilReady() uint32 {
   120  	gen := fs.generation.Load()
   121  	if gen%2 == 0 {
   122  		// An even generation indicates that we're not currently handling an
   123  		// ENOSPC. Allow the write to proceed.
   124  		return gen
   125  	}
   126  
   127  	// We're currently handling an ENOSPC error. Wait on the condition
   128  	// variable until we're not handling an ENOSPC.
   129  	fs.mu.Lock()
   130  	defer fs.mu.Unlock()
   131  
   132  	// Load the generation again with fs.mu locked.
   133  	gen = fs.generation.Load()
   134  	for gen%2 == 1 {
   135  		fs.mu.Wait()
   136  		gen = fs.generation.Load()
   137  	}
   138  	return gen
   139  }
   140  
   141  func (fs *enospcFS) handleENOSPC(gen uint32) {
   142  	fs.mu.Lock()
   143  	defer fs.mu.Unlock()
   144  
   145  	currentGeneration := fs.generation.Load()
   146  
   147  	// If the current generation is still `gen`, this is the first goroutine
   148  	// to hit an ENOSPC within this write generation, so this goroutine is
   149  	// responsible for invoking the callback.
   150  	if currentGeneration == gen {
   151  		// Increment the generation to an odd number, indicating that the FS
   152  		// is out-of-disk space and incoming writes should pause and wait for
   153  		// the next generation before continuing.
   154  		fs.generation.Store(gen + 1)
   155  
   156  		func() {
   157  			// Drop the mutex while we invoke the callback, re-acquiring
   158  			// afterwards.
   159  			fs.mu.Unlock()
   160  			defer fs.mu.Lock()
   161  			fs.mu.onDiskFull()
   162  		}()
   163  
   164  		// Update the current generation again to an even number, indicating
   165  		// that the callback has completed for the write generation `gen`.
   166  		fs.generation.Store(gen + 2)
   167  		fs.mu.Broadcast()
   168  		return
   169  	}
   170  
   171  	// The current generation has already been incremented, so either the
   172  	// callback is currently being run by another goroutine or it's already
   173  	// completed. Wait for it complete if it hasn't already.
   174  	//
   175  	// The current generation may be updated multiple times, including to an
   176  	// odd number signifying a later write generation has already encountered
   177  	// ENOSPC. In that case, the callback was not able to remedy the full disk
   178  	// and waiting is unlikely to be helpful.  Continuing to wait risks
   179  	// blocking an unbounded number of generations.  Retrying and bubbling the
   180  	// ENOSPC up might be helpful if we can abort a large compaction that
   181  	// started before we became more selective about compaction picking, so
   182  	// this loop only waits for this write generation's callback and no
   183  	// subsequent generations' callbacks.
   184  	for currentGeneration == gen+1 {
   185  		fs.mu.Wait()
   186  		currentGeneration = fs.generation.Load()
   187  	}
   188  }
   189  
   190  func (fs *enospcFS) Create(name string) (File, error) {
   191  	gen := fs.waitUntilReady()
   192  
   193  	f, err := fs.inner.Create(name)
   194  
   195  	if err != nil && isENOSPC(err) {
   196  		fs.handleENOSPC(gen)
   197  		f, err = fs.inner.Create(name)
   198  	}
   199  	if f != nil {
   200  		f = &enospcFile{
   201  			fs:    fs,
   202  			inner: f,
   203  		}
   204  	}
   205  	return f, err
   206  }
   207  
   208  func (fs *enospcFS) Link(oldname, newname string) error {
   209  	gen := fs.waitUntilReady()
   210  
   211  	err := fs.inner.Link(oldname, newname)
   212  
   213  	if err != nil && isENOSPC(err) {
   214  		fs.handleENOSPC(gen)
   215  		err = fs.inner.Link(oldname, newname)
   216  	}
   217  	return err
   218  }
   219  
   220  func (fs *enospcFS) Open(name string, opts ...OpenOption) (File, error) {
   221  	f, err := fs.inner.Open(name, opts...)
   222  	if f != nil {
   223  		f = &enospcFile{
   224  			fs:    fs,
   225  			inner: f,
   226  		}
   227  	}
   228  	return f, err
   229  }
   230  
   231  func (fs *enospcFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
   232  	f, err := fs.inner.OpenReadWrite(name, opts...)
   233  	if f != nil {
   234  		f = &enospcFile{
   235  			fs:    fs,
   236  			inner: f,
   237  		}
   238  	}
   239  	return f, err
   240  }
   241  
   242  func (fs *enospcFS) OpenDir(name string) (File, error) {
   243  	f, err := fs.inner.OpenDir(name)
   244  	if f != nil {
   245  		f = &enospcFile{
   246  			fs:    fs,
   247  			inner: f,
   248  		}
   249  	}
   250  	return f, err
   251  }
   252  
   253  func (fs *enospcFS) Remove(name string) error {
   254  	gen := fs.waitUntilReady()
   255  
   256  	err := fs.inner.Remove(name)
   257  
   258  	if err != nil && isENOSPC(err) {
   259  		fs.handleENOSPC(gen)
   260  		err = fs.inner.Remove(name)
   261  	}
   262  	return err
   263  }
   264  
   265  func (fs *enospcFS) RemoveAll(name string) error {
   266  	gen := fs.waitUntilReady()
   267  
   268  	err := fs.inner.RemoveAll(name)
   269  
   270  	if err != nil && isENOSPC(err) {
   271  		fs.handleENOSPC(gen)
   272  		err = fs.inner.RemoveAll(name)
   273  	}
   274  	return err
   275  }
   276  
   277  func (fs *enospcFS) Rename(oldname, newname string) error {
   278  	gen := fs.waitUntilReady()
   279  
   280  	err := fs.inner.Rename(oldname, newname)
   281  
   282  	if err != nil && isENOSPC(err) {
   283  		fs.handleENOSPC(gen)
   284  		err = fs.inner.Rename(oldname, newname)
   285  	}
   286  	return err
   287  }
   288  
   289  func (fs *enospcFS) ReuseForWrite(oldname, newname string) (File, error) {
   290  	gen := fs.waitUntilReady()
   291  
   292  	f, err := fs.inner.ReuseForWrite(oldname, newname)
   293  
   294  	if err != nil && isENOSPC(err) {
   295  		fs.handleENOSPC(gen)
   296  		f, err = fs.inner.ReuseForWrite(oldname, newname)
   297  	}
   298  
   299  	if f != nil {
   300  		f = &enospcFile{
   301  			fs:    fs,
   302  			inner: f,
   303  		}
   304  	}
   305  	return f, err
   306  }
   307  
   308  func (fs *enospcFS) MkdirAll(dir string, perm os.FileMode) error {
   309  	gen := fs.waitUntilReady()
   310  
   311  	err := fs.inner.MkdirAll(dir, perm)
   312  
   313  	if err != nil && isENOSPC(err) {
   314  		fs.handleENOSPC(gen)
   315  		err = fs.inner.MkdirAll(dir, perm)
   316  	}
   317  	return err
   318  }
   319  
   320  func (fs *enospcFS) Lock(name string) (io.Closer, error) {
   321  	gen := fs.waitUntilReady()
   322  
   323  	closer, err := fs.inner.Lock(name)
   324  
   325  	if err != nil && isENOSPC(err) {
   326  		fs.handleENOSPC(gen)
   327  		closer, err = fs.inner.Lock(name)
   328  	}
   329  	return closer, err
   330  }
   331  
   332  func (fs *enospcFS) List(dir string) ([]string, error) {
   333  	return fs.inner.List(dir)
   334  }
   335  
   336  func (fs *enospcFS) Stat(name string) (os.FileInfo, error) {
   337  	return fs.inner.Stat(name)
   338  }
   339  
   340  func (fs *enospcFS) PathBase(path string) string {
   341  	return fs.inner.PathBase(path)
   342  }
   343  
   344  func (fs *enospcFS) PathJoin(elem ...string) string {
   345  	return fs.inner.PathJoin(elem...)
   346  }
   347  
   348  func (fs *enospcFS) PathDir(path string) string {
   349  	return fs.inner.PathDir(path)
   350  }
   351  
   352  func (fs *enospcFS) GetDiskUsage(path string) (DiskUsage, error) {
   353  	return fs.inner.GetDiskUsage(path)
   354  }
   355  
   356  type enospcFile struct {
   357  	fs    *enospcFS
   358  	inner File
   359  }
   360  
   361  var _ File = (*enospcFile)(nil)
   362  
   363  func (f *enospcFile) Close() error {
   364  	return f.inner.Close()
   365  }
   366  
   367  func (f *enospcFile) Read(p []byte) (n int, err error) {
   368  	return f.inner.Read(p)
   369  }
   370  
   371  func (f *enospcFile) ReadAt(p []byte, off int64) (n int, err error) {
   372  	return f.inner.ReadAt(p, off)
   373  }
   374  
   375  func (f *enospcFile) Write(p []byte) (n int, err error) {
   376  	gen := f.fs.waitUntilReady()
   377  
   378  	n, err = f.inner.Write(p)
   379  
   380  	if err != nil && isENOSPC(err) {
   381  		f.fs.handleENOSPC(gen)
   382  		var n2 int
   383  		n2, err = f.inner.Write(p[n:])
   384  		n += n2
   385  	}
   386  	return n, err
   387  }
   388  
   389  func (f *enospcFile) WriteAt(p []byte, ofs int64) (n int, err error) {
   390  	gen := f.fs.waitUntilReady()
   391  
   392  	n, err = f.inner.WriteAt(p, ofs)
   393  
   394  	if err != nil && isENOSPC(err) {
   395  		f.fs.handleENOSPC(gen)
   396  		var n2 int
   397  		n2, err = f.inner.WriteAt(p[n:], ofs+int64(n))
   398  		n += n2
   399  	}
   400  	return n, err
   401  }
   402  
   403  func (f *enospcFile) Prefetch(offset, length int64) error {
   404  	return f.inner.Prefetch(offset, length)
   405  }
   406  
   407  func (f *enospcFile) Preallocate(offset, length int64) error {
   408  	return f.inner.Preallocate(offset, length)
   409  }
   410  
   411  func (f *enospcFile) Stat() (os.FileInfo, error) {
   412  	return f.inner.Stat()
   413  }
   414  
   415  func (f *enospcFile) Sync() error {
   416  	gen := f.fs.waitUntilReady()
   417  
   418  	err := f.inner.Sync()
   419  
   420  	if err != nil && isENOSPC(err) {
   421  		f.fs.handleENOSPC(gen)
   422  
   423  		// NB: It is NOT safe to retry the Sync. See the PostgreSQL
   424  		// 'fsyncgate' discussion. A successful Sync after a failed one does
   425  		// not provide any guarantees and (always?) loses the unsynced writes.
   426  		// We need to bubble the error up and hope we weren't syncing a WAL or
   427  		// MANIFEST, because we'll have no choice but to crash. Errors while
   428  		// syncing an sstable will result in a failed flush/compaction, and
   429  		// the relevant sstable(s) will be marked as obsolete and deleted.
   430  		// See: https://lwn.net/Articles/752063/
   431  	}
   432  	return err
   433  }
   434  
   435  func (f *enospcFile) SyncData() error {
   436  	return f.inner.SyncData()
   437  }
   438  
   439  func (f *enospcFile) SyncTo(length int64) (fullSync bool, err error) {
   440  	return f.inner.SyncTo(length)
   441  }
   442  
   443  func (f *enospcFile) Fd() uintptr {
   444  	return f.inner.Fd()
   445  }
   446  
   447  var _ FS = (*enospcFS)(nil)
   448  
   449  func isENOSPC(err error) bool {
   450  	err = errors.UnwrapAll(err)
   451  	e, ok := err.(syscall.Errno)
   452  	return ok && e == syscall.ENOSPC
   453  }