github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/vfs/disk_full.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package vfs
     6  
     7  import (
     8  	"io"
     9  	"os"
    10  	"sync"
    11  	"sync/atomic"
    12  	"syscall"
    13  
    14  	"github.com/cockroachdb/errors"
    15  )
    16  
    17  // OnDiskFull wraps the provided FS with an FS that examines returned errors,
    18  // looking for ENOSPC errors. It invokes the provided callback when the
    19  // underlying filesystem returns an error signifying the storage is out of
    20  // disk space.
    21  //
    22  // All new writes to the filesystem are blocked while the callback executes,
    23  // so care must be taken to avoid expensive work from within the callback.
    24  //
    25  // Once the callback completes, any write-oriented operations that encountered
    26  // ENOSPC are retried exactly once. Once the callback completes, it will not
    27  // be invoked again until a new operation that began after the callback
    28  // returned encounters an ENOSPC error.
    29  //
    30  // OnDiskFull may be used to automatically manage a ballast file, which is
    31  // removed from the filesystem from within the callback. Note that if managing
    32  // a ballast, the caller should maintain a reference to the inner FS and
    33  // remove the ballast on the unwrapped FS.
    34  func OnDiskFull(fs FS, fn func()) FS {
    35  	newFS := &enospcFS{inner: fs}
    36  	newFS.mu.Cond.L = &newFS.mu.Mutex
    37  	newFS.mu.onDiskFull = fn
    38  	return newFS
    39  }
    40  
    41  type enospcFS struct {
    42  	inner  FS
    43  	atomic struct {
    44  		// generation is a monotonically increasing number that encodes the
    45  		// current state of ENOSPC error handling. Incoming writes are
    46  		// organized into generations to provide strong guarantees on when the
    47  		// disk full callback is invoked. The callback is invoked once per
    48  		// write generation.
    49  		//
    50  		// Special significance is given to the parity of this generation
    51  		// field to optimize incoming writes in the normal state, which only
    52  		// need to perform a single atomic load. If generation is odd, an
    53  		// ENOSPC error is being actively handled. The generations associated
    54  		// with writes are always even.
    55  		//
    56  		// The lifecycle of a write is:
    57  		//
    58  		// 1. Atomically load the current generation.
    59  		//    a. If it's even, this is the write's generation number.
    60  		//    b. If it's odd, an ENOSPC was recently encountered and the
    61  		//       corresponding invocation of the disk full callback has not
    62  		//       yet completed. The write must wait until the callback has
    63  		//       completed and generation is updated to an even number, which
    64  		//       becomes the write's generation number.
    65  		// 2. Perform the write. If it encounters no error or an error other
    66  		//    than ENOSPC, the write returns and proceeds no further in this
    67  		//    lifecycle.
    68  		// 3. Handle ENOSPC. If the write encounters ENOSPC, the callback must
    69  		//    be invoked for the write's generation. The write's goroutine
    70  		//    acquires the FS's mutex.
    71  		//    a. If the FS's current generation is still equal to the write's
    72  		//       generation, the write is the first write of its generation to
    73  		//       encounter ENOSPC. It increments the FS's current generation
    74  		//       to an odd number, signifying that an ENOSPC is being handled
    75  		//       and invokes the callback.
    76  		//    b. If the FS's current generation has changed, some other write
    77  		//       from the same generation encountered an ENOSPC first. This
    78  		//       write waits on the condition variable until the FS's current
    79  		//       generation is updated indicating that the generation's
    80  		//       callback invocation has completed.
    81  		// 3. Retry the write once. The callback for the write's generation
    82  		//    has completed, either by this write's goroutine or another's.
    83  		//    The write may proceed with the expectation that the callback
    84  		//    remedied the full disk by freeing up disk space and an ENOSPC
    85  		//    should not be encountered again for at least a few minutes. If
    86  		//    we do encounter another ENOSPC on the retry, the callback was
    87  		//    unable to remedy the full disk and another retry won't be
    88  		//    useful. Any error, including ENOSPC, during the retry is
    89  		//    returned without further handling.  None of the retries invoke
    90  		//    the callback.
    91  		//
    92  		// This scheme has a few nice properties:
    93  		// * Once the disk-full callback completes, it won't be invoked
    94  		//   again unless a write that started strictly later encounters an
    95  		//   ENOSPC. This is convenient if the callback strives to 'fix' the
    96  		//   full disk, for example, by removing a ballast file. A new
    97  		//   invocation of the callback guarantees a new problem.
    98  		// * Incoming writes block if there's an unhandled ENOSPC. Some
    99  		//   writes, like WAL or MANIFEST fsyncs, are fatal if they encounter
   100  		//   an ENOSPC.
   101  		generation uint32
   102  	}
   103  	mu struct {
   104  		sync.Mutex
   105  		sync.Cond
   106  		onDiskFull func()
   107  	}
   108  }
   109  
   110  // Unwrap returns the underlying FS. This may be called by vfs.Root to access
   111  // the underlying filesystem.
   112  func (fs *enospcFS) Unwrap() FS {
   113  	return fs.inner
   114  }
   115  
   116  // waitUntilReady is called before every FS or File operation that
   117  // might return ENOSPC. If an ENOSPC was encountered and the corresponding
   118  // invocation of the `onDiskFull` callback has not yet returned,
   119  // waitUntilReady blocks until the callback returns. The returned generation
   120  // is always even.
   121  func (fs *enospcFS) waitUntilReady() uint32 {
   122  	gen := atomic.LoadUint32(&fs.atomic.generation)
   123  	if gen%2 == 0 {
   124  		// An even generation indicates that we're not currently handling an
   125  		// ENOSPC. Allow the write to proceed.
   126  		return gen
   127  	}
   128  
   129  	// We're currently handling an ENOSPC error. Wait on the condition
   130  	// variable until we're not handling an ENOSPC.
   131  	fs.mu.Lock()
   132  	defer fs.mu.Unlock()
   133  
   134  	// Load the generation again with fs.mu locked.
   135  	gen = atomic.LoadUint32(&fs.atomic.generation)
   136  	for gen%2 == 1 {
   137  		fs.mu.Wait()
   138  		gen = atomic.LoadUint32(&fs.atomic.generation)
   139  	}
   140  	return gen
   141  }
   142  
   143  func (fs *enospcFS) handleENOSPC(gen uint32) {
   144  	fs.mu.Lock()
   145  	defer fs.mu.Unlock()
   146  
   147  	currentGeneration := atomic.LoadUint32(&fs.atomic.generation)
   148  
   149  	// If the current generation is still `gen`, this is the first goroutine
   150  	// to hit an ENOSPC within this write generation, so this goroutine is
   151  	// responsible for invoking the callback.
   152  	if currentGeneration == gen {
   153  		// Increment the generation to an odd number, indicating that the FS
   154  		// is out-of-disk space and incoming writes should pause and wait for
   155  		// the next generation before continuing.
   156  		atomic.StoreUint32(&fs.atomic.generation, gen+1)
   157  
   158  		func() {
   159  			// Drop the mutex while we invoke the callback, re-acquiring
   160  			// afterwards.
   161  			fs.mu.Unlock()
   162  			defer fs.mu.Lock()
   163  			fs.mu.onDiskFull()
   164  		}()
   165  
   166  		// Update the current generation again to an even number, indicating
   167  		// that the callback has completed for the write generation `gen`.
   168  		atomic.StoreUint32(&fs.atomic.generation, gen+2)
   169  		fs.mu.Broadcast()
   170  		return
   171  	}
   172  
   173  	// The current generation has already been incremented, so either the
   174  	// callback is currently being run by another goroutine or it's already
   175  	// completed. Wait for it complete if it hasn't already.
   176  	//
   177  	// The current generation may be updated multiple times, including to an
   178  	// odd number signifying a later write generation has already encountered
   179  	// ENOSPC. In that case, the callback was not able to remedy the full disk
   180  	// and waiting is unlikely to be helpful.  Continuing to wait risks
   181  	// blocking an unbounded number of generations.  Retrying and bubbling the
   182  	// ENOSPC up might be helpful if we can abort a large compaction that
   183  	// started before we became more selective about compaction picking, so
   184  	// this loop only waits for this write generation's callback and no
   185  	// subsequent generations' callbacks.
   186  	for currentGeneration == gen+1 {
   187  		fs.mu.Wait()
   188  		currentGeneration = atomic.LoadUint32(&fs.atomic.generation)
   189  	}
   190  }
   191  
   192  func (fs *enospcFS) Create(name string) (File, error) {
   193  	gen := fs.waitUntilReady()
   194  
   195  	f, err := fs.inner.Create(name)
   196  
   197  	if err != nil && isENOSPC(err) {
   198  		fs.handleENOSPC(gen)
   199  		f, err = fs.inner.Create(name)
   200  	}
   201  	if f != nil {
   202  		f = WithFd(f, enospcFile{
   203  			fs:    fs,
   204  			inner: f,
   205  		})
   206  	}
   207  	return f, err
   208  }
   209  
   210  func (fs *enospcFS) Link(oldname, newname string) error {
   211  	gen := fs.waitUntilReady()
   212  
   213  	err := fs.inner.Link(oldname, newname)
   214  
   215  	if err != nil && isENOSPC(err) {
   216  		fs.handleENOSPC(gen)
   217  		err = fs.inner.Link(oldname, newname)
   218  	}
   219  	return err
   220  }
   221  
   222  func (fs *enospcFS) Open(name string, opts ...OpenOption) (File, error) {
   223  	f, err := fs.inner.Open(name, opts...)
   224  	if f != nil {
   225  		f = WithFd(f, enospcFile{
   226  			fs:    fs,
   227  			inner: f,
   228  		})
   229  	}
   230  	return f, err
   231  }
   232  
   233  func (fs *enospcFS) OpenDir(name string) (File, error) {
   234  	f, err := fs.inner.OpenDir(name)
   235  	if f != nil {
   236  		f = WithFd(f, enospcFile{
   237  			fs:    fs,
   238  			inner: f,
   239  		})
   240  	}
   241  	return f, err
   242  }
   243  
   244  func (fs *enospcFS) Remove(name string) error {
   245  	gen := fs.waitUntilReady()
   246  
   247  	err := fs.inner.Remove(name)
   248  
   249  	if err != nil && isENOSPC(err) {
   250  		fs.handleENOSPC(gen)
   251  		err = fs.inner.Remove(name)
   252  	}
   253  	return err
   254  }
   255  
   256  func (fs *enospcFS) RemoveAll(name string) error {
   257  	gen := fs.waitUntilReady()
   258  
   259  	err := fs.inner.RemoveAll(name)
   260  
   261  	if err != nil && isENOSPC(err) {
   262  		fs.handleENOSPC(gen)
   263  		err = fs.inner.RemoveAll(name)
   264  	}
   265  	return err
   266  }
   267  
   268  func (fs *enospcFS) Rename(oldname, newname string) error {
   269  	gen := fs.waitUntilReady()
   270  
   271  	err := fs.inner.Rename(oldname, newname)
   272  
   273  	if err != nil && isENOSPC(err) {
   274  		fs.handleENOSPC(gen)
   275  		err = fs.inner.Rename(oldname, newname)
   276  	}
   277  	return err
   278  }
   279  
   280  func (fs *enospcFS) ReuseForWrite(oldname, newname string) (File, error) {
   281  	gen := fs.waitUntilReady()
   282  
   283  	f, err := fs.inner.ReuseForWrite(oldname, newname)
   284  
   285  	if err != nil && isENOSPC(err) {
   286  		fs.handleENOSPC(gen)
   287  		f, err = fs.inner.ReuseForWrite(oldname, newname)
   288  	}
   289  
   290  	if f != nil {
   291  		f = WithFd(f, enospcFile{
   292  			fs:    fs,
   293  			inner: f,
   294  		})
   295  	}
   296  	return f, err
   297  }
   298  
   299  func (fs *enospcFS) MkdirAll(dir string, perm os.FileMode) error {
   300  	gen := fs.waitUntilReady()
   301  
   302  	err := fs.inner.MkdirAll(dir, perm)
   303  
   304  	if err != nil && isENOSPC(err) {
   305  		fs.handleENOSPC(gen)
   306  		err = fs.inner.MkdirAll(dir, perm)
   307  	}
   308  	return err
   309  }
   310  
   311  func (fs *enospcFS) Lock(name string) (io.Closer, error) {
   312  	gen := fs.waitUntilReady()
   313  
   314  	closer, err := fs.inner.Lock(name)
   315  
   316  	if err != nil && isENOSPC(err) {
   317  		fs.handleENOSPC(gen)
   318  		closer, err = fs.inner.Lock(name)
   319  	}
   320  	return closer, err
   321  }
   322  
   323  func (fs *enospcFS) List(dir string) ([]string, error) {
   324  	return fs.inner.List(dir)
   325  }
   326  
   327  func (fs *enospcFS) Stat(name string) (os.FileInfo, error) {
   328  	return fs.inner.Stat(name)
   329  }
   330  
   331  func (fs *enospcFS) PathBase(path string) string {
   332  	return fs.inner.PathBase(path)
   333  }
   334  
   335  func (fs *enospcFS) PathJoin(elem ...string) string {
   336  	return fs.inner.PathJoin(elem...)
   337  }
   338  
   339  func (fs *enospcFS) PathDir(path string) string {
   340  	return fs.inner.PathDir(path)
   341  }
   342  
   343  func (fs *enospcFS) GetDiskUsage(path string) (DiskUsage, error) {
   344  	return fs.inner.GetDiskUsage(path)
   345  }
   346  
   347  type enospcFile struct {
   348  	fs    *enospcFS
   349  	inner File
   350  }
   351  
   352  func (f enospcFile) Close() error {
   353  	return f.inner.Close()
   354  }
   355  
   356  func (f enospcFile) Read(p []byte) (n int, err error) {
   357  	return f.inner.Read(p)
   358  }
   359  
   360  func (f enospcFile) ReadAt(p []byte, off int64) (n int, err error) {
   361  	return f.inner.ReadAt(p, off)
   362  }
   363  
   364  func (f enospcFile) Write(p []byte) (n int, err error) {
   365  	gen := f.fs.waitUntilReady()
   366  
   367  	n, err = f.inner.Write(p)
   368  
   369  	if err != nil && isENOSPC(err) {
   370  		f.fs.handleENOSPC(gen)
   371  		var n2 int
   372  		n2, err = f.inner.Write(p[n:])
   373  		n += n2
   374  	}
   375  	return n, err
   376  }
   377  
   378  func (f enospcFile) Stat() (os.FileInfo, error) {
   379  	return f.inner.Stat()
   380  }
   381  
   382  func (f enospcFile) Sync() error {
   383  	gen := f.fs.waitUntilReady()
   384  
   385  	err := f.inner.Sync()
   386  
   387  	if err != nil && isENOSPC(err) {
   388  		f.fs.handleENOSPC(gen)
   389  
   390  		// NB: It is NOT safe to retry the Sync. See the PostgreSQL
   391  		// 'fsyncgate' discussion. A successful Sync after a failed one does
   392  		// not provide any guarantees and (always?) loses the unsynced writes.
   393  		// We need to bubble the error up and hope we weren't syncing a WAL or
   394  		// MANIFEST, because we'll have no choice but to crash. Errors while
   395  		// syncing an sstable will result in a failed flush/compaction, and
   396  		// the relevant sstable(s) will be marked as obsolete and deleted.
   397  		// See: https://lwn.net/Articles/752063/
   398  	}
   399  	return err
   400  }
   401  
   402  // Ensure that *enospcFS implements the FS interface.
   403  var _ FS = (*enospcFS)(nil)
   404  
   405  func isENOSPC(err error) bool {
   406  	err = errors.UnwrapAll(err)
   407  	e, ok := err.(syscall.Errno)
   408  	return ok && e == syscall.ENOSPC
   409  }