github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/vfs/disk_health.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/vfs/disk_health.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package vfs
     6  
     7  import (
     8  	"io"
     9  	"os"
    10  	"sync"
    11  	"sync/atomic"
    12  	"time"
    13  )
    14  
    15  const (
    16  	// defaultTickInterval is the default interval between two ticks of each
    17  	// diskHealthCheckingFile loop iteration.
    18  	defaultTickInterval = 2 * time.Second
    19  	// preallocatedSlotCount is the default number of slots available for
    20  	// concurrent filesystem operations. The slot count may be exceeded, but
    21  	// each additional slot will incur an additional allocation. We choose 16
    22  	// here with the expectation that it is significantly more than required in
    23  	// practice. See the comment above the diskHealthCheckingFS type definition.
    24  	preallocatedSlotCount = 16
    25  )
    26  
    27  // diskHealthCheckingFile is a File wrapper to detect slow disk operations, and
    28  // call onSlowDisk if a disk operation is seen to exceed diskSlowThreshold.
    29  //
    30  // This struct creates a goroutine (in startTicker()) that, at every tick
    31  // interval, sees if there's a disk operation taking longer than the specified
    32  // duration. This setup is preferable to creating a new timer at every disk
    33  // operation, as it reduces overhead per disk operation.
    34  type diskHealthCheckingFile struct {
    35  	File
    36  
    37  	onSlowDisk        func(time.Duration)
    38  	diskSlowThreshold time.Duration
    39  	tickInterval      time.Duration
    40  
    41  	stopper        chan struct{}
    42  	lastWriteNanos int64
    43  }
    44  
    45  // newDiskHealthCheckingFile instantiates a new diskHealthCheckingFile, with the
    46  // specified time threshold and event listener.
    47  func newDiskHealthCheckingFile(
    48  	file File, diskSlowThreshold time.Duration, onSlowDisk func(time.Duration),
    49  ) *diskHealthCheckingFile {
    50  	return &diskHealthCheckingFile{
    51  		File:              file,
    52  		onSlowDisk:        onSlowDisk,
    53  		diskSlowThreshold: diskSlowThreshold,
    54  		tickInterval:      defaultTickInterval,
    55  
    56  		stopper: make(chan struct{}),
    57  	}
    58  }
    59  
    60  // startTicker starts a new goroutine with a ticker to monitor disk operations.
    61  // Can only be called if the ticker goroutine isn't running already.
    62  func (d *diskHealthCheckingFile) startTicker() {
    63  	if d.diskSlowThreshold == 0 {
    64  		return
    65  	}
    66  
    67  	go func() {
    68  		ticker := time.NewTicker(d.tickInterval)
    69  		defer ticker.Stop()
    70  
    71  		for {
    72  			select {
    73  			case <-d.stopper:
    74  				return
    75  
    76  			case <-ticker.C:
    77  				lastWriteNanos := atomic.LoadInt64(&d.lastWriteNanos)
    78  				if lastWriteNanos == 0 {
    79  					continue
    80  				}
    81  				lastWrite := time.Unix(0, lastWriteNanos)
    82  				now := time.Now()
    83  				if lastWrite.Add(d.diskSlowThreshold).Before(now) {
    84  					// diskSlowThreshold was exceeded. Call the passed-in
    85  					// listener.
    86  					d.onSlowDisk(now.Sub(lastWrite))
    87  				}
    88  			}
    89  		}
    90  	}()
    91  }
    92  
    93  // stopTicker stops the goroutine started in startTicker.
    94  func (d *diskHealthCheckingFile) stopTicker() {
    95  	close(d.stopper)
    96  }
    97  
    98  // Write implements the io.Writer interface.
    99  func (d *diskHealthCheckingFile) Write(p []byte) (n int, err error) {
   100  	d.timeDiskOp(func() {
   101  		n, err = d.File.Write(p)
   102  	})
   103  	return n, err
   104  }
   105  
   106  // Close implements the io.Closer interface.
   107  func (d *diskHealthCheckingFile) Close() error {
   108  	d.stopTicker()
   109  	return d.File.Close()
   110  }
   111  
   112  // Sync implements the io.Syncer interface.
   113  func (d *diskHealthCheckingFile) Sync() (err error) {
   114  	d.timeDiskOp(func() {
   115  		err = d.File.Sync()
   116  	})
   117  	return err
   118  }
   119  
   120  // timeDiskOp runs the specified closure and makes its timing visible to the
   121  // monitoring goroutine, in case it exceeds one of the slow disk durations.
   122  func (d *diskHealthCheckingFile) timeDiskOp(op func()) {
   123  	if d == nil {
   124  		op()
   125  		return
   126  	}
   127  
   128  	atomic.StoreInt64(&d.lastWriteNanos, time.Now().UnixNano())
   129  	defer func() {
   130  		atomic.StoreInt64(&d.lastWriteNanos, 0)
   131  	}()
   132  	op()
   133  }
   134  
   135  // diskHealthCheckingFS adds disk-health checking facilities to a VFS.
   136  // It times disk write operations in two ways:
   137  //
   138  // 1. Wrapping vfs.Files.
   139  //
   140  // The bulk of write I/O activity is file writing and syncing, invoked through
   141  // the `vfs.File` interface. This VFS wraps all files open for writing with a
   142  // special diskHealthCheckingFile implementation of the vfs.File interface. See
   143  // above for the implementation.
   144  //
   145  // 2. Monitoring filesystem metadata operations.
   146  //
   147  // Filesystem metadata operations (create, link, remove, rename, etc) are also
   148  // sources of disk writes. Unlike a vfs.File which requires Write and Sync calls
   149  // to be sequential, a vfs.FS may receive these filesystem metadata operations
   150  // in parallel. To accommodate this parallelism, the diskHealthCheckingFS's
   151  // write-oriented filesystem operations record their start times into a 'slot'
   152  // on the filesystem. A single long-running goroutine periodically scans the
   153  // slots looking for slow operations.
   154  //
   155  // The number of slots on a diskHealthCheckingFS grows to a working set of the
   156  // maximum concurrent filesystem operations. This is expected to be very few
   157  // for these reasons:
   158  //  1. Pebble has limited write concurrency. Flushes, compactions and WAL
   159  //     rotations are the primary sources of filesystem metadata operations. With
   160  //     the default max-compaction concurrency, these operations require at most 5
   161  //     concurrent slots if all 5 perform a filesystem metadata operation
   162  //     simultaneously.
   163  //  2. Pebble's limited concurrent I/O writers spend most of their time
   164  //     performing file I/O, not performing the filesystem metadata operations that
   165  //     require recording a slot on the diskHealthCheckingFS.
   166  //  3. In CockroachDB, each additional store/Pebble instance has its own vfs.FS
   167  //     which provides a separate goroutine and set of slots.
   168  //  4. In CockroachDB, many of the additional sources of filesystem metadata
   169  //     operations (like encryption-at-rest) are sequential with respect to Pebble's
   170  //     threads.
   171  type diskHealthCheckingFS struct {
   172  	tickInterval      time.Duration
   173  	diskSlowThreshold time.Duration
   174  	onSlowDisk        func(string, time.Duration)
   175  	fs                FS
   176  	mu                struct {
   177  		sync.Mutex
   178  		tickerRunning bool
   179  		stopper       chan struct{}
   180  		inflight      []*slot
   181  	}
   182  	// prealloc preallocates the memory for mu.inflight slots and the slice
   183  	// itself. The contained fields are not accessed directly except by
   184  	// WithDiskHealthChecks when initializing mu.inflight. The number of slots
   185  	// in d.mu.inflight will grow to the maximum number of concurrent file
   186  	// metadata operations (create, remove, link, etc). If the number of
   187  	// concurrent operations never exceeds preallocatedSlotCount, we'll never
   188  	// incur an additional allocation.
   189  	prealloc struct {
   190  		slots        [preallocatedSlotCount]slot
   191  		slotPtrSlice [preallocatedSlotCount]*slot
   192  	}
   193  }
   194  
   195  type slot struct {
   196  	name       string
   197  	startNanos int64
   198  }
   199  
   200  // diskHealthCheckingFS implements FS.
   201  var _ FS = (*diskHealthCheckingFS)(nil)
   202  
   203  // WithDiskHealthChecks wraps an FS and ensures that all write-oriented
   204  // operations on the FS are wrapped with disk health detection checks. Disk
   205  // operations that are observed to take longer than diskSlowThreshold trigger an
   206  // onSlowDisk call.
   207  //
   208  // A threshold of zero disables disk-health checking.
   209  func WithDiskHealthChecks(
   210  	innerFS FS, diskSlowThreshold time.Duration, onSlowDisk func(string, time.Duration),
   211  ) (FS, io.Closer) {
   212  	if diskSlowThreshold == 0 {
   213  		return innerFS, noopCloser{}
   214  	}
   215  
   216  	fs := &diskHealthCheckingFS{
   217  		fs:                innerFS,
   218  		tickInterval:      defaultTickInterval,
   219  		diskSlowThreshold: diskSlowThreshold,
   220  		onSlowDisk:        onSlowDisk,
   221  	}
   222  	fs.mu.stopper = make(chan struct{})
   223  	// The fs holds preallocated slots and a preallocated array of slot pointers
   224  	// with equal length. Initialize the inflight slice to use a slice backed by
   225  	// the preallocated array with each slot initialized to a preallocated slot.
   226  	fs.mu.inflight = fs.prealloc.slotPtrSlice[:]
   227  	for i := range fs.mu.inflight {
   228  		fs.mu.inflight[i] = &fs.prealloc.slots[i]
   229  	}
   230  	return fs, fs
   231  }
   232  
   233  func (d *diskHealthCheckingFS) timeFilesystemOp(name string, op func()) {
   234  	if d == nil {
   235  		op()
   236  		return
   237  	}
   238  
   239  	// Record this operation's start time on the FS, so that the long-running
   240  	// goroutine can monitor the filesystem operation.
   241  	//
   242  	// The diskHealthCheckingFile implementation uses a single field that is
   243  	// atomically updated, taking advantage of the fact that writes to a single
   244  	// vfs.File handle are not performed in parallel. The vfs.FS however may
   245  	// receive write filesystem operations in parallel. To accommodate this
   246  	// parallelism, writing goroutines append their start time to a
   247  	// mutex-protected vector. On ticks, the long-running goroutine scans the
   248  	// vector searching for start times older than the slow-disk threshold. When
   249  	// a writing goroutine completes its operation, it atomically overwrites its
   250  	// slot to signal completion.
   251  	var s *slot
   252  	func() {
   253  		d.mu.Lock()
   254  		defer d.mu.Unlock()
   255  
   256  		// If there's no long-running goroutine to monitor this filesystem
   257  		// operation, start one.
   258  		if !d.mu.tickerRunning {
   259  			d.startTickerLocked()
   260  		}
   261  
   262  		startNanos := time.Now().UnixNano()
   263  		for i := 0; i < len(d.mu.inflight); i++ {
   264  			if atomic.LoadInt64(&d.mu.inflight[i].startNanos) == 0 {
   265  				// This slot is not in use. Claim it.
   266  				s = d.mu.inflight[i]
   267  				s.name = name
   268  				atomic.StoreInt64(&s.startNanos, startNanos)
   269  				break
   270  			}
   271  		}
   272  		// If we didn't find any unused slots, create a new slot and append it.
   273  		// This slot will exist forever. The number of slots will grow to the
   274  		// maximum number of concurrent filesystem operations over the lifetime
   275  		// of the process. Only operations that grow the number of slots must
   276  		// incur an allocation.
   277  		if s == nil {
   278  			s = &slot{
   279  				name:       name,
   280  				startNanos: startNanos,
   281  			}
   282  			d.mu.inflight = append(d.mu.inflight, s)
   283  		}
   284  	}()
   285  
   286  	op()
   287  
   288  	// Signal completion by zeroing the start time.
   289  	atomic.StoreInt64(&s.startNanos, 0)
   290  }
   291  
   292  // startTickerLocked starts a new goroutine with a ticker to monitor disk
   293  // filesystem operations. Requires d.mu and !d.mu.tickerRunning.
   294  func (d *diskHealthCheckingFS) startTickerLocked() {
   295  	d.mu.tickerRunning = true
   296  	stopper := d.mu.stopper
   297  	go func() {
   298  		ticker := time.NewTicker(d.tickInterval)
   299  		defer ticker.Stop()
   300  
   301  		for {
   302  			select {
   303  			case <-ticker.C:
   304  				// Scan the inflight slots for any slots recording a start
   305  				// time older than the diskSlowThreshold.
   306  				d.mu.Lock()
   307  				now := time.Now()
   308  				for i := range d.mu.inflight {
   309  					nanos := atomic.LoadInt64(&d.mu.inflight[i].startNanos)
   310  					if nanos != 0 && time.Unix(0, nanos).Add(d.diskSlowThreshold).Before(now) {
   311  						// diskSlowThreshold was exceeded. Invoke the provided
   312  						// callback.
   313  						d.onSlowDisk(d.mu.inflight[i].name, now.Sub(time.Unix(0, nanos)))
   314  					}
   315  				}
   316  				d.mu.Unlock()
   317  			case <-stopper:
   318  				return
   319  			}
   320  		}
   321  	}()
   322  }
   323  
   324  // Close implements io.Closer. Close stops the long-running goroutine that
   325  // monitors for slow filesystem metadata operations. Close may be called
   326  // multiple times. If the filesystem is used after Close has been called, a new
   327  // long-running goroutine will be created.
   328  func (d *diskHealthCheckingFS) Close() error {
   329  	d.mu.Lock()
   330  	if !d.mu.tickerRunning {
   331  		// Nothing to stop.
   332  		d.mu.Unlock()
   333  		return nil
   334  	}
   335  
   336  	// Grab the stopper so we can request the long-running goroutine to stop.
   337  	// Replace the stopper in case this FS is reused. It's possible to Close and
   338  	// reuse a disk-health checking FS. This is to accommodate the on-by-default
   339  	// behavior in Pebble, and the possibility that users may continue to use
   340  	// the Pebble default FS beyond the lifetime of a single DB.
   341  	stopper := d.mu.stopper
   342  	d.mu.stopper = make(chan struct{})
   343  	d.mu.tickerRunning = false
   344  	d.mu.Unlock()
   345  
   346  	// Ask the long-running goroutine to stop. This is a synchronous channel
   347  	// send.
   348  	stopper <- struct{}{}
   349  	close(stopper)
   350  	return nil
   351  }
   352  
   353  // Create implements the FS interface.
   354  func (d *diskHealthCheckingFS) Create(name string) (File, error) {
   355  	var f File
   356  	var err error
   357  	d.timeFilesystemOp(name, func() {
   358  		f, err = d.fs.Create(name)
   359  	})
   360  	if err != nil {
   361  		return f, err
   362  	}
   363  	if d.diskSlowThreshold == 0 {
   364  		return f, nil
   365  	}
   366  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(duration time.Duration) {
   367  		d.onSlowDisk(name, duration)
   368  	})
   369  	checkingFile.startTicker()
   370  	return WithFd(f, checkingFile), nil
   371  }
   372  
   373  // GetDiskUsage implements the FS interface.
   374  func (d *diskHealthCheckingFS) GetDiskUsage(path string) (DiskUsage, error) {
   375  	return d.fs.GetDiskUsage(path)
   376  }
   377  
   378  // Link implements the FS interface.
   379  func (d *diskHealthCheckingFS) Link(oldname, newname string) error {
   380  	var err error
   381  	d.timeFilesystemOp(newname, func() {
   382  		err = d.fs.Link(oldname, newname)
   383  	})
   384  	return err
   385  }
   386  
   387  // List implements the FS interface.
   388  func (d *diskHealthCheckingFS) List(dir string) ([]string, error) {
   389  	return d.fs.List(dir)
   390  }
   391  
   392  // Lock implements the FS interface.
   393  func (d *diskHealthCheckingFS) Lock(name string) (io.Closer, error) {
   394  	return d.fs.Lock(name)
   395  }
   396  
   397  // MkdirAll implements the FS interface.
   398  func (d *diskHealthCheckingFS) MkdirAll(dir string, perm os.FileMode) error {
   399  	var err error
   400  	d.timeFilesystemOp(dir, func() {
   401  		err = d.fs.MkdirAll(dir, perm)
   402  	})
   403  	return err
   404  }
   405  
   406  // Open implements the FS interface.
   407  func (d *diskHealthCheckingFS) Open(name string, opts ...OpenOption) (File, error) {
   408  	return d.fs.Open(name, opts...)
   409  }
   410  
   411  // OpenDir implements the FS interface.
   412  func (d *diskHealthCheckingFS) OpenDir(name string) (File, error) {
   413  	f, err := d.fs.OpenDir(name)
   414  	if err != nil {
   415  		return f, err
   416  	}
   417  	// Directories opened with OpenDir must be opened with health checking,
   418  	// because they may be explicitly synced.
   419  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(duration time.Duration) {
   420  		d.onSlowDisk(name, duration)
   421  	})
   422  	checkingFile.startTicker()
   423  	return WithFd(f, checkingFile), nil
   424  }
   425  
   426  // PathBase implements the FS interface.
   427  func (d *diskHealthCheckingFS) PathBase(path string) string {
   428  	return d.fs.PathBase(path)
   429  }
   430  
   431  // PathJoin implements the FS interface.
   432  func (d *diskHealthCheckingFS) PathJoin(elem ...string) string {
   433  	return d.fs.PathJoin(elem...)
   434  }
   435  
   436  // PathDir implements the FS interface.
   437  func (d *diskHealthCheckingFS) PathDir(path string) string {
   438  	return d.fs.PathDir(path)
   439  }
   440  
   441  // Remove implements the FS interface.
   442  func (d *diskHealthCheckingFS) Remove(name string) error {
   443  	var err error
   444  	d.timeFilesystemOp(name, func() {
   445  		err = d.fs.Remove(name)
   446  	})
   447  	return err
   448  }
   449  
   450  // RemoveAll implements the FS interface.
   451  func (d *diskHealthCheckingFS) RemoveAll(name string) error {
   452  	var err error
   453  	d.timeFilesystemOp(name, func() {
   454  		err = d.fs.RemoveAll(name)
   455  	})
   456  	return err
   457  }
   458  
   459  // Rename implements the FS interface.
   460  func (d *diskHealthCheckingFS) Rename(oldname, newname string) error {
   461  	var err error
   462  	d.timeFilesystemOp(newname, func() {
   463  		err = d.fs.Rename(oldname, newname)
   464  	})
   465  	return err
   466  }
   467  
   468  // ReuseForWrite implements the FS interface.
   469  func (d *diskHealthCheckingFS) ReuseForWrite(oldname, newname string) (File, error) {
   470  	var f File
   471  	var err error
   472  	d.timeFilesystemOp(newname, func() {
   473  		f, err = d.fs.ReuseForWrite(oldname, newname)
   474  	})
   475  	if err != nil {
   476  		return f, err
   477  	}
   478  	if d.diskSlowThreshold == 0 {
   479  		return f, nil
   480  	}
   481  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(duration time.Duration) {
   482  		d.onSlowDisk(newname, duration)
   483  	})
   484  	checkingFile.startTicker()
   485  	return WithFd(f, checkingFile), nil
   486  }
   487  
   488  // Stat implements the FS interface.
   489  func (d *diskHealthCheckingFS) Stat(name string) (os.FileInfo, error) {
   490  	return d.fs.Stat(name)
   491  }
   492  
   493  type noopCloser struct{}
   494  
   495  func (noopCloser) Close() error { return nil }