github.com/cockroachdb/pebble@v1.1.2/vfs/disk_health.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package vfs
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"path/filepath"
    12  	"sync"
    13  	"sync/atomic"
    14  	"time"
    15  
    16  	"github.com/cockroachdb/redact"
    17  )
    18  
    19  const (
    20  	// preallocatedSlotCount is the default number of slots available for
    21  	// concurrent filesystem operations. The slot count may be exceeded, but
    22  	// each additional slot will incur an additional allocation. We choose 16
    23  	// here with the expectation that it is significantly more than required in
    24  	// practice. See the comment above the diskHealthCheckingFS type definition.
    25  	preallocatedSlotCount = 16
    26  	// deltaBits is the number of bits in the packed 64-bit integer used for
    27  	// identifying a delta from the file creation time in milliseconds.
    28  	deltaBits = 40
    29  	// writeSizeBits is the number of bits in the packed 64-bit integer used for
    30  	// identifying the size of the write operation, if the operation is sized. See
    31  	// writeSizePrecision below for precision of size.
    32  	writeSizeBits = 20
    33  	// Track size of writes at kilobyte precision. See comment above lastWritePacked for more.
    34  	writeSizePrecision = 1024
    35  )
    36  
    37  // Variables to enable testing.
    38  var (
    39  	// defaultTickInterval is the default interval between two ticks of each
    40  	// diskHealthCheckingFile loop iteration.
    41  	defaultTickInterval = 2 * time.Second
    42  )
    43  
    44  // OpType is the type of IO operation being monitored by a
    45  // diskHealthCheckingFile.
    46  type OpType uint8
    47  
    48  // The following OpTypes is limited to the subset of file system operations that
    49  // a diskHealthCheckingFile supports (namely writes and syncs).
    50  const (
    51  	OpTypeUnknown OpType = iota
    52  	OpTypeWrite
    53  	OpTypeSync
    54  	OpTypeSyncData
    55  	OpTypeSyncTo
    56  	OpTypeCreate
    57  	OpTypeLink
    58  	OpTypeMkdirAll
    59  	OpTypePreallocate
    60  	OpTypeRemove
    61  	OpTypeRemoveAll
    62  	OpTypeRename
    63  	OpTypeReuseForWrite
    64  	// Note: opTypeMax is just used in tests. It must appear last in the list
    65  	// of OpTypes.
    66  	opTypeMax
    67  )
    68  
    69  // String implements fmt.Stringer.
    70  func (o OpType) String() string {
    71  	switch o {
    72  	case OpTypeWrite:
    73  		return "write"
    74  	case OpTypeSync:
    75  		return "sync"
    76  	case OpTypeSyncData:
    77  		return "syncdata"
    78  	case OpTypeSyncTo:
    79  		return "syncto"
    80  	case OpTypeCreate:
    81  		return "create"
    82  	case OpTypeLink:
    83  		return "link"
    84  	case OpTypeMkdirAll:
    85  		return "mkdirall"
    86  	case OpTypePreallocate:
    87  		return "preallocate"
    88  	case OpTypeRemove:
    89  		return "remove"
    90  	case OpTypeRemoveAll:
    91  		return "removall"
    92  	case OpTypeRename:
    93  		return "rename"
    94  	case OpTypeReuseForWrite:
    95  		return "reuseforwrite"
    96  	case OpTypeUnknown:
    97  		return "unknown"
    98  	default:
    99  		panic(fmt.Sprintf("vfs: unknown op type: %d", o))
   100  	}
   101  }
   102  
   103  // diskHealthCheckingFile is a File wrapper to detect slow disk operations, and
   104  // call onSlowDisk if a disk operation is seen to exceed diskSlowThreshold.
   105  //
   106  // This struct creates a goroutine (in startTicker()) that, at every tick
   107  // interval, sees if there's a disk operation taking longer than the specified
   108  // duration. This setup is preferable to creating a new timer at every disk
   109  // operation, as it reduces overhead per disk operation.
   110  type diskHealthCheckingFile struct {
   111  	file              File
   112  	onSlowDisk        func(opType OpType, writeSizeInBytes int, duration time.Duration)
   113  	diskSlowThreshold time.Duration
   114  	tickInterval      time.Duration
   115  
   116  	stopper chan struct{}
   117  	// lastWritePacked is a 64-bit unsigned int. The most significant
   118  	// 40 bits represent an delta (in milliseconds) from the creation
   119  	// time of the diskHealthCheckingFile. The next most significant 20 bits
   120  	// represent the size of the write in KBs, if the write has a size. (If
   121  	// it doesn't, the 20 bits are zeroed). The least significant four bits
   122  	// contains the OpType.
   123  	//
   124  	// The use of 40 bits for an delta provides ~34 years of effective
   125  	// monitoring time before the uint wraps around, at millisecond precision.
   126  	// ~34 years of process uptime "ought to be enough for anybody". Millisecond
   127  	// writeSizePrecision is sufficient, given that we are monitoring for writes that take
   128  	// longer than one millisecond.
   129  	//
   130  	// The use of 20 bits for the size in KBs allows representing sizes up
   131  	// to nearly one GB. If the write is larger than that, we round down to ~one GB.
   132  	//
   133  	// The use of four bits for OpType allows for 16 operation types.
   134  	//
   135  	// NB: this packing scheme is not persisted, and is therefore safe to adjust
   136  	// across process boundaries.
   137  	lastWritePacked atomic.Uint64
   138  	createTime      time.Time
   139  }
   140  
   141  // newDiskHealthCheckingFile instantiates a new diskHealthCheckingFile, with the
   142  // specified time threshold and event listener.
   143  func newDiskHealthCheckingFile(
   144  	file File,
   145  	diskSlowThreshold time.Duration,
   146  	onSlowDisk func(OpType OpType, writeSizeInBytes int, duration time.Duration),
   147  ) *diskHealthCheckingFile {
   148  	return &diskHealthCheckingFile{
   149  		file:              file,
   150  		onSlowDisk:        onSlowDisk,
   151  		diskSlowThreshold: diskSlowThreshold,
   152  		tickInterval:      defaultTickInterval,
   153  
   154  		stopper:    make(chan struct{}),
   155  		createTime: time.Now(),
   156  	}
   157  }
   158  
   159  // startTicker starts a new goroutine with a ticker to monitor disk operations.
   160  // Can only be called if the ticker goroutine isn't running already.
   161  func (d *diskHealthCheckingFile) startTicker() {
   162  	if d.diskSlowThreshold == 0 {
   163  		return
   164  	}
   165  
   166  	go func() {
   167  		ticker := time.NewTicker(d.tickInterval)
   168  		defer ticker.Stop()
   169  
   170  		for {
   171  			select {
   172  			case <-d.stopper:
   173  				return
   174  
   175  			case <-ticker.C:
   176  				packed := d.lastWritePacked.Load()
   177  				if packed == 0 {
   178  					continue
   179  				}
   180  				delta, writeSize, op := unpack(packed)
   181  				lastWrite := d.createTime.Add(delta)
   182  				now := time.Now()
   183  				if lastWrite.Add(d.diskSlowThreshold).Before(now) {
   184  					// diskSlowThreshold was exceeded. Call the passed-in
   185  					// listener.
   186  					d.onSlowDisk(op, writeSize, now.Sub(lastWrite))
   187  				}
   188  			}
   189  		}
   190  	}()
   191  }
   192  
   193  // stopTicker stops the goroutine started in startTicker.
   194  func (d *diskHealthCheckingFile) stopTicker() {
   195  	close(d.stopper)
   196  }
   197  
   198  // Fd implements (vfs.File).Fd.
   199  func (d *diskHealthCheckingFile) Fd() uintptr {
   200  	return d.file.Fd()
   201  }
   202  
   203  // Read implements (vfs.File).Read
   204  func (d *diskHealthCheckingFile) Read(p []byte) (int, error) {
   205  	return d.file.Read(p)
   206  }
   207  
   208  // ReadAt implements (vfs.File).ReadAt
   209  func (d *diskHealthCheckingFile) ReadAt(p []byte, off int64) (int, error) {
   210  	return d.file.ReadAt(p, off)
   211  }
   212  
   213  // Write implements the io.Writer interface.
   214  func (d *diskHealthCheckingFile) Write(p []byte) (n int, err error) {
   215  	d.timeDiskOp(OpTypeWrite, int64(len(p)), func() {
   216  		n, err = d.file.Write(p)
   217  	})
   218  	return n, err
   219  }
   220  
   221  // Write implements the io.WriterAt interface.
   222  func (d *diskHealthCheckingFile) WriteAt(p []byte, ofs int64) (n int, err error) {
   223  	d.timeDiskOp(OpTypeWrite, int64(len(p)), func() {
   224  		n, err = d.file.WriteAt(p, ofs)
   225  	})
   226  	return n, err
   227  }
   228  
   229  // Close implements the io.Closer interface.
   230  func (d *diskHealthCheckingFile) Close() error {
   231  	d.stopTicker()
   232  	return d.file.Close()
   233  }
   234  
   235  // Prefetch implements (vfs.File).Prefetch.
   236  func (d *diskHealthCheckingFile) Prefetch(offset, length int64) error {
   237  	return d.file.Prefetch(offset, length)
   238  }
   239  
   240  // Preallocate implements (vfs.File).Preallocate.
   241  func (d *diskHealthCheckingFile) Preallocate(off, n int64) (err error) {
   242  	d.timeDiskOp(OpTypePreallocate, n, func() {
   243  		err = d.file.Preallocate(off, n)
   244  	})
   245  	return err
   246  }
   247  
   248  // Stat implements (vfs.File).Stat.
   249  func (d *diskHealthCheckingFile) Stat() (os.FileInfo, error) {
   250  	return d.file.Stat()
   251  }
   252  
   253  // Sync implements the io.Syncer interface.
   254  func (d *diskHealthCheckingFile) Sync() (err error) {
   255  	d.timeDiskOp(OpTypeSync, 0, func() {
   256  		err = d.file.Sync()
   257  	})
   258  	return err
   259  }
   260  
   261  // SyncData implements (vfs.File).SyncData.
   262  func (d *diskHealthCheckingFile) SyncData() (err error) {
   263  	d.timeDiskOp(OpTypeSyncData, 0, func() {
   264  		err = d.file.SyncData()
   265  	})
   266  	return err
   267  }
   268  
   269  // SyncTo implements (vfs.File).SyncTo.
   270  func (d *diskHealthCheckingFile) SyncTo(length int64) (fullSync bool, err error) {
   271  	d.timeDiskOp(OpTypeSyncTo, length, func() {
   272  		fullSync, err = d.file.SyncTo(length)
   273  	})
   274  	return fullSync, err
   275  }
   276  
   277  // timeDiskOp runs the specified closure and makes its timing visible to the
   278  // monitoring goroutine, in case it exceeds one of the slow disk durations.
   279  // opType should always be set. writeSizeInBytes should be set if the write
   280  // operation is sized. If not, it should be set to zero.
   281  func (d *diskHealthCheckingFile) timeDiskOp(opType OpType, writeSizeInBytes int64, op func()) {
   282  	if d == nil {
   283  		op()
   284  		return
   285  	}
   286  
   287  	delta := time.Since(d.createTime)
   288  	packed := pack(delta, writeSizeInBytes, opType)
   289  	if d.lastWritePacked.Swap(packed) != 0 {
   290  		panic("concurrent write operations detected on file")
   291  	}
   292  	defer func() {
   293  		if d.lastWritePacked.Swap(0) != packed {
   294  			panic("concurrent write operations detected on file")
   295  		}
   296  	}()
   297  	op()
   298  }
   299  
   300  // Note the slight lack of symmetry between pack & unpack. pack takes an int64 for writeSizeInBytes, since
   301  // callers of pack use an int64. This is dictated by the vfs interface. unpack OTOH returns an int. This is
   302  // safe because the packing scheme implies we only actually need 32 bits.
   303  func pack(delta time.Duration, writeSizeInBytes int64, opType OpType) uint64 {
   304  	// We have no guarantee of clock monotonicity. If we have a small regression
   305  	// in the clock, we set deltaMillis to zero, so we can still catch the operation
   306  	// if happens to be slow.
   307  	deltaMillis := delta.Milliseconds()
   308  	if deltaMillis < 0 {
   309  		deltaMillis = 0
   310  	}
   311  	// As of 3/7/2023, the use of 40 bits for an delta provides ~34 years
   312  	// of effective monitoring time before the uint wraps around, at millisecond
   313  	// precision.
   314  	if deltaMillis > 1<<deltaBits-1 {
   315  		panic("vfs: last write delta would result in integer wraparound")
   316  	}
   317  
   318  	// See writeSizePrecision to get the unit of writeSize. As of 1/26/2023, the unit is KBs.
   319  	writeSize := writeSizeInBytes / writeSizePrecision
   320  	// If the size of the write is larger than we can store in the packed int, store the max
   321  	// value we can store in the packed int.
   322  	const writeSizeCeiling = 1<<writeSizeBits - 1
   323  	if writeSize > writeSizeCeiling {
   324  		writeSize = writeSizeCeiling
   325  	}
   326  
   327  	return uint64(deltaMillis)<<(64-deltaBits) | uint64(writeSize)<<(64-deltaBits-writeSizeBits) | uint64(opType)
   328  }
   329  
   330  func unpack(packed uint64) (delta time.Duration, writeSizeInBytes int, opType OpType) {
   331  	delta = time.Duration(packed>>(64-deltaBits)) * time.Millisecond
   332  	wz := int64(packed>>(64-deltaBits-writeSizeBits)) & ((1 << writeSizeBits) - 1) * writeSizePrecision
   333  	// Given the packing scheme, converting wz to an int will not truncate anything.
   334  	writeSizeInBytes = int(wz)
   335  	opType = OpType(packed & 0xf)
   336  	return delta, writeSizeInBytes, opType
   337  }
   338  
   339  // diskHealthCheckingDir implements disk-health checking for directories. Unlike
   340  // other files, we allow directories to receive concurrent write operations
   341  // (Syncs are the only write operations supported by a directory.) Since the
   342  // diskHealthCheckingFile's timeDiskOp can only track a single in-flight
   343  // operation at a time, we time the operation using the filesystem-level
   344  // timeFilesystemOp function instead.
   345  type diskHealthCheckingDir struct {
   346  	File
   347  	name string
   348  	fs   *diskHealthCheckingFS
   349  }
   350  
   351  // Sync implements the io.Syncer interface.
   352  func (d *diskHealthCheckingDir) Sync() (err error) {
   353  	d.fs.timeFilesystemOp(d.name, OpTypeSync, func() {
   354  		err = d.File.Sync()
   355  	})
   356  	return err
   357  }
   358  
   359  // DiskSlowInfo captures info about detected slow operations on the vfs.
   360  type DiskSlowInfo struct {
   361  	// Path of file being written to.
   362  	Path string
   363  	// Operation being performed on the file.
   364  	OpType OpType
   365  	// Size of write in bytes, if the write is sized.
   366  	WriteSize int
   367  	// Duration that has elapsed since this disk operation started.
   368  	Duration time.Duration
   369  }
   370  
   371  func (i DiskSlowInfo) String() string {
   372  	return redact.StringWithoutMarkers(i)
   373  }
   374  
   375  // SafeFormat implements redact.SafeFormatter.
   376  func (i DiskSlowInfo) SafeFormat(w redact.SafePrinter, _ rune) {
   377  	switch i.OpType {
   378  	// Operations for which i.WriteSize is meaningful.
   379  	case OpTypeWrite, OpTypeSyncTo, OpTypePreallocate:
   380  		w.Printf("disk slowness detected: %s on file %s (%d bytes) has been ongoing for %0.1fs",
   381  			redact.Safe(i.OpType.String()), redact.Safe(filepath.Base(i.Path)),
   382  			redact.Safe(i.WriteSize), redact.Safe(i.Duration.Seconds()))
   383  	default:
   384  		w.Printf("disk slowness detected: %s on file %s has been ongoing for %0.1fs",
   385  			redact.Safe(i.OpType.String()), redact.Safe(filepath.Base(i.Path)),
   386  			redact.Safe(i.Duration.Seconds()))
   387  	}
   388  }
   389  
   390  // diskHealthCheckingFS adds disk-health checking facilities to a VFS.
   391  // It times disk write operations in two ways:
   392  //
   393  // 1. Wrapping vfs.Files.
   394  //
   395  // The bulk of write I/O activity is file writing and syncing, invoked through
   396  // the `vfs.File` interface. This VFS wraps all files open for writing with a
   397  // special diskHealthCheckingFile implementation of the vfs.File interface. See
   398  // above for the implementation.
   399  //
   400  // 2. Monitoring filesystem metadata operations.
   401  //
   402  // Filesystem metadata operations (create, link, remove, rename, etc) are also
   403  // sources of disk writes. Unlike a vfs.File which requires Write and Sync calls
   404  // to be sequential, a vfs.FS may receive these filesystem metadata operations
   405  // in parallel. To accommodate this parallelism, the diskHealthCheckingFS's
   406  // write-oriented filesystem operations record their start times into a 'slot'
   407  // on the filesystem. A single long-running goroutine periodically scans the
   408  // slots looking for slow operations.
   409  //
   410  // The number of slots on a diskHealthCheckingFS grows to a working set of the
   411  // maximum concurrent filesystem operations. This is expected to be very few
   412  // for these reasons:
   413  //  1. Pebble has limited write concurrency. Flushes, compactions and WAL
   414  //     rotations are the primary sources of filesystem metadata operations. With
   415  //     the default max-compaction concurrency, these operations require at most 5
   416  //     concurrent slots if all 5 perform a filesystem metadata operation
   417  //     simultaneously.
   418  //  2. Pebble's limited concurrent I/O writers spend most of their time
   419  //     performing file I/O, not performing the filesystem metadata operations that
   420  //     require recording a slot on the diskHealthCheckingFS.
   421  //  3. In CockroachDB, each additional store/Pebble instance has its own vfs.FS
   422  //     which provides a separate goroutine and set of slots.
   423  //  4. In CockroachDB, many of the additional sources of filesystem metadata
   424  //     operations (like encryption-at-rest) are sequential with respect to Pebble's
   425  //     threads.
   426  type diskHealthCheckingFS struct {
   427  	tickInterval      time.Duration
   428  	diskSlowThreshold time.Duration
   429  	onSlowDisk        func(DiskSlowInfo)
   430  	fs                FS
   431  	mu                struct {
   432  		sync.Mutex
   433  		tickerRunning bool
   434  		stopper       chan struct{}
   435  		inflight      []*slot
   436  	}
   437  	// prealloc preallocates the memory for mu.inflight slots and the slice
   438  	// itself. The contained fields are not accessed directly except by
   439  	// WithDiskHealthChecks when initializing mu.inflight. The number of slots
   440  	// in d.mu.inflight will grow to the maximum number of concurrent file
   441  	// metadata operations (create, remove, link, etc). If the number of
   442  	// concurrent operations never exceeds preallocatedSlotCount, we'll never
   443  	// incur an additional allocation.
   444  	prealloc struct {
   445  		slots        [preallocatedSlotCount]slot
   446  		slotPtrSlice [preallocatedSlotCount]*slot
   447  	}
   448  }
   449  
   450  type slot struct {
   451  	name       string
   452  	opType     OpType
   453  	startNanos atomic.Int64
   454  }
   455  
   456  // diskHealthCheckingFS implements FS.
   457  var _ FS = (*diskHealthCheckingFS)(nil)
   458  
   459  // WithDiskHealthChecks wraps an FS and ensures that all write-oriented
   460  // operations on the FS are wrapped with disk health detection checks. Disk
   461  // operations that are observed to take longer than diskSlowThreshold trigger an
   462  // onSlowDisk call.
   463  //
   464  // A threshold of zero disables disk-health checking.
   465  func WithDiskHealthChecks(
   466  	innerFS FS, diskSlowThreshold time.Duration, onSlowDisk func(info DiskSlowInfo),
   467  ) (FS, io.Closer) {
   468  	if diskSlowThreshold == 0 {
   469  		return innerFS, noopCloser{}
   470  	}
   471  
   472  	fs := &diskHealthCheckingFS{
   473  		fs:                innerFS,
   474  		tickInterval:      defaultTickInterval,
   475  		diskSlowThreshold: diskSlowThreshold,
   476  		onSlowDisk:        onSlowDisk,
   477  	}
   478  	fs.mu.stopper = make(chan struct{})
   479  	// The fs holds preallocated slots and a preallocated array of slot pointers
   480  	// with equal length. Initialize the inflight slice to use a slice backed by
   481  	// the preallocated array with each slot initialized to a preallocated slot.
   482  	fs.mu.inflight = fs.prealloc.slotPtrSlice[:]
   483  	for i := range fs.mu.inflight {
   484  		fs.mu.inflight[i] = &fs.prealloc.slots[i]
   485  	}
   486  	return fs, fs
   487  }
   488  
   489  func (d *diskHealthCheckingFS) timeFilesystemOp(name string, opType OpType, op func()) {
   490  	if d == nil {
   491  		op()
   492  		return
   493  	}
   494  
   495  	// Record this operation's start time on the FS, so that the long-running
   496  	// goroutine can monitor the filesystem operation.
   497  	//
   498  	// The diskHealthCheckingFile implementation uses a single field that is
   499  	// atomically updated, taking advantage of the fact that writes to a single
   500  	// vfs.File handle are not performed in parallel. The vfs.FS however may
   501  	// receive write filesystem operations in parallel. To accommodate this
   502  	// parallelism, writing goroutines append their start time to a
   503  	// mutex-protected vector. On ticks, the long-running goroutine scans the
   504  	// vector searching for start times older than the slow-disk threshold. When
   505  	// a writing goroutine completes its operation, it atomically overwrites its
   506  	// slot to signal completion.
   507  	var s *slot
   508  	func() {
   509  		d.mu.Lock()
   510  		defer d.mu.Unlock()
   511  
   512  		// If there's no long-running goroutine to monitor this filesystem
   513  		// operation, start one.
   514  		if !d.mu.tickerRunning {
   515  			d.startTickerLocked()
   516  		}
   517  
   518  		startNanos := time.Now().UnixNano()
   519  		for i := 0; i < len(d.mu.inflight); i++ {
   520  			if d.mu.inflight[i].startNanos.Load() == 0 {
   521  				// This slot is not in use. Claim it.
   522  				s = d.mu.inflight[i]
   523  				s.name = name
   524  				s.opType = opType
   525  				s.startNanos.Store(startNanos)
   526  				break
   527  			}
   528  		}
   529  		// If we didn't find any unused slots, create a new slot and append it.
   530  		// This slot will exist forever. The number of slots will grow to the
   531  		// maximum number of concurrent filesystem operations over the lifetime
   532  		// of the process. Only operations that grow the number of slots must
   533  		// incur an allocation.
   534  		if s == nil {
   535  			s = &slot{
   536  				name:   name,
   537  				opType: opType,
   538  			}
   539  			s.startNanos.Store(startNanos)
   540  			d.mu.inflight = append(d.mu.inflight, s)
   541  		}
   542  	}()
   543  
   544  	op()
   545  
   546  	// Signal completion by zeroing the start time.
   547  	s.startNanos.Store(0)
   548  }
   549  
   550  // startTickerLocked starts a new goroutine with a ticker to monitor disk
   551  // filesystem operations. Requires d.mu and !d.mu.tickerRunning.
   552  func (d *diskHealthCheckingFS) startTickerLocked() {
   553  	d.mu.tickerRunning = true
   554  	stopper := d.mu.stopper
   555  	go func() {
   556  		ticker := time.NewTicker(d.tickInterval)
   557  		defer ticker.Stop()
   558  		type exceededSlot struct {
   559  			name       string
   560  			opType     OpType
   561  			startNanos int64
   562  		}
   563  		var exceededSlots []exceededSlot
   564  
   565  		for {
   566  			select {
   567  			case <-ticker.C:
   568  				// Scan the inflight slots for any slots recording a start
   569  				// time older than the diskSlowThreshold.
   570  				exceededSlots = exceededSlots[:0]
   571  				d.mu.Lock()
   572  				now := time.Now()
   573  				for i := range d.mu.inflight {
   574  					nanos := d.mu.inflight[i].startNanos.Load()
   575  					if nanos != 0 && time.Unix(0, nanos).Add(d.diskSlowThreshold).Before(now) {
   576  						// diskSlowThreshold was exceeded. Copy this inflightOp into
   577  						// exceededSlots and call d.onSlowDisk after dropping the mutex.
   578  						inflightOp := exceededSlot{
   579  							name:       d.mu.inflight[i].name,
   580  							opType:     d.mu.inflight[i].opType,
   581  							startNanos: nanos,
   582  						}
   583  						exceededSlots = append(exceededSlots, inflightOp)
   584  					}
   585  				}
   586  				d.mu.Unlock()
   587  				for i := range exceededSlots {
   588  					d.onSlowDisk(
   589  						DiskSlowInfo{
   590  							Path:      exceededSlots[i].name,
   591  							OpType:    exceededSlots[i].opType,
   592  							WriteSize: 0, // writes at the fs level are not sized
   593  							Duration:  now.Sub(time.Unix(0, exceededSlots[i].startNanos)),
   594  						})
   595  				}
   596  			case <-stopper:
   597  				return
   598  			}
   599  		}
   600  	}()
   601  }
   602  
   603  // Close implements io.Closer. Close stops the long-running goroutine that
   604  // monitors for slow filesystem metadata operations. Close may be called
   605  // multiple times. If the filesystem is used after Close has been called, a new
   606  // long-running goroutine will be created.
   607  func (d *diskHealthCheckingFS) Close() error {
   608  	d.mu.Lock()
   609  	if !d.mu.tickerRunning {
   610  		// Nothing to stop.
   611  		d.mu.Unlock()
   612  		return nil
   613  	}
   614  
   615  	// Grab the stopper so we can request the long-running goroutine to stop.
   616  	// Replace the stopper in case this FS is reused. It's possible to Close and
   617  	// reuse a disk-health checking FS. This is to accommodate the on-by-default
   618  	// behavior in Pebble, and the possibility that users may continue to use
   619  	// the Pebble default FS beyond the lifetime of a single DB.
   620  	stopper := d.mu.stopper
   621  	d.mu.stopper = make(chan struct{})
   622  	d.mu.tickerRunning = false
   623  	d.mu.Unlock()
   624  
   625  	// Ask the long-running goroutine to stop. This is a synchronous channel
   626  	// send.
   627  	stopper <- struct{}{}
   628  	close(stopper)
   629  	return nil
   630  }
   631  
   632  // Create implements the FS interface.
   633  func (d *diskHealthCheckingFS) Create(name string) (File, error) {
   634  	var f File
   635  	var err error
   636  	d.timeFilesystemOp(name, OpTypeCreate, func() {
   637  		f, err = d.fs.Create(name)
   638  	})
   639  	if err != nil {
   640  		return f, err
   641  	}
   642  	if d.diskSlowThreshold == 0 {
   643  		return f, nil
   644  	}
   645  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
   646  		d.onSlowDisk(
   647  			DiskSlowInfo{
   648  				Path:      name,
   649  				OpType:    opType,
   650  				WriteSize: writeSizeInBytes,
   651  				Duration:  duration,
   652  			})
   653  	})
   654  	checkingFile.startTicker()
   655  	return checkingFile, nil
   656  }
   657  
   658  // GetDiskUsage implements the FS interface.
   659  func (d *diskHealthCheckingFS) GetDiskUsage(path string) (DiskUsage, error) {
   660  	return d.fs.GetDiskUsage(path)
   661  }
   662  
   663  // Link implements the FS interface.
   664  func (d *diskHealthCheckingFS) Link(oldname, newname string) error {
   665  	var err error
   666  	d.timeFilesystemOp(newname, OpTypeLink, func() {
   667  		err = d.fs.Link(oldname, newname)
   668  	})
   669  	return err
   670  }
   671  
   672  // List implements the FS interface.
   673  func (d *diskHealthCheckingFS) List(dir string) ([]string, error) {
   674  	return d.fs.List(dir)
   675  }
   676  
   677  // Lock implements the FS interface.
   678  func (d *diskHealthCheckingFS) Lock(name string) (io.Closer, error) {
   679  	return d.fs.Lock(name)
   680  }
   681  
   682  // MkdirAll implements the FS interface.
   683  func (d *diskHealthCheckingFS) MkdirAll(dir string, perm os.FileMode) error {
   684  	var err error
   685  	d.timeFilesystemOp(dir, OpTypeMkdirAll, func() {
   686  		err = d.fs.MkdirAll(dir, perm)
   687  	})
   688  	return err
   689  }
   690  
   691  // Open implements the FS interface.
   692  func (d *diskHealthCheckingFS) Open(name string, opts ...OpenOption) (File, error) {
   693  	return d.fs.Open(name, opts...)
   694  }
   695  
   696  // OpenReadWrite implements the FS interface.
   697  func (d *diskHealthCheckingFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
   698  	return d.fs.OpenReadWrite(name, opts...)
   699  }
   700  
   701  // OpenDir implements the FS interface.
   702  func (d *diskHealthCheckingFS) OpenDir(name string) (File, error) {
   703  	f, err := d.fs.OpenDir(name)
   704  	if err != nil {
   705  		return f, err
   706  	}
   707  	// Directories opened with OpenDir must be opened with health checking,
   708  	// because they may be explicitly synced.
   709  	return &diskHealthCheckingDir{
   710  		File: f,
   711  		name: name,
   712  		fs:   d,
   713  	}, nil
   714  }
   715  
   716  // PathBase implements the FS interface.
   717  func (d *diskHealthCheckingFS) PathBase(path string) string {
   718  	return d.fs.PathBase(path)
   719  }
   720  
   721  // PathJoin implements the FS interface.
   722  func (d *diskHealthCheckingFS) PathJoin(elem ...string) string {
   723  	return d.fs.PathJoin(elem...)
   724  }
   725  
   726  // PathDir implements the FS interface.
   727  func (d *diskHealthCheckingFS) PathDir(path string) string {
   728  	return d.fs.PathDir(path)
   729  }
   730  
   731  // Remove implements the FS interface.
   732  func (d *diskHealthCheckingFS) Remove(name string) error {
   733  	var err error
   734  	d.timeFilesystemOp(name, OpTypeRemove, func() {
   735  		err = d.fs.Remove(name)
   736  	})
   737  	return err
   738  }
   739  
   740  // RemoveAll implements the FS interface.
   741  func (d *diskHealthCheckingFS) RemoveAll(name string) error {
   742  	var err error
   743  	d.timeFilesystemOp(name, OpTypeRemoveAll, func() {
   744  		err = d.fs.RemoveAll(name)
   745  	})
   746  	return err
   747  }
   748  
   749  // Rename implements the FS interface.
   750  func (d *diskHealthCheckingFS) Rename(oldname, newname string) error {
   751  	var err error
   752  	d.timeFilesystemOp(newname, OpTypeRename, func() {
   753  		err = d.fs.Rename(oldname, newname)
   754  	})
   755  	return err
   756  }
   757  
   758  // ReuseForWrite implements the FS interface.
   759  func (d *diskHealthCheckingFS) ReuseForWrite(oldname, newname string) (File, error) {
   760  	var f File
   761  	var err error
   762  	d.timeFilesystemOp(newname, OpTypeReuseForWrite, func() {
   763  		f, err = d.fs.ReuseForWrite(oldname, newname)
   764  	})
   765  	if err != nil {
   766  		return f, err
   767  	}
   768  	if d.diskSlowThreshold == 0 {
   769  		return f, nil
   770  	}
   771  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
   772  		d.onSlowDisk(
   773  			DiskSlowInfo{
   774  				Path:      newname,
   775  				OpType:    opType,
   776  				WriteSize: writeSizeInBytes,
   777  				Duration:  duration,
   778  			})
   779  	})
   780  	checkingFile.startTicker()
   781  	return checkingFile, nil
   782  }
   783  
   784  // Stat implements the FS interface.
   785  func (d *diskHealthCheckingFS) Stat(name string) (os.FileInfo, error) {
   786  	return d.fs.Stat(name)
   787  }
   788  
   789  type noopCloser struct{}
   790  
   791  func (noopCloser) Close() error { return nil }