github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/vfs/disk_health.go

github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/vfs/disk_health.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package vfs
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"path/filepath"
    12  	"sync"
    13  	"sync/atomic"
    14  	"time"
    15  
    16  	"github.com/cockroachdb/redact"
    17  )
    18  
    19  const (
    20  	// preallocatedSlotCount is the default number of slots available for
    21  	// concurrent filesystem operations. The slot count may be exceeded, but
    22  	// each additional slot will incur an additional allocation. We choose 16
    23  	// here with the expectation that it is significantly more than required in
    24  	// practice. See the comment above the diskHealthCheckingFS type definition.
    25  	preallocatedSlotCount = 16
    26  	// deltaBits is the number of bits in the packed 64-bit integer used for
    27  	// identifying a delta from the file creation time in milliseconds.
    28  	deltaBits = 40
    29  	// writeSizeBits is the number of bits in the packed 64-bit integer used for
    30  	// identifying the size of the write operation, if the operation is sized. See
    31  	// writeSizePrecision below for precision of size.
    32  	writeSizeBits = 20
    33  	// Track size of writes at kilobyte precision. See comment above lastWritePacked for more.
    34  	writeSizePrecision = 1024
    35  )
    36  
    37  // Variables to enable testing.
    38  var (
    39  	// defaultTickInterval is the default interval between two ticks of each
    40  	// diskHealthCheckingFile loop iteration.
    41  	defaultTickInterval = 2 * time.Second
    42  )
    43  
    44  // OpType is the type of IO operation being monitored by a
    45  // diskHealthCheckingFile.
    46  type OpType uint8
    47  
    48  // The following OpTypes is limited to the subset of file system operations that
    49  // a diskHealthCheckingFile supports (namely writes and syncs).
    50  const (
    51  	OpTypeUnknown OpType = iota
    52  	OpTypeWrite
    53  	OpTypeSync
    54  	OpTypeSyncData
    55  	OpTypeSyncTo
    56  	OpTypeCreate
    57  	OpTypeLink
    58  	OpTypeMkdirAll
    59  	OpTypePreallocate
    60  	OpTypeRemove
    61  	OpTypeRemoveAll
    62  	OpTypeRename
    63  	OpTypeReuseForWrite
    64  	// Note: opTypeMax is just used in tests. It must appear last in the list
    65  	// of OpTypes.
    66  	opTypeMax
    67  )
    68  
    69  // String implements fmt.Stringer.
    70  func (o OpType) String() string {
    71  	switch o {
    72  	case OpTypeWrite:
    73  		return "write"
    74  	case OpTypeSync:
    75  		return "sync"
    76  	case OpTypeSyncData:
    77  		return "syncdata"
    78  	case OpTypeSyncTo:
    79  		return "syncto"
    80  	case OpTypeCreate:
    81  		return "create"
    82  	case OpTypeLink:
    83  		return "link"
    84  	case OpTypeMkdirAll:
    85  		return "mkdirall"
    86  	case OpTypePreallocate:
    87  		return "preallocate"
    88  	case OpTypeRemove:
    89  		return "remove"
    90  	case OpTypeRemoveAll:
    91  		return "removall"
    92  	case OpTypeRename:
    93  		return "rename"
    94  	case OpTypeReuseForWrite:
    95  		return "reuseforwrite"
    96  	case OpTypeUnknown:
    97  		return "unknown"
    98  	default:
    99  		panic(fmt.Sprintf("vfs: unknown op type: %d", o))
   100  	}
   101  }
   102  
   103  // diskHealthCheckingFile is a File wrapper to detect slow disk operations, and
   104  // call onSlowDisk if a disk operation is seen to exceed diskSlowThreshold.
   105  //
   106  // This struct creates a goroutine (in startTicker()) that, at every tick
   107  // interval, sees if there's a disk operation taking longer than the specified
   108  // duration. This setup is preferable to creating a new timer at every disk
   109  // operation, as it reduces overhead per disk operation.
   110  type diskHealthCheckingFile struct {
   111  	file              File
   112  	onSlowDisk        func(opType OpType, writeSizeInBytes int, duration time.Duration)
   113  	diskSlowThreshold time.Duration
   114  	tickInterval      time.Duration
   115  
   116  	stopper chan struct{}
   117  	// lastWritePacked is a 64-bit unsigned int. The most significant
   118  	// 40 bits represent an delta (in milliseconds) from the creation
   119  	// time of the diskHealthCheckingFile. The next most significant 20 bits
   120  	// represent the size of the write in KBs, if the write has a size. (If
   121  	// it doesn't, the 20 bits are zeroed). The least significant four bits
   122  	// contains the OpType.
   123  	//
   124  	// The use of 40 bits for an delta provides ~34 years of effective
   125  	// monitoring time before the uint wraps around, at millisecond precision.
   126  	// ~34 years of process uptime "ought to be enough for anybody". Millisecond
   127  	// writeSizePrecision is sufficient, given that we are monitoring for writes that take
   128  	// longer than one millisecond.
   129  	//
   130  	// The use of 20 bits for the size in KBs allows representing sizes up
   131  	// to nearly one GB. If the write is larger than that, we round down to ~one GB.
   132  	//
   133  	// The use of four bits for OpType allows for 16 operation types.
   134  	//
   135  	// NB: this packing scheme is not persisted, and is therefore safe to adjust
   136  	// across process boundaries.
   137  	lastWritePacked atomic.Uint64
   138  	createTimeNanos int64
   139  }
   140  
   141  // newDiskHealthCheckingFile instantiates a new diskHealthCheckingFile, with the
   142  // specified time threshold and event listener.
   143  func newDiskHealthCheckingFile(
   144  	file File,
   145  	diskSlowThreshold time.Duration,
   146  	onSlowDisk func(OpType OpType, writeSizeInBytes int, duration time.Duration),
   147  ) *diskHealthCheckingFile {
   148  	return &diskHealthCheckingFile{
   149  		file:              file,
   150  		onSlowDisk:        onSlowDisk,
   151  		diskSlowThreshold: diskSlowThreshold,
   152  		tickInterval:      defaultTickInterval,
   153  
   154  		stopper:         make(chan struct{}),
   155  		createTimeNanos: time.Now().UnixNano(),
   156  	}
   157  }
   158  
   159  // startTicker starts a new goroutine with a ticker to monitor disk operations.
   160  // Can only be called if the ticker goroutine isn't running already.
   161  func (d *diskHealthCheckingFile) startTicker() {
   162  	if d.diskSlowThreshold == 0 {
   163  		return
   164  	}
   165  
   166  	go func() {
   167  		ticker := time.NewTicker(d.tickInterval)
   168  		defer ticker.Stop()
   169  
   170  		for {
   171  			select {
   172  			case <-d.stopper:
   173  				return
   174  
   175  			case <-ticker.C:
   176  				packed := d.lastWritePacked.Load()
   177  				if packed == 0 {
   178  					continue
   179  				}
   180  				delta, writeSize, op := unpack(packed)
   181  				lastWrite := time.Unix(0, d.createTimeNanos+delta.Nanoseconds())
   182  				now := time.Now()
   183  				if lastWrite.Add(d.diskSlowThreshold).Before(now) {
   184  					// diskSlowThreshold was exceeded. Call the passed-in
   185  					// listener.
   186  					d.onSlowDisk(op, writeSize, now.Sub(lastWrite))
   187  				}
   188  			}
   189  		}
   190  	}()
   191  }
   192  
   193  // stopTicker stops the goroutine started in startTicker.
   194  func (d *diskHealthCheckingFile) stopTicker() {
   195  	close(d.stopper)
   196  }
   197  
   198  // Fd implements (vfs.File).Fd.
   199  func (d *diskHealthCheckingFile) Fd() uintptr {
   200  	return d.file.Fd()
   201  }
   202  
   203  // Read implements (vfs.File).Read
   204  func (d *diskHealthCheckingFile) Read(p []byte) (int, error) {
   205  	return d.file.Read(p)
   206  }
   207  
   208  // ReadAt implements (vfs.File).ReadAt
   209  func (d *diskHealthCheckingFile) ReadAt(p []byte, off int64) (int, error) {
   210  	return d.file.ReadAt(p, off)
   211  }
   212  
   213  // Write implements the io.Writer interface.
   214  func (d *diskHealthCheckingFile) Write(p []byte) (n int, err error) {
   215  	d.timeDiskOp(OpTypeWrite, int64(len(p)), func() {
   216  		n, err = d.file.Write(p)
   217  	}, time.Now().UnixNano())
   218  	return n, err
   219  }
   220  
   221  // Write implements the io.WriterAt interface.
   222  func (d *diskHealthCheckingFile) WriteAt(p []byte, ofs int64) (n int, err error) {
   223  	d.timeDiskOp(OpTypeWrite, int64(len(p)), func() {
   224  		n, err = d.file.WriteAt(p, ofs)
   225  	}, time.Now().UnixNano())
   226  	return n, err
   227  }
   228  
   229  // Close implements the io.Closer interface.
   230  func (d *diskHealthCheckingFile) Close() error {
   231  	d.stopTicker()
   232  	return d.file.Close()
   233  }
   234  
   235  // Prefetch implements (vfs.File).Prefetch.
   236  func (d *diskHealthCheckingFile) Prefetch(offset, length int64) error {
   237  	return d.file.Prefetch(offset, length)
   238  }
   239  
   240  // Preallocate implements (vfs.File).Preallocate.
   241  func (d *diskHealthCheckingFile) Preallocate(off, n int64) (err error) {
   242  	d.timeDiskOp(OpTypePreallocate, n, func() {
   243  		err = d.file.Preallocate(off, n)
   244  	}, time.Now().UnixNano())
   245  	return err
   246  }
   247  
   248  // Stat implements (vfs.File).Stat.
   249  func (d *diskHealthCheckingFile) Stat() (os.FileInfo, error) {
   250  	return d.file.Stat()
   251  }
   252  
   253  // Sync implements the io.Syncer interface.
   254  func (d *diskHealthCheckingFile) Sync() (err error) {
   255  	d.timeDiskOp(OpTypeSync, 0, func() {
   256  		err = d.file.Sync()
   257  	}, time.Now().UnixNano())
   258  	return err
   259  }
   260  
   261  // SyncData implements (vfs.File).SyncData.
   262  func (d *diskHealthCheckingFile) SyncData() (err error) {
   263  	d.timeDiskOp(OpTypeSyncData, 0, func() {
   264  		err = d.file.SyncData()
   265  	}, time.Now().UnixNano())
   266  	return err
   267  }
   268  
   269  // SyncTo implements (vfs.File).SyncTo.
   270  func (d *diskHealthCheckingFile) SyncTo(length int64) (fullSync bool, err error) {
   271  	d.timeDiskOp(OpTypeSyncTo, length, func() {
   272  		fullSync, err = d.file.SyncTo(length)
   273  	}, time.Now().UnixNano())
   274  	return fullSync, err
   275  }
   276  
   277  // timeDiskOp runs the specified closure and makes its timing visible to the
   278  // monitoring goroutine, in case it exceeds one of the slow disk durations.
   279  // opType should always be set. writeSizeInBytes should be set if the write
   280  // operation is sized. If not, it should be set to zero.
   281  //
   282  // The start time is taken as a parameter in the form of nanoseconds since the
   283  // unix epoch so that it appears in stack traces during crashes (if GOTRACEBACK
   284  // is set appropriately), aiding postmortem debugging.
   285  func (d *diskHealthCheckingFile) timeDiskOp(
   286  	opType OpType, writeSizeInBytes int64, op func(), startNanos int64,
   287  ) {
   288  	if d == nil {
   289  		op()
   290  		return
   291  	}
   292  
   293  	delta := time.Duration(startNanos - d.createTimeNanos)
   294  	packed := pack(delta, writeSizeInBytes, opType)
   295  	if d.lastWritePacked.Swap(packed) != 0 {
   296  		panic("concurrent write operations detected on file")
   297  	}
   298  	defer func() {
   299  		if d.lastWritePacked.Swap(0) != packed {
   300  			panic("concurrent write operations detected on file")
   301  		}
   302  	}()
   303  	op()
   304  }
   305  
   306  // Note the slight lack of symmetry between pack & unpack. pack takes an int64 for writeSizeInBytes, since
   307  // callers of pack use an int64. This is dictated by the vfs interface. unpack OTOH returns an int. This is
   308  // safe because the packing scheme implies we only actually need 32 bits.
   309  func pack(delta time.Duration, writeSizeInBytes int64, opType OpType) uint64 {
   310  	// We have no guarantee of clock monotonicity. If we have a small regression
   311  	// in the clock, we set deltaMillis to zero, so we can still catch the operation
   312  	// if happens to be slow.
   313  	deltaMillis := delta.Milliseconds()
   314  	if deltaMillis < 0 {
   315  		deltaMillis = 0
   316  	}
   317  	// As of 3/7/2023, the use of 40 bits for an delta provides ~34 years
   318  	// of effective monitoring time before the uint wraps around, at millisecond
   319  	// precision.
   320  	if deltaMillis > 1<<deltaBits-1 {
   321  		panic("vfs: last write delta would result in integer wraparound")
   322  	}
   323  
   324  	// See writeSizePrecision to get the unit of writeSize. As of 1/26/2023, the unit is KBs.
   325  	writeSize := writeSizeInBytes / writeSizePrecision
   326  	// If the size of the write is larger than we can store in the packed int, store the max
   327  	// value we can store in the packed int.
   328  	const writeSizeCeiling = 1<<writeSizeBits - 1
   329  	if writeSize > writeSizeCeiling {
   330  		writeSize = writeSizeCeiling
   331  	}
   332  
   333  	return uint64(deltaMillis)<<(64-deltaBits) | uint64(writeSize)<<(64-deltaBits-writeSizeBits) | uint64(opType)
   334  }
   335  
   336  func unpack(packed uint64) (delta time.Duration, writeSizeInBytes int, opType OpType) {
   337  	delta = time.Duration(packed>>(64-deltaBits)) * time.Millisecond
   338  	wz := int64(packed>>(64-deltaBits-writeSizeBits)) & ((1 << writeSizeBits) - 1) * writeSizePrecision
   339  	// Given the packing scheme, converting wz to an int will not truncate anything.
   340  	writeSizeInBytes = int(wz)
   341  	opType = OpType(packed & 0xf)
   342  	return delta, writeSizeInBytes, opType
   343  }
   344  
   345  // diskHealthCheckingDir implements disk-health checking for directories. Unlike
   346  // other files, we allow directories to receive concurrent write operations
   347  // (Syncs are the only write operations supported by a directory.) Since the
   348  // diskHealthCheckingFile's timeDiskOp can only track a single in-flight
   349  // operation at a time, we time the operation using the filesystem-level
   350  // timeFilesystemOp function instead.
   351  type diskHealthCheckingDir struct {
   352  	File
   353  	name string
   354  	fs   *diskHealthCheckingFS
   355  }
   356  
   357  // Sync implements the io.Syncer interface.
   358  func (d *diskHealthCheckingDir) Sync() (err error) {
   359  	d.fs.timeFilesystemOp(d.name, OpTypeSync, func() {
   360  		err = d.File.Sync()
   361  	}, time.Now().UnixNano())
   362  	return err
   363  }
   364  
   365  // DiskSlowInfo captures info about detected slow operations on the vfs.
   366  type DiskSlowInfo struct {
   367  	// Path of file being written to.
   368  	Path string
   369  	// Operation being performed on the file.
   370  	OpType OpType
   371  	// Size of write in bytes, if the write is sized.
   372  	WriteSize int
   373  	// Duration that has elapsed since this disk operation started.
   374  	Duration time.Duration
   375  }
   376  
   377  func (i DiskSlowInfo) String() string {
   378  	return redact.StringWithoutMarkers(i)
   379  }
   380  
   381  // SafeFormat implements redact.SafeFormatter.
   382  func (i DiskSlowInfo) SafeFormat(w redact.SafePrinter, _ rune) {
   383  	switch i.OpType {
   384  	// Operations for which i.WriteSize is meaningful.
   385  	case OpTypeWrite, OpTypeSyncTo, OpTypePreallocate:
   386  		w.Printf("disk slowness detected: %s on file %s (%d bytes) has been ongoing for %0.1fs",
   387  			redact.Safe(i.OpType.String()), redact.Safe(filepath.Base(i.Path)),
   388  			redact.Safe(i.WriteSize), redact.Safe(i.Duration.Seconds()))
   389  	default:
   390  		w.Printf("disk slowness detected: %s on file %s has been ongoing for %0.1fs",
   391  			redact.Safe(i.OpType.String()), redact.Safe(filepath.Base(i.Path)),
   392  			redact.Safe(i.Duration.Seconds()))
   393  	}
   394  }
   395  
   396  // diskHealthCheckingFS adds disk-health checking facilities to a VFS.
   397  // It times disk write operations in two ways:
   398  //
   399  // 1. Wrapping vfs.Files.
   400  //
   401  // The bulk of write I/O activity is file writing and syncing, invoked through
   402  // the `vfs.File` interface. This VFS wraps all files open for writing with a
   403  // special diskHealthCheckingFile implementation of the vfs.File interface. See
   404  // above for the implementation.
   405  //
   406  // 2. Monitoring filesystem metadata operations.
   407  //
   408  // Filesystem metadata operations (create, link, remove, rename, etc) are also
   409  // sources of disk writes. Unlike a vfs.File which requires Write and Sync calls
   410  // to be sequential, a vfs.FS may receive these filesystem metadata operations
   411  // in parallel. To accommodate this parallelism, the diskHealthCheckingFS's
   412  // write-oriented filesystem operations record their start times into a 'slot'
   413  // on the filesystem. A single long-running goroutine periodically scans the
   414  // slots looking for slow operations.
   415  //
   416  // The number of slots on a diskHealthCheckingFS grows to a working set of the
   417  // maximum concurrent filesystem operations. This is expected to be very few
   418  // for these reasons:
   419  //  1. Pebble has limited write concurrency. Flushes, compactions and WAL
   420  //     rotations are the primary sources of filesystem metadata operations. With
   421  //     the default max-compaction concurrency, these operations require at most 5
   422  //     concurrent slots if all 5 perform a filesystem metadata operation
   423  //     simultaneously.
   424  //  2. Pebble's limited concurrent I/O writers spend most of their time
   425  //     performing file I/O, not performing the filesystem metadata operations that
   426  //     require recording a slot on the diskHealthCheckingFS.
   427  //  3. In CockroachDB, each additional store/Pebble instance has its own vfs.FS
   428  //     which provides a separate goroutine and set of slots.
   429  //  4. In CockroachDB, many of the additional sources of filesystem metadata
   430  //     operations (like encryption-at-rest) are sequential with respect to Pebble's
   431  //     threads.
   432  type diskHealthCheckingFS struct {
   433  	tickInterval      time.Duration
   434  	diskSlowThreshold time.Duration
   435  	onSlowDisk        func(DiskSlowInfo)
   436  	fs                FS
   437  	mu                struct {
   438  		sync.Mutex
   439  		tickerRunning bool
   440  		stopper       chan struct{}
   441  		inflight      []*slot
   442  	}
   443  	// prealloc preallocates the memory for mu.inflight slots and the slice
   444  	// itself. The contained fields are not accessed directly except by
   445  	// WithDiskHealthChecks when initializing mu.inflight. The number of slots
   446  	// in d.mu.inflight will grow to the maximum number of concurrent file
   447  	// metadata operations (create, remove, link, etc). If the number of
   448  	// concurrent operations never exceeds preallocatedSlotCount, we'll never
   449  	// incur an additional allocation.
   450  	prealloc struct {
   451  		slots        [preallocatedSlotCount]slot
   452  		slotPtrSlice [preallocatedSlotCount]*slot
   453  	}
   454  }
   455  
   456  type slot struct {
   457  	name       string
   458  	opType     OpType
   459  	startNanos atomic.Int64
   460  }
   461  
   462  // diskHealthCheckingFS implements FS.
   463  var _ FS = (*diskHealthCheckingFS)(nil)
   464  
   465  // WithDiskHealthChecks wraps an FS and ensures that all write-oriented
   466  // operations on the FS are wrapped with disk health detection checks. Disk
   467  // operations that are observed to take longer than diskSlowThreshold trigger an
   468  // onSlowDisk call.
   469  //
   470  // A threshold of zero disables disk-health checking.
   471  func WithDiskHealthChecks(
   472  	innerFS FS, diskSlowThreshold time.Duration, onSlowDisk func(info DiskSlowInfo),
   473  ) (FS, io.Closer) {
   474  	if diskSlowThreshold == 0 {
   475  		return innerFS, noopCloser{}
   476  	}
   477  
   478  	fs := &diskHealthCheckingFS{
   479  		fs:                innerFS,
   480  		tickInterval:      defaultTickInterval,
   481  		diskSlowThreshold: diskSlowThreshold,
   482  		onSlowDisk:        onSlowDisk,
   483  	}
   484  	fs.mu.stopper = make(chan struct{})
   485  	// The fs holds preallocated slots and a preallocated array of slot pointers
   486  	// with equal length. Initialize the inflight slice to use a slice backed by
   487  	// the preallocated array with each slot initialized to a preallocated slot.
   488  	fs.mu.inflight = fs.prealloc.slotPtrSlice[:]
   489  	for i := range fs.mu.inflight {
   490  		fs.mu.inflight[i] = &fs.prealloc.slots[i]
   491  	}
   492  	return fs, fs
   493  }
   494  
   495  // timeFilesystemOp executes the provided closure, which should perform a
   496  // singular filesystem operation of a type matching opType on the named file. It
   497  // records the provided start time such that the long-lived disk-health checking
   498  // goroutine can observe if the operation is blocked for an inordinate time.
   499  //
   500  // The start time is taken as a parameter in the form of nanoseconds since the
   501  // unix epoch so that it appears in stack traces during crashes (if GOTRACEBACK
   502  // is set appropriately), aiding postmortem debugging.
   503  func (d *diskHealthCheckingFS) timeFilesystemOp(
   504  	name string, opType OpType, op func(), startNanos int64,
   505  ) {
   506  	if d == nil {
   507  		op()
   508  		return
   509  	}
   510  
   511  	// Record this operation's start time on the FS, so that the long-running
   512  	// goroutine can monitor the filesystem operation.
   513  	//
   514  	// The diskHealthCheckingFile implementation uses a single field that is
   515  	// atomically updated, taking advantage of the fact that writes to a single
   516  	// vfs.File handle are not performed in parallel. The vfs.FS however may
   517  	// receive write filesystem operations in parallel. To accommodate this
   518  	// parallelism, writing goroutines append their start time to a
   519  	// mutex-protected vector. On ticks, the long-running goroutine scans the
   520  	// vector searching for start times older than the slow-disk threshold. When
   521  	// a writing goroutine completes its operation, it atomically overwrites its
   522  	// slot to signal completion.
   523  	var s *slot
   524  	func() {
   525  		d.mu.Lock()
   526  		defer d.mu.Unlock()
   527  
   528  		// If there's no long-running goroutine to monitor this filesystem
   529  		// operation, start one.
   530  		if !d.mu.tickerRunning {
   531  			d.startTickerLocked()
   532  		}
   533  
   534  		for i := 0; i < len(d.mu.inflight); i++ {
   535  			if d.mu.inflight[i].startNanos.Load() == 0 {
   536  				// This slot is not in use. Claim it.
   537  				s = d.mu.inflight[i]
   538  				s.name = name
   539  				s.opType = opType
   540  				s.startNanos.Store(startNanos)
   541  				break
   542  			}
   543  		}
   544  		// If we didn't find any unused slots, create a new slot and append it.
   545  		// This slot will exist forever. The number of slots will grow to the
   546  		// maximum number of concurrent filesystem operations over the lifetime
   547  		// of the process. Only operations that grow the number of slots must
   548  		// incur an allocation.
   549  		if s == nil {
   550  			s = &slot{
   551  				name:   name,
   552  				opType: opType,
   553  			}
   554  			s.startNanos.Store(startNanos)
   555  			d.mu.inflight = append(d.mu.inflight, s)
   556  		}
   557  	}()
   558  
   559  	op()
   560  
   561  	// Signal completion by zeroing the start time.
   562  	s.startNanos.Store(0)
   563  }
   564  
   565  // startTickerLocked starts a new goroutine with a ticker to monitor disk
   566  // filesystem operations. Requires d.mu and !d.mu.tickerRunning.
   567  func (d *diskHealthCheckingFS) startTickerLocked() {
   568  	d.mu.tickerRunning = true
   569  	stopper := d.mu.stopper
   570  	go func() {
   571  		ticker := time.NewTicker(d.tickInterval)
   572  		defer ticker.Stop()
   573  		type exceededSlot struct {
   574  			name       string
   575  			opType     OpType
   576  			startNanos int64
   577  		}
   578  		var exceededSlots []exceededSlot
   579  
   580  		for {
   581  			select {
   582  			case <-ticker.C:
   583  				// Scan the inflight slots for any slots recording a start
   584  				// time older than the diskSlowThreshold.
   585  				exceededSlots = exceededSlots[:0]
   586  				d.mu.Lock()
   587  				now := time.Now()
   588  				for i := range d.mu.inflight {
   589  					nanos := d.mu.inflight[i].startNanos.Load()
   590  					if nanos != 0 && time.Unix(0, nanos).Add(d.diskSlowThreshold).Before(now) {
   591  						// diskSlowThreshold was exceeded. Copy this inflightOp into
   592  						// exceededSlots and call d.onSlowDisk after dropping the mutex.
   593  						inflightOp := exceededSlot{
   594  							name:       d.mu.inflight[i].name,
   595  							opType:     d.mu.inflight[i].opType,
   596  							startNanos: nanos,
   597  						}
   598  						exceededSlots = append(exceededSlots, inflightOp)
   599  					}
   600  				}
   601  				d.mu.Unlock()
   602  				for i := range exceededSlots {
   603  					d.onSlowDisk(
   604  						DiskSlowInfo{
   605  							Path:      exceededSlots[i].name,
   606  							OpType:    exceededSlots[i].opType,
   607  							WriteSize: 0, // writes at the fs level are not sized
   608  							Duration:  now.Sub(time.Unix(0, exceededSlots[i].startNanos)),
   609  						})
   610  				}
   611  			case <-stopper:
   612  				return
   613  			}
   614  		}
   615  	}()
   616  }
   617  
   618  // Close implements io.Closer. Close stops the long-running goroutine that
   619  // monitors for slow filesystem metadata operations. Close may be called
   620  // multiple times. If the filesystem is used after Close has been called, a new
   621  // long-running goroutine will be created.
   622  func (d *diskHealthCheckingFS) Close() error {
   623  	d.mu.Lock()
   624  	if !d.mu.tickerRunning {
   625  		// Nothing to stop.
   626  		d.mu.Unlock()
   627  		return nil
   628  	}
   629  
   630  	// Grab the stopper so we can request the long-running goroutine to stop.
   631  	// Replace the stopper in case this FS is reused. It's possible to Close and
   632  	// reuse a disk-health checking FS. This is to accommodate the on-by-default
   633  	// behavior in Pebble, and the possibility that users may continue to use
   634  	// the Pebble default FS beyond the lifetime of a single DB.
   635  	stopper := d.mu.stopper
   636  	d.mu.stopper = make(chan struct{})
   637  	d.mu.tickerRunning = false
   638  	d.mu.Unlock()
   639  
   640  	// Ask the long-running goroutine to stop. This is a synchronous channel
   641  	// send.
   642  	stopper <- struct{}{}
   643  	close(stopper)
   644  	return nil
   645  }
   646  
   647  // Create implements the FS interface.
   648  func (d *diskHealthCheckingFS) Create(name string) (File, error) {
   649  	var f File
   650  	var err error
   651  	d.timeFilesystemOp(name, OpTypeCreate, func() {
   652  		f, err = d.fs.Create(name)
   653  	}, time.Now().UnixNano())
   654  	if err != nil {
   655  		return f, err
   656  	}
   657  	if d.diskSlowThreshold == 0 {
   658  		return f, nil
   659  	}
   660  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
   661  		d.onSlowDisk(
   662  			DiskSlowInfo{
   663  				Path:      name,
   664  				OpType:    opType,
   665  				WriteSize: writeSizeInBytes,
   666  				Duration:  duration,
   667  			})
   668  	})
   669  	checkingFile.startTicker()
   670  	return checkingFile, nil
   671  }
   672  
   673  // GetDiskUsage implements the FS interface.
   674  func (d *diskHealthCheckingFS) GetDiskUsage(path string) (DiskUsage, error) {
   675  	return d.fs.GetDiskUsage(path)
   676  }
   677  
   678  // Link implements the FS interface.
   679  func (d *diskHealthCheckingFS) Link(oldname, newname string) error {
   680  	var err error
   681  	d.timeFilesystemOp(newname, OpTypeLink, func() {
   682  		err = d.fs.Link(oldname, newname)
   683  	}, time.Now().UnixNano())
   684  	return err
   685  }
   686  
   687  // List implements the FS interface.
   688  func (d *diskHealthCheckingFS) List(dir string) ([]string, error) {
   689  	return d.fs.List(dir)
   690  }
   691  
   692  // Lock implements the FS interface.
   693  func (d *diskHealthCheckingFS) Lock(name string) (io.Closer, error) {
   694  	return d.fs.Lock(name)
   695  }
   696  
   697  // MkdirAll implements the FS interface.
   698  func (d *diskHealthCheckingFS) MkdirAll(dir string, perm os.FileMode) error {
   699  	var err error
   700  	d.timeFilesystemOp(dir, OpTypeMkdirAll, func() {
   701  		err = d.fs.MkdirAll(dir, perm)
   702  	}, time.Now().UnixNano())
   703  	return err
   704  }
   705  
   706  // Open implements the FS interface.
   707  func (d *diskHealthCheckingFS) Open(name string, opts ...OpenOption) (File, error) {
   708  	return d.fs.Open(name, opts...)
   709  }
   710  
   711  // OpenReadWrite implements the FS interface.
   712  func (d *diskHealthCheckingFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
   713  	return d.fs.OpenReadWrite(name, opts...)
   714  }
   715  
   716  // OpenDir implements the FS interface.
   717  func (d *diskHealthCheckingFS) OpenDir(name string) (File, error) {
   718  	f, err := d.fs.OpenDir(name)
   719  	if err != nil {
   720  		return f, err
   721  	}
   722  	// Directories opened with OpenDir must be opened with health checking,
   723  	// because they may be explicitly synced.
   724  	return &diskHealthCheckingDir{
   725  		File: f,
   726  		name: name,
   727  		fs:   d,
   728  	}, nil
   729  }
   730  
   731  // PathBase implements the FS interface.
   732  func (d *diskHealthCheckingFS) PathBase(path string) string {
   733  	return d.fs.PathBase(path)
   734  }
   735  
   736  // PathJoin implements the FS interface.
   737  func (d *diskHealthCheckingFS) PathJoin(elem ...string) string {
   738  	return d.fs.PathJoin(elem...)
   739  }
   740  
   741  // PathDir implements the FS interface.
   742  func (d *diskHealthCheckingFS) PathDir(path string) string {
   743  	return d.fs.PathDir(path)
   744  }
   745  
   746  // Remove implements the FS interface.
   747  func (d *diskHealthCheckingFS) Remove(name string) error {
   748  	var err error
   749  	d.timeFilesystemOp(name, OpTypeRemove, func() {
   750  		err = d.fs.Remove(name)
   751  	}, time.Now().UnixNano())
   752  	return err
   753  }
   754  
   755  // RemoveAll implements the FS interface.
   756  func (d *diskHealthCheckingFS) RemoveAll(name string) error {
   757  	var err error
   758  	d.timeFilesystemOp(name, OpTypeRemoveAll, func() {
   759  		err = d.fs.RemoveAll(name)
   760  	}, time.Now().UnixNano())
   761  	return err
   762  }
   763  
   764  // Rename implements the FS interface.
   765  func (d *diskHealthCheckingFS) Rename(oldname, newname string) error {
   766  	var err error
   767  	d.timeFilesystemOp(newname, OpTypeRename, func() {
   768  		err = d.fs.Rename(oldname, newname)
   769  	}, time.Now().UnixNano())
   770  	return err
   771  }
   772  
   773  // ReuseForWrite implements the FS interface.
   774  func (d *diskHealthCheckingFS) ReuseForWrite(oldname, newname string) (File, error) {
   775  	var f File
   776  	var err error
   777  	d.timeFilesystemOp(newname, OpTypeReuseForWrite, func() {
   778  		f, err = d.fs.ReuseForWrite(oldname, newname)
   779  	}, time.Now().UnixNano())
   780  	if err != nil {
   781  		return f, err
   782  	}
   783  	if d.diskSlowThreshold == 0 {
   784  		return f, nil
   785  	}
   786  	checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
   787  		d.onSlowDisk(
   788  			DiskSlowInfo{
   789  				Path:      newname,
   790  				OpType:    opType,
   791  				WriteSize: writeSizeInBytes,
   792  				Duration:  duration,
   793  			})
   794  	})
   795  	checkingFile.startTicker()
   796  	return checkingFile, nil
   797  }
   798  
   799  // Stat implements the FS interface.
   800  func (d *diskHealthCheckingFS) Stat(name string) (os.FileInfo, error) {
   801  	return d.fs.Stat(name)
   802  }
   803  
   804  type noopCloser struct{}
   805  
   806  func (noopCloser) Close() error { return nil }