github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/vfs/disk_health_test.go (about)

     1  // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package vfs
     6  
     7  import (
     8  	"io"
     9  	"math"
    10  	"os"
    11  	"runtime"
    12  	"sync"
    13  	"sync/atomic"
    14  	"testing"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  type mockFile struct {
    22  	syncAndWriteDuration time.Duration
    23  }
    24  
    25  func (m mockFile) Close() error {
    26  	return nil
    27  }
    28  
    29  func (m mockFile) Read(p []byte) (n int, err error) {
    30  	panic("unimplemented")
    31  }
    32  
    33  func (m mockFile) ReadAt(p []byte, off int64) (n int, err error) {
    34  	panic("unimplemented")
    35  }
    36  
    37  func (m mockFile) Write(p []byte) (n int, err error) {
    38  	time.Sleep(m.syncAndWriteDuration)
    39  	return len(p), nil
    40  }
    41  
    42  func (m mockFile) WriteAt(p []byte, ofs int64) (n int, err error) {
    43  	time.Sleep(m.syncAndWriteDuration)
    44  	return len(p), nil
    45  }
    46  
    47  func (m mockFile) Prefetch(offset, length int64) error {
    48  	panic("unimplemented")
    49  }
    50  
    51  func (m mockFile) Preallocate(int64, int64) error {
    52  	time.Sleep(m.syncAndWriteDuration)
    53  	return nil
    54  }
    55  
    56  func (m mockFile) Stat() (os.FileInfo, error) {
    57  	panic("unimplemented")
    58  }
    59  
    60  func (m mockFile) Fd() uintptr {
    61  	return InvalidFd
    62  }
    63  
    64  func (m mockFile) Sync() error {
    65  	time.Sleep(m.syncAndWriteDuration)
    66  	return nil
    67  }
    68  
    69  func (m mockFile) SyncData() error {
    70  	time.Sleep(m.syncAndWriteDuration)
    71  	return nil
    72  }
    73  
    74  func (m mockFile) SyncTo(int64) (fullSync bool, err error) {
    75  	time.Sleep(m.syncAndWriteDuration)
    76  	return false, nil
    77  }
    78  
    79  var _ File = &mockFile{}
    80  
    81  type mockFS struct {
    82  	create        func(string) (File, error)
    83  	link          func(string, string) error
    84  	list          func(string) ([]string, error)
    85  	lock          func(string) (io.Closer, error)
    86  	mkdirAll      func(string, os.FileMode) error
    87  	open          func(string, ...OpenOption) (File, error)
    88  	openDir       func(string) (File, error)
    89  	pathBase      func(string) string
    90  	pathJoin      func(...string) string
    91  	pathDir       func(string) string
    92  	remove        func(string) error
    93  	removeAll     func(string) error
    94  	rename        func(string, string) error
    95  	reuseForWrite func(string, string) (File, error)
    96  	stat          func(string) (os.FileInfo, error)
    97  	getDiskUsage  func(string) (DiskUsage, error)
    98  }
    99  
   100  func (m mockFS) Create(name string) (File, error) {
   101  	if m.create == nil {
   102  		panic("unimplemented")
   103  	}
   104  	return m.create(name)
   105  }
   106  
   107  func (m mockFS) Link(oldname, newname string) error {
   108  	if m.link == nil {
   109  		panic("unimplemented")
   110  	}
   111  	return m.link(oldname, newname)
   112  }
   113  
   114  func (m mockFS) Open(name string, opts ...OpenOption) (File, error) {
   115  	if m.open == nil {
   116  		panic("unimplemented")
   117  	}
   118  	return m.open(name, opts...)
   119  }
   120  
   121  func (m mockFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) {
   122  	panic("unimplemented")
   123  }
   124  
   125  func (m mockFS) OpenDir(name string) (File, error) {
   126  	if m.openDir == nil {
   127  		panic("unimplemented")
   128  	}
   129  	return m.openDir(name)
   130  }
   131  
   132  func (m mockFS) Remove(name string) error {
   133  	if m.remove == nil {
   134  		panic("unimplemented")
   135  	}
   136  	return m.remove(name)
   137  }
   138  
   139  func (m mockFS) RemoveAll(name string) error {
   140  	if m.removeAll == nil {
   141  		panic("unimplemented")
   142  	}
   143  	return m.removeAll(name)
   144  }
   145  
   146  func (m mockFS) Rename(oldname, newname string) error {
   147  	if m.rename == nil {
   148  		panic("unimplemented")
   149  	}
   150  	return m.rename(oldname, newname)
   151  }
   152  
   153  func (m mockFS) ReuseForWrite(oldname, newname string) (File, error) {
   154  	if m.reuseForWrite == nil {
   155  		panic("unimplemented")
   156  	}
   157  	return m.reuseForWrite(oldname, newname)
   158  }
   159  
   160  func (m mockFS) MkdirAll(dir string, perm os.FileMode) error {
   161  	if m.mkdirAll == nil {
   162  		panic("unimplemented")
   163  	}
   164  	return m.mkdirAll(dir, perm)
   165  }
   166  
   167  func (m mockFS) Lock(name string) (io.Closer, error) {
   168  	if m.lock == nil {
   169  		panic("unimplemented")
   170  	}
   171  	return m.lock(name)
   172  }
   173  
   174  func (m mockFS) List(dir string) ([]string, error) {
   175  	if m.list == nil {
   176  		panic("unimplemented")
   177  	}
   178  	return m.list(dir)
   179  }
   180  
   181  func (m mockFS) Stat(name string) (os.FileInfo, error) {
   182  	if m.stat == nil {
   183  		panic("unimplemented")
   184  	}
   185  	return m.stat(name)
   186  }
   187  
   188  func (m mockFS) PathBase(path string) string {
   189  	if m.pathBase == nil {
   190  		panic("unimplemented")
   191  	}
   192  	return m.pathBase(path)
   193  }
   194  
   195  func (m mockFS) PathJoin(elem ...string) string {
   196  	if m.pathJoin == nil {
   197  		panic("unimplemented")
   198  	}
   199  	return m.pathJoin(elem...)
   200  }
   201  
   202  func (m mockFS) PathDir(path string) string {
   203  	if m.pathDir == nil {
   204  		panic("unimplemented")
   205  	}
   206  	return m.pathDir(path)
   207  }
   208  
   209  func (m mockFS) GetDiskUsage(path string) (DiskUsage, error) {
   210  	if m.getDiskUsage == nil {
   211  		panic("unimplemented")
   212  	}
   213  	return m.getDiskUsage(path)
   214  }
   215  
   216  var _ FS = &mockFS{}
   217  
   218  func TestDiskHealthChecking_File(t *testing.T) {
   219  	oldTickInterval := defaultTickInterval
   220  	defaultTickInterval = time.Millisecond
   221  	if runtime.GOOS == "windows" {
   222  		t.Skipf("skipped on windows due to unreliable runtimes")
   223  	}
   224  
   225  	defer func() { defaultTickInterval = oldTickInterval }()
   226  
   227  	const (
   228  		slowThreshold = 50 * time.Millisecond
   229  	)
   230  
   231  	fiveKB := make([]byte, 5*writeSizePrecision)
   232  	testCases := []struct {
   233  		op               OpType
   234  		writeSize        int
   235  		writeDuration    time.Duration
   236  		fn               func(f File)
   237  		createWriteDelta time.Duration
   238  	}{
   239  		{
   240  			op:            OpTypeWrite,
   241  			writeSize:     5 * writeSizePrecision, // five KB
   242  			writeDuration: 100 * time.Millisecond,
   243  			fn:            func(f File) { f.Write(fiveKB) },
   244  		},
   245  		{
   246  			op:            OpTypeSync,
   247  			writeSize:     0,
   248  			writeDuration: 100 * time.Millisecond,
   249  			fn:            func(f File) { f.Sync() },
   250  		},
   251  	}
   252  	for _, tc := range testCases {
   253  		t.Run(tc.op.String(), func(t *testing.T) {
   254  			diskSlow := make(chan DiskSlowInfo, 3)
   255  			mockFS := &mockFS{create: func(name string) (File, error) {
   256  				return mockFile{syncAndWriteDuration: tc.writeDuration}, nil
   257  			}}
   258  			fs, closer := WithDiskHealthChecks(mockFS, slowThreshold,
   259  				func(info DiskSlowInfo) {
   260  					diskSlow <- info
   261  				})
   262  			defer closer.Close()
   263  			dhFile, _ := fs.Create("test")
   264  			defer dhFile.Close()
   265  
   266  			// Writing after file creation tests computation of delta between file
   267  			// creation time & write time.
   268  			time.Sleep(tc.createWriteDelta)
   269  
   270  			tc.fn(dhFile)
   271  
   272  			select {
   273  			case i := <-diskSlow:
   274  				d := i.Duration
   275  				if d.Seconds() < slowThreshold.Seconds() {
   276  					t.Fatalf("expected %0.1f to be greater than threshold %0.1f", d.Seconds(), slowThreshold.Seconds())
   277  				}
   278  				require.Equal(t, tc.writeSize, i.WriteSize)
   279  				require.Equal(t, tc.op, i.OpType)
   280  			case <-time.After(10 * time.Second):
   281  				t.Fatal("disk stall detector did not detect slow disk operation")
   282  			}
   283  		})
   284  	}
   285  }
   286  
   287  func TestDiskHealthChecking_NotTooManyOps(t *testing.T) {
   288  	numBitsForOpType := 64 - deltaBits - writeSizeBits
   289  	numOpTypesAllowed := int(math.Pow(2, float64(numBitsForOpType)))
   290  	numOpTypes := int(opTypeMax)
   291  	require.LessOrEqual(t, numOpTypes, numOpTypesAllowed)
   292  }
   293  
   294  func TestDiskHealthChecking_File_PackingAndUnpacking(t *testing.T) {
   295  	testCases := []struct {
   296  		desc          string
   297  		delta         time.Duration
   298  		writeSize     int64
   299  		opType        OpType
   300  		wantDelta     time.Duration
   301  		wantWriteSize int
   302  	}{
   303  		// Write op with write size in bytes.
   304  		{
   305  			desc:          "write, sized op",
   306  			delta:         3000 * time.Millisecond,
   307  			writeSize:     1024, // 1 KB.
   308  			opType:        OpTypeWrite,
   309  			wantDelta:     3000 * time.Millisecond,
   310  			wantWriteSize: 1024,
   311  		},
   312  		// Sync op. No write size. Max-ish delta that packing scheme can handle.
   313  		{
   314  			desc:          "sync, no write size",
   315  			delta:         34 * time.Hour * 24 * 365,
   316  			writeSize:     0,
   317  			opType:        OpTypeSync,
   318  			wantDelta:     34 * time.Hour * 24 * 365,
   319  			wantWriteSize: 0,
   320  		},
   321  		// Delta is negative (e.g. due to clock sync). Set to
   322  		// zero.
   323  		{
   324  			desc:          "delta negative",
   325  			delta:         -5,
   326  			writeSize:     5120, // 5 KB
   327  			opType:        OpTypeWrite,
   328  			wantDelta:     0,
   329  			wantWriteSize: 5120,
   330  		},
   331  		// Write size in bytes is larger than can fit in 20 bits.
   332  		// Round down to max that can fit in 20 bits.
   333  		{
   334  			desc:          "write size truncated",
   335  			delta:         231 * time.Millisecond,
   336  			writeSize:     2097152000, // too big!
   337  			opType:        OpTypeWrite,
   338  			wantDelta:     231 * time.Millisecond,
   339  			wantWriteSize: 1073740800, // (2^20-1) * writeSizePrecision ~= a bit less than one GB
   340  		},
   341  		// Write size in bytes is max representable less than the ceiling.
   342  		{
   343  			desc:          "write size barely not truncated",
   344  			delta:         231 * time.Millisecond,
   345  			writeSize:     1073739776, // max representable less than the ceiling
   346  			opType:        OpTypeWrite,
   347  			wantDelta:     231 * time.Millisecond,
   348  			wantWriteSize: 1073739776, // since can fit, unchanged
   349  		},
   350  	}
   351  	for _, tc := range testCases {
   352  		t.Run(tc.desc, func(t *testing.T) {
   353  			packed := pack(tc.delta, tc.writeSize, tc.opType)
   354  			gotDelta, gotWriteSize, gotOpType := unpack(packed)
   355  
   356  			require.Equal(t, tc.wantDelta, gotDelta)
   357  			require.Equal(t, tc.wantWriteSize, gotWriteSize)
   358  			require.Equal(t, tc.opType, gotOpType)
   359  		})
   360  	}
   361  }
   362  
   363  func TestDiskHealthChecking_File_Underflow(t *testing.T) {
   364  	f := &mockFile{}
   365  	hcFile := newDiskHealthCheckingFile(f, 1*time.Second, func(opType OpType, writeSizeInBytes int, duration time.Duration) {
   366  		// We expect to panic before sending the event.
   367  		t.Fatalf("unexpected slow disk event")
   368  	})
   369  	defer hcFile.Close()
   370  
   371  	t.Run("too large delta leads to panic", func(t *testing.T) {
   372  		// Given the packing scheme, 35 years of process uptime will lead to a delta
   373  		// that is too large to fit in the packed int64.
   374  		tCreate := time.Now().Add(-35 * time.Hour * 24 * 365)
   375  		hcFile.createTime = tCreate
   376  
   377  		// Assert that the time since tCreate (in milliseconds) is indeed greater
   378  		// than the max delta that can fit.
   379  		require.True(t, time.Since(tCreate).Milliseconds() > 1<<deltaBits-1)
   380  
   381  		// Attempting to start the clock for a new operation on the file should
   382  		// trigger a panic, as the calculated delta from the file creation time would
   383  		// result in integer overflow.
   384  		require.Panics(t, func() { _, _ = hcFile.Write([]byte("uh oh")) })
   385  	})
   386  	t.Run("pretty large delta but not too large leads to no panic", func(t *testing.T) {
   387  		// Given the packing scheme, 34 years of process uptime will lead to a delta
   388  		// that is just small enough to fit in the packed int64.
   389  		tCreate := time.Now().Add(-34 * time.Hour * 24 * 365)
   390  		hcFile.createTime = tCreate
   391  
   392  		require.True(t, time.Since(tCreate).Milliseconds() < 1<<deltaBits-1)
   393  		require.NotPanics(t, func() { _, _ = hcFile.Write([]byte("should be fine")) })
   394  	})
   395  }
   396  
   397  var (
   398  	errInjected = errors.New("injected error")
   399  )
   400  
   401  // filesystemOpsMockFS returns a filesystem that will block until it reads from
   402  // the provided channel on filesystem operations.
   403  func filesystemOpsMockFS(ch chan struct{}) *mockFS {
   404  	return &mockFS{
   405  		create: func(name string) (File, error) {
   406  			<-ch
   407  			return nil, errInjected
   408  		},
   409  		link: func(oldname, newname string) error {
   410  			<-ch
   411  			return errInjected
   412  		},
   413  		mkdirAll: func(string, os.FileMode) error {
   414  			<-ch
   415  			return errInjected
   416  		},
   417  		remove: func(name string) error {
   418  			<-ch
   419  			return errInjected
   420  		},
   421  		removeAll: func(name string) error {
   422  			<-ch
   423  			return errInjected
   424  		},
   425  		rename: func(oldname, newname string) error {
   426  			<-ch
   427  			return errInjected
   428  		},
   429  		reuseForWrite: func(oldname, newname string) (File, error) {
   430  			<-ch
   431  			return nil, errInjected
   432  		},
   433  	}
   434  }
   435  
   436  func stallFilesystemOperations(fs FS) []filesystemOperation {
   437  	return []filesystemOperation{
   438  		{
   439  			"create", OpTypeCreate, func() {
   440  				f, _ := fs.Create("foo")
   441  				if f != nil {
   442  					f.Close()
   443  				}
   444  			},
   445  		},
   446  		{
   447  			"link", OpTypeLink, func() { _ = fs.Link("foo", "bar") },
   448  		},
   449  		{
   450  			"mkdirall", OpTypeMkdirAll, func() { _ = fs.MkdirAll("foo", os.ModePerm) },
   451  		},
   452  		{
   453  			"remove", OpTypeRemove, func() { _ = fs.Remove("foo") },
   454  		},
   455  		{
   456  			"removeall", OpTypeRemoveAll, func() { _ = fs.RemoveAll("foo") },
   457  		},
   458  		{
   459  			"rename", OpTypeRename, func() { _ = fs.Rename("foo", "bar") },
   460  		},
   461  		{
   462  			"reuseforwrite", OpTypeReuseForWrite, func() { _, _ = fs.ReuseForWrite("foo", "bar") },
   463  		},
   464  	}
   465  }
   466  
   467  type filesystemOperation struct {
   468  	name   string
   469  	opType OpType
   470  	f      func()
   471  }
   472  
   473  func TestDiskHealthChecking_Filesystem(t *testing.T) {
   474  	const stallThreshold = 10 * time.Millisecond
   475  	if runtime.GOOS == "windows" {
   476  		t.Skipf("skipped on windows due to unreliable runtimes")
   477  	}
   478  
   479  	// Wrap with disk-health checking, counting each stall via stallCount.
   480  	var expectedOpType OpType
   481  	var stallCount atomic.Uint64
   482  	unstall := make(chan struct{})
   483  	var lastOpType OpType
   484  	fs, closer := WithDiskHealthChecks(filesystemOpsMockFS(unstall), stallThreshold,
   485  		func(info DiskSlowInfo) {
   486  			require.Equal(t, 0, info.WriteSize)
   487  			stallCount.Add(1)
   488  			if lastOpType != info.OpType {
   489  				require.Equal(t, expectedOpType, info.OpType)
   490  				lastOpType = info.OpType
   491  				// Sending on `unstall` releases the blocked filesystem
   492  				// operation, allowing the test to proceed.
   493  				unstall <- struct{}{}
   494  			}
   495  		})
   496  
   497  	defer closer.Close()
   498  	fs.(*diskHealthCheckingFS).tickInterval = 5 * time.Millisecond
   499  	ops := stallFilesystemOperations(fs)
   500  	for _, o := range ops {
   501  		t.Run(o.name, func(t *testing.T) {
   502  			expectedOpType = o.opType
   503  			before := stallCount.Load()
   504  			// o.f() will perform the filesystem operation and block within the
   505  			// mock filesystem until the disk stall detector notices the stall
   506  			// and sends to the `unstall` channel.
   507  			o.f()
   508  			after := stallCount.Load()
   509  			require.Greater(t, int(after-before), 0)
   510  		})
   511  	}
   512  }
   513  
   514  // TestDiskHealthChecking_Filesystem_Close tests the behavior of repeatedly
   515  // closing and reusing a filesystem wrapped by WithDiskHealthChecks. This is a
   516  // permitted usage because it allows (*pebble.Options).EnsureDefaults to wrap
   517  // with disk-health checking by default, and to clean up the long-running
   518  // goroutine on (*pebble.DB).Close, while still allowing the FS to be used
   519  // multiple times.
   520  func TestDiskHealthChecking_Filesystem_Close(t *testing.T) {
   521  	const stallThreshold = 10 * time.Millisecond
   522  	stallChan := make(chan struct{}, 1)
   523  	mockFS := &mockFS{
   524  		create: func(name string) (File, error) {
   525  			<-stallChan
   526  			return &mockFile{}, nil
   527  		},
   528  	}
   529  
   530  	files := []string{"foo", "bar", "bax"}
   531  	var lastPath string
   532  	stalled := make(chan string)
   533  	fs, closer := WithDiskHealthChecks(mockFS, stallThreshold,
   534  		func(info DiskSlowInfo) {
   535  			if lastPath != info.Path {
   536  				lastPath = info.Path
   537  				stalled <- info.Path
   538  			}
   539  		})
   540  	fs.(*diskHealthCheckingFS).tickInterval = 5 * time.Millisecond
   541  
   542  	var wg sync.WaitGroup
   543  	for _, filename := range files {
   544  		filename := filename
   545  		// Create will stall, and the detector should write to the stalled channel
   546  		// with the filename.
   547  		wg.Add(1)
   548  		go func() {
   549  			defer wg.Done()
   550  			f, _ := fs.Create(filename)
   551  			if f != nil {
   552  				f.Close()
   553  			}
   554  		}()
   555  
   556  		select {
   557  		case stalledPath := <-stalled:
   558  			require.Equal(t, filename, stalledPath)
   559  		case <-time.After(10 * time.Second):
   560  			t.Fatalf("timed out waiting for stall")
   561  		}
   562  		// Unblock the call to Create().
   563  		stallChan <- struct{}{}
   564  
   565  		// Invoke the closer. This will cause the long-running goroutine to
   566  		// exit, but the fs should still be usable and should still detect
   567  		// subsequent stalls on the next iteration.
   568  		require.NoError(t, closer.Close())
   569  	}
   570  	wg.Wait()
   571  }