github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/vfs/disk_health_test.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package vfs 6 7 import ( 8 "io" 9 "math" 10 "os" 11 "runtime" 12 "sync" 13 "sync/atomic" 14 "testing" 15 "time" 16 17 "github.com/cockroachdb/errors" 18 "github.com/stretchr/testify/require" 19 ) 20 21 type mockFile struct { 22 syncAndWriteDuration time.Duration 23 } 24 25 func (m mockFile) Close() error { 26 return nil 27 } 28 29 func (m mockFile) Read(p []byte) (n int, err error) { 30 panic("unimplemented") 31 } 32 33 func (m mockFile) ReadAt(p []byte, off int64) (n int, err error) { 34 panic("unimplemented") 35 } 36 37 func (m mockFile) Write(p []byte) (n int, err error) { 38 time.Sleep(m.syncAndWriteDuration) 39 return len(p), nil 40 } 41 42 func (m mockFile) WriteAt(p []byte, ofs int64) (n int, err error) { 43 time.Sleep(m.syncAndWriteDuration) 44 return len(p), nil 45 } 46 47 func (m mockFile) Prefetch(offset, length int64) error { 48 panic("unimplemented") 49 } 50 51 func (m mockFile) Preallocate(int64, int64) error { 52 time.Sleep(m.syncAndWriteDuration) 53 return nil 54 } 55 56 func (m mockFile) Stat() (os.FileInfo, error) { 57 panic("unimplemented") 58 } 59 60 func (m mockFile) Fd() uintptr { 61 return InvalidFd 62 } 63 64 func (m mockFile) Sync() error { 65 time.Sleep(m.syncAndWriteDuration) 66 return nil 67 } 68 69 func (m mockFile) SyncData() error { 70 time.Sleep(m.syncAndWriteDuration) 71 return nil 72 } 73 74 func (m mockFile) SyncTo(int64) (fullSync bool, err error) { 75 time.Sleep(m.syncAndWriteDuration) 76 return false, nil 77 } 78 79 var _ File = &mockFile{} 80 81 type mockFS struct { 82 create func(string) (File, error) 83 link func(string, string) error 84 list func(string) ([]string, error) 85 lock func(string) (io.Closer, error) 86 mkdirAll func(string, os.FileMode) error 87 open func(string, ...OpenOption) (File, error) 88 openDir func(string) (File, error) 89 pathBase func(string) string 90 pathJoin func(...string) string 91 pathDir func(string) string 92 remove func(string) error 93 removeAll func(string) error 94 rename func(string, string) error 95 reuseForWrite func(string, string) (File, error) 96 stat func(string) (os.FileInfo, error) 97 getDiskUsage func(string) (DiskUsage, error) 98 } 99 100 func (m mockFS) Create(name string) (File, error) { 101 if m.create == nil { 102 panic("unimplemented") 103 } 104 return m.create(name) 105 } 106 107 func (m mockFS) Link(oldname, newname string) error { 108 if m.link == nil { 109 panic("unimplemented") 110 } 111 return m.link(oldname, newname) 112 } 113 114 func (m mockFS) Open(name string, opts ...OpenOption) (File, error) { 115 if m.open == nil { 116 panic("unimplemented") 117 } 118 return m.open(name, opts...) 119 } 120 121 func (m mockFS) OpenReadWrite(name string, opts ...OpenOption) (File, error) { 122 panic("unimplemented") 123 } 124 125 func (m mockFS) OpenDir(name string) (File, error) { 126 if m.openDir == nil { 127 panic("unimplemented") 128 } 129 return m.openDir(name) 130 } 131 132 func (m mockFS) Remove(name string) error { 133 if m.remove == nil { 134 panic("unimplemented") 135 } 136 return m.remove(name) 137 } 138 139 func (m mockFS) RemoveAll(name string) error { 140 if m.removeAll == nil { 141 panic("unimplemented") 142 } 143 return m.removeAll(name) 144 } 145 146 func (m mockFS) Rename(oldname, newname string) error { 147 if m.rename == nil { 148 panic("unimplemented") 149 } 150 return m.rename(oldname, newname) 151 } 152 153 func (m mockFS) ReuseForWrite(oldname, newname string) (File, error) { 154 if m.reuseForWrite == nil { 155 panic("unimplemented") 156 } 157 return m.reuseForWrite(oldname, newname) 158 } 159 160 func (m mockFS) MkdirAll(dir string, perm os.FileMode) error { 161 if m.mkdirAll == nil { 162 panic("unimplemented") 163 } 164 return m.mkdirAll(dir, perm) 165 } 166 167 func (m mockFS) Lock(name string) (io.Closer, error) { 168 if m.lock == nil { 169 panic("unimplemented") 170 } 171 return m.lock(name) 172 } 173 174 func (m mockFS) List(dir string) ([]string, error) { 175 if m.list == nil { 176 panic("unimplemented") 177 } 178 return m.list(dir) 179 } 180 181 func (m mockFS) Stat(name string) (os.FileInfo, error) { 182 if m.stat == nil { 183 panic("unimplemented") 184 } 185 return m.stat(name) 186 } 187 188 func (m mockFS) PathBase(path string) string { 189 if m.pathBase == nil { 190 panic("unimplemented") 191 } 192 return m.pathBase(path) 193 } 194 195 func (m mockFS) PathJoin(elem ...string) string { 196 if m.pathJoin == nil { 197 panic("unimplemented") 198 } 199 return m.pathJoin(elem...) 200 } 201 202 func (m mockFS) PathDir(path string) string { 203 if m.pathDir == nil { 204 panic("unimplemented") 205 } 206 return m.pathDir(path) 207 } 208 209 func (m mockFS) GetDiskUsage(path string) (DiskUsage, error) { 210 if m.getDiskUsage == nil { 211 panic("unimplemented") 212 } 213 return m.getDiskUsage(path) 214 } 215 216 var _ FS = &mockFS{} 217 218 func TestDiskHealthChecking_File(t *testing.T) { 219 oldTickInterval := defaultTickInterval 220 defaultTickInterval = time.Millisecond 221 if runtime.GOOS == "windows" { 222 t.Skipf("skipped on windows due to unreliable runtimes") 223 } 224 225 defer func() { defaultTickInterval = oldTickInterval }() 226 227 const ( 228 slowThreshold = 50 * time.Millisecond 229 ) 230 231 fiveKB := make([]byte, 5*writeSizePrecision) 232 testCases := []struct { 233 op OpType 234 writeSize int 235 writeDuration time.Duration 236 fn func(f File) 237 createWriteDelta time.Duration 238 }{ 239 { 240 op: OpTypeWrite, 241 writeSize: 5 * writeSizePrecision, // five KB 242 writeDuration: 100 * time.Millisecond, 243 fn: func(f File) { f.Write(fiveKB) }, 244 }, 245 { 246 op: OpTypeSync, 247 writeSize: 0, 248 writeDuration: 100 * time.Millisecond, 249 fn: func(f File) { f.Sync() }, 250 }, 251 } 252 for _, tc := range testCases { 253 t.Run(tc.op.String(), func(t *testing.T) { 254 diskSlow := make(chan DiskSlowInfo, 3) 255 mockFS := &mockFS{create: func(name string) (File, error) { 256 return mockFile{syncAndWriteDuration: tc.writeDuration}, nil 257 }} 258 fs, closer := WithDiskHealthChecks(mockFS, slowThreshold, 259 func(info DiskSlowInfo) { 260 diskSlow <- info 261 }) 262 defer closer.Close() 263 dhFile, _ := fs.Create("test") 264 defer dhFile.Close() 265 266 // Writing after file creation tests computation of delta between file 267 // creation time & write time. 268 time.Sleep(tc.createWriteDelta) 269 270 tc.fn(dhFile) 271 272 select { 273 case i := <-diskSlow: 274 d := i.Duration 275 if d.Seconds() < slowThreshold.Seconds() { 276 t.Fatalf("expected %0.1f to be greater than threshold %0.1f", d.Seconds(), slowThreshold.Seconds()) 277 } 278 require.Equal(t, tc.writeSize, i.WriteSize) 279 require.Equal(t, tc.op, i.OpType) 280 case <-time.After(10 * time.Second): 281 t.Fatal("disk stall detector did not detect slow disk operation") 282 } 283 }) 284 } 285 } 286 287 func TestDiskHealthChecking_NotTooManyOps(t *testing.T) { 288 numBitsForOpType := 64 - deltaBits - writeSizeBits 289 numOpTypesAllowed := int(math.Pow(2, float64(numBitsForOpType))) 290 numOpTypes := int(opTypeMax) 291 require.LessOrEqual(t, numOpTypes, numOpTypesAllowed) 292 } 293 294 func TestDiskHealthChecking_File_PackingAndUnpacking(t *testing.T) { 295 testCases := []struct { 296 desc string 297 delta time.Duration 298 writeSize int64 299 opType OpType 300 wantDelta time.Duration 301 wantWriteSize int 302 }{ 303 // Write op with write size in bytes. 304 { 305 desc: "write, sized op", 306 delta: 3000 * time.Millisecond, 307 writeSize: 1024, // 1 KB. 308 opType: OpTypeWrite, 309 wantDelta: 3000 * time.Millisecond, 310 wantWriteSize: 1024, 311 }, 312 // Sync op. No write size. Max-ish delta that packing scheme can handle. 313 { 314 desc: "sync, no write size", 315 delta: 34 * time.Hour * 24 * 365, 316 writeSize: 0, 317 opType: OpTypeSync, 318 wantDelta: 34 * time.Hour * 24 * 365, 319 wantWriteSize: 0, 320 }, 321 // Delta is negative (e.g. due to clock sync). Set to 322 // zero. 323 { 324 desc: "delta negative", 325 delta: -5, 326 writeSize: 5120, // 5 KB 327 opType: OpTypeWrite, 328 wantDelta: 0, 329 wantWriteSize: 5120, 330 }, 331 // Write size in bytes is larger than can fit in 20 bits. 332 // Round down to max that can fit in 20 bits. 333 { 334 desc: "write size truncated", 335 delta: 231 * time.Millisecond, 336 writeSize: 2097152000, // too big! 337 opType: OpTypeWrite, 338 wantDelta: 231 * time.Millisecond, 339 wantWriteSize: 1073740800, // (2^20-1) * writeSizePrecision ~= a bit less than one GB 340 }, 341 // Write size in bytes is max representable less than the ceiling. 342 { 343 desc: "write size barely not truncated", 344 delta: 231 * time.Millisecond, 345 writeSize: 1073739776, // max representable less than the ceiling 346 opType: OpTypeWrite, 347 wantDelta: 231 * time.Millisecond, 348 wantWriteSize: 1073739776, // since can fit, unchanged 349 }, 350 } 351 for _, tc := range testCases { 352 t.Run(tc.desc, func(t *testing.T) { 353 packed := pack(tc.delta, tc.writeSize, tc.opType) 354 gotDelta, gotWriteSize, gotOpType := unpack(packed) 355 356 require.Equal(t, tc.wantDelta, gotDelta) 357 require.Equal(t, tc.wantWriteSize, gotWriteSize) 358 require.Equal(t, tc.opType, gotOpType) 359 }) 360 } 361 } 362 363 func TestDiskHealthChecking_File_Underflow(t *testing.T) { 364 f := &mockFile{} 365 hcFile := newDiskHealthCheckingFile(f, 1*time.Second, func(opType OpType, writeSizeInBytes int, duration time.Duration) { 366 // We expect to panic before sending the event. 367 t.Fatalf("unexpected slow disk event") 368 }) 369 defer hcFile.Close() 370 371 t.Run("too large delta leads to panic", func(t *testing.T) { 372 // Given the packing scheme, 35 years of process uptime will lead to a delta 373 // that is too large to fit in the packed int64. 374 tCreate := time.Now().Add(-35 * time.Hour * 24 * 365) 375 hcFile.createTime = tCreate 376 377 // Assert that the time since tCreate (in milliseconds) is indeed greater 378 // than the max delta that can fit. 379 require.True(t, time.Since(tCreate).Milliseconds() > 1<<deltaBits-1) 380 381 // Attempting to start the clock for a new operation on the file should 382 // trigger a panic, as the calculated delta from the file creation time would 383 // result in integer overflow. 384 require.Panics(t, func() { _, _ = hcFile.Write([]byte("uh oh")) }) 385 }) 386 t.Run("pretty large delta but not too large leads to no panic", func(t *testing.T) { 387 // Given the packing scheme, 34 years of process uptime will lead to a delta 388 // that is just small enough to fit in the packed int64. 389 tCreate := time.Now().Add(-34 * time.Hour * 24 * 365) 390 hcFile.createTime = tCreate 391 392 require.True(t, time.Since(tCreate).Milliseconds() < 1<<deltaBits-1) 393 require.NotPanics(t, func() { _, _ = hcFile.Write([]byte("should be fine")) }) 394 }) 395 } 396 397 var ( 398 errInjected = errors.New("injected error") 399 ) 400 401 // filesystemOpsMockFS returns a filesystem that will block until it reads from 402 // the provided channel on filesystem operations. 403 func filesystemOpsMockFS(ch chan struct{}) *mockFS { 404 return &mockFS{ 405 create: func(name string) (File, error) { 406 <-ch 407 return nil, errInjected 408 }, 409 link: func(oldname, newname string) error { 410 <-ch 411 return errInjected 412 }, 413 mkdirAll: func(string, os.FileMode) error { 414 <-ch 415 return errInjected 416 }, 417 remove: func(name string) error { 418 <-ch 419 return errInjected 420 }, 421 removeAll: func(name string) error { 422 <-ch 423 return errInjected 424 }, 425 rename: func(oldname, newname string) error { 426 <-ch 427 return errInjected 428 }, 429 reuseForWrite: func(oldname, newname string) (File, error) { 430 <-ch 431 return nil, errInjected 432 }, 433 } 434 } 435 436 func stallFilesystemOperations(fs FS) []filesystemOperation { 437 return []filesystemOperation{ 438 { 439 "create", OpTypeCreate, func() { 440 f, _ := fs.Create("foo") 441 if f != nil { 442 f.Close() 443 } 444 }, 445 }, 446 { 447 "link", OpTypeLink, func() { _ = fs.Link("foo", "bar") }, 448 }, 449 { 450 "mkdirall", OpTypeMkdirAll, func() { _ = fs.MkdirAll("foo", os.ModePerm) }, 451 }, 452 { 453 "remove", OpTypeRemove, func() { _ = fs.Remove("foo") }, 454 }, 455 { 456 "removeall", OpTypeRemoveAll, func() { _ = fs.RemoveAll("foo") }, 457 }, 458 { 459 "rename", OpTypeRename, func() { _ = fs.Rename("foo", "bar") }, 460 }, 461 { 462 "reuseforwrite", OpTypeReuseForWrite, func() { _, _ = fs.ReuseForWrite("foo", "bar") }, 463 }, 464 } 465 } 466 467 type filesystemOperation struct { 468 name string 469 opType OpType 470 f func() 471 } 472 473 func TestDiskHealthChecking_Filesystem(t *testing.T) { 474 const stallThreshold = 10 * time.Millisecond 475 if runtime.GOOS == "windows" { 476 t.Skipf("skipped on windows due to unreliable runtimes") 477 } 478 479 // Wrap with disk-health checking, counting each stall via stallCount. 480 var expectedOpType OpType 481 var stallCount atomic.Uint64 482 unstall := make(chan struct{}) 483 var lastOpType OpType 484 fs, closer := WithDiskHealthChecks(filesystemOpsMockFS(unstall), stallThreshold, 485 func(info DiskSlowInfo) { 486 require.Equal(t, 0, info.WriteSize) 487 stallCount.Add(1) 488 if lastOpType != info.OpType { 489 require.Equal(t, expectedOpType, info.OpType) 490 lastOpType = info.OpType 491 // Sending on `unstall` releases the blocked filesystem 492 // operation, allowing the test to proceed. 493 unstall <- struct{}{} 494 } 495 }) 496 497 defer closer.Close() 498 fs.(*diskHealthCheckingFS).tickInterval = 5 * time.Millisecond 499 ops := stallFilesystemOperations(fs) 500 for _, o := range ops { 501 t.Run(o.name, func(t *testing.T) { 502 expectedOpType = o.opType 503 before := stallCount.Load() 504 // o.f() will perform the filesystem operation and block within the 505 // mock filesystem until the disk stall detector notices the stall 506 // and sends to the `unstall` channel. 507 o.f() 508 after := stallCount.Load() 509 require.Greater(t, int(after-before), 0) 510 }) 511 } 512 } 513 514 // TestDiskHealthChecking_Filesystem_Close tests the behavior of repeatedly 515 // closing and reusing a filesystem wrapped by WithDiskHealthChecks. This is a 516 // permitted usage because it allows (*pebble.Options).EnsureDefaults to wrap 517 // with disk-health checking by default, and to clean up the long-running 518 // goroutine on (*pebble.DB).Close, while still allowing the FS to be used 519 // multiple times. 520 func TestDiskHealthChecking_Filesystem_Close(t *testing.T) { 521 const stallThreshold = 10 * time.Millisecond 522 stallChan := make(chan struct{}, 1) 523 mockFS := &mockFS{ 524 create: func(name string) (File, error) { 525 <-stallChan 526 return &mockFile{}, nil 527 }, 528 } 529 530 files := []string{"foo", "bar", "bax"} 531 var lastPath string 532 stalled := make(chan string) 533 fs, closer := WithDiskHealthChecks(mockFS, stallThreshold, 534 func(info DiskSlowInfo) { 535 if lastPath != info.Path { 536 lastPath = info.Path 537 stalled <- info.Path 538 } 539 }) 540 fs.(*diskHealthCheckingFS).tickInterval = 5 * time.Millisecond 541 542 var wg sync.WaitGroup 543 for _, filename := range files { 544 filename := filename 545 // Create will stall, and the detector should write to the stalled channel 546 // with the filename. 547 wg.Add(1) 548 go func() { 549 defer wg.Done() 550 f, _ := fs.Create(filename) 551 if f != nil { 552 f.Close() 553 } 554 }() 555 556 select { 557 case stalledPath := <-stalled: 558 require.Equal(t, filename, stalledPath) 559 case <-time.After(10 * time.Second): 560 t.Fatalf("timed out waiting for stall") 561 } 562 // Unblock the call to Create(). 563 stallChan <- struct{}{} 564 565 // Invoke the closer. This will cause the long-running goroutine to 566 // exit, but the fs should still be usable and should still detect 567 // subsequent stalls on the next iteration. 568 require.NoError(t, closer.Close()) 569 } 570 wg.Wait() 571 }