github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/vfs/disk_health.go (about) 1 // Copyright 2020 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package vfs 6 7 import ( 8 "io" 9 "os" 10 "sync" 11 "sync/atomic" 12 "time" 13 ) 14 15 const ( 16 // defaultTickInterval is the default interval between two ticks of each 17 // diskHealthCheckingFile loop iteration. 18 defaultTickInterval = 2 * time.Second 19 // preallocatedSlotCount is the default number of slots available for 20 // concurrent filesystem operations. The slot count may be exceeded, but 21 // each additional slot will incur an additional allocation. We choose 16 22 // here with the expectation that it is significantly more than required in 23 // practice. See the comment above the diskHealthCheckingFS type definition. 24 preallocatedSlotCount = 16 25 ) 26 27 // diskHealthCheckingFile is a File wrapper to detect slow disk operations, and 28 // call onSlowDisk if a disk operation is seen to exceed diskSlowThreshold. 29 // 30 // This struct creates a goroutine (in startTicker()) that, at every tick 31 // interval, sees if there's a disk operation taking longer than the specified 32 // duration. This setup is preferable to creating a new timer at every disk 33 // operation, as it reduces overhead per disk operation. 34 type diskHealthCheckingFile struct { 35 File 36 37 onSlowDisk func(time.Duration) 38 diskSlowThreshold time.Duration 39 tickInterval time.Duration 40 41 stopper chan struct{} 42 lastWriteNanos int64 43 } 44 45 // newDiskHealthCheckingFile instantiates a new diskHealthCheckingFile, with the 46 // specified time threshold and event listener. 47 func newDiskHealthCheckingFile( 48 file File, diskSlowThreshold time.Duration, onSlowDisk func(time.Duration), 49 ) *diskHealthCheckingFile { 50 return &diskHealthCheckingFile{ 51 File: file, 52 onSlowDisk: onSlowDisk, 53 diskSlowThreshold: diskSlowThreshold, 54 tickInterval: defaultTickInterval, 55 56 stopper: make(chan struct{}), 57 } 58 } 59 60 // startTicker starts a new goroutine with a ticker to monitor disk operations. 61 // Can only be called if the ticker goroutine isn't running already. 62 func (d *diskHealthCheckingFile) startTicker() { 63 if d.diskSlowThreshold == 0 { 64 return 65 } 66 67 go func() { 68 ticker := time.NewTicker(d.tickInterval) 69 defer ticker.Stop() 70 71 for { 72 select { 73 case <-d.stopper: 74 return 75 76 case <-ticker.C: 77 lastWriteNanos := atomic.LoadInt64(&d.lastWriteNanos) 78 if lastWriteNanos == 0 { 79 continue 80 } 81 lastWrite := time.Unix(0, lastWriteNanos) 82 now := time.Now() 83 if lastWrite.Add(d.diskSlowThreshold).Before(now) { 84 // diskSlowThreshold was exceeded. Call the passed-in 85 // listener. 86 d.onSlowDisk(now.Sub(lastWrite)) 87 } 88 } 89 } 90 }() 91 } 92 93 // stopTicker stops the goroutine started in startTicker. 94 func (d *diskHealthCheckingFile) stopTicker() { 95 close(d.stopper) 96 } 97 98 // Write implements the io.Writer interface. 99 func (d *diskHealthCheckingFile) Write(p []byte) (n int, err error) { 100 d.timeDiskOp(func() { 101 n, err = d.File.Write(p) 102 }) 103 return n, err 104 } 105 106 // Close implements the io.Closer interface. 107 func (d *diskHealthCheckingFile) Close() error { 108 d.stopTicker() 109 return d.File.Close() 110 } 111 112 // Sync implements the io.Syncer interface. 113 func (d *diskHealthCheckingFile) Sync() (err error) { 114 d.timeDiskOp(func() { 115 err = d.File.Sync() 116 }) 117 return err 118 } 119 120 // timeDiskOp runs the specified closure and makes its timing visible to the 121 // monitoring goroutine, in case it exceeds one of the slow disk durations. 122 func (d *diskHealthCheckingFile) timeDiskOp(op func()) { 123 if d == nil { 124 op() 125 return 126 } 127 128 atomic.StoreInt64(&d.lastWriteNanos, time.Now().UnixNano()) 129 defer func() { 130 atomic.StoreInt64(&d.lastWriteNanos, 0) 131 }() 132 op() 133 } 134 135 // diskHealthCheckingFS adds disk-health checking facilities to a VFS. 136 // It times disk write operations in two ways: 137 // 138 // 1. Wrapping vfs.Files. 139 // 140 // The bulk of write I/O activity is file writing and syncing, invoked through 141 // the `vfs.File` interface. This VFS wraps all files open for writing with a 142 // special diskHealthCheckingFile implementation of the vfs.File interface. See 143 // above for the implementation. 144 // 145 // 2. Monitoring filesystem metadata operations. 146 // 147 // Filesystem metadata operations (create, link, remove, rename, etc) are also 148 // sources of disk writes. Unlike a vfs.File which requires Write and Sync calls 149 // to be sequential, a vfs.FS may receive these filesystem metadata operations 150 // in parallel. To accommodate this parallelism, the diskHealthCheckingFS's 151 // write-oriented filesystem operations record their start times into a 'slot' 152 // on the filesystem. A single long-running goroutine periodically scans the 153 // slots looking for slow operations. 154 // 155 // The number of slots on a diskHealthCheckingFS grows to a working set of the 156 // maximum concurrent filesystem operations. This is expected to be very few 157 // for these reasons: 158 // 1. Pebble has limited write concurrency. Flushes, compactions and WAL 159 // rotations are the primary sources of filesystem metadata operations. With 160 // the default max-compaction concurrency, these operations require at most 5 161 // concurrent slots if all 5 perform a filesystem metadata operation 162 // simultaneously. 163 // 2. Pebble's limited concurrent I/O writers spend most of their time 164 // performing file I/O, not performing the filesystem metadata operations that 165 // require recording a slot on the diskHealthCheckingFS. 166 // 3. In CockroachDB, each additional store/Pebble instance has its own vfs.FS 167 // which provides a separate goroutine and set of slots. 168 // 4. In CockroachDB, many of the additional sources of filesystem metadata 169 // operations (like encryption-at-rest) are sequential with respect to Pebble's 170 // threads. 171 type diskHealthCheckingFS struct { 172 tickInterval time.Duration 173 diskSlowThreshold time.Duration 174 onSlowDisk func(string, time.Duration) 175 fs FS 176 mu struct { 177 sync.Mutex 178 tickerRunning bool 179 stopper chan struct{} 180 inflight []*slot 181 } 182 // prealloc preallocates the memory for mu.inflight slots and the slice 183 // itself. The contained fields are not accessed directly except by 184 // WithDiskHealthChecks when initializing mu.inflight. The number of slots 185 // in d.mu.inflight will grow to the maximum number of concurrent file 186 // metadata operations (create, remove, link, etc). If the number of 187 // concurrent operations never exceeds preallocatedSlotCount, we'll never 188 // incur an additional allocation. 189 prealloc struct { 190 slots [preallocatedSlotCount]slot 191 slotPtrSlice [preallocatedSlotCount]*slot 192 } 193 } 194 195 type slot struct { 196 name string 197 startNanos int64 198 } 199 200 // diskHealthCheckingFS implements FS. 201 var _ FS = (*diskHealthCheckingFS)(nil) 202 203 // WithDiskHealthChecks wraps an FS and ensures that all write-oriented 204 // operations on the FS are wrapped with disk health detection checks. Disk 205 // operations that are observed to take longer than diskSlowThreshold trigger an 206 // onSlowDisk call. 207 // 208 // A threshold of zero disables disk-health checking. 209 func WithDiskHealthChecks( 210 innerFS FS, diskSlowThreshold time.Duration, onSlowDisk func(string, time.Duration), 211 ) (FS, io.Closer) { 212 if diskSlowThreshold == 0 { 213 return innerFS, noopCloser{} 214 } 215 216 fs := &diskHealthCheckingFS{ 217 fs: innerFS, 218 tickInterval: defaultTickInterval, 219 diskSlowThreshold: diskSlowThreshold, 220 onSlowDisk: onSlowDisk, 221 } 222 fs.mu.stopper = make(chan struct{}) 223 // The fs holds preallocated slots and a preallocated array of slot pointers 224 // with equal length. Initialize the inflight slice to use a slice backed by 225 // the preallocated array with each slot initialized to a preallocated slot. 226 fs.mu.inflight = fs.prealloc.slotPtrSlice[:] 227 for i := range fs.mu.inflight { 228 fs.mu.inflight[i] = &fs.prealloc.slots[i] 229 } 230 return fs, fs 231 } 232 233 func (d *diskHealthCheckingFS) timeFilesystemOp(name string, op func()) { 234 if d == nil { 235 op() 236 return 237 } 238 239 // Record this operation's start time on the FS, so that the long-running 240 // goroutine can monitor the filesystem operation. 241 // 242 // The diskHealthCheckingFile implementation uses a single field that is 243 // atomically updated, taking advantage of the fact that writes to a single 244 // vfs.File handle are not performed in parallel. The vfs.FS however may 245 // receive write filesystem operations in parallel. To accommodate this 246 // parallelism, writing goroutines append their start time to a 247 // mutex-protected vector. On ticks, the long-running goroutine scans the 248 // vector searching for start times older than the slow-disk threshold. When 249 // a writing goroutine completes its operation, it atomically overwrites its 250 // slot to signal completion. 251 var s *slot 252 func() { 253 d.mu.Lock() 254 defer d.mu.Unlock() 255 256 // If there's no long-running goroutine to monitor this filesystem 257 // operation, start one. 258 if !d.mu.tickerRunning { 259 d.startTickerLocked() 260 } 261 262 startNanos := time.Now().UnixNano() 263 for i := 0; i < len(d.mu.inflight); i++ { 264 if atomic.LoadInt64(&d.mu.inflight[i].startNanos) == 0 { 265 // This slot is not in use. Claim it. 266 s = d.mu.inflight[i] 267 s.name = name 268 atomic.StoreInt64(&s.startNanos, startNanos) 269 break 270 } 271 } 272 // If we didn't find any unused slots, create a new slot and append it. 273 // This slot will exist forever. The number of slots will grow to the 274 // maximum number of concurrent filesystem operations over the lifetime 275 // of the process. Only operations that grow the number of slots must 276 // incur an allocation. 277 if s == nil { 278 s = &slot{ 279 name: name, 280 startNanos: startNanos, 281 } 282 d.mu.inflight = append(d.mu.inflight, s) 283 } 284 }() 285 286 op() 287 288 // Signal completion by zeroing the start time. 289 atomic.StoreInt64(&s.startNanos, 0) 290 } 291 292 // startTickerLocked starts a new goroutine with a ticker to monitor disk 293 // filesystem operations. Requires d.mu and !d.mu.tickerRunning. 294 func (d *diskHealthCheckingFS) startTickerLocked() { 295 d.mu.tickerRunning = true 296 stopper := d.mu.stopper 297 go func() { 298 ticker := time.NewTicker(d.tickInterval) 299 defer ticker.Stop() 300 301 for { 302 select { 303 case <-ticker.C: 304 // Scan the inflight slots for any slots recording a start 305 // time older than the diskSlowThreshold. 306 d.mu.Lock() 307 now := time.Now() 308 for i := range d.mu.inflight { 309 nanos := atomic.LoadInt64(&d.mu.inflight[i].startNanos) 310 if nanos != 0 && time.Unix(0, nanos).Add(d.diskSlowThreshold).Before(now) { 311 // diskSlowThreshold was exceeded. Invoke the provided 312 // callback. 313 d.onSlowDisk(d.mu.inflight[i].name, now.Sub(time.Unix(0, nanos))) 314 } 315 } 316 d.mu.Unlock() 317 case <-stopper: 318 return 319 } 320 } 321 }() 322 } 323 324 // Close implements io.Closer. Close stops the long-running goroutine that 325 // monitors for slow filesystem metadata operations. Close may be called 326 // multiple times. If the filesystem is used after Close has been called, a new 327 // long-running goroutine will be created. 328 func (d *diskHealthCheckingFS) Close() error { 329 d.mu.Lock() 330 if !d.mu.tickerRunning { 331 // Nothing to stop. 332 d.mu.Unlock() 333 return nil 334 } 335 336 // Grab the stopper so we can request the long-running goroutine to stop. 337 // Replace the stopper in case this FS is reused. It's possible to Close and 338 // reuse a disk-health checking FS. This is to accommodate the on-by-default 339 // behavior in Pebble, and the possibility that users may continue to use 340 // the Pebble default FS beyond the lifetime of a single DB. 341 stopper := d.mu.stopper 342 d.mu.stopper = make(chan struct{}) 343 d.mu.tickerRunning = false 344 d.mu.Unlock() 345 346 // Ask the long-running goroutine to stop. This is a synchronous channel 347 // send. 348 stopper <- struct{}{} 349 close(stopper) 350 return nil 351 } 352 353 // Create implements the FS interface. 354 func (d *diskHealthCheckingFS) Create(name string) (File, error) { 355 var f File 356 var err error 357 d.timeFilesystemOp(name, func() { 358 f, err = d.fs.Create(name) 359 }) 360 if err != nil { 361 return f, err 362 } 363 if d.diskSlowThreshold == 0 { 364 return f, nil 365 } 366 checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(duration time.Duration) { 367 d.onSlowDisk(name, duration) 368 }) 369 checkingFile.startTicker() 370 return WithFd(f, checkingFile), nil 371 } 372 373 // GetDiskUsage implements the FS interface. 374 func (d *diskHealthCheckingFS) GetDiskUsage(path string) (DiskUsage, error) { 375 return d.fs.GetDiskUsage(path) 376 } 377 378 // Link implements the FS interface. 379 func (d *diskHealthCheckingFS) Link(oldname, newname string) error { 380 var err error 381 d.timeFilesystemOp(newname, func() { 382 err = d.fs.Link(oldname, newname) 383 }) 384 return err 385 } 386 387 // List implements the FS interface. 388 func (d *diskHealthCheckingFS) List(dir string) ([]string, error) { 389 return d.fs.List(dir) 390 } 391 392 // Lock implements the FS interface. 393 func (d *diskHealthCheckingFS) Lock(name string) (io.Closer, error) { 394 return d.fs.Lock(name) 395 } 396 397 // MkdirAll implements the FS interface. 398 func (d *diskHealthCheckingFS) MkdirAll(dir string, perm os.FileMode) error { 399 var err error 400 d.timeFilesystemOp(dir, func() { 401 err = d.fs.MkdirAll(dir, perm) 402 }) 403 return err 404 } 405 406 // Open implements the FS interface. 407 func (d *diskHealthCheckingFS) Open(name string, opts ...OpenOption) (File, error) { 408 return d.fs.Open(name, opts...) 409 } 410 411 // OpenDir implements the FS interface. 412 func (d *diskHealthCheckingFS) OpenDir(name string) (File, error) { 413 f, err := d.fs.OpenDir(name) 414 if err != nil { 415 return f, err 416 } 417 // Directories opened with OpenDir must be opened with health checking, 418 // because they may be explicitly synced. 419 checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(duration time.Duration) { 420 d.onSlowDisk(name, duration) 421 }) 422 checkingFile.startTicker() 423 return WithFd(f, checkingFile), nil 424 } 425 426 // PathBase implements the FS interface. 427 func (d *diskHealthCheckingFS) PathBase(path string) string { 428 return d.fs.PathBase(path) 429 } 430 431 // PathJoin implements the FS interface. 432 func (d *diskHealthCheckingFS) PathJoin(elem ...string) string { 433 return d.fs.PathJoin(elem...) 434 } 435 436 // PathDir implements the FS interface. 437 func (d *diskHealthCheckingFS) PathDir(path string) string { 438 return d.fs.PathDir(path) 439 } 440 441 // Remove implements the FS interface. 442 func (d *diskHealthCheckingFS) Remove(name string) error { 443 var err error 444 d.timeFilesystemOp(name, func() { 445 err = d.fs.Remove(name) 446 }) 447 return err 448 } 449 450 // RemoveAll implements the FS interface. 451 func (d *diskHealthCheckingFS) RemoveAll(name string) error { 452 var err error 453 d.timeFilesystemOp(name, func() { 454 err = d.fs.RemoveAll(name) 455 }) 456 return err 457 } 458 459 // Rename implements the FS interface. 460 func (d *diskHealthCheckingFS) Rename(oldname, newname string) error { 461 var err error 462 d.timeFilesystemOp(newname, func() { 463 err = d.fs.Rename(oldname, newname) 464 }) 465 return err 466 } 467 468 // ReuseForWrite implements the FS interface. 469 func (d *diskHealthCheckingFS) ReuseForWrite(oldname, newname string) (File, error) { 470 var f File 471 var err error 472 d.timeFilesystemOp(newname, func() { 473 f, err = d.fs.ReuseForWrite(oldname, newname) 474 }) 475 if err != nil { 476 return f, err 477 } 478 if d.diskSlowThreshold == 0 { 479 return f, nil 480 } 481 checkingFile := newDiskHealthCheckingFile(f, d.diskSlowThreshold, func(duration time.Duration) { 482 d.onSlowDisk(newname, duration) 483 }) 484 checkingFile.startTicker() 485 return WithFd(f, checkingFile), nil 486 } 487 488 // Stat implements the FS interface. 489 func (d *diskHealthCheckingFS) Stat(name string) (os.FileInfo, error) { 490 return d.fs.Stat(name) 491 } 492 493 type noopCloser struct{} 494 495 func (noopCloser) Close() error { return nil }