github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/vfs/disk_full.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package vfs 6 7 import ( 8 "io" 9 "os" 10 "sync" 11 "sync/atomic" 12 "syscall" 13 14 "github.com/cockroachdb/errors" 15 ) 16 17 // OnDiskFull wraps the provided FS with an FS that examines returned errors, 18 // looking for ENOSPC errors. It invokes the provided callback when the 19 // underlying filesystem returns an error signifying the storage is out of 20 // disk space. 21 // 22 // All new writes to the filesystem are blocked while the callback executes, 23 // so care must be taken to avoid expensive work from within the callback. 24 // 25 // Once the callback completes, any write-oriented operations that encountered 26 // ENOSPC are retried exactly once. Once the callback completes, it will not 27 // be invoked again until a new operation that began after the callback 28 // returned encounters an ENOSPC error. 29 // 30 // OnDiskFull may be used to automatically manage a ballast file, which is 31 // removed from the filesystem from within the callback. Note that if managing 32 // a ballast, the caller should maintain a reference to the inner FS and 33 // remove the ballast on the unwrapped FS. 34 func OnDiskFull(fs FS, fn func()) FS { 35 newFS := &enospcFS{inner: fs} 36 newFS.mu.Cond.L = &newFS.mu.Mutex 37 newFS.mu.onDiskFull = fn 38 return newFS 39 } 40 41 type enospcFS struct { 42 inner FS 43 atomic struct { 44 // generation is a monotonically increasing number that encodes the 45 // current state of ENOSPC error handling. Incoming writes are 46 // organized into generations to provide strong guarantees on when the 47 // disk full callback is invoked. The callback is invoked once per 48 // write generation. 49 // 50 // Special significance is given to the parity of this generation 51 // field to optimize incoming writes in the normal state, which only 52 // need to perform a single atomic load. If generation is odd, an 53 // ENOSPC error is being actively handled. The generations associated 54 // with writes are always even. 55 // 56 // The lifecycle of a write is: 57 // 58 // 1. Atomically load the current generation. 59 // a. If it's even, this is the write's generation number. 60 // b. If it's odd, an ENOSPC was recently encountered and the 61 // corresponding invocation of the disk full callback has not 62 // yet completed. The write must wait until the callback has 63 // completed and generation is updated to an even number, which 64 // becomes the write's generation number. 65 // 2. Perform the write. If it encounters no error or an error other 66 // than ENOSPC, the write returns and proceeds no further in this 67 // lifecycle. 68 // 3. Handle ENOSPC. If the write encounters ENOSPC, the callback must 69 // be invoked for the write's generation. The write's goroutine 70 // acquires the FS's mutex. 71 // a. If the FS's current generation is still equal to the write's 72 // generation, the write is the first write of its generation to 73 // encounter ENOSPC. It increments the FS's current generation 74 // to an odd number, signifying that an ENOSPC is being handled 75 // and invokes the callback. 76 // b. If the FS's current generation has changed, some other write 77 // from the same generation encountered an ENOSPC first. This 78 // write waits on the condition variable until the FS's current 79 // generation is updated indicating that the generation's 80 // callback invocation has completed. 81 // 3. Retry the write once. The callback for the write's generation 82 // has completed, either by this write's goroutine or another's. 83 // The write may proceed with the expectation that the callback 84 // remedied the full disk by freeing up disk space and an ENOSPC 85 // should not be encountered again for at least a few minutes. If 86 // we do encounter another ENOSPC on the retry, the callback was 87 // unable to remedy the full disk and another retry won't be 88 // useful. Any error, including ENOSPC, during the retry is 89 // returned without further handling. None of the retries invoke 90 // the callback. 91 // 92 // This scheme has a few nice properties: 93 // * Once the disk-full callback completes, it won't be invoked 94 // again unless a write that started strictly later encounters an 95 // ENOSPC. This is convenient if the callback strives to 'fix' the 96 // full disk, for example, by removing a ballast file. A new 97 // invocation of the callback guarantees a new problem. 98 // * Incoming writes block if there's an unhandled ENOSPC. Some 99 // writes, like WAL or MANIFEST fsyncs, are fatal if they encounter 100 // an ENOSPC. 101 generation uint32 102 } 103 mu struct { 104 sync.Mutex 105 sync.Cond 106 onDiskFull func() 107 } 108 } 109 110 // Unwrap returns the underlying FS. This may be called by vfs.Root to access 111 // the underlying filesystem. 112 func (fs *enospcFS) Unwrap() FS { 113 return fs.inner 114 } 115 116 // waitUntilReady is called before every FS or File operation that 117 // might return ENOSPC. If an ENOSPC was encountered and the corresponding 118 // invocation of the `onDiskFull` callback has not yet returned, 119 // waitUntilReady blocks until the callback returns. The returned generation 120 // is always even. 121 func (fs *enospcFS) waitUntilReady() uint32 { 122 gen := atomic.LoadUint32(&fs.atomic.generation) 123 if gen%2 == 0 { 124 // An even generation indicates that we're not currently handling an 125 // ENOSPC. Allow the write to proceed. 126 return gen 127 } 128 129 // We're currently handling an ENOSPC error. Wait on the condition 130 // variable until we're not handling an ENOSPC. 131 fs.mu.Lock() 132 defer fs.mu.Unlock() 133 134 // Load the generation again with fs.mu locked. 135 gen = atomic.LoadUint32(&fs.atomic.generation) 136 for gen%2 == 1 { 137 fs.mu.Wait() 138 gen = atomic.LoadUint32(&fs.atomic.generation) 139 } 140 return gen 141 } 142 143 func (fs *enospcFS) handleENOSPC(gen uint32) { 144 fs.mu.Lock() 145 defer fs.mu.Unlock() 146 147 currentGeneration := atomic.LoadUint32(&fs.atomic.generation) 148 149 // If the current generation is still `gen`, this is the first goroutine 150 // to hit an ENOSPC within this write generation, so this goroutine is 151 // responsible for invoking the callback. 152 if currentGeneration == gen { 153 // Increment the generation to an odd number, indicating that the FS 154 // is out-of-disk space and incoming writes should pause and wait for 155 // the next generation before continuing. 156 atomic.StoreUint32(&fs.atomic.generation, gen+1) 157 158 func() { 159 // Drop the mutex while we invoke the callback, re-acquiring 160 // afterwards. 161 fs.mu.Unlock() 162 defer fs.mu.Lock() 163 fs.mu.onDiskFull() 164 }() 165 166 // Update the current generation again to an even number, indicating 167 // that the callback has completed for the write generation `gen`. 168 atomic.StoreUint32(&fs.atomic.generation, gen+2) 169 fs.mu.Broadcast() 170 return 171 } 172 173 // The current generation has already been incremented, so either the 174 // callback is currently being run by another goroutine or it's already 175 // completed. Wait for it complete if it hasn't already. 176 // 177 // The current generation may be updated multiple times, including to an 178 // odd number signifying a later write generation has already encountered 179 // ENOSPC. In that case, the callback was not able to remedy the full disk 180 // and waiting is unlikely to be helpful. Continuing to wait risks 181 // blocking an unbounded number of generations. Retrying and bubbling the 182 // ENOSPC up might be helpful if we can abort a large compaction that 183 // started before we became more selective about compaction picking, so 184 // this loop only waits for this write generation's callback and no 185 // subsequent generations' callbacks. 186 for currentGeneration == gen+1 { 187 fs.mu.Wait() 188 currentGeneration = atomic.LoadUint32(&fs.atomic.generation) 189 } 190 } 191 192 func (fs *enospcFS) Create(name string) (File, error) { 193 gen := fs.waitUntilReady() 194 195 f, err := fs.inner.Create(name) 196 197 if err != nil && isENOSPC(err) { 198 fs.handleENOSPC(gen) 199 f, err = fs.inner.Create(name) 200 } 201 if f != nil { 202 f = WithFd(f, enospcFile{ 203 fs: fs, 204 inner: f, 205 }) 206 } 207 return f, err 208 } 209 210 func (fs *enospcFS) Link(oldname, newname string) error { 211 gen := fs.waitUntilReady() 212 213 err := fs.inner.Link(oldname, newname) 214 215 if err != nil && isENOSPC(err) { 216 fs.handleENOSPC(gen) 217 err = fs.inner.Link(oldname, newname) 218 } 219 return err 220 } 221 222 func (fs *enospcFS) Open(name string, opts ...OpenOption) (File, error) { 223 f, err := fs.inner.Open(name, opts...) 224 if f != nil { 225 f = WithFd(f, enospcFile{ 226 fs: fs, 227 inner: f, 228 }) 229 } 230 return f, err 231 } 232 233 func (fs *enospcFS) OpenDir(name string) (File, error) { 234 f, err := fs.inner.OpenDir(name) 235 if f != nil { 236 f = WithFd(f, enospcFile{ 237 fs: fs, 238 inner: f, 239 }) 240 } 241 return f, err 242 } 243 244 func (fs *enospcFS) Remove(name string) error { 245 gen := fs.waitUntilReady() 246 247 err := fs.inner.Remove(name) 248 249 if err != nil && isENOSPC(err) { 250 fs.handleENOSPC(gen) 251 err = fs.inner.Remove(name) 252 } 253 return err 254 } 255 256 func (fs *enospcFS) RemoveAll(name string) error { 257 gen := fs.waitUntilReady() 258 259 err := fs.inner.RemoveAll(name) 260 261 if err != nil && isENOSPC(err) { 262 fs.handleENOSPC(gen) 263 err = fs.inner.RemoveAll(name) 264 } 265 return err 266 } 267 268 func (fs *enospcFS) Rename(oldname, newname string) error { 269 gen := fs.waitUntilReady() 270 271 err := fs.inner.Rename(oldname, newname) 272 273 if err != nil && isENOSPC(err) { 274 fs.handleENOSPC(gen) 275 err = fs.inner.Rename(oldname, newname) 276 } 277 return err 278 } 279 280 func (fs *enospcFS) ReuseForWrite(oldname, newname string) (File, error) { 281 gen := fs.waitUntilReady() 282 283 f, err := fs.inner.ReuseForWrite(oldname, newname) 284 285 if err != nil && isENOSPC(err) { 286 fs.handleENOSPC(gen) 287 f, err = fs.inner.ReuseForWrite(oldname, newname) 288 } 289 290 if f != nil { 291 f = WithFd(f, enospcFile{ 292 fs: fs, 293 inner: f, 294 }) 295 } 296 return f, err 297 } 298 299 func (fs *enospcFS) MkdirAll(dir string, perm os.FileMode) error { 300 gen := fs.waitUntilReady() 301 302 err := fs.inner.MkdirAll(dir, perm) 303 304 if err != nil && isENOSPC(err) { 305 fs.handleENOSPC(gen) 306 err = fs.inner.MkdirAll(dir, perm) 307 } 308 return err 309 } 310 311 func (fs *enospcFS) Lock(name string) (io.Closer, error) { 312 gen := fs.waitUntilReady() 313 314 closer, err := fs.inner.Lock(name) 315 316 if err != nil && isENOSPC(err) { 317 fs.handleENOSPC(gen) 318 closer, err = fs.inner.Lock(name) 319 } 320 return closer, err 321 } 322 323 func (fs *enospcFS) List(dir string) ([]string, error) { 324 return fs.inner.List(dir) 325 } 326 327 func (fs *enospcFS) Stat(name string) (os.FileInfo, error) { 328 return fs.inner.Stat(name) 329 } 330 331 func (fs *enospcFS) PathBase(path string) string { 332 return fs.inner.PathBase(path) 333 } 334 335 func (fs *enospcFS) PathJoin(elem ...string) string { 336 return fs.inner.PathJoin(elem...) 337 } 338 339 func (fs *enospcFS) PathDir(path string) string { 340 return fs.inner.PathDir(path) 341 } 342 343 func (fs *enospcFS) GetDiskUsage(path string) (DiskUsage, error) { 344 return fs.inner.GetDiskUsage(path) 345 } 346 347 type enospcFile struct { 348 fs *enospcFS 349 inner File 350 } 351 352 func (f enospcFile) Close() error { 353 return f.inner.Close() 354 } 355 356 func (f enospcFile) Read(p []byte) (n int, err error) { 357 return f.inner.Read(p) 358 } 359 360 func (f enospcFile) ReadAt(p []byte, off int64) (n int, err error) { 361 return f.inner.ReadAt(p, off) 362 } 363 364 func (f enospcFile) Write(p []byte) (n int, err error) { 365 gen := f.fs.waitUntilReady() 366 367 n, err = f.inner.Write(p) 368 369 if err != nil && isENOSPC(err) { 370 f.fs.handleENOSPC(gen) 371 var n2 int 372 n2, err = f.inner.Write(p[n:]) 373 n += n2 374 } 375 return n, err 376 } 377 378 func (f enospcFile) Stat() (os.FileInfo, error) { 379 return f.inner.Stat() 380 } 381 382 func (f enospcFile) Sync() error { 383 gen := f.fs.waitUntilReady() 384 385 err := f.inner.Sync() 386 387 if err != nil && isENOSPC(err) { 388 f.fs.handleENOSPC(gen) 389 390 // NB: It is NOT safe to retry the Sync. See the PostgreSQL 391 // 'fsyncgate' discussion. A successful Sync after a failed one does 392 // not provide any guarantees and (always?) loses the unsynced writes. 393 // We need to bubble the error up and hope we weren't syncing a WAL or 394 // MANIFEST, because we'll have no choice but to crash. Errors while 395 // syncing an sstable will result in a failed flush/compaction, and 396 // the relevant sstable(s) will be marked as obsolete and deleted. 397 // See: https://lwn.net/Articles/752063/ 398 } 399 return err 400 } 401 402 // Ensure that *enospcFS implements the FS interface. 403 var _ FS = (*enospcFS)(nil) 404 405 func isENOSPC(err error) bool { 406 err = errors.UnwrapAll(err) 407 e, ok := err.(syscall.Errno) 408 return ok && e == syscall.ENOSPC 409 }