github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/replay/workload_capture.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package replay 6 7 import ( 8 "fmt" 9 "io" 10 "sync" 11 "sync/atomic" 12 13 "github.com/cockroachdb/pebble" 14 "github.com/cockroachdb/pebble/internal/base" 15 "github.com/cockroachdb/pebble/vfs" 16 ) 17 18 type workloadCaptureState uint8 19 20 const ( 21 obsolete = workloadCaptureState(1) << iota 22 readyForProcessing 23 capturedSuccessfully 24 ) 25 26 func (wcs workloadCaptureState) is(flag workloadCaptureState) bool { return wcs&flag != 0 } 27 28 type manifestDetails struct { 29 sourceFilepath string 30 sourceFile vfs.File 31 32 destFile vfs.File 33 } 34 35 // WorkloadCollector is designed to capture workloads by handling manifest 36 // files, flushed SSTs and ingested SSTs. The collector hooks into the 37 // pebble.EventListener and pebble.Cleaner in order keep track of file states. 38 type WorkloadCollector struct { 39 mu struct { 40 sync.Mutex 41 fileState map[string]workloadCaptureState 42 // pendingSSTables holds a slice of file paths to sstables that need to 43 // be copied but haven't yet. The `copyFiles` goroutine grabs these 44 // files, and the flush and ingest event handlers append them. 45 pendingSSTables []string 46 // manifestIndex is an index into `manifests`, pointing to the 47 // manifest currently being copied. 48 manifestIndex int 49 // appending to manifests requires holding mu. Only the `copyFiles` 50 // goroutine is permitted to read or edit the struct contents once 51 // appended, so it does not need to hold mu while accessing the structs' 52 // fields. 53 manifests []*manifestDetails 54 55 // The following condition variable and counts are used in tests to 56 // synchronize with the copying goroutine. 57 copyCond sync.Cond 58 tablesCopied int 59 tablesEnqueued int 60 } 61 // Stores the current manifest that is being used by the database. 62 curManifest atomic.Uint64 63 // Stores whether the workload collector is enabled. 64 enabled atomic.Bool 65 buffer []byte 66 // config contains information that is only set on the creation of the 67 // WorkloadCollector. 68 config struct { 69 // srcFS and srcDir represent the location from which the workload collector 70 // collects the files from. 71 srcFS vfs.FS 72 srcDir string 73 // destFS and destDir represent the location to which the workload collector 74 // sends the files to. 75 destFS vfs.FS 76 destDir string 77 // cleaner stores the cleaner to use when files become obsolete and need to 78 // be cleaned. 79 cleaner base.Cleaner 80 } 81 copier struct { 82 sync.Cond 83 stop bool 84 done chan struct{} 85 } 86 } 87 88 // NewWorkloadCollector is used externally to create a New WorkloadCollector. 89 func NewWorkloadCollector(srcDir string) *WorkloadCollector { 90 wc := &WorkloadCollector{} 91 wc.buffer = make([]byte, 1<<10 /* 1KB */) 92 wc.config.srcDir = srcDir 93 wc.mu.copyCond.L = &wc.mu.Mutex 94 wc.mu.fileState = make(map[string]workloadCaptureState) 95 wc.copier.Cond.L = &wc.mu.Mutex 96 return wc 97 } 98 99 // Attach is used to set up the WorkloadCollector by attaching itself to 100 // pebble.Options EventListener and Cleaner. 101 func (w *WorkloadCollector) Attach(opts *pebble.Options) { 102 opts.AddEventListener(pebble.EventListener{ 103 FlushEnd: w.onFlushEnd, 104 ManifestCreated: w.onManifestCreated, 105 TableIngested: w.onTableIngest, 106 }) 107 108 opts.EnsureDefaults() 109 // Replace the original Cleaner with the workload collector's implementation, 110 // which will invoke the original Cleaner, but only once the collector's copied 111 // what it needs. 112 c := cleaner{ 113 name: fmt.Sprintf("replay.WorkloadCollector(%q)", opts.Cleaner), 114 clean: w.clean, 115 } 116 w.config.cleaner, opts.Cleaner = opts.Cleaner, c 117 w.config.srcFS = opts.FS 118 } 119 120 // enqueueCopyLocked enqueues the sstable with the provided filenum be copied in 121 // the background. Requires w.mu. 122 func (w *WorkloadCollector) enqueueCopyLocked(fileNum base.DiskFileNum) { 123 fileName := base.MakeFilename(base.FileTypeTable, fileNum) 124 w.mu.fileState[fileName] |= readyForProcessing 125 w.mu.pendingSSTables = append(w.mu.pendingSSTables, w.srcFilepath(fileName)) 126 w.mu.tablesEnqueued++ 127 } 128 129 // cleanFile calls the cleaner on the specified path and removes the path from 130 // the fileState map. 131 func (w *WorkloadCollector) cleanFile(fileType base.FileType, path string) error { 132 err := w.config.cleaner.Clean(w.config.srcFS, fileType, path) 133 if err == nil { 134 w.mu.Lock() 135 delete(w.mu.fileState, w.config.srcFS.PathBase(path)) 136 w.mu.Unlock() 137 } 138 return err 139 } 140 141 // clean deletes files only after they have been processed or are not required 142 // for the workload collection. 143 func (w *WorkloadCollector) clean(fs vfs.FS, fileType base.FileType, path string) error { 144 if !w.IsRunning() { 145 return w.cleanFile(fileType, path) 146 } 147 w.mu.Lock() 148 fileName := fs.PathBase(path) 149 if fileState, ok := w.mu.fileState[fileName]; !ok || fileState.is(capturedSuccessfully) { 150 // Delete the file if it has been captured or the file is not important 151 // to capture which means it can be deleted. 152 w.mu.Unlock() 153 return w.cleanFile(fileType, path) 154 } 155 w.mu.fileState[fileName] |= obsolete 156 w.mu.Unlock() 157 return nil 158 } 159 160 // onTableIngest is attached to a pebble.DB as an EventListener.TableIngested 161 // func. It enqueues all ingested tables to be copied. 162 func (w *WorkloadCollector) onTableIngest(info pebble.TableIngestInfo) { 163 if !w.IsRunning() { 164 return 165 } 166 w.mu.Lock() 167 defer w.mu.Unlock() 168 for _, table := range info.Tables { 169 w.enqueueCopyLocked(table.FileNum.DiskFileNum()) 170 } 171 w.copier.Broadcast() 172 } 173 174 // onFlushEnd is attached to a pebble.DB as an EventListener.FlushEnd func. It 175 // enqueues all flushed tables to be copied. 176 func (w *WorkloadCollector) onFlushEnd(info pebble.FlushInfo) { 177 if !w.IsRunning() { 178 return 179 } 180 w.mu.Lock() 181 defer w.mu.Unlock() 182 for _, table := range info.Output { 183 w.enqueueCopyLocked(table.FileNum.DiskFileNum()) 184 } 185 w.copier.Broadcast() 186 } 187 188 // onManifestCreated is attached to a pebble.DB as an 189 // EventListener.ManifestCreated func. It records the the new manifest so that 190 // it's copied asynchronously in the background. 191 func (w *WorkloadCollector) onManifestCreated(info pebble.ManifestCreateInfo) { 192 w.curManifest.Store(uint64(info.FileNum)) 193 if !w.enabled.Load() { 194 return 195 } 196 w.mu.Lock() 197 defer w.mu.Unlock() 198 199 // mark the manifest file as ready for processing to prevent it from being 200 // cleaned before we process it. 201 fileName := base.MakeFilename(base.FileTypeManifest, info.FileNum) 202 w.mu.fileState[fileName] |= readyForProcessing 203 w.mu.manifests = append(w.mu.manifests, &manifestDetails{ 204 sourceFilepath: info.Path, 205 }) 206 } 207 208 // copyFiles is run in a separate goroutine, copying sstables and manifests. 209 func (w *WorkloadCollector) copyFiles() { 210 w.mu.Lock() 211 defer w.mu.Unlock() 212 // NB: This loop must hold w.mu at the beginning of each iteration. It may 213 // drop w.mu at times, but it must reacquire it before the next iteration. 214 for !w.copier.stop { 215 // The following performs the workload capture. It waits on a condition 216 // variable (fileListener) to let it know when new files are available to be 217 // collected. 218 if len(w.mu.pendingSSTables) == 0 { 219 w.copier.Wait() 220 } 221 // Grab the manifests to copy. 222 index := w.mu.manifestIndex 223 pendingManifests := w.mu.manifests[index:] 224 var pending []string 225 pending, w.mu.pendingSSTables = w.mu.pendingSSTables, nil 226 func() { 227 // Note the unusual lock order; Temporarily unlock the 228 // mutex, but re-acquire it before returning. 229 w.mu.Unlock() 230 defer w.mu.Lock() 231 232 // Copy any updates to the manifests files. 233 w.copyManifests(index, pendingManifests) 234 // Copy the SSTables provided in pending. copySSTables takes 235 // ownership of the pending slice. 236 w.copySSTables(pending) 237 }() 238 239 // This helps in tests; Tests can wait on the copyCond condition 240 // variable until the necessary bits have been copied. 241 w.mu.tablesCopied += len(pending) 242 w.mu.copyCond.Broadcast() 243 } 244 245 for idx := range w.mu.manifests { 246 if f := w.mu.manifests[idx].sourceFile; f != nil { 247 if err := f.Close(); err != nil { 248 panic(err) 249 } 250 w.mu.manifests[idx].sourceFile = nil 251 } 252 if f := w.mu.manifests[idx].destFile; f != nil { 253 if err := f.Close(); err != nil { 254 panic(err) 255 } 256 w.mu.manifests[idx].destFile = nil 257 } 258 } 259 close(w.copier.done) 260 } 261 262 // copyManifests copies any un-copied portions of the source manifests. 263 func (w *WorkloadCollector) copyManifests(startAtIndex int, manifests []*manifestDetails) { 264 destFS := w.config.destFS 265 266 for index, manifest := range manifests { 267 if manifest.destFile == nil && manifest.sourceFile == nil { 268 // This is the first time we've read from this manifest, and we 269 // don't yet have open file descriptors for the src or dst files. It 270 // is safe to write to manifest.{destFile,sourceFile} without 271 // holding d.mu, because the copyFiles goroutine is the only 272 // goroutine that accesses the fields of the `manifestDetails` 273 // struct. 274 var err error 275 manifest.destFile, err = destFS.Create(w.destFilepath(destFS.PathBase(manifest.sourceFilepath))) 276 if err != nil { 277 panic(err) 278 } 279 manifest.sourceFile, err = w.config.srcFS.Open(manifest.sourceFilepath) 280 if err != nil { 281 panic(err) 282 } 283 } 284 285 numBytesRead, err := io.CopyBuffer(manifest.destFile, manifest.sourceFile, w.buffer) 286 if err != nil { 287 panic(err) 288 } 289 290 // Read 0 bytes from the current manifest and this is not the 291 // latest/newest manifest which means we have read its entirety. No new 292 // data will be written to it, because only the latest manifest may 293 // receive edits. Close the current source and destination files and 294 // move the manifest to start at the next index in w.mu.manifests. 295 if numBytesRead == 0 && index != len(manifests)-1 { 296 // Rotating the manifests so we can close the files. 297 if err := manifests[index].sourceFile.Close(); err != nil { 298 panic(err) 299 } 300 manifests[index].sourceFile = nil 301 if err := manifests[index].destFile.Close(); err != nil { 302 panic(err) 303 } 304 manifests[index].destFile = nil 305 w.mu.Lock() 306 w.mu.manifestIndex = startAtIndex + index + 1 307 w.mu.Unlock() 308 } 309 } 310 } 311 312 // copySSTables copies the provided sstables to the stored workload. If a file 313 // has already been marked as obsolete, then file will be cleaned by the 314 // w.config.cleaner after it is copied. The provided slice will be mutated and 315 // should not be used following the call to this function. 316 func (w *WorkloadCollector) copySSTables(pending []string) { 317 for _, filePath := range pending { 318 err := vfs.CopyAcrossFS(w.config.srcFS, 319 filePath, 320 w.config.destFS, 321 w.destFilepath(w.config.srcFS.PathBase(filePath))) 322 if err != nil { 323 panic(err) 324 } 325 } 326 327 // Identify the subset of `pending` files that should now be cleaned. The 328 // WorkloadCollector intercepts Cleaner.Clean calls to defer cleaning until 329 // copying has completed. If Cleaner.Clean has already been invoked for any 330 // of the files that copied, we can now actually Clean them. 331 pendingClean := pending[:0] 332 w.mu.Lock() 333 for _, filePath := range pending { 334 fileName := w.config.srcFS.PathBase(filePath) 335 if w.mu.fileState[fileName].is(obsolete) { 336 pendingClean = append(pendingClean, filePath) 337 } else { 338 w.mu.fileState[fileName] |= capturedSuccessfully 339 } 340 } 341 w.mu.Unlock() 342 343 for _, path := range pendingClean { 344 _ = w.cleanFile(base.FileTypeTable, path) 345 } 346 } 347 348 // Start begins collecting a workload. All flushed and ingested sstables, plus 349 // corresponding manifests are copied to the provided destination path on the 350 // provided FS. 351 func (w *WorkloadCollector) Start(destFS vfs.FS, destPath string) { 352 w.mu.Lock() 353 defer w.mu.Unlock() 354 355 // If the collector not is running then that means w.enabled == 0 so swap it 356 // to 1 and continue else return since it is already running. 357 if !w.enabled.CompareAndSwap(false, true) { 358 return 359 } 360 w.config.destFS = destFS 361 w.config.destDir = destPath 362 363 // Initialize the tracked manifests to the database's current manifest, if 364 // the database has already started. Every database Open creates a new 365 // manifest. There are two cases: 366 // 1. The database has already been opened. Then `w.atomic.curManifest` 367 // contains the file number of the current manifest. We must initialize 368 // the w.mu.manifests slice to contain this first manifest. 369 // 2. The database has not yet been opened. Then `w.atomic.curManifest` is 370 // still zero. Once the associated database is opened, it'll invoke 371 // onManifestCreated which will handle enqueuing the manifest on 372 // `w.mu.manifests`. 373 fileNum := base.FileNum(w.curManifest.Load()) 374 if fileNum != 0 { 375 fileName := base.MakeFilename(base.FileTypeManifest, fileNum.DiskFileNum()) 376 w.mu.manifests = append(w.mu.manifests[:0], &manifestDetails{sourceFilepath: w.srcFilepath(fileName)}) 377 w.mu.fileState[fileName] |= readyForProcessing 378 } 379 380 // Begin copying files asynchronously in the background. 381 w.copier.done = make(chan struct{}) 382 w.copier.stop = false 383 go w.copyFiles() 384 } 385 386 // WaitAndStop waits for all enqueued sstables to be copied over, and then 387 // calls Stop. Gracefully ensures that all sstables referenced in the collected 388 // manifest's latest version edit will exist in the copy directory. 389 func (w *WorkloadCollector) WaitAndStop() { 390 w.mu.Lock() 391 for w.mu.tablesEnqueued != w.mu.tablesCopied { 392 w.mu.copyCond.Wait() 393 } 394 w.mu.Unlock() 395 w.Stop() 396 } 397 398 // Stop stops collection of the workload. 399 func (w *WorkloadCollector) Stop() { 400 w.mu.Lock() 401 // If the collector is running then that means w.enabled == true so swap it to 402 // false and continue else return since it is not running. 403 if !w.enabled.CompareAndSwap(true, false) { 404 w.mu.Unlock() 405 return 406 } 407 w.copier.stop = true 408 w.copier.Broadcast() 409 w.mu.Unlock() 410 <-w.copier.done 411 } 412 413 // IsRunning returns whether the WorkloadCollector is currently running. 414 func (w *WorkloadCollector) IsRunning() bool { 415 return w.enabled.Load() 416 } 417 418 // srcFilepath returns the file path to the named file in the source directory 419 // on the source filesystem. 420 func (w *WorkloadCollector) srcFilepath(name string) string { 421 return w.config.srcFS.PathJoin(w.config.srcDir, name) 422 } 423 424 // destFilepath returns the file path to the named file in the destination 425 // directory on the destination filesystem. 426 func (w *WorkloadCollector) destFilepath(name string) string { 427 return w.config.destFS.PathJoin(w.config.destDir, name) 428 } 429 430 type cleaner struct { 431 name string 432 clean func(vfs.FS, base.FileType, string) error 433 } 434 435 func (c cleaner) String() string { return c.name } 436 func (c cleaner) Clean(fs vfs.FS, fileType base.FileType, path string) error { 437 return c.clean(fs, fileType, path) 438 }