github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/replay/workload_capture.go (about)

     1  // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package replay
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"sync"
    11  	"sync/atomic"
    12  
    13  	"github.com/cockroachdb/pebble"
    14  	"github.com/cockroachdb/pebble/internal/base"
    15  	"github.com/cockroachdb/pebble/vfs"
    16  )
    17  
    18  type workloadCaptureState uint8
    19  
    20  const (
    21  	obsolete = workloadCaptureState(1) << iota
    22  	readyForProcessing
    23  	capturedSuccessfully
    24  )
    25  
    26  func (wcs workloadCaptureState) is(flag workloadCaptureState) bool { return wcs&flag != 0 }
    27  
    28  type manifestDetails struct {
    29  	sourceFilepath string
    30  	sourceFile     vfs.File
    31  
    32  	destFile vfs.File
    33  }
    34  
    35  // WorkloadCollector is designed to capture workloads by handling manifest
    36  // files, flushed SSTs and ingested SSTs. The collector hooks into the
    37  // pebble.EventListener and pebble.Cleaner in order keep track of file states.
    38  type WorkloadCollector struct {
    39  	mu struct {
    40  		sync.Mutex
    41  		fileState map[string]workloadCaptureState
    42  		// pendingSSTables holds a slice of file paths to sstables that need to
    43  		// be copied but haven't yet. The `copyFiles` goroutine grabs these
    44  		// files, and the flush and ingest event handlers append them.
    45  		pendingSSTables []string
    46  		// manifestIndex is an index into `manifests`, pointing to the
    47  		// manifest currently being copied.
    48  		manifestIndex int
    49  		// appending to manifests requires holding mu. Only the `copyFiles`
    50  		// goroutine is permitted to read or edit the struct contents once
    51  		// appended, so it does not need to hold mu while accessing the structs'
    52  		// fields.
    53  		manifests []*manifestDetails
    54  
    55  		// The following condition variable and counts are used in tests to
    56  		// synchronize with the copying goroutine.
    57  		copyCond       sync.Cond
    58  		tablesCopied   int
    59  		tablesEnqueued int
    60  	}
    61  	// Stores the current manifest that is being used by the database.
    62  	curManifest atomic.Uint64
    63  	// Stores whether the workload collector is enabled.
    64  	enabled atomic.Bool
    65  	buffer  []byte
    66  	// config contains information that is only set on the creation of the
    67  	// WorkloadCollector.
    68  	config struct {
    69  		// srcFS and srcDir represent the location from which the workload collector
    70  		// collects the files from.
    71  		srcFS  vfs.FS
    72  		srcDir string
    73  		// destFS and destDir represent the location to which the workload collector
    74  		// sends the files to.
    75  		destFS  vfs.FS
    76  		destDir string
    77  		// cleaner stores the cleaner to use when files become obsolete and need to
    78  		// be cleaned.
    79  		cleaner base.Cleaner
    80  	}
    81  	copier struct {
    82  		sync.Cond
    83  		stop bool
    84  		done chan struct{}
    85  	}
    86  }
    87  
    88  // NewWorkloadCollector is used externally to create a New WorkloadCollector.
    89  func NewWorkloadCollector(srcDir string) *WorkloadCollector {
    90  	wc := &WorkloadCollector{}
    91  	wc.buffer = make([]byte, 1<<10 /* 1KB */)
    92  	wc.config.srcDir = srcDir
    93  	wc.mu.copyCond.L = &wc.mu.Mutex
    94  	wc.mu.fileState = make(map[string]workloadCaptureState)
    95  	wc.copier.Cond.L = &wc.mu.Mutex
    96  	return wc
    97  }
    98  
    99  // Attach is used to set up the WorkloadCollector by attaching itself to
   100  // pebble.Options EventListener and Cleaner.
   101  func (w *WorkloadCollector) Attach(opts *pebble.Options) {
   102  	opts.AddEventListener(pebble.EventListener{
   103  		FlushEnd:        w.onFlushEnd,
   104  		ManifestCreated: w.onManifestCreated,
   105  		TableIngested:   w.onTableIngest,
   106  	})
   107  
   108  	opts.EnsureDefaults()
   109  	// Replace the original Cleaner with the workload collector's implementation,
   110  	// which will invoke the original Cleaner, but only once the collector's copied
   111  	// what it needs.
   112  	c := cleaner{
   113  		name:  fmt.Sprintf("replay.WorkloadCollector(%q)", opts.Cleaner),
   114  		clean: w.clean,
   115  	}
   116  	w.config.cleaner, opts.Cleaner = opts.Cleaner, c
   117  	w.config.srcFS = opts.FS
   118  }
   119  
   120  // enqueueCopyLocked enqueues the sstable with the provided filenum be copied in
   121  // the background. Requires w.mu.
   122  func (w *WorkloadCollector) enqueueCopyLocked(fileNum base.DiskFileNum) {
   123  	fileName := base.MakeFilename(base.FileTypeTable, fileNum)
   124  	w.mu.fileState[fileName] |= readyForProcessing
   125  	w.mu.pendingSSTables = append(w.mu.pendingSSTables, w.srcFilepath(fileName))
   126  	w.mu.tablesEnqueued++
   127  }
   128  
   129  // cleanFile calls the cleaner on the specified path and removes the path from
   130  // the fileState map.
   131  func (w *WorkloadCollector) cleanFile(fileType base.FileType, path string) error {
   132  	err := w.config.cleaner.Clean(w.config.srcFS, fileType, path)
   133  	if err == nil {
   134  		w.mu.Lock()
   135  		delete(w.mu.fileState, w.config.srcFS.PathBase(path))
   136  		w.mu.Unlock()
   137  	}
   138  	return err
   139  }
   140  
   141  // clean deletes files only after they have been processed or are not required
   142  // for the workload collection.
   143  func (w *WorkloadCollector) clean(fs vfs.FS, fileType base.FileType, path string) error {
   144  	if !w.IsRunning() {
   145  		return w.cleanFile(fileType, path)
   146  	}
   147  	w.mu.Lock()
   148  	fileName := fs.PathBase(path)
   149  	if fileState, ok := w.mu.fileState[fileName]; !ok || fileState.is(capturedSuccessfully) {
   150  		// Delete the file if it has been captured or the file is not important
   151  		// to capture which means it can be deleted.
   152  		w.mu.Unlock()
   153  		return w.cleanFile(fileType, path)
   154  	}
   155  	w.mu.fileState[fileName] |= obsolete
   156  	w.mu.Unlock()
   157  	return nil
   158  }
   159  
   160  // onTableIngest is attached to a pebble.DB as an EventListener.TableIngested
   161  // func. It enqueues all ingested tables to be copied.
   162  func (w *WorkloadCollector) onTableIngest(info pebble.TableIngestInfo) {
   163  	if !w.IsRunning() {
   164  		return
   165  	}
   166  	w.mu.Lock()
   167  	defer w.mu.Unlock()
   168  	for _, table := range info.Tables {
   169  		w.enqueueCopyLocked(table.FileNum.DiskFileNum())
   170  	}
   171  	w.copier.Broadcast()
   172  }
   173  
   174  // onFlushEnd is attached to a pebble.DB as an EventListener.FlushEnd func. It
   175  // enqueues all flushed tables to be copied.
   176  func (w *WorkloadCollector) onFlushEnd(info pebble.FlushInfo) {
   177  	if !w.IsRunning() {
   178  		return
   179  	}
   180  	w.mu.Lock()
   181  	defer w.mu.Unlock()
   182  	for _, table := range info.Output {
   183  		w.enqueueCopyLocked(table.FileNum.DiskFileNum())
   184  	}
   185  	w.copier.Broadcast()
   186  }
   187  
   188  // onManifestCreated is attached to a pebble.DB as an
   189  // EventListener.ManifestCreated func. It records the the new manifest so that
   190  // it's copied asynchronously in the background.
   191  func (w *WorkloadCollector) onManifestCreated(info pebble.ManifestCreateInfo) {
   192  	w.curManifest.Store(uint64(info.FileNum))
   193  	if !w.enabled.Load() {
   194  		return
   195  	}
   196  	w.mu.Lock()
   197  	defer w.mu.Unlock()
   198  
   199  	// mark the manifest file as ready for processing to prevent it from being
   200  	// cleaned before we process it.
   201  	fileName := base.MakeFilename(base.FileTypeManifest, info.FileNum.DiskFileNum())
   202  	w.mu.fileState[fileName] |= readyForProcessing
   203  	w.mu.manifests = append(w.mu.manifests, &manifestDetails{
   204  		sourceFilepath: info.Path,
   205  	})
   206  }
   207  
   208  // copyFiles is run in a separate goroutine, copying sstables and manifests.
   209  func (w *WorkloadCollector) copyFiles() {
   210  	w.mu.Lock()
   211  	defer w.mu.Unlock()
   212  	// NB: This loop must hold w.mu at the beginning of each iteration. It may
   213  	// drop w.mu at times, but it must reacquire it before the next iteration.
   214  	for !w.copier.stop {
   215  		// The following performs the workload capture. It waits on a condition
   216  		// variable (fileListener) to let it know when new files are available to be
   217  		// collected.
   218  		if len(w.mu.pendingSSTables) == 0 {
   219  			w.copier.Wait()
   220  		}
   221  		// Grab the manifests to copy.
   222  		index := w.mu.manifestIndex
   223  		pendingManifests := w.mu.manifests[index:]
   224  		var pending []string
   225  		pending, w.mu.pendingSSTables = w.mu.pendingSSTables, nil
   226  		func() {
   227  			// Note the unusual lock order; Temporarily unlock the
   228  			// mutex, but re-acquire it before returning.
   229  			w.mu.Unlock()
   230  			defer w.mu.Lock()
   231  
   232  			// Copy any updates to the manifests files.
   233  			w.copyManifests(index, pendingManifests)
   234  			// Copy the SSTables provided in pending. copySSTables takes
   235  			// ownership of the pending slice.
   236  			w.copySSTables(pending)
   237  		}()
   238  
   239  		// This helps in tests; Tests can wait on the copyCond condition
   240  		// variable until the necessary bits have been copied.
   241  		w.mu.tablesCopied += len(pending)
   242  		w.mu.copyCond.Broadcast()
   243  	}
   244  
   245  	for idx := range w.mu.manifests {
   246  		if f := w.mu.manifests[idx].sourceFile; f != nil {
   247  			if err := f.Close(); err != nil {
   248  				panic(err)
   249  			}
   250  			w.mu.manifests[idx].sourceFile = nil
   251  		}
   252  		if f := w.mu.manifests[idx].destFile; f != nil {
   253  			if err := f.Close(); err != nil {
   254  				panic(err)
   255  			}
   256  			w.mu.manifests[idx].destFile = nil
   257  		}
   258  	}
   259  	close(w.copier.done)
   260  }
   261  
   262  // copyManifests copies any un-copied portions of the source manifests.
   263  func (w *WorkloadCollector) copyManifests(startAtIndex int, manifests []*manifestDetails) {
   264  	destFS := w.config.destFS
   265  
   266  	for index, manifest := range manifests {
   267  		if manifest.destFile == nil && manifest.sourceFile == nil {
   268  			// This is the first time we've read from this manifest, and we
   269  			// don't yet have open file descriptors for the src or dst files. It
   270  			// is safe to write to manifest.{destFile,sourceFile} without
   271  			// holding d.mu, because the copyFiles goroutine is the only
   272  			// goroutine that accesses the fields of the `manifestDetails`
   273  			// struct.
   274  			var err error
   275  			manifest.destFile, err = destFS.Create(w.destFilepath(destFS.PathBase(manifest.sourceFilepath)))
   276  			if err != nil {
   277  				panic(err)
   278  			}
   279  			manifest.sourceFile, err = w.config.srcFS.Open(manifest.sourceFilepath)
   280  			if err != nil {
   281  				panic(err)
   282  			}
   283  		}
   284  
   285  		numBytesRead, err := io.CopyBuffer(manifest.destFile, manifest.sourceFile, w.buffer)
   286  		if err != nil {
   287  			panic(err)
   288  		}
   289  
   290  		// Read 0 bytes from the current manifest and this is not the
   291  		// latest/newest manifest which means we have read its entirety. No new
   292  		// data will be written to it, because only the latest manifest may
   293  		// receive edits. Close the current source and destination files and
   294  		// move the manifest to start at the next index in w.mu.manifests.
   295  		if numBytesRead == 0 && index != len(manifests)-1 {
   296  			// Rotating the manifests so we can close the files.
   297  			if err := manifests[index].sourceFile.Close(); err != nil {
   298  				panic(err)
   299  			}
   300  			manifests[index].sourceFile = nil
   301  			if err := manifests[index].destFile.Close(); err != nil {
   302  				panic(err)
   303  			}
   304  			manifests[index].destFile = nil
   305  			w.mu.Lock()
   306  			w.mu.manifestIndex = startAtIndex + index + 1
   307  			w.mu.Unlock()
   308  		}
   309  	}
   310  }
   311  
   312  // copySSTables copies the provided sstables to the stored workload. If a file
   313  // has already been marked as obsolete, then file will be cleaned by the
   314  // w.config.cleaner after it is copied. The provided slice will be mutated and
   315  // should not be used following the call to this function.
   316  func (w *WorkloadCollector) copySSTables(pending []string) {
   317  	for _, filePath := range pending {
   318  		err := vfs.CopyAcrossFS(w.config.srcFS,
   319  			filePath,
   320  			w.config.destFS,
   321  			w.destFilepath(w.config.srcFS.PathBase(filePath)))
   322  		if err != nil {
   323  			panic(err)
   324  		}
   325  	}
   326  
   327  	// Identify the subset of `pending` files that should now be cleaned. The
   328  	// WorkloadCollector intercepts Cleaner.Clean calls to defer cleaning until
   329  	// copying has completed. If Cleaner.Clean has already been invoked for any
   330  	// of the files that copied, we can now actually Clean them.
   331  	pendingClean := pending[:0]
   332  	w.mu.Lock()
   333  	for _, filePath := range pending {
   334  		fileName := w.config.srcFS.PathBase(filePath)
   335  		if w.mu.fileState[fileName].is(obsolete) {
   336  			pendingClean = append(pendingClean, filePath)
   337  		} else {
   338  			w.mu.fileState[fileName] |= capturedSuccessfully
   339  		}
   340  	}
   341  	w.mu.Unlock()
   342  
   343  	for _, path := range pendingClean {
   344  		_ = w.cleanFile(base.FileTypeTable, path)
   345  	}
   346  }
   347  
   348  // Start begins collecting a workload. All flushed and ingested sstables, plus
   349  // corresponding manifests are copied to the provided destination path on the
   350  // provided FS.
   351  func (w *WorkloadCollector) Start(destFS vfs.FS, destPath string) {
   352  	w.mu.Lock()
   353  	defer w.mu.Unlock()
   354  
   355  	// If the collector not is running then that means w.enabled == 0 so swap it
   356  	// to 1 and continue else return since it is already running.
   357  	if !w.enabled.CompareAndSwap(false, true) {
   358  		return
   359  	}
   360  	w.config.destFS = destFS
   361  	w.config.destDir = destPath
   362  
   363  	// Initialize the tracked manifests to the database's current manifest, if
   364  	// the database has already started. Every database Open creates a new
   365  	// manifest. There are two cases:
   366  	//   1. The database has already been opened. Then `w.atomic.curManifest`
   367  	//      contains the file number of the current manifest. We must initialize
   368  	//      the w.mu.manifests slice to contain this first manifest.
   369  	//   2. The database has not yet been opened. Then `w.atomic.curManifest` is
   370  	//      still zero. Once the associated database is opened, it'll invoke
   371  	//      onManifestCreated which will handle enqueuing the manifest on
   372  	//      `w.mu.manifests`.
   373  	fileNum := base.FileNum(w.curManifest.Load())
   374  	if fileNum != 0 {
   375  		fileName := base.MakeFilename(base.FileTypeManifest, fileNum.DiskFileNum())
   376  		w.mu.manifests = append(w.mu.manifests[:0], &manifestDetails{sourceFilepath: w.srcFilepath(fileName)})
   377  		w.mu.fileState[fileName] |= readyForProcessing
   378  	}
   379  
   380  	// Begin copying files asynchronously in the background.
   381  	w.copier.done = make(chan struct{})
   382  	w.copier.stop = false
   383  	go w.copyFiles()
   384  }
   385  
   386  // WaitAndStop waits for all enqueued sstables to be copied over, and then
   387  // calls Stop. Gracefully ensures that all sstables referenced in the collected
   388  // manifest's latest version edit will exist in the copy directory.
   389  func (w *WorkloadCollector) WaitAndStop() {
   390  	w.mu.Lock()
   391  	for w.mu.tablesEnqueued != w.mu.tablesCopied {
   392  		w.mu.copyCond.Wait()
   393  	}
   394  	w.mu.Unlock()
   395  	w.Stop()
   396  }
   397  
   398  // Stop stops collection of the workload.
   399  func (w *WorkloadCollector) Stop() {
   400  	w.mu.Lock()
   401  	// If the collector is running then that means w.enabled == true so swap it to
   402  	// false and continue else return since it is not running.
   403  	if !w.enabled.CompareAndSwap(true, false) {
   404  		w.mu.Unlock()
   405  		return
   406  	}
   407  	w.copier.stop = true
   408  	w.copier.Broadcast()
   409  	w.mu.Unlock()
   410  	<-w.copier.done
   411  }
   412  
   413  // IsRunning returns whether the WorkloadCollector is currently running.
   414  func (w *WorkloadCollector) IsRunning() bool {
   415  	return w.enabled.Load()
   416  }
   417  
   418  // srcFilepath returns the file path to the named file in the source directory
   419  // on the source filesystem.
   420  func (w *WorkloadCollector) srcFilepath(name string) string {
   421  	return w.config.srcFS.PathJoin(w.config.srcDir, name)
   422  }
   423  
   424  // destFilepath returns the file path to the named file in the destination
   425  // directory on the destination filesystem.
   426  func (w *WorkloadCollector) destFilepath(name string) string {
   427  	return w.config.destFS.PathJoin(w.config.destDir, name)
   428  }
   429  
   430  type cleaner struct {
   431  	name  string
   432  	clean func(vfs.FS, base.FileType, string) error
   433  }
   434  
   435  func (c cleaner) String() string { return c.name }
   436  func (c cleaner) Clean(fs vfs.FS, fileType base.FileType, path string) error {
   437  	return c.clean(fs, fileType, path)
   438  }