github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/storage/cleanup.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package storage
    22  
    23  import (
    24  	"fmt"
    25  	"sort"
    26  	"sync"
    27  
    28  	"github.com/m3db/m3/src/dbnode/persist"
    29  	"github.com/m3db/m3/src/dbnode/persist/fs"
    30  	"github.com/m3db/m3/src/dbnode/persist/fs/commitlog"
    31  	"github.com/m3db/m3/src/dbnode/retention"
    32  	"github.com/m3db/m3/src/x/clock"
    33  	xerrors "github.com/m3db/m3/src/x/errors"
    34  	"github.com/m3db/m3/src/x/ident"
    35  	xtime "github.com/m3db/m3/src/x/time"
    36  
    37  	"github.com/pborman/uuid"
    38  	"github.com/uber-go/tally"
    39  	"go.uber.org/zap"
    40  )
    41  
    42  type (
    43  	commitLogFilesFn func(commitlog.Options) (
    44  		persist.CommitLogFiles, []commitlog.ErrorWithPath, error,
    45  	)
    46  	snapshotMetadataFilesFn func(fs.Options) (
    47  		[]fs.SnapshotMetadata, []fs.SnapshotMetadataErrorWithPaths, error,
    48  	)
    49  )
    50  
    51  type snapshotFilesFn func(
    52  	filePathPrefix string, namespace ident.ID, shard uint32,
    53  ) (fs.FileSetFilesSlice, error)
    54  
    55  type deleteFilesFn func(files []string) error
    56  
    57  type deleteInactiveDirectoriesFn func(parentDirPath string, activeDirNames []string) error
    58  
    59  // Narrow interface so as not to expose all the functionality of the commitlog
    60  // to the cleanup manager.
    61  type activeCommitlogs interface {
    62  	ActiveLogs() (persist.CommitLogFiles, error)
    63  }
    64  
    65  type cleanupManager struct {
    66  	sync.RWMutex
    67  
    68  	database         database
    69  	activeCommitlogs activeCommitlogs
    70  
    71  	opts                    Options
    72  	nowFn                   clock.NowFn
    73  	filePathPrefix          string
    74  	commitLogsDir           string
    75  	commitLogFilesFn        commitLogFilesFn
    76  	snapshotMetadataFilesFn snapshotMetadataFilesFn
    77  	snapshotFilesFn         snapshotFilesFn
    78  
    79  	deleteFilesFn               deleteFilesFn
    80  	deleteInactiveDirectoriesFn deleteInactiveDirectoriesFn
    81  	warmFlushCleanupInProgress  bool
    82  	coldFlushCleanupInProgress  bool
    83  	metrics                     cleanupManagerMetrics
    84  	logger                      *zap.Logger
    85  }
    86  
    87  type cleanupManagerMetrics struct {
    88  	warmFlushCleanupStatus      tally.Gauge
    89  	coldFlushCleanupStatus      tally.Gauge
    90  	corruptCommitlogFile        tally.Counter
    91  	corruptSnapshotFile         tally.Counter
    92  	corruptSnapshotMetadataFile tally.Counter
    93  	deletedCommitlogFile        tally.Counter
    94  	deletedSnapshotFile         tally.Counter
    95  	deletedSnapshotMetadataFile tally.Counter
    96  }
    97  
    98  func newCleanupManagerMetrics(scope tally.Scope) cleanupManagerMetrics {
    99  	clScope := scope.SubScope("commitlog")
   100  	sScope := scope.SubScope("snapshot")
   101  	smScope := scope.SubScope("snapshot-metadata")
   102  	return cleanupManagerMetrics{
   103  		warmFlushCleanupStatus:      scope.Gauge("warm-flush-cleanup"),
   104  		coldFlushCleanupStatus:      scope.Gauge("cold-flush-cleanup"),
   105  		corruptCommitlogFile:        clScope.Counter("corrupt"),
   106  		corruptSnapshotFile:         sScope.Counter("corrupt"),
   107  		corruptSnapshotMetadataFile: smScope.Counter("corrupt"),
   108  		deletedCommitlogFile:        clScope.Counter("deleted"),
   109  		deletedSnapshotFile:         sScope.Counter("deleted"),
   110  		deletedSnapshotMetadataFile: smScope.Counter("deleted"),
   111  	}
   112  }
   113  
   114  func newCleanupManager(
   115  	database database, activeLogs activeCommitlogs, scope tally.Scope) databaseCleanupManager {
   116  	opts := database.Options()
   117  	filePathPrefix := opts.CommitLogOptions().FilesystemOptions().FilePathPrefix()
   118  	commitLogsDir := fs.CommitLogsDirPath(filePathPrefix)
   119  
   120  	return &cleanupManager{
   121  		database:         database,
   122  		activeCommitlogs: activeLogs,
   123  
   124  		opts:                        opts,
   125  		nowFn:                       opts.ClockOptions().NowFn(),
   126  		filePathPrefix:              filePathPrefix,
   127  		commitLogsDir:               commitLogsDir,
   128  		commitLogFilesFn:            commitlog.Files,
   129  		snapshotMetadataFilesFn:     fs.SortedSnapshotMetadataFiles,
   130  		snapshotFilesFn:             fs.SnapshotFiles,
   131  		deleteFilesFn:               fs.DeleteFiles,
   132  		deleteInactiveDirectoriesFn: fs.DeleteInactiveDirectories,
   133  		metrics:                     newCleanupManagerMetrics(scope),
   134  		logger:                      opts.InstrumentOptions().Logger(),
   135  	}
   136  }
   137  
   138  func (m *cleanupManager) WarmFlushCleanup(t xtime.UnixNano) error {
   139  	m.Lock()
   140  	m.warmFlushCleanupInProgress = true
   141  	m.Unlock()
   142  
   143  	defer func() {
   144  		m.Lock()
   145  		m.warmFlushCleanupInProgress = false
   146  		m.Unlock()
   147  	}()
   148  
   149  	namespaces, err := m.database.OwnedNamespaces()
   150  	if err != nil {
   151  		return err
   152  	}
   153  
   154  	multiErr := xerrors.NewMultiError()
   155  	if err := m.cleanupExpiredIndexFiles(t, namespaces); err != nil {
   156  		multiErr = multiErr.Add(fmt.Errorf(
   157  			"encountered errors when cleaning up index files for %v: %w", t, err))
   158  	}
   159  
   160  	if err := m.cleanupCorruptedIndexFiles(namespaces); err != nil {
   161  		multiErr = multiErr.Add(fmt.Errorf(
   162  			"encountered errors when cleaning up corrupted files for %v: %w", t, err))
   163  	}
   164  
   165  	if err := m.cleanupDuplicateIndexFiles(namespaces); err != nil {
   166  		multiErr = multiErr.Add(fmt.Errorf(
   167  			"encountered errors when cleaning up index files for %v: %w", t, err))
   168  	}
   169  
   170  	if err := m.deleteInactiveDataSnapshotFiles(namespaces); err != nil {
   171  		multiErr = multiErr.Add(fmt.Errorf(
   172  			"encountered errors when deleting inactive snapshot files for %v: %w", t, err))
   173  	}
   174  
   175  	if err := m.deleteInactiveNamespaceFiles(namespaces); err != nil {
   176  		multiErr = multiErr.Add(fmt.Errorf(
   177  			"encountered errors when deleting inactive namespace files for %v: %w", t, err))
   178  	}
   179  
   180  	if err := m.cleanupSnapshotsAndCommitlogs(namespaces); err != nil {
   181  		multiErr = multiErr.Add(fmt.Errorf(
   182  			"encountered errors when cleaning up snapshot and commitlog files: %w", err))
   183  	}
   184  
   185  	return multiErr.FinalError()
   186  }
   187  
   188  func (m *cleanupManager) ColdFlushCleanup(t xtime.UnixNano) error {
   189  	m.Lock()
   190  	m.coldFlushCleanupInProgress = true
   191  	m.Unlock()
   192  
   193  	defer func() {
   194  		m.Lock()
   195  		m.coldFlushCleanupInProgress = false
   196  		m.Unlock()
   197  	}()
   198  
   199  	namespaces, err := m.database.OwnedNamespaces()
   200  	if err != nil {
   201  		return err
   202  	}
   203  
   204  	multiErr := xerrors.NewMultiError()
   205  	if err := m.cleanupDataFiles(t, namespaces); err != nil {
   206  		multiErr = multiErr.Add(fmt.Errorf(
   207  			"encountered errors when cleaning up data files for %v: %v", t, err))
   208  	}
   209  
   210  	if err := m.deleteInactiveDataFiles(namespaces); err != nil {
   211  		multiErr = multiErr.Add(fmt.Errorf(
   212  			"encountered errors when deleting inactive data files for %v: %v", t, err))
   213  	}
   214  
   215  	return multiErr.FinalError()
   216  }
   217  
   218  func (m *cleanupManager) Report() {
   219  	m.RLock()
   220  	coldFlushCleanupInProgress := m.coldFlushCleanupInProgress
   221  	warmFlushCleanupInProgress := m.warmFlushCleanupInProgress
   222  	m.RUnlock()
   223  
   224  	if coldFlushCleanupInProgress {
   225  		m.metrics.coldFlushCleanupStatus.Update(1)
   226  	} else {
   227  		m.metrics.coldFlushCleanupStatus.Update(0)
   228  	}
   229  
   230  	if warmFlushCleanupInProgress {
   231  		m.metrics.warmFlushCleanupStatus.Update(1)
   232  	} else {
   233  		m.metrics.warmFlushCleanupStatus.Update(0)
   234  	}
   235  }
   236  
   237  func (m *cleanupManager) deleteInactiveNamespaceFiles(namespaces []databaseNamespace) error {
   238  	var namespaceDirNames []string
   239  	filePathPrefix := m.database.Options().CommitLogOptions().FilesystemOptions().FilePathPrefix()
   240  	dataDirPath := fs.DataDirPath(filePathPrefix)
   241  
   242  	for _, n := range namespaces {
   243  		namespaceDirNames = append(namespaceDirNames, n.ID().String())
   244  	}
   245  
   246  	return m.deleteInactiveDirectoriesFn(dataDirPath, namespaceDirNames)
   247  }
   248  
   249  // deleteInactiveDataFiles will delete data files for shards that the node no longer owns
   250  // which can occur in the case of topology changes
   251  func (m *cleanupManager) deleteInactiveDataFiles(namespaces []databaseNamespace) error {
   252  	return m.deleteInactiveDataFileSetFiles(fs.NamespaceDataDirPath, namespaces)
   253  }
   254  
   255  // deleteInactiveDataSnapshotFiles will delete snapshot files for shards that the node no longer owns
   256  // which can occur in the case of topology changes
   257  func (m *cleanupManager) deleteInactiveDataSnapshotFiles(namespaces []databaseNamespace) error {
   258  	return m.deleteInactiveDataFileSetFiles(fs.NamespaceSnapshotsDirPath, namespaces)
   259  }
   260  
   261  func (m *cleanupManager) deleteInactiveDataFileSetFiles(
   262  	filesetFilesDirPathFn func(string, ident.ID) string, namespaces []databaseNamespace,
   263  ) error {
   264  	multiErr := xerrors.NewMultiError()
   265  	filePathPrefix := m.database.Options().CommitLogOptions().FilesystemOptions().FilePathPrefix()
   266  	for _, n := range namespaces {
   267  		var activeShards []string
   268  		namespaceDirPath := filesetFilesDirPathFn(filePathPrefix, n.ID())
   269  		// NB(linasn) This should list ALL shards because it will delete
   270  		// dirs for the shards NOT LISTED below.
   271  		for _, s := range n.OwnedShards() {
   272  			shard := fmt.Sprintf("%d", s.ID())
   273  			activeShards = append(activeShards, shard)
   274  		}
   275  		multiErr = multiErr.Add(m.deleteInactiveDirectoriesFn(namespaceDirPath, activeShards))
   276  	}
   277  
   278  	return multiErr.FinalError()
   279  }
   280  
   281  func (m *cleanupManager) cleanupDataFiles(t xtime.UnixNano, namespaces []databaseNamespace) error {
   282  	multiErr := xerrors.NewMultiError()
   283  	for _, n := range namespaces {
   284  		if !n.Options().CleanupEnabled() {
   285  			continue
   286  		}
   287  		earliestToRetain := retention.FlushTimeStart(n.Options().RetentionOptions(), t)
   288  		shards := n.OwnedShards()
   289  		multiErr = multiErr.Add(m.cleanupExpiredNamespaceDataFiles(earliestToRetain, shards))
   290  		multiErr = multiErr.Add(m.cleanupCompactedNamespaceDataFiles(shards))
   291  	}
   292  	return multiErr.FinalError()
   293  }
   294  
   295  func (m *cleanupManager) cleanupExpiredIndexFiles(
   296  	t xtime.UnixNano, namespaces []databaseNamespace,
   297  ) error {
   298  	multiErr := xerrors.NewMultiError()
   299  	for _, n := range namespaces {
   300  		if !n.Options().CleanupEnabled() || !n.Options().IndexOptions().Enabled() {
   301  			continue
   302  		}
   303  		idx, err := n.Index()
   304  		if err != nil {
   305  			multiErr = multiErr.Add(err)
   306  			continue
   307  		}
   308  		multiErr = multiErr.Add(idx.CleanupExpiredFileSets(t))
   309  	}
   310  	return multiErr.FinalError()
   311  }
   312  
   313  func (m *cleanupManager) cleanupCorruptedIndexFiles(namespaces []databaseNamespace) error {
   314  	multiErr := xerrors.NewMultiError()
   315  	for _, n := range namespaces {
   316  		if !n.Options().CleanupEnabled() || !n.Options().IndexOptions().Enabled() {
   317  			continue
   318  		}
   319  		idx, err := n.Index()
   320  		if err != nil {
   321  			multiErr = multiErr.Add(err)
   322  			continue
   323  		}
   324  		multiErr = multiErr.Add(idx.CleanupCorruptedFileSets())
   325  	}
   326  	return multiErr.FinalError()
   327  }
   328  
   329  func (m *cleanupManager) cleanupDuplicateIndexFiles(namespaces []databaseNamespace) error {
   330  	multiErr := xerrors.NewMultiError()
   331  	for _, n := range namespaces {
   332  		if !n.Options().CleanupEnabled() || !n.Options().IndexOptions().Enabled() {
   333  			continue
   334  		}
   335  		idx, err := n.Index()
   336  		if err != nil {
   337  			multiErr = multiErr.Add(err)
   338  			continue
   339  		}
   340  		activeShards := make([]uint32, 0)
   341  		for _, s := range n.OwnedShards() {
   342  			activeShards = append(activeShards, s.ID())
   343  		}
   344  		multiErr = multiErr.Add(idx.CleanupDuplicateFileSets(activeShards))
   345  	}
   346  	return multiErr.FinalError()
   347  }
   348  
   349  func (m *cleanupManager) cleanupExpiredNamespaceDataFiles(
   350  	earliestToRetain xtime.UnixNano, shards []databaseShard,
   351  ) error {
   352  	multiErr := xerrors.NewMultiError()
   353  	for _, shard := range shards {
   354  		if !shard.IsBootstrapped() {
   355  			continue
   356  		}
   357  		if err := shard.CleanupExpiredFileSets(earliestToRetain); err != nil {
   358  			multiErr = multiErr.Add(err)
   359  		}
   360  	}
   361  
   362  	return multiErr.FinalError()
   363  }
   364  
   365  func (m *cleanupManager) cleanupCompactedNamespaceDataFiles(shards []databaseShard) error {
   366  	multiErr := xerrors.NewMultiError()
   367  	for _, shard := range shards {
   368  		if !shard.IsBootstrapped() {
   369  			continue
   370  		}
   371  		if err := shard.CleanupCompactedFileSets(); err != nil {
   372  			multiErr = multiErr.Add(err)
   373  		}
   374  	}
   375  
   376  	return multiErr.FinalError()
   377  }
   378  
   379  // The goal of the cleanupSnapshotsAndCommitlogs function is to delete all snapshots files, snapshot metadata
   380  // files, and commitlog files except for those that are currently required for recovery from a node failure.
   381  // According to the snapshotting / commitlog rotation logic, the files that are required for a complete
   382  // recovery are:
   383  //
   384  //     1. The most recent (highest index) snapshot metadata files.
   385  //     2. All snapshot files whose associated snapshot ID matches the snapshot ID of the most recent snapshot
   386  //        metadata file.
   387  //     3. All commitlog files whose index is larger than or equal to the index of the commitlog identifier stored
   388  //        in the most recent snapshot metadata file. This is because the snapshotting and commitlog rotation process
   389  //        guarantees that the most recent snapshot contains all data stored in commitlogs that were created before
   390  //        the rotation / snapshot process began.
   391  //
   392  // cleanupSnapshotsAndCommitlogs accomplishes this goal by performing the following steps:
   393  //
   394  //     1. List all the snapshot metadata files on disk.
   395  //     2. Identify the most recent one (highest index).
   396  //     3. For every namespace/shard/block combination, delete all snapshot
   397  //        files that match one of the following criteria:
   398  //         1. Snapshot files whose associated snapshot ID does not match the snapshot ID of the most recent
   399  //            snapshot metadata file.
   400  //         2. Snapshot files that are corrupt.
   401  //     4. Delete all snapshot metadata files prior to the most recent once.
   402  //     5. Delete corrupt snapshot metadata files.
   403  //     6. List all the commitlog files on disk.
   404  //     7. List all the commitlog files that are being actively written to.
   405  //     8. Delete all commitlog files whose index is lower than the index of the commitlog file referenced in the
   406  //        most recent snapshot metadata file (ignoring any commitlog files being actively written to.)
   407  //     9. Delete all corrupt commitlog files (ignoring any commitlog files being actively written to.)
   408  //
   409  // This process is also modeled formally in TLA+ in the file `SnapshotsSpec.tla`.
   410  func (m *cleanupManager) cleanupSnapshotsAndCommitlogs(namespaces []databaseNamespace) (finalErr error) {
   411  	logger := m.opts.InstrumentOptions().Logger().With(
   412  		zap.String("comment",
   413  			"partial/corrupt files are expected as result of a restart (this is ok)"),
   414  	)
   415  
   416  	fsOpts := m.opts.CommitLogOptions().FilesystemOptions()
   417  	snapshotMetadatas, snapshotMetadataErrorsWithPaths, err := m.snapshotMetadataFilesFn(fsOpts)
   418  	if err != nil {
   419  		return err
   420  	}
   421  
   422  	if len(snapshotMetadatas) == 0 {
   423  		// No cleanup can be performed until we have at least one complete snapshot.
   424  		return nil
   425  	}
   426  
   427  	// They should technically already be sorted, but better to be safe.
   428  	sort.Slice(snapshotMetadatas, func(i, j int) bool {
   429  		return snapshotMetadatas[i].ID.Index < snapshotMetadatas[j].ID.Index
   430  	})
   431  	sortedSnapshotMetadatas := snapshotMetadatas
   432  
   433  	// Sanity check.
   434  	lastMetadataIndex := int64(-1)
   435  	for _, snapshotMetadata := range sortedSnapshotMetadatas {
   436  		currIndex := snapshotMetadata.ID.Index
   437  		if currIndex == lastMetadataIndex {
   438  			// Should never happen.
   439  			return fmt.Errorf(
   440  				"found two snapshot metadata files with duplicate index: %d", currIndex)
   441  		}
   442  		lastMetadataIndex = currIndex
   443  	}
   444  
   445  	if len(sortedSnapshotMetadatas) == 0 {
   446  		// No cleanup can be performed until we have at least one complete snapshot.
   447  		return nil
   448  	}
   449  
   450  	var (
   451  		multiErr           = xerrors.NewMultiError()
   452  		filesToDelete      = []string{}
   453  		mostRecentSnapshot = sortedSnapshotMetadatas[len(sortedSnapshotMetadatas)-1]
   454  	)
   455  	defer func() {
   456  		// Use a defer to perform the final file deletion so that we can attempt to cleanup *some* files
   457  		// when we encounter partial errors on a best effort basis.
   458  		multiErr = multiErr.Add(finalErr)
   459  		multiErr = multiErr.Add(m.deleteFilesFn(filesToDelete))
   460  		finalErr = multiErr.FinalError()
   461  	}()
   462  
   463  	for _, ns := range namespaces {
   464  		for _, s := range ns.OwnedShards() {
   465  			if !s.IsBootstrapped() {
   466  				continue
   467  			}
   468  			shardSnapshots, err := m.snapshotFilesFn(fsOpts.FilePathPrefix(), ns.ID(), s.ID())
   469  			if err != nil {
   470  				multiErr = multiErr.Add(fmt.Errorf(
   471  					"err reading snapshot files for ns: %s and shard: %d, err: %w",
   472  					ns.ID(), s.ID(), err,
   473  				))
   474  				continue
   475  			}
   476  
   477  			for _, snapshot := range shardSnapshots {
   478  				_, snapshotID, err := snapshot.SnapshotTimeAndID()
   479  				if err != nil {
   480  					// If we can't parse the snapshotID, assume the snapshot is corrupt and delete it. This could be caused
   481  					// by a variety of situations, like a node crashing while writing out a set of snapshot files and should
   482  					// have no impact on correctness as the snapshot files from previous (successful) snapshot will still be
   483  					// retained.
   484  					m.metrics.corruptSnapshotFile.Inc(1)
   485  					logger.With(
   486  						zap.Error(err),
   487  						zap.Strings("files", snapshot.AbsoluteFilePaths),
   488  					).Warn("corrupt snapshot file during cleanup, marking files for deletion")
   489  					filesToDelete = append(filesToDelete, snapshot.AbsoluteFilePaths...)
   490  					continue
   491  				}
   492  
   493  				if !uuid.Equal(snapshotID, mostRecentSnapshot.ID.UUID) {
   494  					// If the UUID of the snapshot files doesn't match the most recent snapshot
   495  					// then its safe to delete because it means we have a more recently complete set.
   496  					m.metrics.deletedSnapshotFile.Inc(1)
   497  					filesToDelete = append(filesToDelete, snapshot.AbsoluteFilePaths...)
   498  				}
   499  			}
   500  		}
   501  	}
   502  
   503  	// Delete all snapshot metadatas prior to the most recent one.
   504  	for _, snapshot := range sortedSnapshotMetadatas[:len(sortedSnapshotMetadatas)-1] {
   505  		m.metrics.deletedSnapshotMetadataFile.Inc(1)
   506  		filesToDelete = append(filesToDelete, snapshot.AbsoluteFilePaths()...)
   507  	}
   508  
   509  	// Delete corrupt snapshot metadata files.
   510  	for _, errorWithPath := range snapshotMetadataErrorsWithPaths {
   511  		m.metrics.corruptSnapshotMetadataFile.Inc(1)
   512  		logger.With(
   513  			zap.Error(errorWithPath.Error),
   514  			zap.String("metadataFilePath", errorWithPath.MetadataFilePath),
   515  			zap.String("checkpointFilePath", errorWithPath.CheckpointFilePath),
   516  		).Warn("corrupt snapshot metadata file during cleanup, marking files for deletion")
   517  		filesToDelete = append(filesToDelete, errorWithPath.MetadataFilePath)
   518  		filesToDelete = append(filesToDelete, errorWithPath.CheckpointFilePath)
   519  	}
   520  
   521  	// Figure out which commitlog files exist on disk.
   522  	files, commitlogErrorsWithPaths, err := m.commitLogFilesFn(m.opts.CommitLogOptions())
   523  	if err != nil {
   524  		// Hard failure here because the remaining cleanup logic relies on this data
   525  		// being available.
   526  		return err
   527  	}
   528  
   529  	// Figure out which commitlog files are being actively written to.
   530  	activeCommitlogs, err := m.activeCommitlogs.ActiveLogs()
   531  	if err != nil {
   532  		// Hard failure here because the remaining cleanup logic relies on this data
   533  		// being available.
   534  		return err
   535  	}
   536  
   537  	// Delete all commitlog files prior to the one captured by the most recent snapshot.
   538  	for _, file := range files {
   539  		if activeCommitlogs.Contains(file.FilePath) {
   540  			// Skip over any commitlog files that are being actively written to.
   541  			continue
   542  		}
   543  
   544  		if file.Index < mostRecentSnapshot.CommitlogIdentifier.Index {
   545  			m.metrics.deletedCommitlogFile.Inc(1)
   546  			filesToDelete = append(filesToDelete, file.FilePath)
   547  		}
   548  	}
   549  
   550  	// Delete corrupt commitlog files.
   551  	for _, errorWithPath := range commitlogErrorsWithPaths {
   552  		if activeCommitlogs.Contains(errorWithPath.Path()) {
   553  			// Skip over any commitlog files that are being actively written to. Note that is
   554  			// is common for an active commitlog to appear corrupt because the info header has
   555  			// not been flushed yet.
   556  			continue
   557  		}
   558  
   559  		m.metrics.corruptCommitlogFile.Inc(1)
   560  		// If we were unable to read the commit log files info header, then we're forced to assume
   561  		// that the file is corrupt and remove it. This can happen in situations where M3DB experiences
   562  		// sudden shutdown.
   563  		logger.With(
   564  			zap.Error(errorWithPath),
   565  			zap.String("path", errorWithPath.Path()),
   566  		).Warn("corrupt commitlog file during cleanup, marking file for deletion")
   567  		filesToDelete = append(filesToDelete, errorWithPath.Path())
   568  	}
   569  
   570  	return finalErr
   571  }