github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/storage/cleaner/cleaner.go (about)

     1  // This directory was copied and adapted from https://github.com/grafana/agent/tree/main/pkg/metrics.
     2  // We cannot vendor the agent in since the agent vendors loki in, which would cause a cyclic dependency.
     3  // NOTE: many changes have been made to the original code for our use-case.
     4  package cleaner
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"path/filepath"
    10  	"time"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/go-kit/log/level"
    14  	promwal "github.com/prometheus/prometheus/tsdb/wal"
    15  
    16  	"github.com/grafana/loki/pkg/ruler/storage/instance"
    17  	"github.com/grafana/loki/pkg/ruler/storage/wal"
    18  )
    19  
    20  // Default settings for the WAL cleaner.
    21  const (
    22  	DefaultCleanupAge    = 12 * time.Hour
    23  	DefaultCleanupPeriod = 0 * time.Second // disabled by default
    24  )
    25  
    26  // lastModifiedFunc gets the last modified time of the most recent segment of a WAL
    27  type lastModifiedFunc func(path string) (time.Time, error)
    28  
    29  func lastModified(path string) (time.Time, error) {
    30  	existing, err := promwal.Open(nil, path)
    31  	if err != nil {
    32  		return time.Time{}, err
    33  	}
    34  
    35  	// We don't care if there are errors closing the abandoned WAL
    36  	defer func() { _ = existing.Close() }()
    37  
    38  	_, last, err := promwal.Segments(existing.Dir())
    39  	if err != nil {
    40  		return time.Time{}, fmt.Errorf("unable to open WAL: %w", err)
    41  	}
    42  
    43  	if last == -1 {
    44  		return time.Time{}, fmt.Errorf("unable to determine most recent segment for %s", path)
    45  	}
    46  
    47  	// full path to the most recent segment in this WAL
    48  	lastSegment := promwal.SegmentName(path, last)
    49  	segmentFile, err := os.Stat(lastSegment)
    50  	if err != nil {
    51  		return time.Time{}, fmt.Errorf("unable to determine mtime for %s segment: %w", lastSegment, err)
    52  	}
    53  
    54  	return segmentFile.ModTime(), nil
    55  }
    56  
    57  // WALCleaner periodically checks for Write Ahead Logs (WALs) that are not associated
    58  // with any active instance.ManagedInstance and have not been written to in some configured
    59  // amount of time and deletes them.
    60  type WALCleaner struct {
    61  	logger          log.Logger
    62  	instanceManager instance.Manager
    63  	walDirectory    string
    64  	walLastModified lastModifiedFunc
    65  	minAge          time.Duration
    66  	period          time.Duration
    67  	done            chan bool
    68  
    69  	metrics *Metrics
    70  }
    71  
    72  // NewWALCleaner creates a new cleaner that looks for abandoned WALs in the given
    73  // directory and removes them if they haven't been modified in over minAge. Starts
    74  // a goroutine to periodically run the cleanup method in a loop
    75  func NewWALCleaner(logger log.Logger, manager instance.Manager, metrics *Metrics, walDirectory string, cfg Config) *WALCleaner {
    76  	c := &WALCleaner{
    77  		logger:          log.With(logger, "component", "cleaner"),
    78  		instanceManager: manager,
    79  		walDirectory:    filepath.Clean(walDirectory),
    80  		walLastModified: lastModified,
    81  		minAge:          DefaultCleanupAge,
    82  		period:          DefaultCleanupPeriod,
    83  		done:            make(chan bool),
    84  
    85  		metrics: metrics,
    86  	}
    87  
    88  	if cfg.MinAge > 0 {
    89  		c.minAge = cfg.MinAge
    90  	}
    91  
    92  	// We allow a period of 0 here because '0' means "don't run the task". This
    93  	// is handled by not running a ticker at all in the run method.
    94  	if cfg.Period >= 0 {
    95  		c.period = cfg.Period
    96  	}
    97  
    98  	go c.run()
    99  	return c
   100  }
   101  
   102  // getManagedStorage gets storage directories used for each ManagedInstance
   103  func (c *WALCleaner) getManagedStorage(instances map[string]instance.ManagedInstance) map[string]bool {
   104  	out := make(map[string]bool)
   105  
   106  	for _, inst := range instances {
   107  		out[inst.StorageDirectory()] = true
   108  	}
   109  
   110  	return out
   111  }
   112  
   113  // getAllStorage gets all storage directories under walDirectory
   114  func (c *WALCleaner) getAllStorage() []string {
   115  	var out []string
   116  
   117  	_ = filepath.Walk(c.walDirectory, func(p string, info os.FileInfo, err error) error {
   118  		if os.IsNotExist(err) {
   119  			// The root WAL directory doesn't exist. Maybe this Agent isn't responsible for any
   120  			// instances yet. Log at debug since this isn't a big deal. We'll just try to crawl
   121  			// the direction again on the next periodic run.
   122  			level.Debug(c.logger).Log("msg", "WAL storage path does not exist", "path", p, "err", err)
   123  		} else if err != nil {
   124  			// Just log any errors traversing the WAL directory. This will potentially result
   125  			// in a WAL (that has incorrect permissions or some similar problem) not being cleaned
   126  			// up. This is  better than preventing *all* other WALs from being cleaned up.
   127  			c.metrics.DiscoveryError.WithLabelValues(p).Inc()
   128  			level.Warn(c.logger).Log("msg", "unable to traverse WAL storage path", "path", p, "err", err)
   129  		} else if info.IsDir() && filepath.Dir(p) == c.walDirectory {
   130  			// Single level below the root are instance storage directories (including WALs)
   131  			out = append(out, p)
   132  		}
   133  
   134  		return nil
   135  	})
   136  
   137  	return out
   138  }
   139  
   140  // getAbandonedStorage gets the full path of storage directories that aren't associated with
   141  // an active instance  and haven't been written to within a configured duration (usually several
   142  // hours or more).
   143  func (c *WALCleaner) getAbandonedStorage(all []string, managed map[string]bool, now time.Time) []string {
   144  	var out []string
   145  
   146  	for _, dir := range all {
   147  		if managed[dir] {
   148  			level.Debug(c.logger).Log("msg", "active WAL", "name", dir)
   149  			continue
   150  		}
   151  
   152  		walDir := wal.SubDirectory(dir)
   153  		mtime, err := c.walLastModified(walDir)
   154  		if err != nil {
   155  			c.metrics.SegmentError.WithLabelValues(dir).Inc()
   156  			level.Warn(c.logger).Log("msg", "unable to find segment mtime of WAL", "name", dir, "err", err)
   157  			continue
   158  		}
   159  
   160  		diff := now.Sub(mtime)
   161  		if diff > c.minAge {
   162  			// The last segment for this WAL was modified more then $minAge (positive number of hours)
   163  			// in the past. This makes it a candidate for deletion since it's also not associated with
   164  			// any Instances this agent knows about.
   165  			out = append(out, dir)
   166  		}
   167  
   168  		level.Debug(c.logger).Log("msg", "abandoned WAL", "name", dir, "mtime", mtime, "diff", diff)
   169  	}
   170  
   171  	return out
   172  }
   173  
   174  // run cleans up abandoned WALs (if period != 0) in a loop periodically until stopped
   175  func (c *WALCleaner) run() {
   176  	// A period of 0 means don't run a cleanup task
   177  	if c.period == 0 {
   178  		return
   179  	}
   180  
   181  	ticker := time.NewTicker(c.period)
   182  	defer ticker.Stop()
   183  
   184  	for {
   185  		select {
   186  		case <-c.done:
   187  			level.Debug(c.logger).Log("msg", "stopping cleaner...")
   188  			return
   189  		case <-ticker.C:
   190  			c.cleanup()
   191  		}
   192  	}
   193  }
   194  
   195  // cleanup removes any abandoned and unused WAL directories. Note that it shouldn't be
   196  // necessary to call this method explicitly in most cases since it will be run periodically
   197  // in a goroutine (started when WALCleaner is created).
   198  func (c *WALCleaner) cleanup() {
   199  	if !c.instanceManager.Ready() {
   200  		level.Warn(c.logger).Log("msg", "delaying WAL clean until all storage instances are ready")
   201  		return
   202  	}
   203  
   204  	start := time.Now()
   205  	all := c.getAllStorage()
   206  	managed := c.getManagedStorage(c.instanceManager.ListInstances())
   207  	abandoned := c.getAbandonedStorage(all, managed, time.Now())
   208  
   209  	c.metrics.ManagedStorage.Set(float64(len(managed)))
   210  	c.metrics.AbandonedStorage.Set(float64(len(abandoned)))
   211  
   212  	// NOTE: this is a little imperfect right now; the manager cannot currently be notified when an
   213  	// instance (rule group) is removed by the prometheus QueueManager, so the cleaner will only really
   214  	// become aware of "abandoned" instances when the ruler is restarted.
   215  	// TODO(dannyk): contribute a callback mechanism to prometheus to allow for this hook
   216  
   217  	for _, a := range abandoned {
   218  		level.Info(c.logger).Log("msg", "deleting abandoned WAL", "name", a)
   219  		err := os.RemoveAll(a)
   220  		if err != nil {
   221  			level.Error(c.logger).Log("msg", "failed to delete abandoned WAL", "name", a, "err", err)
   222  			c.metrics.CleanupRunsErrors.Inc()
   223  		} else {
   224  			c.metrics.CleanupRunsSuccess.Inc()
   225  		}
   226  	}
   227  
   228  	c.metrics.CleanupTimes.Observe(time.Since(start).Seconds())
   229  }
   230  
   231  // Stop the cleaner and any background tasks running
   232  func (c *WALCleaner) Stop() {
   233  	close(c.done)
   234  }