github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/storage/cleaner/cleaner.go (about) 1 // This directory was copied and adapted from https://github.com/grafana/agent/tree/main/pkg/metrics. 2 // We cannot vendor the agent in since the agent vendors loki in, which would cause a cyclic dependency. 3 // NOTE: many changes have been made to the original code for our use-case. 4 package cleaner 5 6 import ( 7 "fmt" 8 "os" 9 "path/filepath" 10 "time" 11 12 "github.com/go-kit/log" 13 "github.com/go-kit/log/level" 14 promwal "github.com/prometheus/prometheus/tsdb/wal" 15 16 "github.com/grafana/loki/pkg/ruler/storage/instance" 17 "github.com/grafana/loki/pkg/ruler/storage/wal" 18 ) 19 20 // Default settings for the WAL cleaner. 21 const ( 22 DefaultCleanupAge = 12 * time.Hour 23 DefaultCleanupPeriod = 0 * time.Second // disabled by default 24 ) 25 26 // lastModifiedFunc gets the last modified time of the most recent segment of a WAL 27 type lastModifiedFunc func(path string) (time.Time, error) 28 29 func lastModified(path string) (time.Time, error) { 30 existing, err := promwal.Open(nil, path) 31 if err != nil { 32 return time.Time{}, err 33 } 34 35 // We don't care if there are errors closing the abandoned WAL 36 defer func() { _ = existing.Close() }() 37 38 _, last, err := promwal.Segments(existing.Dir()) 39 if err != nil { 40 return time.Time{}, fmt.Errorf("unable to open WAL: %w", err) 41 } 42 43 if last == -1 { 44 return time.Time{}, fmt.Errorf("unable to determine most recent segment for %s", path) 45 } 46 47 // full path to the most recent segment in this WAL 48 lastSegment := promwal.SegmentName(path, last) 49 segmentFile, err := os.Stat(lastSegment) 50 if err != nil { 51 return time.Time{}, fmt.Errorf("unable to determine mtime for %s segment: %w", lastSegment, err) 52 } 53 54 return segmentFile.ModTime(), nil 55 } 56 57 // WALCleaner periodically checks for Write Ahead Logs (WALs) that are not associated 58 // with any active instance.ManagedInstance and have not been written to in some configured 59 // amount of time and deletes them. 60 type WALCleaner struct { 61 logger log.Logger 62 instanceManager instance.Manager 63 walDirectory string 64 walLastModified lastModifiedFunc 65 minAge time.Duration 66 period time.Duration 67 done chan bool 68 69 metrics *Metrics 70 } 71 72 // NewWALCleaner creates a new cleaner that looks for abandoned WALs in the given 73 // directory and removes them if they haven't been modified in over minAge. Starts 74 // a goroutine to periodically run the cleanup method in a loop 75 func NewWALCleaner(logger log.Logger, manager instance.Manager, metrics *Metrics, walDirectory string, cfg Config) *WALCleaner { 76 c := &WALCleaner{ 77 logger: log.With(logger, "component", "cleaner"), 78 instanceManager: manager, 79 walDirectory: filepath.Clean(walDirectory), 80 walLastModified: lastModified, 81 minAge: DefaultCleanupAge, 82 period: DefaultCleanupPeriod, 83 done: make(chan bool), 84 85 metrics: metrics, 86 } 87 88 if cfg.MinAge > 0 { 89 c.minAge = cfg.MinAge 90 } 91 92 // We allow a period of 0 here because '0' means "don't run the task". This 93 // is handled by not running a ticker at all in the run method. 94 if cfg.Period >= 0 { 95 c.period = cfg.Period 96 } 97 98 go c.run() 99 return c 100 } 101 102 // getManagedStorage gets storage directories used for each ManagedInstance 103 func (c *WALCleaner) getManagedStorage(instances map[string]instance.ManagedInstance) map[string]bool { 104 out := make(map[string]bool) 105 106 for _, inst := range instances { 107 out[inst.StorageDirectory()] = true 108 } 109 110 return out 111 } 112 113 // getAllStorage gets all storage directories under walDirectory 114 func (c *WALCleaner) getAllStorage() []string { 115 var out []string 116 117 _ = filepath.Walk(c.walDirectory, func(p string, info os.FileInfo, err error) error { 118 if os.IsNotExist(err) { 119 // The root WAL directory doesn't exist. Maybe this Agent isn't responsible for any 120 // instances yet. Log at debug since this isn't a big deal. We'll just try to crawl 121 // the direction again on the next periodic run. 122 level.Debug(c.logger).Log("msg", "WAL storage path does not exist", "path", p, "err", err) 123 } else if err != nil { 124 // Just log any errors traversing the WAL directory. This will potentially result 125 // in a WAL (that has incorrect permissions or some similar problem) not being cleaned 126 // up. This is better than preventing *all* other WALs from being cleaned up. 127 c.metrics.DiscoveryError.WithLabelValues(p).Inc() 128 level.Warn(c.logger).Log("msg", "unable to traverse WAL storage path", "path", p, "err", err) 129 } else if info.IsDir() && filepath.Dir(p) == c.walDirectory { 130 // Single level below the root are instance storage directories (including WALs) 131 out = append(out, p) 132 } 133 134 return nil 135 }) 136 137 return out 138 } 139 140 // getAbandonedStorage gets the full path of storage directories that aren't associated with 141 // an active instance and haven't been written to within a configured duration (usually several 142 // hours or more). 143 func (c *WALCleaner) getAbandonedStorage(all []string, managed map[string]bool, now time.Time) []string { 144 var out []string 145 146 for _, dir := range all { 147 if managed[dir] { 148 level.Debug(c.logger).Log("msg", "active WAL", "name", dir) 149 continue 150 } 151 152 walDir := wal.SubDirectory(dir) 153 mtime, err := c.walLastModified(walDir) 154 if err != nil { 155 c.metrics.SegmentError.WithLabelValues(dir).Inc() 156 level.Warn(c.logger).Log("msg", "unable to find segment mtime of WAL", "name", dir, "err", err) 157 continue 158 } 159 160 diff := now.Sub(mtime) 161 if diff > c.minAge { 162 // The last segment for this WAL was modified more then $minAge (positive number of hours) 163 // in the past. This makes it a candidate for deletion since it's also not associated with 164 // any Instances this agent knows about. 165 out = append(out, dir) 166 } 167 168 level.Debug(c.logger).Log("msg", "abandoned WAL", "name", dir, "mtime", mtime, "diff", diff) 169 } 170 171 return out 172 } 173 174 // run cleans up abandoned WALs (if period != 0) in a loop periodically until stopped 175 func (c *WALCleaner) run() { 176 // A period of 0 means don't run a cleanup task 177 if c.period == 0 { 178 return 179 } 180 181 ticker := time.NewTicker(c.period) 182 defer ticker.Stop() 183 184 for { 185 select { 186 case <-c.done: 187 level.Debug(c.logger).Log("msg", "stopping cleaner...") 188 return 189 case <-ticker.C: 190 c.cleanup() 191 } 192 } 193 } 194 195 // cleanup removes any abandoned and unused WAL directories. Note that it shouldn't be 196 // necessary to call this method explicitly in most cases since it will be run periodically 197 // in a goroutine (started when WALCleaner is created). 198 func (c *WALCleaner) cleanup() { 199 if !c.instanceManager.Ready() { 200 level.Warn(c.logger).Log("msg", "delaying WAL clean until all storage instances are ready") 201 return 202 } 203 204 start := time.Now() 205 all := c.getAllStorage() 206 managed := c.getManagedStorage(c.instanceManager.ListInstances()) 207 abandoned := c.getAbandonedStorage(all, managed, time.Now()) 208 209 c.metrics.ManagedStorage.Set(float64(len(managed))) 210 c.metrics.AbandonedStorage.Set(float64(len(abandoned))) 211 212 // NOTE: this is a little imperfect right now; the manager cannot currently be notified when an 213 // instance (rule group) is removed by the prometheus QueueManager, so the cleaner will only really 214 // become aware of "abandoned" instances when the ruler is restarted. 215 // TODO(dannyk): contribute a callback mechanism to prometheus to allow for this hook 216 217 for _, a := range abandoned { 218 level.Info(c.logger).Log("msg", "deleting abandoned WAL", "name", a) 219 err := os.RemoveAll(a) 220 if err != nil { 221 level.Error(c.logger).Log("msg", "failed to delete abandoned WAL", "name", a, "err", err) 222 c.metrics.CleanupRunsErrors.Inc() 223 } else { 224 c.metrics.CleanupRunsSuccess.Inc() 225 } 226 } 227 228 c.metrics.CleanupTimes.Observe(time.Since(start).Seconds()) 229 } 230 231 // Stop the cleaner and any background tasks running 232 func (c *WALCleaner) Stop() { 233 close(c.done) 234 }