github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/cleaner/cleaner.go (about) 1 package cleaner 2 3 import ( 4 "context" 5 "flag" 6 "time" 7 8 "github.com/go-kit/log" 9 "github.com/go-kit/log/level" 10 "github.com/grafana/dskit/backoff" 11 "github.com/grafana/dskit/services" 12 "github.com/pkg/errors" 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/prometheus/client_golang/prometheus/promauto" 15 "github.com/prometheus/common/model" 16 "github.com/thanos-io/thanos/pkg/objstore" 17 "golang.org/x/sync/errgroup" 18 19 "github.com/cortexproject/cortex/pkg/chunk" 20 "github.com/cortexproject/cortex/pkg/chunk/storage" 21 "github.com/cortexproject/cortex/tools/blocksconvert" 22 "github.com/cortexproject/cortex/tools/blocksconvert/planprocessor" 23 ) 24 25 type Config struct { 26 PlansDirectory string 27 Concurrency int 28 29 PlanProcessorConfig planprocessor.Config 30 } 31 32 func (cfg *Config) RegisterFlags(f *flag.FlagSet) { 33 cfg.PlanProcessorConfig.RegisterFlags("cleaner", f) 34 35 f.StringVar(&cfg.PlansDirectory, "cleaner.plans-dir", "", "Local directory used for storing temporary plan files.") 36 f.IntVar(&cfg.Concurrency, "cleaner.concurrency", 128, "Number of concurrent series cleaners.") 37 } 38 39 func NewCleaner(cfg Config, scfg blocksconvert.SharedConfig, l log.Logger, reg prometheus.Registerer) (services.Service, error) { 40 err := scfg.SchemaConfig.Load() 41 if err != nil { 42 return nil, errors.Wrap(err, "failed to load schema") 43 } 44 45 bucketClient, err := scfg.GetBucket(l, reg) 46 if err != nil { 47 return nil, err 48 } 49 50 c := &Cleaner{ 51 cfg: cfg, 52 53 bucketClient: bucketClient, 54 schemaConfig: scfg.SchemaConfig, 55 storageConfig: scfg.StorageConfig, 56 57 deletedSeries: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 58 Name: "cortex_blocksconvert_cleaner_deleted_series_total", 59 Help: "Deleted series", 60 }), 61 deletedSeriesErrors: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 62 Name: "cortex_blocksconvert_cleaner_delete_series_errors_total", 63 Help: "Number of errors while deleting series.", 64 }), 65 deletedIndexEntries: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 66 Name: "cortex_blocksconvert_cleaner_deleted_index_entries_total", 67 Help: "Deleted index entries", 68 }), 69 deletedChunks: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 70 Name: "cortex_blocksconvert_cleaner_deleted_chunks_total", 71 Help: "Deleted chunks", 72 }), 73 deletedChunksMissing: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 74 Name: "cortex_blocksconvert_cleaner_delete_chunks_missing_total", 75 Help: "Chunks that were missing when trying to delete them.", 76 }), 77 deletedChunksSkipped: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 78 Name: "cortex_blocksconvert_cleaner_delete_chunks_skipped_total", 79 Help: "Number of skipped chunks during deletion.", 80 }), 81 deletedChunksErrors: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 82 Name: "cortex_blocksconvert_cleaner_delete_chunks_errors_total", 83 Help: "Number of errors while deleting individual chunks.", 84 }), 85 } 86 87 return planprocessor.NewService(cfg.PlanProcessorConfig, cfg.PlansDirectory, bucketClient, nil, c.planProcessorFactory, l, reg) 88 } 89 90 type Cleaner struct { 91 cfg Config 92 93 bucketClient objstore.Bucket 94 schemaConfig chunk.SchemaConfig 95 storageConfig storage.Config 96 97 deletedChunks prometheus.Counter 98 deletedChunksSkipped prometheus.Counter 99 deletedChunksMissing prometheus.Counter 100 deletedChunksErrors prometheus.Counter 101 102 deletedIndexEntries prometheus.Counter 103 deletedSeries prometheus.Counter 104 deletedSeriesErrors prometheus.Counter 105 } 106 107 func (c *Cleaner) planProcessorFactory(planLog log.Logger, userID string, start time.Time, end time.Time) planprocessor.PlanProcessor { 108 return &cleanerProcessor{ 109 cleaner: c, 110 log: planLog, 111 userID: userID, 112 dayStart: start, 113 dayEnd: end, 114 } 115 } 116 117 type cleanerProcessor struct { 118 cleaner *Cleaner 119 120 log log.Logger 121 userID string 122 dayStart time.Time 123 dayEnd time.Time 124 } 125 126 func (cp *cleanerProcessor) ProcessPlanEntries(ctx context.Context, planEntryCh chan blocksconvert.PlanEntry) (string, error) { 127 tableName, schema, chunkClient, indexClient, err := cp.cleaner.createClientsForDay(cp.dayStart) 128 if err != nil { 129 return "", errors.Wrap(err, "failed to create clients") 130 } 131 132 defer chunkClient.Stop() 133 defer indexClient.Stop() 134 135 seriesSchema, ok := schema.(chunk.SeriesStoreSchema) 136 if !ok || seriesSchema == nil { 137 return "", errors.Errorf("invalid schema, expected v9 or later") 138 } 139 140 g, gctx := errgroup.WithContext(ctx) 141 for i := 0; i < cp.cleaner.cfg.Concurrency; i++ { 142 g.Go(func() error { 143 for { 144 select { 145 case <-ctx.Done(): 146 return nil 147 148 case e, ok := <-planEntryCh: 149 if !ok { 150 // End of input. 151 return nil 152 } 153 154 err := cp.deleteChunksForSeries(gctx, tableName, seriesSchema, chunkClient, indexClient, e) 155 if err != nil { 156 return err 157 } 158 } 159 } 160 }) 161 } 162 163 if err := g.Wait(); err != nil { 164 return "", errors.Wrap(err, "failed to cleanup series") 165 } 166 167 // "cleaned" will be appended as "block ID" to finished status file. 168 return "cleaned", nil 169 } 170 171 func (cp *cleanerProcessor) deleteChunksForSeries(ctx context.Context, tableName string, schema chunk.SeriesStoreSchema, chunkClient chunk.Client, indexClient chunk.IndexClient, e blocksconvert.PlanEntry) error { 172 var c *chunk.Chunk 173 var err error 174 175 b := backoff.New(ctx, backoff.Config{ 176 MinBackoff: 1 * time.Second, 177 MaxBackoff: 5 * time.Second, 178 MaxRetries: 5, 179 }) 180 for ; b.Ongoing(); b.Wait() { 181 c, err = fetchSingleChunk(ctx, cp.userID, chunkClient, e.Chunks) 182 if err == nil { 183 break 184 } 185 186 level.Warn(cp.log).Log("msg", "failed to fetch chunk for series", "series", e.SeriesID, "err", err, "retries", b.NumRetries()+1) 187 } 188 189 if err == nil { 190 err = b.Err() 191 } 192 if err != nil { 193 return errors.Wrapf(err, "error while fetching chunk for series %s", e.SeriesID) 194 } 195 196 if c == nil { 197 cp.cleaner.deletedSeriesErrors.Inc() 198 // This can happen also when cleaner is restarted. Chunks deleted previously cannot be found anymore, 199 // but index entries should not exist. 200 // level.Warn(cp.log).Log("msg", "no chunk found for series, unable to delete series", "series", e.SeriesID) 201 return nil 202 } 203 204 // All chunks belonging to the series use the same metric. 205 metric := c.Metric 206 207 metricName := metric.Get(model.MetricNameLabel) 208 if metricName == "" { 209 return errors.Errorf("cannot find metric name for series %s", metric.String()) 210 } 211 212 start := model.TimeFromUnixNano(cp.dayStart.UnixNano()) 213 end := model.TimeFromUnixNano(cp.dayEnd.UnixNano()) 214 215 var chunksToDelete []string 216 217 indexEntries := 0 218 219 // With metric, we find out which index entries to remove. 220 batch := indexClient.NewWriteBatch() 221 for _, cid := range e.Chunks { 222 c, err := chunk.ParseExternalKey(cp.userID, cid) 223 if err != nil { 224 return errors.Wrap(err, "failed to parse chunk key") 225 } 226 227 // ChunkWriteEntries returns entries not only for this day-period, but all days that chunk covers. 228 // Since we process plans "backwards", more recent entries should already be cleaned up. 229 ents, err := schema.GetChunkWriteEntries(c.From, c.Through, cp.userID, metricName, metric, cid) 230 if err != nil { 231 return errors.Wrapf(err, "getting index entries to delete for chunkID=%s", cid) 232 } 233 for i := range ents { 234 // To avoid deleting entries from older tables, we check for table. This can still delete entries 235 // from different buckets in the same table, but we just accept that. 236 if tableName == ents[i].TableName { 237 batch.Delete(ents[i].TableName, ents[i].HashValue, ents[i].RangeValue) 238 } 239 } 240 indexEntries += len(ents) 241 242 // Label entries in v9, v10 and v11 don't use from/through in encoded values, so instead of chunk From/Through values, 243 // we only pass start/end for current day, to avoid deleting entries in other buckets. 244 // As "end" is inclusive, we make it exclusive by -1. 245 _, perKeyEnts, err := schema.GetCacheKeysAndLabelWriteEntries(start, end-1, cp.userID, metricName, metric, cid) 246 if err != nil { 247 return errors.Wrapf(err, "getting index entries to delete for chunkID=%s", cid) 248 } 249 for _, ents := range perKeyEnts { 250 for i := range ents { 251 batch.Delete(ents[i].TableName, ents[i].HashValue, ents[i].RangeValue) 252 } 253 indexEntries += len(ents) 254 } 255 256 // Only delete this chunk if it *starts* in plans' date-period. In general we process plans from most-recent 257 // to older, so if chunk starts in current plan's period, its index entries were already removed in later plans. 258 // This breaks when running multiple cleaners or cleaner crashes. 259 if c.From >= start { 260 chunksToDelete = append(chunksToDelete, cid) 261 } else { 262 cp.cleaner.deletedChunksSkipped.Inc() 263 continue 264 } 265 } 266 267 // Delete index entries first. If we delete chunks first, and then cleaner is interrupted, 268 // chunks won't be find upon restart, and it won't be possible to clean up index entries. 269 if err := indexClient.BatchWrite(ctx, batch); err != nil { 270 level.Warn(cp.log).Log("msg", "failed to delete index entries for series", "series", e.SeriesID, "err", err) 271 cp.cleaner.deletedSeriesErrors.Inc() 272 } else { 273 cp.cleaner.deletedSeries.Inc() 274 cp.cleaner.deletedIndexEntries.Add(float64(indexEntries)) 275 } 276 277 for _, cid := range chunksToDelete { 278 if err := chunkClient.DeleteChunk(ctx, cp.userID, cid); err != nil { 279 if errors.Is(err, chunk.ErrStorageObjectNotFound) { 280 cp.cleaner.deletedChunksMissing.Inc() 281 } else { 282 level.Warn(cp.log).Log("msg", "failed to delete chunk for series", "series", e.SeriesID, "chunk", cid, "err", err) 283 cp.cleaner.deletedChunksErrors.Inc() 284 } 285 } else { 286 cp.cleaner.deletedChunks.Inc() 287 } 288 } 289 290 return nil 291 } 292 293 func fetchSingleChunk(ctx context.Context, userID string, chunkClient chunk.Client, chunksIds []string) (*chunk.Chunk, error) { 294 // Fetch single chunk 295 for _, cid := range chunksIds { 296 c, err := chunk.ParseExternalKey(userID, cid) 297 if err != nil { 298 return nil, errors.Wrap(err, "fetching chunks") 299 } 300 301 cs, err := chunkClient.GetChunks(ctx, []chunk.Chunk{c}) 302 303 if errors.Is(err, chunk.ErrStorageObjectNotFound) { 304 continue 305 } 306 if err != nil { 307 return nil, errors.Wrap(err, "fetching chunks") 308 } 309 310 if len(cs) > 0 { 311 return &cs[0], nil 312 } 313 } 314 315 return nil, nil 316 } 317 318 func (c *Cleaner) createClientsForDay(dayStart time.Time) (string, chunk.BaseSchema, chunk.Client, chunk.IndexClient, error) { 319 for ix, s := range c.schemaConfig.Configs { 320 if dayStart.Unix() < s.From.Unix() { 321 continue 322 } 323 324 if ix+1 < len(c.schemaConfig.Configs) && dayStart.Unix() > c.schemaConfig.Configs[ix+1].From.Unix() { 325 continue 326 } 327 328 tableName := s.IndexTables.TableFor(model.TimeFromUnixNano(dayStart.UnixNano())) 329 330 schema, err := s.CreateSchema() 331 if err != nil { 332 return "", nil, nil, nil, errors.Wrap(err, "failed to create schema") 333 } 334 335 // No registerer, to avoid problems with registering same metrics multiple times. 336 index, err := storage.NewIndexClient(s.IndexType, c.storageConfig, c.schemaConfig, nil) 337 if err != nil { 338 return "", nil, nil, nil, errors.Wrap(err, "error creating index client") 339 } 340 341 objectStoreType := s.ObjectType 342 if objectStoreType == "" { 343 objectStoreType = s.IndexType 344 } 345 346 // No registerer, to avoid problems with registering same metrics multiple times. 347 chunks, err := storage.NewChunkClient(objectStoreType, c.storageConfig, c.schemaConfig, nil) 348 if err != nil { 349 index.Stop() 350 return "", nil, nil, nil, errors.Wrap(err, "error creating object client") 351 } 352 353 return tableName, schema, chunks, index, nil 354 } 355 356 return "", nil, nil, nil, errors.Errorf("no schema for day %v", dayStart.Format("2006-01-02")) 357 }