github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/tools/blocksconvert/cleaner/cleaner.go (about)

     1  package cleaner
     2  
     3  import (
     4  	"context"
     5  	"flag"
     6  	"time"
     7  
     8  	"github.com/go-kit/log"
     9  	"github.com/go-kit/log/level"
    10  	"github.com/grafana/dskit/backoff"
    11  	"github.com/grafana/dskit/services"
    12  	"github.com/pkg/errors"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/prometheus/client_golang/prometheus/promauto"
    15  	"github.com/prometheus/common/model"
    16  	"github.com/thanos-io/thanos/pkg/objstore"
    17  	"golang.org/x/sync/errgroup"
    18  
    19  	"github.com/cortexproject/cortex/pkg/chunk"
    20  	"github.com/cortexproject/cortex/pkg/chunk/storage"
    21  	"github.com/cortexproject/cortex/tools/blocksconvert"
    22  	"github.com/cortexproject/cortex/tools/blocksconvert/planprocessor"
    23  )
    24  
    25  type Config struct {
    26  	PlansDirectory string
    27  	Concurrency    int
    28  
    29  	PlanProcessorConfig planprocessor.Config
    30  }
    31  
    32  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    33  	cfg.PlanProcessorConfig.RegisterFlags("cleaner", f)
    34  
    35  	f.StringVar(&cfg.PlansDirectory, "cleaner.plans-dir", "", "Local directory used for storing temporary plan files.")
    36  	f.IntVar(&cfg.Concurrency, "cleaner.concurrency", 128, "Number of concurrent series cleaners.")
    37  }
    38  
    39  func NewCleaner(cfg Config, scfg blocksconvert.SharedConfig, l log.Logger, reg prometheus.Registerer) (services.Service, error) {
    40  	err := scfg.SchemaConfig.Load()
    41  	if err != nil {
    42  		return nil, errors.Wrap(err, "failed to load schema")
    43  	}
    44  
    45  	bucketClient, err := scfg.GetBucket(l, reg)
    46  	if err != nil {
    47  		return nil, err
    48  	}
    49  
    50  	c := &Cleaner{
    51  		cfg: cfg,
    52  
    53  		bucketClient:  bucketClient,
    54  		schemaConfig:  scfg.SchemaConfig,
    55  		storageConfig: scfg.StorageConfig,
    56  
    57  		deletedSeries: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    58  			Name: "cortex_blocksconvert_cleaner_deleted_series_total",
    59  			Help: "Deleted series",
    60  		}),
    61  		deletedSeriesErrors: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    62  			Name: "cortex_blocksconvert_cleaner_delete_series_errors_total",
    63  			Help: "Number of errors while deleting series.",
    64  		}),
    65  		deletedIndexEntries: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    66  			Name: "cortex_blocksconvert_cleaner_deleted_index_entries_total",
    67  			Help: "Deleted index entries",
    68  		}),
    69  		deletedChunks: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    70  			Name: "cortex_blocksconvert_cleaner_deleted_chunks_total",
    71  			Help: "Deleted chunks",
    72  		}),
    73  		deletedChunksMissing: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    74  			Name: "cortex_blocksconvert_cleaner_delete_chunks_missing_total",
    75  			Help: "Chunks that were missing when trying to delete them.",
    76  		}),
    77  		deletedChunksSkipped: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    78  			Name: "cortex_blocksconvert_cleaner_delete_chunks_skipped_total",
    79  			Help: "Number of skipped chunks during deletion.",
    80  		}),
    81  		deletedChunksErrors: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    82  			Name: "cortex_blocksconvert_cleaner_delete_chunks_errors_total",
    83  			Help: "Number of errors while deleting individual chunks.",
    84  		}),
    85  	}
    86  
    87  	return planprocessor.NewService(cfg.PlanProcessorConfig, cfg.PlansDirectory, bucketClient, nil, c.planProcessorFactory, l, reg)
    88  }
    89  
    90  type Cleaner struct {
    91  	cfg Config
    92  
    93  	bucketClient  objstore.Bucket
    94  	schemaConfig  chunk.SchemaConfig
    95  	storageConfig storage.Config
    96  
    97  	deletedChunks        prometheus.Counter
    98  	deletedChunksSkipped prometheus.Counter
    99  	deletedChunksMissing prometheus.Counter
   100  	deletedChunksErrors  prometheus.Counter
   101  
   102  	deletedIndexEntries prometheus.Counter
   103  	deletedSeries       prometheus.Counter
   104  	deletedSeriesErrors prometheus.Counter
   105  }
   106  
   107  func (c *Cleaner) planProcessorFactory(planLog log.Logger, userID string, start time.Time, end time.Time) planprocessor.PlanProcessor {
   108  	return &cleanerProcessor{
   109  		cleaner:  c,
   110  		log:      planLog,
   111  		userID:   userID,
   112  		dayStart: start,
   113  		dayEnd:   end,
   114  	}
   115  }
   116  
   117  type cleanerProcessor struct {
   118  	cleaner *Cleaner
   119  
   120  	log      log.Logger
   121  	userID   string
   122  	dayStart time.Time
   123  	dayEnd   time.Time
   124  }
   125  
   126  func (cp *cleanerProcessor) ProcessPlanEntries(ctx context.Context, planEntryCh chan blocksconvert.PlanEntry) (string, error) {
   127  	tableName, schema, chunkClient, indexClient, err := cp.cleaner.createClientsForDay(cp.dayStart)
   128  	if err != nil {
   129  		return "", errors.Wrap(err, "failed to create clients")
   130  	}
   131  
   132  	defer chunkClient.Stop()
   133  	defer indexClient.Stop()
   134  
   135  	seriesSchema, ok := schema.(chunk.SeriesStoreSchema)
   136  	if !ok || seriesSchema == nil {
   137  		return "", errors.Errorf("invalid schema, expected v9 or later")
   138  	}
   139  
   140  	g, gctx := errgroup.WithContext(ctx)
   141  	for i := 0; i < cp.cleaner.cfg.Concurrency; i++ {
   142  		g.Go(func() error {
   143  			for {
   144  				select {
   145  				case <-ctx.Done():
   146  					return nil
   147  
   148  				case e, ok := <-planEntryCh:
   149  					if !ok {
   150  						// End of input.
   151  						return nil
   152  					}
   153  
   154  					err := cp.deleteChunksForSeries(gctx, tableName, seriesSchema, chunkClient, indexClient, e)
   155  					if err != nil {
   156  						return err
   157  					}
   158  				}
   159  			}
   160  		})
   161  	}
   162  
   163  	if err := g.Wait(); err != nil {
   164  		return "", errors.Wrap(err, "failed to cleanup series")
   165  	}
   166  
   167  	// "cleaned" will be appended as "block ID" to finished status file.
   168  	return "cleaned", nil
   169  }
   170  
   171  func (cp *cleanerProcessor) deleteChunksForSeries(ctx context.Context, tableName string, schema chunk.SeriesStoreSchema, chunkClient chunk.Client, indexClient chunk.IndexClient, e blocksconvert.PlanEntry) error {
   172  	var c *chunk.Chunk
   173  	var err error
   174  
   175  	b := backoff.New(ctx, backoff.Config{
   176  		MinBackoff: 1 * time.Second,
   177  		MaxBackoff: 5 * time.Second,
   178  		MaxRetries: 5,
   179  	})
   180  	for ; b.Ongoing(); b.Wait() {
   181  		c, err = fetchSingleChunk(ctx, cp.userID, chunkClient, e.Chunks)
   182  		if err == nil {
   183  			break
   184  		}
   185  
   186  		level.Warn(cp.log).Log("msg", "failed to fetch chunk for series", "series", e.SeriesID, "err", err, "retries", b.NumRetries()+1)
   187  	}
   188  
   189  	if err == nil {
   190  		err = b.Err()
   191  	}
   192  	if err != nil {
   193  		return errors.Wrapf(err, "error while fetching chunk for series %s", e.SeriesID)
   194  	}
   195  
   196  	if c == nil {
   197  		cp.cleaner.deletedSeriesErrors.Inc()
   198  		// This can happen also when cleaner is restarted. Chunks deleted previously cannot be found anymore,
   199  		// but index entries should not exist.
   200  		// level.Warn(cp.log).Log("msg", "no chunk found for series, unable to delete series", "series", e.SeriesID)
   201  		return nil
   202  	}
   203  
   204  	// All chunks belonging to the series use the same metric.
   205  	metric := c.Metric
   206  
   207  	metricName := metric.Get(model.MetricNameLabel)
   208  	if metricName == "" {
   209  		return errors.Errorf("cannot find metric name for series %s", metric.String())
   210  	}
   211  
   212  	start := model.TimeFromUnixNano(cp.dayStart.UnixNano())
   213  	end := model.TimeFromUnixNano(cp.dayEnd.UnixNano())
   214  
   215  	var chunksToDelete []string
   216  
   217  	indexEntries := 0
   218  
   219  	// With metric, we find out which index entries to remove.
   220  	batch := indexClient.NewWriteBatch()
   221  	for _, cid := range e.Chunks {
   222  		c, err := chunk.ParseExternalKey(cp.userID, cid)
   223  		if err != nil {
   224  			return errors.Wrap(err, "failed to parse chunk key")
   225  		}
   226  
   227  		// ChunkWriteEntries returns entries not only for this day-period, but all days that chunk covers.
   228  		// Since we process plans "backwards", more recent entries should already be cleaned up.
   229  		ents, err := schema.GetChunkWriteEntries(c.From, c.Through, cp.userID, metricName, metric, cid)
   230  		if err != nil {
   231  			return errors.Wrapf(err, "getting index entries to delete for chunkID=%s", cid)
   232  		}
   233  		for i := range ents {
   234  			// To avoid deleting entries from older tables, we check for table. This can still delete entries
   235  			// from different buckets in the same table, but we just accept that.
   236  			if tableName == ents[i].TableName {
   237  				batch.Delete(ents[i].TableName, ents[i].HashValue, ents[i].RangeValue)
   238  			}
   239  		}
   240  		indexEntries += len(ents)
   241  
   242  		// Label entries in v9, v10 and v11 don't use from/through in encoded values, so instead of chunk From/Through values,
   243  		// we only pass start/end for current day, to avoid deleting entries in other buckets.
   244  		// As "end" is inclusive, we make it exclusive by -1.
   245  		_, perKeyEnts, err := schema.GetCacheKeysAndLabelWriteEntries(start, end-1, cp.userID, metricName, metric, cid)
   246  		if err != nil {
   247  			return errors.Wrapf(err, "getting index entries to delete for chunkID=%s", cid)
   248  		}
   249  		for _, ents := range perKeyEnts {
   250  			for i := range ents {
   251  				batch.Delete(ents[i].TableName, ents[i].HashValue, ents[i].RangeValue)
   252  			}
   253  			indexEntries += len(ents)
   254  		}
   255  
   256  		// Only delete this chunk if it *starts* in plans' date-period. In general we process plans from most-recent
   257  		// to older, so if chunk starts in current plan's period, its index entries were already removed in later plans.
   258  		// This breaks when running multiple cleaners or cleaner crashes.
   259  		if c.From >= start {
   260  			chunksToDelete = append(chunksToDelete, cid)
   261  		} else {
   262  			cp.cleaner.deletedChunksSkipped.Inc()
   263  			continue
   264  		}
   265  	}
   266  
   267  	// Delete index entries first. If we delete chunks first, and then cleaner is interrupted,
   268  	// chunks won't be find upon restart, and it won't be possible to clean up index entries.
   269  	if err := indexClient.BatchWrite(ctx, batch); err != nil {
   270  		level.Warn(cp.log).Log("msg", "failed to delete index entries for series", "series", e.SeriesID, "err", err)
   271  		cp.cleaner.deletedSeriesErrors.Inc()
   272  	} else {
   273  		cp.cleaner.deletedSeries.Inc()
   274  		cp.cleaner.deletedIndexEntries.Add(float64(indexEntries))
   275  	}
   276  
   277  	for _, cid := range chunksToDelete {
   278  		if err := chunkClient.DeleteChunk(ctx, cp.userID, cid); err != nil {
   279  			if errors.Is(err, chunk.ErrStorageObjectNotFound) {
   280  				cp.cleaner.deletedChunksMissing.Inc()
   281  			} else {
   282  				level.Warn(cp.log).Log("msg", "failed to delete chunk for series", "series", e.SeriesID, "chunk", cid, "err", err)
   283  				cp.cleaner.deletedChunksErrors.Inc()
   284  			}
   285  		} else {
   286  			cp.cleaner.deletedChunks.Inc()
   287  		}
   288  	}
   289  
   290  	return nil
   291  }
   292  
   293  func fetchSingleChunk(ctx context.Context, userID string, chunkClient chunk.Client, chunksIds []string) (*chunk.Chunk, error) {
   294  	// Fetch single chunk
   295  	for _, cid := range chunksIds {
   296  		c, err := chunk.ParseExternalKey(userID, cid)
   297  		if err != nil {
   298  			return nil, errors.Wrap(err, "fetching chunks")
   299  		}
   300  
   301  		cs, err := chunkClient.GetChunks(ctx, []chunk.Chunk{c})
   302  
   303  		if errors.Is(err, chunk.ErrStorageObjectNotFound) {
   304  			continue
   305  		}
   306  		if err != nil {
   307  			return nil, errors.Wrap(err, "fetching chunks")
   308  		}
   309  
   310  		if len(cs) > 0 {
   311  			return &cs[0], nil
   312  		}
   313  	}
   314  
   315  	return nil, nil
   316  }
   317  
   318  func (c *Cleaner) createClientsForDay(dayStart time.Time) (string, chunk.BaseSchema, chunk.Client, chunk.IndexClient, error) {
   319  	for ix, s := range c.schemaConfig.Configs {
   320  		if dayStart.Unix() < s.From.Unix() {
   321  			continue
   322  		}
   323  
   324  		if ix+1 < len(c.schemaConfig.Configs) && dayStart.Unix() > c.schemaConfig.Configs[ix+1].From.Unix() {
   325  			continue
   326  		}
   327  
   328  		tableName := s.IndexTables.TableFor(model.TimeFromUnixNano(dayStart.UnixNano()))
   329  
   330  		schema, err := s.CreateSchema()
   331  		if err != nil {
   332  			return "", nil, nil, nil, errors.Wrap(err, "failed to create schema")
   333  		}
   334  
   335  		// No registerer, to avoid problems with registering same metrics multiple times.
   336  		index, err := storage.NewIndexClient(s.IndexType, c.storageConfig, c.schemaConfig, nil)
   337  		if err != nil {
   338  			return "", nil, nil, nil, errors.Wrap(err, "error creating index client")
   339  		}
   340  
   341  		objectStoreType := s.ObjectType
   342  		if objectStoreType == "" {
   343  			objectStoreType = s.IndexType
   344  		}
   345  
   346  		// No registerer, to avoid problems with registering same metrics multiple times.
   347  		chunks, err := storage.NewChunkClient(objectStoreType, c.storageConfig, c.schemaConfig, nil)
   348  		if err != nil {
   349  			index.Stop()
   350  			return "", nil, nil, nil, errors.Wrap(err, "error creating object client")
   351  		}
   352  
   353  		return tableName, schema, chunks, index, nil
   354  	}
   355  
   356  	return "", nil, nil, nil, errors.Errorf("no schema for day %v", dayStart.Format("2006-01-02"))
   357  }