github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/chunk/purger/purger.go (about)

     1  package purger
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"flag"
     7  	"fmt"
     8  	"io/ioutil"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/go-kit/log"
    13  	"github.com/go-kit/log/level"
    14  	"github.com/gogo/protobuf/proto"
    15  	"github.com/grafana/dskit/services"
    16  	"github.com/pkg/errors"
    17  	"github.com/prometheus/client_golang/prometheus"
    18  	"github.com/prometheus/client_golang/prometheus/promauto"
    19  	"github.com/prometheus/common/model"
    20  	"github.com/prometheus/prometheus/promql"
    21  	"github.com/prometheus/prometheus/promql/parser"
    22  	"github.com/weaveworks/common/user"
    23  
    24  	"github.com/cortexproject/cortex/pkg/chunk"
    25  	"github.com/cortexproject/cortex/pkg/cortexpb"
    26  	util_log "github.com/cortexproject/cortex/pkg/util/log"
    27  )
    28  
    29  const (
    30  	millisecondPerDay           = int64(24 * time.Hour / time.Millisecond)
    31  	statusSuccess               = "success"
    32  	statusFail                  = "fail"
    33  	loadRequestsInterval        = time.Hour
    34  	retryFailedRequestsInterval = 15 * time.Minute
    35  )
    36  
    37  type purgerMetrics struct {
    38  	deleteRequestsProcessedTotal         *prometheus.CounterVec
    39  	deleteRequestsChunksSelectedTotal    *prometheus.CounterVec
    40  	deleteRequestsProcessingFailures     *prometheus.CounterVec
    41  	loadPendingRequestsAttempsTotal      *prometheus.CounterVec
    42  	oldestPendingDeleteRequestAgeSeconds prometheus.Gauge
    43  	pendingDeleteRequestsCount           prometheus.Gauge
    44  }
    45  
    46  func newPurgerMetrics(r prometheus.Registerer) *purgerMetrics {
    47  	m := purgerMetrics{}
    48  
    49  	m.deleteRequestsProcessedTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
    50  		Namespace: "cortex",
    51  		Name:      "purger_delete_requests_processed_total",
    52  		Help:      "Number of delete requests processed per user",
    53  	}, []string{"user"})
    54  	m.deleteRequestsChunksSelectedTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
    55  		Namespace: "cortex",
    56  		Name:      "purger_delete_requests_chunks_selected_total",
    57  		Help:      "Number of chunks selected while building delete plans per user",
    58  	}, []string{"user"})
    59  	m.deleteRequestsProcessingFailures = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
    60  		Namespace: "cortex",
    61  		Name:      "purger_delete_requests_processing_failures_total",
    62  		Help:      "Number of delete requests processing failures per user",
    63  	}, []string{"user"})
    64  	m.loadPendingRequestsAttempsTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
    65  		Namespace: "cortex",
    66  		Name:      "purger_load_pending_requests_attempts_total",
    67  		Help:      "Number of attempts that were made to load pending requests with status",
    68  	}, []string{"status"})
    69  	m.oldestPendingDeleteRequestAgeSeconds = promauto.With(r).NewGauge(prometheus.GaugeOpts{
    70  		Namespace: "cortex",
    71  		Name:      "purger_oldest_pending_delete_request_age_seconds",
    72  		Help:      "Age of oldest pending delete request in seconds, since they are over their cancellation period",
    73  	})
    74  	m.pendingDeleteRequestsCount = promauto.With(r).NewGauge(prometheus.GaugeOpts{
    75  		Namespace: "cortex",
    76  		Name:      "purger_pending_delete_requests_count",
    77  		Help:      "Count of delete requests which are over their cancellation period and have not finished processing yet",
    78  	})
    79  
    80  	return &m
    81  }
    82  
    83  type deleteRequestWithLogger struct {
    84  	DeleteRequest
    85  	logger log.Logger // logger is initialized with userID and requestID to add context to every log generated using this
    86  }
    87  
    88  // Config holds config for chunks Purger
    89  type Config struct {
    90  	Enable                    bool          `yaml:"enable"`
    91  	NumWorkers                int           `yaml:"num_workers"`
    92  	ObjectStoreType           string        `yaml:"object_store_type"`
    93  	DeleteRequestCancelPeriod time.Duration `yaml:"delete_request_cancel_period"`
    94  }
    95  
    96  // RegisterFlags registers CLI flags for Config
    97  func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
    98  	f.BoolVar(&cfg.Enable, "purger.enable", false, "Enable purger to allow deletion of series. Be aware that Delete series feature is still experimental")
    99  	f.IntVar(&cfg.NumWorkers, "purger.num-workers", 2, "Number of workers executing delete plans in parallel")
   100  	f.StringVar(&cfg.ObjectStoreType, "purger.object-store-type", "", "Name of the object store to use for storing delete plans")
   101  	f.DurationVar(&cfg.DeleteRequestCancelPeriod, "purger.delete-request-cancel-period", 24*time.Hour, "Allow cancellation of delete request until duration after they are created. Data would be deleted only after delete requests have been older than this duration. Ideally this should be set to at least 24h.")
   102  }
   103  
   104  type workerJob struct {
   105  	planNo          int
   106  	userID          string
   107  	deleteRequestID string
   108  	logger          log.Logger
   109  }
   110  
   111  // Purger does the purging of data which is requested to be deleted. Purger only works for chunks.
   112  type Purger struct {
   113  	services.Service
   114  
   115  	cfg          Config
   116  	deleteStore  *DeleteStore
   117  	chunkStore   chunk.Store
   118  	objectClient chunk.ObjectClient
   119  	metrics      *purgerMetrics
   120  
   121  	executePlansChan chan deleteRequestWithLogger
   122  	workerJobChan    chan workerJob
   123  
   124  	// we would only allow processing of singe delete request at a time since delete requests touching same chunks could change the chunk IDs of partially deleted chunks
   125  	// and break the purge plan for other requests
   126  	inProcessRequests *inProcessRequestsCollection
   127  
   128  	// We do not want to limit pulling new delete requests to a fixed interval which otherwise would limit number of delete requests we process per user.
   129  	// While loading delete requests if we find more requests from user pending to be processed, we just set their id in usersWithPendingRequests and
   130  	// when a user's delete request gets processed we just check this map to see whether we want to load more requests without waiting for next ticker to load new batch.
   131  	usersWithPendingRequests    map[string]struct{}
   132  	usersWithPendingRequestsMtx sync.Mutex
   133  	pullNewRequestsChan         chan struct{}
   134  
   135  	pendingPlansCount    map[string]int // per request pending plan count
   136  	pendingPlansCountMtx sync.Mutex
   137  
   138  	wg sync.WaitGroup
   139  }
   140  
   141  // NewPurger creates a new Purger
   142  func NewPurger(cfg Config, deleteStore *DeleteStore, chunkStore chunk.Store, storageClient chunk.ObjectClient, registerer prometheus.Registerer) (*Purger, error) {
   143  	util_log.WarnExperimentalUse("Delete series API")
   144  
   145  	purger := Purger{
   146  		cfg:                      cfg,
   147  		deleteStore:              deleteStore,
   148  		chunkStore:               chunkStore,
   149  		objectClient:             storageClient,
   150  		metrics:                  newPurgerMetrics(registerer),
   151  		pullNewRequestsChan:      make(chan struct{}, 1),
   152  		executePlansChan:         make(chan deleteRequestWithLogger, 50),
   153  		workerJobChan:            make(chan workerJob, 50),
   154  		inProcessRequests:        newInProcessRequestsCollection(),
   155  		usersWithPendingRequests: map[string]struct{}{},
   156  		pendingPlansCount:        map[string]int{},
   157  	}
   158  
   159  	purger.Service = services.NewBasicService(purger.init, purger.loop, purger.stop)
   160  	return &purger, nil
   161  }
   162  
   163  // init starts workers, scheduler and then loads in process delete requests
   164  func (p *Purger) init(ctx context.Context) error {
   165  	for i := 0; i < p.cfg.NumWorkers; i++ {
   166  		p.wg.Add(1)
   167  		go p.worker()
   168  	}
   169  
   170  	p.wg.Add(1)
   171  	go p.jobScheduler(ctx)
   172  
   173  	return p.loadInprocessDeleteRequests()
   174  }
   175  
   176  func (p *Purger) loop(ctx context.Context) error {
   177  	loadRequests := func() {
   178  		status := statusSuccess
   179  
   180  		err := p.pullDeleteRequestsToPlanDeletes()
   181  		if err != nil {
   182  			status = statusFail
   183  			level.Error(util_log.Logger).Log("msg", "error pulling delete requests for building plans", "err", err)
   184  		}
   185  
   186  		p.metrics.loadPendingRequestsAttempsTotal.WithLabelValues(status).Inc()
   187  	}
   188  
   189  	// load requests on startup instead of waiting for first ticker
   190  	loadRequests()
   191  
   192  	loadRequestsTicker := time.NewTicker(loadRequestsInterval)
   193  	defer loadRequestsTicker.Stop()
   194  
   195  	retryFailedRequestsTicker := time.NewTicker(retryFailedRequestsInterval)
   196  	defer retryFailedRequestsTicker.Stop()
   197  
   198  	for {
   199  		select {
   200  		case <-loadRequestsTicker.C:
   201  			loadRequests()
   202  		case <-p.pullNewRequestsChan:
   203  			loadRequests()
   204  		case <-retryFailedRequestsTicker.C:
   205  			p.retryFailedRequests()
   206  		case <-ctx.Done():
   207  			return nil
   208  		}
   209  	}
   210  }
   211  
   212  // Stop waits until all background tasks stop.
   213  func (p *Purger) stop(_ error) error {
   214  	p.wg.Wait()
   215  	return nil
   216  }
   217  
   218  func (p *Purger) retryFailedRequests() {
   219  	userIDsWithFailedRequest := p.inProcessRequests.listUsersWithFailedRequest()
   220  
   221  	for _, userID := range userIDsWithFailedRequest {
   222  		deleteRequest := p.inProcessRequests.get(userID)
   223  		if deleteRequest == nil {
   224  			level.Error(util_log.Logger).Log("msg", "expected an in-process delete request", "user", userID)
   225  			continue
   226  		}
   227  
   228  		p.inProcessRequests.unsetFailedRequestForUser(userID)
   229  		err := p.resumeStalledRequest(*deleteRequest)
   230  		if err != nil {
   231  			reqWithLogger := makeDeleteRequestWithLogger(*deleteRequest, util_log.Logger)
   232  			level.Error(reqWithLogger.logger).Log("msg", "failed to resume failed request", "err", err)
   233  		}
   234  	}
   235  }
   236  
   237  func (p *Purger) workerJobCleanup(job workerJob) {
   238  	err := p.removeDeletePlan(context.Background(), job.userID, job.deleteRequestID, job.planNo)
   239  	if err != nil {
   240  		level.Error(job.logger).Log("msg", "error removing delete plan",
   241  			"plan_no", job.planNo, "err", err)
   242  		return
   243  	}
   244  
   245  	p.pendingPlansCountMtx.Lock()
   246  	p.pendingPlansCount[job.deleteRequestID]--
   247  
   248  	if p.pendingPlansCount[job.deleteRequestID] == 0 {
   249  		level.Info(job.logger).Log("msg", "finished execution of all plans, cleaning up and updating status of request")
   250  
   251  		err := p.deleteStore.UpdateStatus(context.Background(), job.userID, job.deleteRequestID, StatusProcessed)
   252  		if err != nil {
   253  			level.Error(job.logger).Log("msg", "error updating delete request status to process", "err", err)
   254  		}
   255  
   256  		p.metrics.deleteRequestsProcessedTotal.WithLabelValues(job.userID).Inc()
   257  		delete(p.pendingPlansCount, job.deleteRequestID)
   258  		p.pendingPlansCountMtx.Unlock()
   259  
   260  		p.inProcessRequests.remove(job.userID)
   261  
   262  		// request loading of more delete request if
   263  		// - user has more pending requests and
   264  		// - we do not have a pending request to load more requests
   265  		p.usersWithPendingRequestsMtx.Lock()
   266  		defer p.usersWithPendingRequestsMtx.Unlock()
   267  		if _, ok := p.usersWithPendingRequests[job.userID]; ok {
   268  			delete(p.usersWithPendingRequests, job.userID)
   269  			select {
   270  			case p.pullNewRequestsChan <- struct{}{}:
   271  				// sent
   272  			default:
   273  				// already sent
   274  			}
   275  		} else if len(p.usersWithPendingRequests) == 0 {
   276  			// there are no pending requests from any of the users, set the oldest pending request and number of pending requests to 0
   277  			p.metrics.oldestPendingDeleteRequestAgeSeconds.Set(0)
   278  			p.metrics.pendingDeleteRequestsCount.Set(0)
   279  		}
   280  	} else {
   281  		p.pendingPlansCountMtx.Unlock()
   282  	}
   283  }
   284  
   285  // we send all the delete plans to workerJobChan
   286  func (p *Purger) jobScheduler(ctx context.Context) {
   287  	defer p.wg.Done()
   288  
   289  	for {
   290  		select {
   291  		case req := <-p.executePlansChan:
   292  			numPlans := numPlans(req.StartTime, req.EndTime)
   293  			level.Info(req.logger).Log("msg", "sending jobs to workers for purging data", "num_jobs", numPlans)
   294  
   295  			p.pendingPlansCountMtx.Lock()
   296  			p.pendingPlansCount[req.RequestID] = numPlans
   297  			p.pendingPlansCountMtx.Unlock()
   298  
   299  			for i := 0; i < numPlans; i++ {
   300  				p.workerJobChan <- workerJob{planNo: i, userID: req.UserID,
   301  					deleteRequestID: req.RequestID, logger: req.logger}
   302  			}
   303  		case <-ctx.Done():
   304  			close(p.workerJobChan)
   305  			return
   306  		}
   307  	}
   308  }
   309  
   310  func (p *Purger) worker() {
   311  	defer p.wg.Done()
   312  
   313  	for job := range p.workerJobChan {
   314  		err := p.executePlan(job.userID, job.deleteRequestID, job.planNo, job.logger)
   315  		if err != nil {
   316  			p.metrics.deleteRequestsProcessingFailures.WithLabelValues(job.userID).Inc()
   317  			level.Error(job.logger).Log("msg", "error executing delete plan",
   318  				"plan_no", job.planNo, "err", err)
   319  			continue
   320  		}
   321  
   322  		p.workerJobCleanup(job)
   323  	}
   324  }
   325  
   326  func (p *Purger) executePlan(userID, requestID string, planNo int, logger log.Logger) (err error) {
   327  	logger = log.With(logger, "plan_no", planNo)
   328  
   329  	defer func() {
   330  		if err != nil {
   331  			p.inProcessRequests.setFailedRequestForUser(userID)
   332  		}
   333  	}()
   334  
   335  	plan, err := p.getDeletePlan(context.Background(), userID, requestID, planNo)
   336  	if err != nil {
   337  		if err == chunk.ErrStorageObjectNotFound {
   338  			level.Info(logger).Log("msg", "plan not found, must have been executed already")
   339  			// this means plan was already executed and got removed. Do nothing.
   340  			return nil
   341  		}
   342  		return err
   343  	}
   344  
   345  	level.Info(logger).Log("msg", "executing plan")
   346  
   347  	ctx := user.InjectOrgID(context.Background(), userID)
   348  
   349  	for i := range plan.ChunksGroup {
   350  		level.Debug(logger).Log("msg", "deleting chunks", "labels", plan.ChunksGroup[i].Labels)
   351  
   352  		for _, chunkDetails := range plan.ChunksGroup[i].Chunks {
   353  			chunkRef, err := chunk.ParseExternalKey(userID, chunkDetails.ID)
   354  			if err != nil {
   355  				return err
   356  			}
   357  
   358  			var partiallyDeletedInterval *model.Interval = nil
   359  			if chunkDetails.PartiallyDeletedInterval != nil {
   360  				partiallyDeletedInterval = &model.Interval{
   361  					Start: model.Time(chunkDetails.PartiallyDeletedInterval.StartTimestampMs),
   362  					End:   model.Time(chunkDetails.PartiallyDeletedInterval.EndTimestampMs),
   363  				}
   364  			}
   365  
   366  			err = p.chunkStore.DeleteChunk(ctx, chunkRef.From, chunkRef.Through, chunkRef.UserID,
   367  				chunkDetails.ID, cortexpb.FromLabelAdaptersToLabels(plan.ChunksGroup[i].Labels), partiallyDeletedInterval)
   368  			if err != nil {
   369  				if isMissingChunkErr(err) {
   370  					level.Error(logger).Log("msg", "chunk not found for deletion. We may have already deleted it",
   371  						"chunk_id", chunkDetails.ID)
   372  					continue
   373  				}
   374  				return err
   375  			}
   376  		}
   377  
   378  		level.Debug(logger).Log("msg", "deleting series", "labels", plan.ChunksGroup[i].Labels)
   379  
   380  		// this is mostly required to clean up series ids from series store
   381  		err := p.chunkStore.DeleteSeriesIDs(ctx, model.Time(plan.PlanInterval.StartTimestampMs), model.Time(plan.PlanInterval.EndTimestampMs),
   382  			userID, cortexpb.FromLabelAdaptersToLabels(plan.ChunksGroup[i].Labels))
   383  		if err != nil {
   384  			return err
   385  		}
   386  	}
   387  
   388  	level.Info(logger).Log("msg", "finished execution of plan")
   389  
   390  	return
   391  }
   392  
   393  // we need to load all in process delete requests on startup to finish them first
   394  func (p *Purger) loadInprocessDeleteRequests() error {
   395  	inprocessRequests, err := p.deleteStore.GetDeleteRequestsByStatus(context.Background(), StatusBuildingPlan)
   396  	if err != nil {
   397  		return err
   398  	}
   399  
   400  	requestsWithDeletingStatus, err := p.deleteStore.GetDeleteRequestsByStatus(context.Background(), StatusDeleting)
   401  	if err != nil {
   402  		return err
   403  	}
   404  
   405  	inprocessRequests = append(inprocessRequests, requestsWithDeletingStatus...)
   406  
   407  	for i := range inprocessRequests {
   408  		deleteRequest := inprocessRequests[i]
   409  		p.inProcessRequests.set(deleteRequest.UserID, &deleteRequest)
   410  		req := makeDeleteRequestWithLogger(deleteRequest, util_log.Logger)
   411  
   412  		level.Info(req.logger).Log("msg", "resuming in process delete requests", "status", deleteRequest.Status)
   413  		err = p.resumeStalledRequest(deleteRequest)
   414  		if err != nil {
   415  			level.Error(req.logger).Log("msg", "failed to resume stalled request", "err", err)
   416  		}
   417  
   418  	}
   419  
   420  	return nil
   421  }
   422  
   423  func (p *Purger) resumeStalledRequest(deleteRequest DeleteRequest) error {
   424  	req := makeDeleteRequestWithLogger(deleteRequest, util_log.Logger)
   425  
   426  	if deleteRequest.Status == StatusBuildingPlan {
   427  		err := p.buildDeletePlan(req)
   428  		if err != nil {
   429  			p.metrics.deleteRequestsProcessingFailures.WithLabelValues(deleteRequest.UserID).Inc()
   430  			return errors.Wrap(err, "failed to build delete plan")
   431  		}
   432  
   433  		deleteRequest.Status = StatusDeleting
   434  	}
   435  
   436  	if deleteRequest.Status == StatusDeleting {
   437  		level.Info(req.logger).Log("msg", "sending delete request for execution")
   438  		p.executePlansChan <- req
   439  	}
   440  
   441  	return nil
   442  }
   443  
   444  // pullDeleteRequestsToPlanDeletes pulls delete requests which do not have their delete plans built yet and sends them for building delete plans
   445  // after pulling delete requests for building plans, it updates its status to StatusBuildingPlan status to avoid picking this up again next time
   446  func (p *Purger) pullDeleteRequestsToPlanDeletes() error {
   447  	deleteRequests, err := p.deleteStore.GetDeleteRequestsByStatus(context.Background(), StatusReceived)
   448  	if err != nil {
   449  		return err
   450  	}
   451  
   452  	pendingDeleteRequestsCount := p.inProcessRequests.len()
   453  	now := model.Now()
   454  	oldestPendingRequestCreatedAt := model.Time(0)
   455  
   456  	// requests which are still being processed are also considered pending
   457  	if pendingDeleteRequestsCount != 0 {
   458  		oldestInProcessRequest := p.inProcessRequests.getOldest()
   459  		if oldestInProcessRequest != nil {
   460  			oldestPendingRequestCreatedAt = oldestInProcessRequest.CreatedAt
   461  		}
   462  	}
   463  
   464  	for i := range deleteRequests {
   465  		deleteRequest := deleteRequests[i]
   466  
   467  		// adding an extra minute here to avoid a race between cancellation of request and picking of the request for processing
   468  		if deleteRequest.CreatedAt.Add(p.cfg.DeleteRequestCancelPeriod).Add(time.Minute).After(model.Now()) {
   469  			continue
   470  		}
   471  
   472  		pendingDeleteRequestsCount++
   473  		if oldestPendingRequestCreatedAt == 0 || deleteRequest.CreatedAt.Before(oldestPendingRequestCreatedAt) {
   474  			oldestPendingRequestCreatedAt = deleteRequest.CreatedAt
   475  		}
   476  
   477  		if inprocessDeleteRequest := p.inProcessRequests.get(deleteRequest.UserID); inprocessDeleteRequest != nil {
   478  			p.usersWithPendingRequestsMtx.Lock()
   479  			p.usersWithPendingRequests[deleteRequest.UserID] = struct{}{}
   480  			p.usersWithPendingRequestsMtx.Unlock()
   481  
   482  			level.Debug(util_log.Logger).Log("msg", "skipping delete request processing for now since another request from same user is already in process",
   483  				"inprocess_request_id", inprocessDeleteRequest.RequestID,
   484  				"skipped_request_id", deleteRequest.RequestID, "user_id", deleteRequest.UserID)
   485  			continue
   486  		}
   487  
   488  		err = p.deleteStore.UpdateStatus(context.Background(), deleteRequest.UserID, deleteRequest.RequestID, StatusBuildingPlan)
   489  		if err != nil {
   490  			return err
   491  		}
   492  
   493  		deleteRequest.Status = StatusBuildingPlan
   494  		p.inProcessRequests.set(deleteRequest.UserID, &deleteRequest)
   495  		req := makeDeleteRequestWithLogger(deleteRequest, util_log.Logger)
   496  
   497  		level.Info(req.logger).Log("msg", "building plan for a new delete request")
   498  
   499  		err := p.buildDeletePlan(req)
   500  		if err != nil {
   501  			p.metrics.deleteRequestsProcessingFailures.WithLabelValues(deleteRequest.UserID).Inc()
   502  
   503  			// We do not want to remove this delete request from inProcessRequests to make sure
   504  			// we do not move multiple deleting requests in deletion process.
   505  			// None of the other delete requests from the user would be considered for processing until then.
   506  			level.Error(req.logger).Log("msg", "error building delete plan", "err", err)
   507  			return err
   508  		}
   509  
   510  		level.Info(req.logger).Log("msg", "sending delete request for execution")
   511  		p.executePlansChan <- req
   512  	}
   513  
   514  	// track age of oldest delete request since they are over their cancellation period
   515  	oldestPendingRequestAge := time.Duration(0)
   516  	if oldestPendingRequestCreatedAt != 0 {
   517  		oldestPendingRequestAge = now.Sub(oldestPendingRequestCreatedAt.Add(p.cfg.DeleteRequestCancelPeriod))
   518  	}
   519  	p.metrics.oldestPendingDeleteRequestAgeSeconds.Set(float64(oldestPendingRequestAge / time.Second))
   520  	p.metrics.pendingDeleteRequestsCount.Set(float64(pendingDeleteRequestsCount))
   521  
   522  	return nil
   523  }
   524  
   525  // buildDeletePlan builds per day delete plan for given delete requests.
   526  // A days plan will include chunk ids and labels of all the chunks which are supposed to be deleted.
   527  // Chunks are grouped together by labels to avoid storing labels repetitively.
   528  // After building delete plans it updates status of delete request to StatusDeleting and sends it for execution
   529  func (p *Purger) buildDeletePlan(req deleteRequestWithLogger) (err error) {
   530  	ctx := context.Background()
   531  	ctx = user.InjectOrgID(ctx, req.UserID)
   532  
   533  	defer func() {
   534  		if err != nil {
   535  			p.inProcessRequests.setFailedRequestForUser(req.UserID)
   536  		} else {
   537  			req.Status = StatusDeleting
   538  			p.inProcessRequests.set(req.UserID, &req.DeleteRequest)
   539  		}
   540  	}()
   541  
   542  	perDayTimeRange := splitByDay(req.StartTime, req.EndTime)
   543  	level.Info(req.logger).Log("msg", "building delete plan", "num_plans", len(perDayTimeRange))
   544  
   545  	plans := make([][]byte, len(perDayTimeRange))
   546  	includedChunkIDs := map[string]struct{}{}
   547  
   548  	for i, planRange := range perDayTimeRange {
   549  		chunksGroups := []ChunksGroup{}
   550  
   551  		for _, selector := range req.Selectors {
   552  			matchers, err := parser.ParseMetricSelector(selector)
   553  			if err != nil {
   554  				return err
   555  			}
   556  
   557  			chunks, err := p.chunkStore.Get(ctx, req.UserID, planRange.Start, planRange.End, matchers...)
   558  			if err != nil {
   559  				return err
   560  			}
   561  
   562  			var cg []ChunksGroup
   563  			cg, includedChunkIDs = groupChunks(chunks, req.StartTime, req.EndTime, includedChunkIDs)
   564  
   565  			if len(cg) != 0 {
   566  				chunksGroups = append(chunksGroups, cg...)
   567  			}
   568  		}
   569  
   570  		plan := DeletePlan{
   571  			PlanInterval: &Interval{
   572  				StartTimestampMs: int64(planRange.Start),
   573  				EndTimestampMs:   int64(planRange.End),
   574  			},
   575  			ChunksGroup: chunksGroups,
   576  		}
   577  
   578  		pb, err := proto.Marshal(&plan)
   579  		if err != nil {
   580  			return err
   581  		}
   582  
   583  		plans[i] = pb
   584  	}
   585  
   586  	err = p.putDeletePlans(ctx, req.UserID, req.RequestID, plans)
   587  	if err != nil {
   588  		return
   589  	}
   590  
   591  	err = p.deleteStore.UpdateStatus(ctx, req.UserID, req.RequestID, StatusDeleting)
   592  	if err != nil {
   593  		return
   594  	}
   595  
   596  	p.metrics.deleteRequestsChunksSelectedTotal.WithLabelValues(req.UserID).Add(float64(len(includedChunkIDs)))
   597  
   598  	level.Info(req.logger).Log("msg", "built delete plans", "num_plans", len(perDayTimeRange))
   599  
   600  	return
   601  }
   602  
   603  func (p *Purger) putDeletePlans(ctx context.Context, userID, requestID string, plans [][]byte) error {
   604  	for i, plan := range plans {
   605  		objectKey := buildObjectKeyForPlan(userID, requestID, i)
   606  
   607  		err := p.objectClient.PutObject(ctx, objectKey, bytes.NewReader(plan))
   608  		if err != nil {
   609  			return err
   610  		}
   611  	}
   612  
   613  	return nil
   614  }
   615  
   616  func (p *Purger) getDeletePlan(ctx context.Context, userID, requestID string, planNo int) (*DeletePlan, error) {
   617  	objectKey := buildObjectKeyForPlan(userID, requestID, planNo)
   618  
   619  	readCloser, err := p.objectClient.GetObject(ctx, objectKey)
   620  	if err != nil {
   621  		return nil, err
   622  	}
   623  
   624  	defer readCloser.Close()
   625  
   626  	buf, err := ioutil.ReadAll(readCloser)
   627  	if err != nil {
   628  		return nil, err
   629  	}
   630  
   631  	var plan DeletePlan
   632  	err = proto.Unmarshal(buf, &plan)
   633  	if err != nil {
   634  		return nil, err
   635  	}
   636  
   637  	return &plan, nil
   638  }
   639  
   640  func (p *Purger) removeDeletePlan(ctx context.Context, userID, requestID string, planNo int) error {
   641  	objectKey := buildObjectKeyForPlan(userID, requestID, planNo)
   642  	return p.objectClient.DeleteObject(ctx, objectKey)
   643  }
   644  
   645  // returns interval per plan
   646  func splitByDay(start, end model.Time) []model.Interval {
   647  	numOfDays := numPlans(start, end)
   648  
   649  	perDayTimeRange := make([]model.Interval, numOfDays)
   650  	startOfNextDay := model.Time(((int64(start) / millisecondPerDay) + 1) * millisecondPerDay)
   651  	perDayTimeRange[0] = model.Interval{Start: start, End: startOfNextDay - 1}
   652  
   653  	for i := 1; i < numOfDays; i++ {
   654  		interval := model.Interval{Start: startOfNextDay}
   655  		startOfNextDay += model.Time(millisecondPerDay)
   656  		interval.End = startOfNextDay - 1
   657  		perDayTimeRange[i] = interval
   658  	}
   659  
   660  	perDayTimeRange[numOfDays-1].End = end
   661  
   662  	return perDayTimeRange
   663  }
   664  
   665  func numPlans(start, end model.Time) int {
   666  	// rounding down start to start of the day
   667  	if start%model.Time(millisecondPerDay) != 0 {
   668  		start = model.Time((int64(start) / millisecondPerDay) * millisecondPerDay)
   669  	}
   670  
   671  	// rounding up end to end of the day
   672  	if end%model.Time(millisecondPerDay) != 0 {
   673  		end = model.Time((int64(end)/millisecondPerDay)*millisecondPerDay + millisecondPerDay)
   674  	}
   675  
   676  	return int(int64(end-start) / millisecondPerDay)
   677  }
   678  
   679  // groups chunks together by unique label sets i.e all the chunks with same labels would be stored in a group
   680  // chunk details are stored in groups for each unique label set to avoid storing them repetitively for each chunk
   681  func groupChunks(chunks []chunk.Chunk, deleteFrom, deleteThrough model.Time, includedChunkIDs map[string]struct{}) ([]ChunksGroup, map[string]struct{}) {
   682  	metricToChunks := make(map[string]ChunksGroup)
   683  
   684  	for _, chk := range chunks {
   685  		chunkID := chk.ExternalKey()
   686  
   687  		if _, ok := includedChunkIDs[chunkID]; ok {
   688  			continue
   689  		}
   690  		// chunk.Metric are assumed to be sorted which should give same value from String() for same series.
   691  		// If they stop being sorted then in the worst case we would lose the benefit of grouping chunks to avoid storing labels repetitively.
   692  		metricString := chk.Metric.String()
   693  		group, ok := metricToChunks[metricString]
   694  		if !ok {
   695  			group = ChunksGroup{Labels: cortexpb.FromLabelsToLabelAdapters(chk.Metric)}
   696  		}
   697  
   698  		chunkDetails := ChunkDetails{ID: chunkID}
   699  
   700  		if deleteFrom > chk.From || deleteThrough < chk.Through {
   701  			partiallyDeletedInterval := Interval{StartTimestampMs: int64(chk.From), EndTimestampMs: int64(chk.Through)}
   702  
   703  			if deleteFrom > chk.From {
   704  				partiallyDeletedInterval.StartTimestampMs = int64(deleteFrom)
   705  			}
   706  
   707  			if deleteThrough < chk.Through {
   708  				partiallyDeletedInterval.EndTimestampMs = int64(deleteThrough)
   709  			}
   710  			chunkDetails.PartiallyDeletedInterval = &partiallyDeletedInterval
   711  		}
   712  
   713  		group.Chunks = append(group.Chunks, chunkDetails)
   714  		includedChunkIDs[chunkID] = struct{}{}
   715  		metricToChunks[metricString] = group
   716  	}
   717  
   718  	chunksGroups := make([]ChunksGroup, 0, len(metricToChunks))
   719  
   720  	for _, group := range metricToChunks {
   721  		chunksGroups = append(chunksGroups, group)
   722  	}
   723  
   724  	return chunksGroups, includedChunkIDs
   725  }
   726  
   727  func isMissingChunkErr(err error) bool {
   728  	if err == chunk.ErrStorageObjectNotFound {
   729  		return true
   730  	}
   731  	if promqlStorageErr, ok := err.(promql.ErrStorage); ok && promqlStorageErr.Err == chunk.ErrStorageObjectNotFound {
   732  		return true
   733  	}
   734  
   735  	return false
   736  }
   737  
   738  func buildObjectKeyForPlan(userID, requestID string, planNo int) string {
   739  	return fmt.Sprintf("%s:%s/%d", userID, requestID, planNo)
   740  }
   741  
   742  func makeDeleteRequestWithLogger(deleteRequest DeleteRequest, l log.Logger) deleteRequestWithLogger {
   743  	logger := log.With(l, "user_id", deleteRequest.UserID, "request_id", deleteRequest.RequestID)
   744  	return deleteRequestWithLogger{deleteRequest, logger}
   745  }
   746  
   747  // inProcessRequestsCollection stores DeleteRequests which are in process by each user.
   748  // Currently we only allow processing of one delete request per user so it stores single DeleteRequest per user.
   749  type inProcessRequestsCollection struct {
   750  	requests                map[string]*DeleteRequest
   751  	usersWithFailedRequests map[string]struct{}
   752  	mtx                     sync.RWMutex
   753  }
   754  
   755  func newInProcessRequestsCollection() *inProcessRequestsCollection {
   756  	return &inProcessRequestsCollection{
   757  		requests:                map[string]*DeleteRequest{},
   758  		usersWithFailedRequests: map[string]struct{}{},
   759  	}
   760  }
   761  
   762  func (i *inProcessRequestsCollection) set(userID string, request *DeleteRequest) {
   763  	i.mtx.Lock()
   764  	defer i.mtx.Unlock()
   765  
   766  	i.requests[userID] = request
   767  }
   768  
   769  func (i *inProcessRequestsCollection) get(userID string) *DeleteRequest {
   770  	i.mtx.RLock()
   771  	defer i.mtx.RUnlock()
   772  
   773  	return i.requests[userID]
   774  }
   775  
   776  func (i *inProcessRequestsCollection) remove(userID string) {
   777  	i.mtx.Lock()
   778  	defer i.mtx.Unlock()
   779  
   780  	delete(i.requests, userID)
   781  }
   782  
   783  func (i *inProcessRequestsCollection) len() int {
   784  	i.mtx.RLock()
   785  	defer i.mtx.RUnlock()
   786  
   787  	return len(i.requests)
   788  }
   789  
   790  func (i *inProcessRequestsCollection) getOldest() *DeleteRequest {
   791  	i.mtx.RLock()
   792  	defer i.mtx.RUnlock()
   793  
   794  	var oldestRequest *DeleteRequest
   795  	for _, request := range i.requests {
   796  		if oldestRequest == nil || request.CreatedAt.Before(oldestRequest.CreatedAt) {
   797  			oldestRequest = request
   798  		}
   799  	}
   800  
   801  	return oldestRequest
   802  }
   803  
   804  func (i *inProcessRequestsCollection) setFailedRequestForUser(userID string) {
   805  	i.mtx.Lock()
   806  	defer i.mtx.Unlock()
   807  
   808  	i.usersWithFailedRequests[userID] = struct{}{}
   809  }
   810  
   811  func (i *inProcessRequestsCollection) unsetFailedRequestForUser(userID string) {
   812  	i.mtx.Lock()
   813  	defer i.mtx.Unlock()
   814  
   815  	delete(i.usersWithFailedRequests, userID)
   816  }
   817  
   818  func (i *inProcessRequestsCollection) listUsersWithFailedRequest() []string {
   819  	i.mtx.RLock()
   820  	defer i.mtx.RUnlock()
   821  
   822  	userIDs := make([]string, 0, len(i.usersWithFailedRequests))
   823  	for userID := range i.usersWithFailedRequests {
   824  		userIDs = append(userIDs, userID)
   825  	}
   826  
   827  	return userIDs
   828  }