github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/testexporter/correctness/delete_series.go (about)

     1  package correctness
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"flag"
     7  	"fmt"
     8  	"net/http"
     9  	"net/url"
    10  	"path"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/go-kit/log/level"
    15  	v1 "github.com/prometheus/client_golang/api/prometheus/v1"
    16  	"github.com/prometheus/client_golang/prometheus"
    17  	"github.com/prometheus/client_golang/prometheus/promauto"
    18  	"github.com/weaveworks/common/user"
    19  
    20  	util_log "github.com/cortexproject/cortex/pkg/util/log"
    21  	"github.com/cortexproject/cortex/pkg/util/spanlogger"
    22  )
    23  
    24  const deleteRequestPath = "/api/v1/admin/tsdb/delete_series"
    25  
    26  var (
    27  	deleteRequestCreationAttemptsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
    28  		Namespace: namespace,
    29  		Subsystem: subsystem,
    30  		Name:      "delete_requests_creation_attempts_total",
    31  		Help:      "Total number of delete requests creation attempts with status",
    32  	}, []string{"status"})
    33  	deleteRequestVerificationsSkippedTotal = promauto.NewCounterVec(prometheus.CounterOpts{
    34  		Namespace: namespace,
    35  		Subsystem: subsystem,
    36  		Name:      "delete_request_verification_skipped_total",
    37  		Help:      "Total number of queries verifying delete series that were skipped",
    38  	}, []string{"test_name"})
    39  )
    40  
    41  type DeleteSeriesTestConfig struct {
    42  	deleteRequestCreationInterval time.Duration
    43  	deleteDataForRange            time.Duration
    44  	timeQueryStart                TimeValue
    45  	durationQuerySince            time.Duration
    46  	purgerAddr                    string
    47  
    48  	PrometheusAddr string
    49  	ExtraSelectors string
    50  	UserID         string
    51  }
    52  
    53  func (cfg *DeleteSeriesTestConfig) RegisterFlags(f *flag.FlagSet) {
    54  	f.DurationVar(&cfg.deleteRequestCreationInterval, "delete-request-creation-interval", 5*time.Minute, "The interval at which delete request should be sent.")
    55  	f.DurationVar(&cfg.deleteDataForRange, "delete-data-for-range", 2*time.Minute, "Time range for which data is deleted.")
    56  	f.StringVar(&cfg.purgerAddr, "purger-addr", "", "Purger address to send delete requests. Keep empty to use same address as prometheus-address.")
    57  
    58  	// By default, we only query for values from when this process started
    59  	cfg.timeQueryStart = NewTimeValue(time.Now())
    60  	f.Var(&cfg.timeQueryStart, "delete-series-test.test-query-start", "Minimum start date for queries")
    61  	f.DurationVar(&cfg.durationQuerySince, "delete-series-test.test-query-since", 0, "Duration in the past to test.  Overrides -test-query-start")
    62  }
    63  
    64  // DeleteSeriesTest would keep deleting data for configured duration at configured interval.
    65  // Test method would check whether we are getting expected data by eliminating deleted samples while non deleted ones stays untouched.
    66  // For simplification it would not test samples from the start time of last sent delete request and just treat it as passed.
    67  type DeleteSeriesTest struct {
    68  	Case
    69  	cfg                            DeleteSeriesTestConfig
    70  	commonTestConfig               CommonTestConfig
    71  	lastDeleteRequestInterval      interval
    72  	lastDeleteRequestIntervalMutex sync.RWMutex
    73  	quit                           chan struct{}
    74  	wg                             sync.WaitGroup
    75  }
    76  
    77  func NewDeleteSeriesTest(name string, f func(time.Time) float64, cfg DeleteSeriesTestConfig, commonTestConfig CommonTestConfig) Case {
    78  	commonTestConfig.timeQueryStart = cfg.timeQueryStart
    79  	commonTestConfig.durationQuerySince = cfg.durationQuerySince
    80  	test := DeleteSeriesTest{
    81  		Case:             NewSimpleTestCase(name, f, commonTestConfig),
    82  		cfg:              cfg,
    83  		commonTestConfig: commonTestConfig,
    84  		quit:             make(chan struct{}),
    85  	}
    86  
    87  	if cfg.purgerAddr == "" {
    88  		test.cfg.purgerAddr = test.cfg.PrometheusAddr
    89  	}
    90  
    91  	test.wg.Add(1)
    92  	go test.sendDeleteRequestLoop()
    93  	return &test
    94  }
    95  
    96  func (d *DeleteSeriesTest) Stop() {
    97  	close(d.quit)
    98  	d.wg.Wait()
    99  }
   100  
   101  func (d *DeleteSeriesTest) sendDeleteRequestLoop() {
   102  	defer d.wg.Done()
   103  	// send a delete request as soon as we start to avoid missing creation of delete request across restarts.
   104  	err := d.sendDeleteRequest()
   105  	if err != nil {
   106  		level.Error(util_log.Logger).Log("msg", "error sending delete request", "error", err)
   107  	}
   108  
   109  	t := time.NewTicker(d.cfg.deleteRequestCreationInterval)
   110  	defer t.Stop()
   111  
   112  	for {
   113  		select {
   114  		case <-t.C:
   115  			err := d.sendDeleteRequest()
   116  			if err != nil {
   117  				level.Error(util_log.Logger).Log("msg", "error sending delete request", "error", err)
   118  			}
   119  		case <-d.quit:
   120  			return
   121  		}
   122  	}
   123  }
   124  
   125  func (d *DeleteSeriesTest) Test(ctx context.Context, client v1.API, selectors string, start time.Time, duration time.Duration) (bool, error) {
   126  	log := spanlogger.FromContext(ctx)
   127  	queryInterval := interval{start: start.Add(-duration), end: start}
   128  
   129  	d.lastDeleteRequestIntervalMutex.RLock()
   130  	defer d.lastDeleteRequestIntervalMutex.RUnlock()
   131  
   132  	// we do not want to query data after the start time of last delete request sent to simplify things.
   133  	lastDeleteRequestInterval := d.lastDeleteRequestInterval
   134  	if !queryInterval.end.Before(lastDeleteRequestInterval.start) {
   135  		deleteRequestVerificationsSkippedTotal.WithLabelValues(d.Name()).Inc()
   136  		level.Info(log).Log("msg", fmt.Sprintf("skipping test for %d to %d requesting samples after last sent delete request's start time %d",
   137  			start.Add(-duration).Unix(), start.Unix(), lastDeleteRequestInterval.end.Unix()))
   138  		return true, nil
   139  	}
   140  
   141  	pairs, err := d.Query(ctx, client, selectors, start, duration)
   142  	if err != nil {
   143  		level.Error(log).Log("err", err)
   144  		return false, err
   145  	}
   146  
   147  	nonDeletedIntervals := d.getNonDeletedIntervals(queryInterval)
   148  	if len(nonDeletedIntervals) == 0 {
   149  		// we are querying data covered completed by deleted interval so there should not be any sample pairs returned by the query.
   150  		if len(pairs) != 0 {
   151  			return false, errors.New("samples should be 0")
   152  		}
   153  		return true, nil
   154  	}
   155  
   156  	level.Debug(log).Log("start", start.Unix(), "query-start", queryInterval.start.Unix(),
   157  		"query-end", queryInterval.end.Unix(), "non-deleted-intervals")
   158  
   159  	verifyPairsFrom, verifyPairsTo := 0, 0
   160  	for _, nonDeletedInterval := range nonDeletedIntervals {
   161  		for ; verifyPairsTo < len(pairs); verifyPairsTo++ {
   162  			pair := pairs[verifyPairsTo]
   163  			// do not fail the test if difference is just by couple of ms or ns.
   164  			if pair.Timestamp.Time().Before(nonDeletedInterval.start) && pair.Timestamp.Unix() != nonDeletedInterval.start.Unix() {
   165  				level.Error(log).Log("msg", "unexpected sample", "timestamp", pair.Timestamp.Unix(), "non-deleted-interval.start", nonDeletedInterval.start.Unix(),
   166  					"non-deleted-interval.end", nonDeletedInterval.end.Unix())
   167  				return false, nil
   168  			} else if pair.Timestamp.Time().After(nonDeletedInterval.end) {
   169  				break
   170  			}
   171  		}
   172  
   173  		passed := verifySamples(spanlogger.FromContext(ctx), d, pairs[verifyPairsFrom:verifyPairsTo], nonDeletedInterval.end.Sub(nonDeletedInterval.start), d.commonTestConfig)
   174  		if !passed {
   175  			verifyingPairs := pairs[verifyPairsFrom:verifyPairsTo]
   176  			if len(verifyingPairs) == 0 {
   177  				level.Error(log).Log("msg", fmt.Sprintf("expected samples from %d to %d but got 0 samples", nonDeletedInterval.start.Unix(),
   178  					nonDeletedInterval.end.Unix()), "query start", start.Unix(), "query duration", duration)
   179  			} else {
   180  				level.Error(log).Log("msg", "failed to verify samples batch", "query start", start.Unix(), "query duration", duration,
   181  					"batch length", len(verifyingPairs),
   182  					"batch duration", nonDeletedInterval.end.Sub(nonDeletedInterval.start), "batch-start", verifyingPairs[0].Timestamp.Unix(),
   183  					"batch-end", verifyingPairs[len(verifyingPairs)-1].Timestamp.Unix())
   184  			}
   185  			return false, nil
   186  		}
   187  
   188  		verifyPairsFrom = verifyPairsTo
   189  	}
   190  
   191  	return true, nil
   192  }
   193  
   194  func (d *DeleteSeriesTest) sendDeleteRequest() (err error) {
   195  	// data is deleted by slicing the time by deleteRequestCreationInterval from 0 time i.e beginning of epoch
   196  	// and doing deletion for last deleteDataForRange duration at the end of that slice.
   197  	endTime := time.Now().Truncate(d.cfg.deleteRequestCreationInterval)
   198  	startTime := endTime.Add(-d.cfg.deleteDataForRange)
   199  	metricName := prometheus.BuildFQName(namespace, subsystem, d.Name())
   200  	selectors := fmt.Sprintf("%s{%s}", metricName, d.cfg.ExtraSelectors)
   201  
   202  	defer func() {
   203  		status := success
   204  		if err != nil {
   205  			status = fail
   206  		}
   207  		deleteRequestCreationAttemptsTotal.WithLabelValues(status).Inc()
   208  	}()
   209  
   210  	baseURL, err := url.Parse(d.cfg.purgerAddr)
   211  	if err != nil {
   212  		return
   213  	}
   214  
   215  	baseURL.Path = path.Join(baseURL.Path, deleteRequestPath)
   216  
   217  	query := baseURL.Query()
   218  	query.Add("match[]", selectors)
   219  	query.Add("start", fmt.Sprint(startTime.Unix()))
   220  	query.Add("end", fmt.Sprint(endTime.Unix()))
   221  	baseURL.RawQuery = query.Encode()
   222  
   223  	r, err := http.NewRequest("POST", baseURL.String(), nil)
   224  	if err != nil {
   225  		return err
   226  	}
   227  
   228  	if d.cfg.UserID != "" {
   229  		r = r.WithContext(user.InjectOrgID(r.Context(), d.cfg.UserID))
   230  		err = user.InjectOrgIDIntoHTTPRequest(r.Context(), r)
   231  		if err != nil {
   232  			return err
   233  		}
   234  	}
   235  
   236  	level.Error(util_log.Logger).Log("msg", "sending delete request", "selector", selectors, "starttime", startTime, "endtime", endTime)
   237  	resp, err := http.DefaultClient.Do(r)
   238  	if err != nil {
   239  		return
   240  	}
   241  
   242  	if resp.StatusCode != 204 {
   243  		return fmt.Errorf("unexpected status code %d", resp.StatusCode)
   244  	}
   245  
   246  	d.lastDeleteRequestIntervalMutex.Lock()
   247  	defer d.lastDeleteRequestIntervalMutex.Unlock()
   248  
   249  	d.lastDeleteRequestInterval = interval{startTime, endTime}
   250  
   251  	return
   252  }
   253  
   254  func (d *DeleteSeriesTest) getNonDeletedIntervals(queryInterval interval) []interval {
   255  	intervalToProcess := queryInterval
   256  	var nonDeletedIntervals []interval
   257  
   258  	// build first deleted interval
   259  	deletedIntervalEnd := queryInterval.start.Truncate(d.cfg.deleteRequestCreationInterval)
   260  	deletedIntervalStart := deletedIntervalEnd.Add(-d.cfg.deleteDataForRange)
   261  
   262  	// first deleted interval could be out of range so try next intervals to find first relevant interval.
   263  	for !deletedIntervalStart.After(intervalToProcess.start) {
   264  		deletedIntervalStart = deletedIntervalStart.Add(d.cfg.deleteRequestCreationInterval)
   265  		if deletedIntervalEnd.Add(1).After(intervalToProcess.start) {
   266  			intervalToProcess.start = deletedIntervalEnd.Add(1)
   267  		}
   268  		deletedIntervalEnd = deletedIntervalEnd.Add(d.cfg.deleteRequestCreationInterval)
   269  	}
   270  
   271  	// keep building non-deleted intervals with each being from intervalToProcess.start to min(deletedIntervalStart.Start-1, intervalToProcess.end)
   272  	for !deletedIntervalStart.After(queryInterval.end) {
   273  		nonDeletedInterval := interval{intervalToProcess.start, deletedIntervalStart.Add(-1)}
   274  		if nonDeletedInterval.end.After(intervalToProcess.end) {
   275  			nonDeletedInterval.end = intervalToProcess.end
   276  		}
   277  		nonDeletedIntervals = append(nonDeletedIntervals, nonDeletedInterval)
   278  		intervalToProcess.start = deletedIntervalEnd.Add(1)
   279  
   280  		// build next deleted interval
   281  		deletedIntervalStart = deletedIntervalStart.Add(d.cfg.deleteRequestCreationInterval)
   282  		deletedIntervalEnd = deletedIntervalEnd.Add(d.cfg.deleteRequestCreationInterval)
   283  	}
   284  
   285  	// see if we have some interval left in intervalToProcess, add it if so.
   286  	if intervalToProcess.start.Before(intervalToProcess.end) {
   287  		nonDeletedIntervals = append(nonDeletedIntervals, intervalToProcess)
   288  	}
   289  
   290  	return nonDeletedIntervals
   291  }
   292  
   293  func (d *DeleteSeriesTest) MinQueryTime() time.Time {
   294  	return calculateMinQueryTime(d.cfg.durationQuerySince, d.cfg.timeQueryStart)
   295  }
   296  
   297  type interval struct {
   298  	start, end time.Time
   299  }