github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/testexporter/correctness/delete_series.go (about) 1 package correctness 2 3 import ( 4 "context" 5 "errors" 6 "flag" 7 "fmt" 8 "net/http" 9 "net/url" 10 "path" 11 "sync" 12 "time" 13 14 "github.com/go-kit/log/level" 15 v1 "github.com/prometheus/client_golang/api/prometheus/v1" 16 "github.com/prometheus/client_golang/prometheus" 17 "github.com/prometheus/client_golang/prometheus/promauto" 18 "github.com/weaveworks/common/user" 19 20 util_log "github.com/cortexproject/cortex/pkg/util/log" 21 "github.com/cortexproject/cortex/pkg/util/spanlogger" 22 ) 23 24 const deleteRequestPath = "/api/v1/admin/tsdb/delete_series" 25 26 var ( 27 deleteRequestCreationAttemptsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ 28 Namespace: namespace, 29 Subsystem: subsystem, 30 Name: "delete_requests_creation_attempts_total", 31 Help: "Total number of delete requests creation attempts with status", 32 }, []string{"status"}) 33 deleteRequestVerificationsSkippedTotal = promauto.NewCounterVec(prometheus.CounterOpts{ 34 Namespace: namespace, 35 Subsystem: subsystem, 36 Name: "delete_request_verification_skipped_total", 37 Help: "Total number of queries verifying delete series that were skipped", 38 }, []string{"test_name"}) 39 ) 40 41 type DeleteSeriesTestConfig struct { 42 deleteRequestCreationInterval time.Duration 43 deleteDataForRange time.Duration 44 timeQueryStart TimeValue 45 durationQuerySince time.Duration 46 purgerAddr string 47 48 PrometheusAddr string 49 ExtraSelectors string 50 UserID string 51 } 52 53 func (cfg *DeleteSeriesTestConfig) RegisterFlags(f *flag.FlagSet) { 54 f.DurationVar(&cfg.deleteRequestCreationInterval, "delete-request-creation-interval", 5*time.Minute, "The interval at which delete request should be sent.") 55 f.DurationVar(&cfg.deleteDataForRange, "delete-data-for-range", 2*time.Minute, "Time range for which data is deleted.") 56 f.StringVar(&cfg.purgerAddr, "purger-addr", "", "Purger address to send delete requests. Keep empty to use same address as prometheus-address.") 57 58 // By default, we only query for values from when this process started 59 cfg.timeQueryStart = NewTimeValue(time.Now()) 60 f.Var(&cfg.timeQueryStart, "delete-series-test.test-query-start", "Minimum start date for queries") 61 f.DurationVar(&cfg.durationQuerySince, "delete-series-test.test-query-since", 0, "Duration in the past to test. Overrides -test-query-start") 62 } 63 64 // DeleteSeriesTest would keep deleting data for configured duration at configured interval. 65 // Test method would check whether we are getting expected data by eliminating deleted samples while non deleted ones stays untouched. 66 // For simplification it would not test samples from the start time of last sent delete request and just treat it as passed. 67 type DeleteSeriesTest struct { 68 Case 69 cfg DeleteSeriesTestConfig 70 commonTestConfig CommonTestConfig 71 lastDeleteRequestInterval interval 72 lastDeleteRequestIntervalMutex sync.RWMutex 73 quit chan struct{} 74 wg sync.WaitGroup 75 } 76 77 func NewDeleteSeriesTest(name string, f func(time.Time) float64, cfg DeleteSeriesTestConfig, commonTestConfig CommonTestConfig) Case { 78 commonTestConfig.timeQueryStart = cfg.timeQueryStart 79 commonTestConfig.durationQuerySince = cfg.durationQuerySince 80 test := DeleteSeriesTest{ 81 Case: NewSimpleTestCase(name, f, commonTestConfig), 82 cfg: cfg, 83 commonTestConfig: commonTestConfig, 84 quit: make(chan struct{}), 85 } 86 87 if cfg.purgerAddr == "" { 88 test.cfg.purgerAddr = test.cfg.PrometheusAddr 89 } 90 91 test.wg.Add(1) 92 go test.sendDeleteRequestLoop() 93 return &test 94 } 95 96 func (d *DeleteSeriesTest) Stop() { 97 close(d.quit) 98 d.wg.Wait() 99 } 100 101 func (d *DeleteSeriesTest) sendDeleteRequestLoop() { 102 defer d.wg.Done() 103 // send a delete request as soon as we start to avoid missing creation of delete request across restarts. 104 err := d.sendDeleteRequest() 105 if err != nil { 106 level.Error(util_log.Logger).Log("msg", "error sending delete request", "error", err) 107 } 108 109 t := time.NewTicker(d.cfg.deleteRequestCreationInterval) 110 defer t.Stop() 111 112 for { 113 select { 114 case <-t.C: 115 err := d.sendDeleteRequest() 116 if err != nil { 117 level.Error(util_log.Logger).Log("msg", "error sending delete request", "error", err) 118 } 119 case <-d.quit: 120 return 121 } 122 } 123 } 124 125 func (d *DeleteSeriesTest) Test(ctx context.Context, client v1.API, selectors string, start time.Time, duration time.Duration) (bool, error) { 126 log := spanlogger.FromContext(ctx) 127 queryInterval := interval{start: start.Add(-duration), end: start} 128 129 d.lastDeleteRequestIntervalMutex.RLock() 130 defer d.lastDeleteRequestIntervalMutex.RUnlock() 131 132 // we do not want to query data after the start time of last delete request sent to simplify things. 133 lastDeleteRequestInterval := d.lastDeleteRequestInterval 134 if !queryInterval.end.Before(lastDeleteRequestInterval.start) { 135 deleteRequestVerificationsSkippedTotal.WithLabelValues(d.Name()).Inc() 136 level.Info(log).Log("msg", fmt.Sprintf("skipping test for %d to %d requesting samples after last sent delete request's start time %d", 137 start.Add(-duration).Unix(), start.Unix(), lastDeleteRequestInterval.end.Unix())) 138 return true, nil 139 } 140 141 pairs, err := d.Query(ctx, client, selectors, start, duration) 142 if err != nil { 143 level.Error(log).Log("err", err) 144 return false, err 145 } 146 147 nonDeletedIntervals := d.getNonDeletedIntervals(queryInterval) 148 if len(nonDeletedIntervals) == 0 { 149 // we are querying data covered completed by deleted interval so there should not be any sample pairs returned by the query. 150 if len(pairs) != 0 { 151 return false, errors.New("samples should be 0") 152 } 153 return true, nil 154 } 155 156 level.Debug(log).Log("start", start.Unix(), "query-start", queryInterval.start.Unix(), 157 "query-end", queryInterval.end.Unix(), "non-deleted-intervals") 158 159 verifyPairsFrom, verifyPairsTo := 0, 0 160 for _, nonDeletedInterval := range nonDeletedIntervals { 161 for ; verifyPairsTo < len(pairs); verifyPairsTo++ { 162 pair := pairs[verifyPairsTo] 163 // do not fail the test if difference is just by couple of ms or ns. 164 if pair.Timestamp.Time().Before(nonDeletedInterval.start) && pair.Timestamp.Unix() != nonDeletedInterval.start.Unix() { 165 level.Error(log).Log("msg", "unexpected sample", "timestamp", pair.Timestamp.Unix(), "non-deleted-interval.start", nonDeletedInterval.start.Unix(), 166 "non-deleted-interval.end", nonDeletedInterval.end.Unix()) 167 return false, nil 168 } else if pair.Timestamp.Time().After(nonDeletedInterval.end) { 169 break 170 } 171 } 172 173 passed := verifySamples(spanlogger.FromContext(ctx), d, pairs[verifyPairsFrom:verifyPairsTo], nonDeletedInterval.end.Sub(nonDeletedInterval.start), d.commonTestConfig) 174 if !passed { 175 verifyingPairs := pairs[verifyPairsFrom:verifyPairsTo] 176 if len(verifyingPairs) == 0 { 177 level.Error(log).Log("msg", fmt.Sprintf("expected samples from %d to %d but got 0 samples", nonDeletedInterval.start.Unix(), 178 nonDeletedInterval.end.Unix()), "query start", start.Unix(), "query duration", duration) 179 } else { 180 level.Error(log).Log("msg", "failed to verify samples batch", "query start", start.Unix(), "query duration", duration, 181 "batch length", len(verifyingPairs), 182 "batch duration", nonDeletedInterval.end.Sub(nonDeletedInterval.start), "batch-start", verifyingPairs[0].Timestamp.Unix(), 183 "batch-end", verifyingPairs[len(verifyingPairs)-1].Timestamp.Unix()) 184 } 185 return false, nil 186 } 187 188 verifyPairsFrom = verifyPairsTo 189 } 190 191 return true, nil 192 } 193 194 func (d *DeleteSeriesTest) sendDeleteRequest() (err error) { 195 // data is deleted by slicing the time by deleteRequestCreationInterval from 0 time i.e beginning of epoch 196 // and doing deletion for last deleteDataForRange duration at the end of that slice. 197 endTime := time.Now().Truncate(d.cfg.deleteRequestCreationInterval) 198 startTime := endTime.Add(-d.cfg.deleteDataForRange) 199 metricName := prometheus.BuildFQName(namespace, subsystem, d.Name()) 200 selectors := fmt.Sprintf("%s{%s}", metricName, d.cfg.ExtraSelectors) 201 202 defer func() { 203 status := success 204 if err != nil { 205 status = fail 206 } 207 deleteRequestCreationAttemptsTotal.WithLabelValues(status).Inc() 208 }() 209 210 baseURL, err := url.Parse(d.cfg.purgerAddr) 211 if err != nil { 212 return 213 } 214 215 baseURL.Path = path.Join(baseURL.Path, deleteRequestPath) 216 217 query := baseURL.Query() 218 query.Add("match[]", selectors) 219 query.Add("start", fmt.Sprint(startTime.Unix())) 220 query.Add("end", fmt.Sprint(endTime.Unix())) 221 baseURL.RawQuery = query.Encode() 222 223 r, err := http.NewRequest("POST", baseURL.String(), nil) 224 if err != nil { 225 return err 226 } 227 228 if d.cfg.UserID != "" { 229 r = r.WithContext(user.InjectOrgID(r.Context(), d.cfg.UserID)) 230 err = user.InjectOrgIDIntoHTTPRequest(r.Context(), r) 231 if err != nil { 232 return err 233 } 234 } 235 236 level.Error(util_log.Logger).Log("msg", "sending delete request", "selector", selectors, "starttime", startTime, "endtime", endTime) 237 resp, err := http.DefaultClient.Do(r) 238 if err != nil { 239 return 240 } 241 242 if resp.StatusCode != 204 { 243 return fmt.Errorf("unexpected status code %d", resp.StatusCode) 244 } 245 246 d.lastDeleteRequestIntervalMutex.Lock() 247 defer d.lastDeleteRequestIntervalMutex.Unlock() 248 249 d.lastDeleteRequestInterval = interval{startTime, endTime} 250 251 return 252 } 253 254 func (d *DeleteSeriesTest) getNonDeletedIntervals(queryInterval interval) []interval { 255 intervalToProcess := queryInterval 256 var nonDeletedIntervals []interval 257 258 // build first deleted interval 259 deletedIntervalEnd := queryInterval.start.Truncate(d.cfg.deleteRequestCreationInterval) 260 deletedIntervalStart := deletedIntervalEnd.Add(-d.cfg.deleteDataForRange) 261 262 // first deleted interval could be out of range so try next intervals to find first relevant interval. 263 for !deletedIntervalStart.After(intervalToProcess.start) { 264 deletedIntervalStart = deletedIntervalStart.Add(d.cfg.deleteRequestCreationInterval) 265 if deletedIntervalEnd.Add(1).After(intervalToProcess.start) { 266 intervalToProcess.start = deletedIntervalEnd.Add(1) 267 } 268 deletedIntervalEnd = deletedIntervalEnd.Add(d.cfg.deleteRequestCreationInterval) 269 } 270 271 // keep building non-deleted intervals with each being from intervalToProcess.start to min(deletedIntervalStart.Start-1, intervalToProcess.end) 272 for !deletedIntervalStart.After(queryInterval.end) { 273 nonDeletedInterval := interval{intervalToProcess.start, deletedIntervalStart.Add(-1)} 274 if nonDeletedInterval.end.After(intervalToProcess.end) { 275 nonDeletedInterval.end = intervalToProcess.end 276 } 277 nonDeletedIntervals = append(nonDeletedIntervals, nonDeletedInterval) 278 intervalToProcess.start = deletedIntervalEnd.Add(1) 279 280 // build next deleted interval 281 deletedIntervalStart = deletedIntervalStart.Add(d.cfg.deleteRequestCreationInterval) 282 deletedIntervalEnd = deletedIntervalEnd.Add(d.cfg.deleteRequestCreationInterval) 283 } 284 285 // see if we have some interval left in intervalToProcess, add it if so. 286 if intervalToProcess.start.Before(intervalToProcess.end) { 287 nonDeletedIntervals = append(nonDeletedIntervals, intervalToProcess) 288 } 289 290 return nonDeletedIntervals 291 } 292 293 func (d *DeleteSeriesTest) MinQueryTime() time.Time { 294 return calculateMinQueryTime(d.cfg.durationQuerySince, d.cfg.timeQueryStart) 295 } 296 297 type interval struct { 298 start, end time.Time 299 }