github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/querier/queryrange/queryrangebase/results_cache.go (about) 1 package queryrangebase 2 3 import ( 4 "context" 5 "flag" 6 "fmt" 7 "net/http" 8 "sort" 9 "strings" 10 "time" 11 12 "github.com/go-kit/log" 13 "github.com/go-kit/log/level" 14 "github.com/gogo/protobuf/proto" 15 "github.com/gogo/protobuf/types" 16 "github.com/grafana/dskit/flagext" 17 "github.com/opentracing/opentracing-go" 18 otlog "github.com/opentracing/opentracing-go/log" 19 "github.com/pkg/errors" 20 "github.com/prometheus/client_golang/prometheus" 21 "github.com/prometheus/common/model" 22 "github.com/prometheus/prometheus/model/timestamp" 23 "github.com/prometheus/prometheus/promql" 24 "github.com/prometheus/prometheus/promql/parser" 25 "github.com/uber/jaeger-client-go" 26 "github.com/weaveworks/common/httpgrpc" 27 28 "github.com/grafana/dskit/tenant" 29 30 "github.com/grafana/loki/pkg/logproto" 31 "github.com/grafana/loki/pkg/storage/chunk/cache" 32 util_log "github.com/grafana/loki/pkg/util/log" 33 "github.com/grafana/loki/pkg/util/spanlogger" 34 "github.com/grafana/loki/pkg/util/validation" 35 ) 36 37 var ( 38 // Value that cacheControlHeader has if the response indicates that the results should not be cached. 39 noStoreValue = "no-store" 40 41 // ResultsCacheGenNumberHeaderName holds name of the header we want to set in http response 42 ResultsCacheGenNumberHeaderName = "Results-Cache-Gen-Number" 43 ) 44 45 type CacheGenNumberLoader interface { 46 GetResultsCacheGenNumber(tenantIDs []string) string 47 } 48 49 // ResultsCacheConfig is the config for the results cache. 50 type ResultsCacheConfig struct { 51 CacheConfig cache.Config `yaml:"cache"` 52 Compression string `yaml:"compression"` 53 } 54 55 // RegisterFlags registers flags. 56 func (cfg *ResultsCacheConfig) RegisterFlags(f *flag.FlagSet) { 57 cfg.CacheConfig.RegisterFlagsWithPrefix("frontend.", "", f) 58 59 f.StringVar(&cfg.Compression, "frontend.compression", "", "Use compression in results cache. Supported values are: 'snappy' and '' (disable compression).") 60 //lint:ignore faillint Need to pass the global logger like this for warning on deprecated methods 61 flagext.DeprecatedFlag(f, "frontend.cache-split-interval", "Deprecated: The maximum interval expected for each request, results will be cached per single interval. This behavior is now determined by querier.split-queries-by-interval.", util_log.Logger) 62 } 63 64 func (cfg *ResultsCacheConfig) Validate() error { 65 switch cfg.Compression { 66 case "snappy", "": 67 // valid 68 default: 69 return errors.Errorf("unsupported compression type: %s", cfg.Compression) 70 } 71 72 return cfg.CacheConfig.Validate() 73 } 74 75 // Extractor is used by the cache to extract a subset of a response from a cache entry. 76 type Extractor interface { 77 // Extract extracts a subset of a response from the `start` and `end` timestamps in milliseconds in the `from` response. 78 Extract(start, end int64, from Response) Response 79 ResponseWithoutHeaders(resp Response) Response 80 } 81 82 // PrometheusResponseExtractor helps extracting specific info from Query Response. 83 type PrometheusResponseExtractor struct{} 84 85 // Extract extracts response for specific a range from a response. 86 func (PrometheusResponseExtractor) Extract(start, end int64, from Response) Response { 87 promRes := from.(*PrometheusResponse) 88 return &PrometheusResponse{ 89 Status: StatusSuccess, 90 Data: PrometheusData{ 91 ResultType: promRes.Data.ResultType, 92 Result: extractMatrix(start, end, promRes.Data.Result), 93 }, 94 Headers: promRes.Headers, 95 } 96 } 97 98 // ResponseWithoutHeaders is useful in caching data without headers since 99 // we anyways do not need headers for sending back the response so this saves some space by reducing size of the objects. 100 func (PrometheusResponseExtractor) ResponseWithoutHeaders(resp Response) Response { 101 promRes := resp.(*PrometheusResponse) 102 return &PrometheusResponse{ 103 Status: StatusSuccess, 104 Data: PrometheusData{ 105 ResultType: promRes.Data.ResultType, 106 Result: promRes.Data.Result, 107 }, 108 } 109 } 110 111 // CacheSplitter generates cache keys. This is a useful interface for downstream 112 // consumers who wish to implement their own strategies. 113 type CacheSplitter interface { 114 GenerateCacheKey(userID string, r Request) string 115 } 116 117 // constSplitter is a utility for using a constant split interval when determining cache keys 118 type constSplitter time.Duration 119 120 // GenerateCacheKey generates a cache key based on the userID, Request and interval. 121 func (t constSplitter) GenerateCacheKey(userID string, r Request) string { 122 currentInterval := r.GetStart() / int64(time.Duration(t)/time.Millisecond) 123 return fmt.Sprintf("%s:%s:%d:%d", userID, r.GetQuery(), r.GetStep(), currentInterval) 124 } 125 126 // ShouldCacheFn checks whether the current request should go to cache 127 // or not. If not, just send the request to next handler. 128 type ShouldCacheFn func(r Request) bool 129 130 type resultsCache struct { 131 logger log.Logger 132 next Handler 133 cache cache.Cache 134 limits Limits 135 splitter CacheSplitter 136 137 extractor Extractor 138 minCacheExtent int64 // discard any cache extent smaller than this 139 merger Merger 140 cacheGenNumberLoader CacheGenNumberLoader 141 shouldCache ShouldCacheFn 142 } 143 144 // NewResultsCacheMiddleware creates results cache middleware from config. 145 // The middleware cache result using a unique cache key for a given request (step,query,user) and interval. 146 // The cache assumes that each request length (end-start) is below or equal the interval. 147 // Each request starting from within the same interval will hit the same cache entry. 148 // If the cache doesn't have the entire duration of the request cached, it will query the uncached parts and append them to the cache entries. 149 // see `generateKey`. 150 func NewResultsCacheMiddleware( 151 logger log.Logger, 152 c cache.Cache, 153 splitter CacheSplitter, 154 limits Limits, 155 merger Merger, 156 extractor Extractor, 157 cacheGenNumberLoader CacheGenNumberLoader, 158 shouldCache ShouldCacheFn, 159 reg prometheus.Registerer, 160 ) (Middleware, error) { 161 if cacheGenNumberLoader != nil { 162 c = cache.NewCacheGenNumMiddleware(c) 163 } 164 165 return MiddlewareFunc(func(next Handler) Handler { 166 return &resultsCache{ 167 logger: logger, 168 next: next, 169 cache: c, 170 limits: limits, 171 merger: merger, 172 extractor: extractor, 173 minCacheExtent: (5 * time.Minute).Milliseconds(), 174 splitter: splitter, 175 cacheGenNumberLoader: cacheGenNumberLoader, 176 shouldCache: shouldCache, 177 } 178 }), nil 179 } 180 181 func (s resultsCache) Do(ctx context.Context, r Request) (Response, error) { 182 tenantIDs, err := tenant.TenantIDs(ctx) 183 if err != nil { 184 return nil, httpgrpc.Errorf(http.StatusBadRequest, err.Error()) 185 } 186 187 if s.shouldCache != nil && !s.shouldCache(r) { 188 return s.next.Do(ctx, r) 189 } 190 191 if s.cacheGenNumberLoader != nil { 192 ctx = cache.InjectCacheGenNumber(ctx, s.cacheGenNumberLoader.GetResultsCacheGenNumber(tenantIDs)) 193 } 194 195 var ( 196 key = s.splitter.GenerateCacheKey(tenant.JoinTenantIDs(tenantIDs), r) 197 extents []Extent 198 response Response 199 ) 200 201 maxCacheFreshness := validation.MaxDurationPerTenant(tenantIDs, s.limits.MaxCacheFreshness) 202 maxCacheTime := int64(model.Now().Add(-maxCacheFreshness)) 203 if r.GetStart() > maxCacheTime { 204 return s.next.Do(ctx, r) 205 } 206 207 cached, ok := s.get(ctx, key) 208 if ok { 209 response, extents, err = s.handleHit(ctx, r, cached, maxCacheTime) 210 } else { 211 response, extents, err = s.handleMiss(ctx, r, maxCacheTime) 212 } 213 214 if err == nil && len(extents) > 0 { 215 extents, err := s.filterRecentExtents(r, maxCacheFreshness, extents) 216 if err != nil { 217 return nil, err 218 } 219 s.put(ctx, key, extents) 220 } 221 222 return response, err 223 } 224 225 // shouldCacheResponse says whether the response should be cached or not. 226 func (s resultsCache) shouldCacheResponse(ctx context.Context, req Request, r Response, maxCacheTime int64) bool { 227 headerValues := getHeaderValuesWithName(r, cacheControlHeader) 228 for _, v := range headerValues { 229 if v == noStoreValue { 230 level.Debug(s.logger).Log("msg", fmt.Sprintf("%s header in response is equal to %s, not caching the response", cacheControlHeader, noStoreValue)) 231 return false 232 } 233 } 234 235 if !s.isAtModifierCachable(req, maxCacheTime) { 236 return false 237 } 238 239 if s.cacheGenNumberLoader == nil { 240 return true 241 } 242 243 genNumbersFromResp := getHeaderValuesWithName(r, ResultsCacheGenNumberHeaderName) 244 genNumberFromCtx := cache.ExtractCacheGenNumber(ctx) 245 246 if len(genNumbersFromResp) == 0 && genNumberFromCtx != "" { 247 level.Debug(s.logger).Log("msg", fmt.Sprintf("we found results cache gen number %s set in store but none in headers", genNumberFromCtx)) 248 return false 249 } 250 251 for _, gen := range genNumbersFromResp { 252 if gen != genNumberFromCtx { 253 level.Debug(s.logger).Log("msg", fmt.Sprintf("inconsistency in results cache gen numbers %s (GEN-FROM-RESPONSE) != %s (GEN-FROM-STORE), not caching the response", gen, genNumberFromCtx)) 254 return false 255 } 256 } 257 258 return true 259 } 260 261 var errAtModifierAfterEnd = errors.New("at modifier after end") 262 263 // isAtModifierCachable returns true if the @ modifier result 264 // is safe to cache. 265 func (s resultsCache) isAtModifierCachable(r Request, maxCacheTime int64) bool { 266 // There are 2 cases when @ modifier is not safe to cache: 267 // 1. When @ modifier points to time beyond the maxCacheTime. 268 // 2. If the @ modifier time is > the query range end while being 269 // below maxCacheTime. In such cases if any tenant is intentionally 270 // playing with old data, we could cache empty result if we look 271 // beyond query end. 272 query := r.GetQuery() 273 if !strings.Contains(query, "@") { 274 return true 275 } 276 expr, err := parser.ParseExpr(query) 277 if err != nil { 278 // We are being pessimistic in such cases. 279 level.Warn(s.logger).Log("msg", "failed to parse query, considering @ modifier as not cachable", "query", query, "err", err) 280 return false 281 } 282 283 // This resolves the start() and end() used with the @ modifier. 284 expr = promql.PreprocessExpr(expr, timestamp.Time(r.GetStart()), timestamp.Time(r.GetEnd())) 285 286 end := r.GetEnd() 287 atModCachable := true 288 parser.Inspect(expr, func(n parser.Node, _ []parser.Node) error { 289 switch e := n.(type) { 290 case *parser.VectorSelector: 291 if e.Timestamp != nil && (*e.Timestamp > end || *e.Timestamp > maxCacheTime) { 292 atModCachable = false 293 return errAtModifierAfterEnd 294 } 295 case *parser.MatrixSelector: 296 ts := e.VectorSelector.(*parser.VectorSelector).Timestamp 297 if ts != nil && (*ts > end || *ts > maxCacheTime) { 298 atModCachable = false 299 return errAtModifierAfterEnd 300 } 301 case *parser.SubqueryExpr: 302 if e.Timestamp != nil && (*e.Timestamp > end || *e.Timestamp > maxCacheTime) { 303 atModCachable = false 304 return errAtModifierAfterEnd 305 } 306 } 307 return nil 308 }) 309 310 return atModCachable 311 } 312 313 func getHeaderValuesWithName(r Response, headerName string) (headerValues []string) { 314 for _, hv := range r.GetHeaders() { 315 if hv.GetName() != headerName { 316 continue 317 } 318 319 headerValues = append(headerValues, hv.GetValues()...) 320 } 321 322 return 323 } 324 325 func (s resultsCache) handleMiss(ctx context.Context, r Request, maxCacheTime int64) (Response, []Extent, error) { 326 response, err := s.next.Do(ctx, r) 327 if err != nil { 328 return nil, nil, err 329 } 330 331 if !s.shouldCacheResponse(ctx, r, response, maxCacheTime) { 332 return response, []Extent{}, nil 333 } 334 335 extent, err := toExtent(ctx, r, s.extractor.ResponseWithoutHeaders(response)) 336 if err != nil { 337 return nil, nil, err 338 } 339 340 extents := []Extent{ 341 extent, 342 } 343 return response, extents, nil 344 } 345 346 func (s resultsCache) handleHit(ctx context.Context, r Request, extents []Extent, maxCacheTime int64) (Response, []Extent, error) { 347 var ( 348 reqResps []RequestResponse 349 err error 350 ) 351 log, ctx := spanlogger.New(ctx, "handleHit") 352 defer log.Finish() 353 354 requests, responses, err := s.partition(r, extents) 355 if err != nil { 356 return nil, nil, err 357 } 358 if len(requests) == 0 { 359 response, err := s.merger.MergeResponse(responses...) 360 // No downstream requests so no need to write back to the cache. 361 return response, nil, err 362 } 363 364 reqResps, err = DoRequests(ctx, s.next, requests, s.limits) 365 if err != nil { 366 return nil, nil, err 367 } 368 369 for _, reqResp := range reqResps { 370 responses = append(responses, reqResp.Response) 371 if !s.shouldCacheResponse(ctx, r, reqResp.Response, maxCacheTime) { 372 continue 373 } 374 extent, err := toExtent(ctx, reqResp.Request, s.extractor.ResponseWithoutHeaders(reqResp.Response)) 375 if err != nil { 376 return nil, nil, err 377 } 378 extents = append(extents, extent) 379 } 380 sort.Slice(extents, func(i, j int) bool { 381 if extents[i].Start == extents[j].Start { 382 // as an optimization, for two extents starts at the same time, we 383 // put bigger extent at the front of the slice, which helps 384 // to reduce the amount of merge we have to do later. 385 return extents[i].End > extents[j].End 386 } 387 388 return extents[i].Start < extents[j].Start 389 }) 390 391 // Merge any extents - potentially overlapping 392 accumulator, err := newAccumulator(extents[0]) 393 if err != nil { 394 return nil, nil, err 395 } 396 mergedExtents := make([]Extent, 0, len(extents)) 397 398 for i := 1; i < len(extents); i++ { 399 if accumulator.End+r.GetStep() < extents[i].Start { 400 mergedExtents, err = merge(mergedExtents, accumulator) 401 if err != nil { 402 return nil, nil, err 403 } 404 accumulator, err = newAccumulator(extents[i]) 405 if err != nil { 406 return nil, nil, err 407 } 408 continue 409 } 410 411 if accumulator.End >= extents[i].End { 412 continue 413 } 414 415 accumulator.TraceId = jaegerTraceID(ctx) 416 accumulator.End = extents[i].End 417 currentRes, err := extents[i].toResponse() 418 if err != nil { 419 return nil, nil, err 420 } 421 merged, err := s.merger.MergeResponse(accumulator.Response, currentRes) 422 if err != nil { 423 return nil, nil, err 424 } 425 accumulator.Response = merged 426 } 427 428 mergedExtents, err = merge(mergedExtents, accumulator) 429 if err != nil { 430 return nil, nil, err 431 } 432 433 response, err := s.merger.MergeResponse(responses...) 434 return response, mergedExtents, err 435 } 436 437 type accumulator struct { 438 Response 439 Extent 440 } 441 442 func merge(extents []Extent, acc *accumulator) ([]Extent, error) { 443 any, err := types.MarshalAny(acc.Response) 444 if err != nil { 445 return nil, err 446 } 447 return append(extents, Extent{ 448 Start: acc.Extent.Start, 449 End: acc.Extent.End, 450 Response: any, 451 TraceId: acc.Extent.TraceId, 452 }), nil 453 } 454 455 func newAccumulator(base Extent) (*accumulator, error) { 456 res, err := base.toResponse() 457 if err != nil { 458 return nil, err 459 } 460 return &accumulator{ 461 Response: res, 462 Extent: base, 463 }, nil 464 } 465 466 func toExtent(ctx context.Context, req Request, res Response) (Extent, error) { 467 any, err := types.MarshalAny(res) 468 if err != nil { 469 return Extent{}, err 470 } 471 return Extent{ 472 Start: req.GetStart(), 473 End: req.GetEnd(), 474 Response: any, 475 TraceId: jaegerTraceID(ctx), 476 }, nil 477 } 478 479 // partition calculates the required requests to satisfy req given the cached data. 480 // extents must be in order by start time. 481 func (s resultsCache) partition(req Request, extents []Extent) ([]Request, []Response, error) { 482 var requests []Request 483 var cachedResponses []Response 484 start := req.GetStart() 485 486 for _, extent := range extents { 487 // If there is no overlap, ignore this extent. 488 if extent.GetEnd() < start || extent.Start > req.GetEnd() { 489 continue 490 } 491 492 // If this extent is tiny and request is not tiny, discard it: more efficient to do a few larger queries. 493 // Hopefully tiny request can make tiny extent into not-so-tiny extent. 494 495 // However if the step is large enough, the split_query_by_interval middleware would generate a query with same start and end. 496 // For example, if the step size is more than 12h and the interval is 24h. 497 // This means the extent's start and end time would be same, even if the timerange covers several hours. 498 if (req.GetStart() != req.GetEnd()) && (req.GetEnd()-req.GetStart() > s.minCacheExtent) && (extent.End-extent.Start < s.minCacheExtent) { 499 continue 500 } 501 502 // If there is a bit missing at the front, make a request for that. 503 if start < extent.Start { 504 r := req.WithStartEnd(start, extent.Start) 505 requests = append(requests, r) 506 } 507 res, err := extent.toResponse() 508 if err != nil { 509 return nil, nil, err 510 } 511 // extract the overlap from the cached extent. 512 cachedResponses = append(cachedResponses, s.extractor.Extract(start, req.GetEnd(), res)) 513 start = extent.End 514 } 515 516 // Lastly, make a request for any data missing at the end. 517 if start < req.GetEnd() { 518 r := req.WithStartEnd(start, req.GetEnd()) 519 requests = append(requests, r) 520 } 521 522 // If start and end are the same (valid in promql), start == req.GetEnd() and we won't do the query. 523 // But we should only do the request if we don't have a valid cached response for it. 524 if req.GetStart() == req.GetEnd() && len(cachedResponses) == 0 { 525 requests = append(requests, req) 526 } 527 528 return requests, cachedResponses, nil 529 } 530 531 func (s resultsCache) filterRecentExtents(req Request, maxCacheFreshness time.Duration, extents []Extent) ([]Extent, error) { 532 maxCacheTime := (int64(model.Now().Add(-maxCacheFreshness)) / req.GetStep()) * req.GetStep() 533 for i := range extents { 534 // Never cache data for the latest freshness period. 535 if extents[i].End > maxCacheTime { 536 extents[i].End = maxCacheTime 537 res, err := extents[i].toResponse() 538 if err != nil { 539 return nil, err 540 } 541 extracted := s.extractor.Extract(extents[i].Start, maxCacheTime, res) 542 any, err := types.MarshalAny(extracted) 543 if err != nil { 544 return nil, err 545 } 546 extents[i].Response = any 547 } 548 } 549 return extents, nil 550 } 551 552 func (s resultsCache) get(ctx context.Context, key string) ([]Extent, bool) { 553 found, bufs, _, _ := s.cache.Fetch(ctx, []string{cache.HashKey(key)}) 554 if len(found) != 1 { 555 return nil, false 556 } 557 558 var resp CachedResponse 559 log, ctx := spanlogger.New(ctx, "unmarshal-extent") //nolint:ineffassign,staticcheck 560 defer log.Finish() 561 562 log.LogFields(otlog.Int("bytes", len(bufs[0]))) 563 564 if err := proto.Unmarshal(bufs[0], &resp); err != nil { 565 level.Error(log).Log("msg", "error unmarshalling cached value", "err", err) 566 log.Error(err) 567 return nil, false 568 } 569 570 if resp.Key != key { 571 return nil, false 572 } 573 574 // Refreshes the cache if it contains an old proto schema. 575 for _, e := range resp.Extents { 576 if e.Response == nil { 577 return nil, false 578 } 579 } 580 581 return resp.Extents, true 582 } 583 584 func (s resultsCache) put(ctx context.Context, key string, extents []Extent) { 585 buf, err := proto.Marshal(&CachedResponse{ 586 Key: key, 587 Extents: extents, 588 }) 589 if err != nil { 590 level.Error(s.logger).Log("msg", "error marshalling cached value", "err", err) 591 return 592 } 593 594 _ = s.cache.Store(ctx, []string{cache.HashKey(key)}, [][]byte{buf}) 595 } 596 597 func jaegerTraceID(ctx context.Context) string { 598 span := opentracing.SpanFromContext(ctx) 599 if span == nil { 600 return "" 601 } 602 603 spanContext, ok := span.Context().(jaeger.SpanContext) 604 if !ok { 605 return "" 606 } 607 608 return spanContext.TraceID().String() 609 } 610 611 func extractMatrix(start, end int64, matrix []SampleStream) []SampleStream { 612 result := make([]SampleStream, 0, len(matrix)) 613 for _, stream := range matrix { 614 extracted, ok := extractSampleStream(start, end, stream) 615 if ok { 616 result = append(result, extracted) 617 } 618 } 619 return result 620 } 621 622 func extractSampleStream(start, end int64, stream SampleStream) (SampleStream, bool) { 623 result := SampleStream{ 624 Labels: stream.Labels, 625 Samples: make([]logproto.LegacySample, 0, len(stream.Samples)), 626 } 627 for _, sample := range stream.Samples { 628 if start <= sample.TimestampMs && sample.TimestampMs <= end { 629 result.Samples = append(result.Samples, sample) 630 } 631 } 632 if len(result.Samples) == 0 { 633 return SampleStream{}, false 634 } 635 return result, true 636 } 637 638 func (e *Extent) toResponse() (Response, error) { 639 msg, err := types.EmptyAny(e.Response) 640 if err != nil { 641 return nil, err 642 } 643 644 if err := types.UnmarshalAny(e.Response, msg); err != nil { 645 return nil, err 646 } 647 648 resp, ok := msg.(Response) 649 if !ok { 650 return nil, fmt.Errorf("bad cached type") 651 } 652 return resp, nil 653 }