github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/querier/queryrange/log_result_cache.go (about)

     1  package queryrange
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net/http"
     7  	"time"
     8  
     9  	"github.com/go-kit/log"
    10  	"github.com/go-kit/log/level"
    11  	"github.com/gogo/protobuf/proto"
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/prometheus/client_golang/prometheus/promauto"
    14  	"github.com/prometheus/common/model"
    15  	"github.com/weaveworks/common/httpgrpc"
    16  	"golang.org/x/sync/errgroup"
    17  
    18  	"github.com/grafana/dskit/tenant"
    19  
    20  	"github.com/grafana/loki/pkg/loghttp"
    21  	"github.com/grafana/loki/pkg/logproto"
    22  	"github.com/grafana/loki/pkg/logqlmodel/stats"
    23  	"github.com/grafana/loki/pkg/querier/queryrange/queryrangebase"
    24  	"github.com/grafana/loki/pkg/storage/chunk/cache"
    25  	"github.com/grafana/loki/pkg/util/validation"
    26  )
    27  
    28  // LogResultCacheMetrics is the metrics wrapper used in log result cache.
    29  type LogResultCacheMetrics struct {
    30  	CacheHit  prometheus.Counter
    31  	CacheMiss prometheus.Counter
    32  }
    33  
    34  // NewLogResultCacheMetrics creates metrics to be used in log result cache.
    35  func NewLogResultCacheMetrics(registerer prometheus.Registerer) *LogResultCacheMetrics {
    36  	return &LogResultCacheMetrics{
    37  		CacheHit: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
    38  			Namespace: "loki",
    39  			Name:      "query_frontend_log_result_cache_hit_total",
    40  		}),
    41  		CacheMiss: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
    42  			Namespace: "loki",
    43  			Name:      "query_frontend_log_result_cache_miss_total",
    44  		}),
    45  	}
    46  }
    47  
    48  // NewLogResultCache creates a new log result cache middleware.
    49  // Currently it only caches empty filter queries, this is because those are usually easily and freely cacheable.
    50  // Log hits are difficult to handle because of the limit query parameter and the size of the response.
    51  // In the future it could be extended to cache non-empty query results.
    52  // see https://docs.google.com/document/d/1_mACOpxdWZ5K0cIedaja5gzMbv-m0lUVazqZd2O4mEU/edit
    53  func NewLogResultCache(logger log.Logger, limits Limits, cache cache.Cache, shouldCache queryrangebase.ShouldCacheFn, metrics *LogResultCacheMetrics) queryrangebase.Middleware {
    54  	if metrics == nil {
    55  		metrics = NewLogResultCacheMetrics(nil)
    56  	}
    57  	return queryrangebase.MiddlewareFunc(func(next queryrangebase.Handler) queryrangebase.Handler {
    58  		return &logResultCache{
    59  			next:        next,
    60  			limits:      limits,
    61  			cache:       cache,
    62  			logger:      logger,
    63  			shouldCache: shouldCache,
    64  			metrics:     metrics,
    65  		}
    66  	})
    67  }
    68  
    69  type logResultCache struct {
    70  	next        queryrangebase.Handler
    71  	limits      Limits
    72  	cache       cache.Cache
    73  	shouldCache queryrangebase.ShouldCacheFn
    74  
    75  	metrics *LogResultCacheMetrics
    76  	logger  log.Logger
    77  }
    78  
    79  func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (queryrangebase.Response, error) {
    80  	tenantIDs, err := tenant.TenantIDs(ctx)
    81  	if err != nil {
    82  		return nil, httpgrpc.Errorf(http.StatusBadRequest, err.Error())
    83  	}
    84  
    85  	if l.shouldCache != nil && !l.shouldCache(req) {
    86  		return l.next.Do(ctx, req)
    87  	}
    88  
    89  	maxCacheFreshness := validation.MaxDurationPerTenant(tenantIDs, l.limits.MaxCacheFreshness)
    90  	maxCacheTime := int64(model.Now().Add(-maxCacheFreshness))
    91  	if req.GetEnd() > maxCacheTime {
    92  		return l.next.Do(ctx, req)
    93  	}
    94  
    95  	lokiReq, ok := req.(*LokiRequest)
    96  	if !ok {
    97  		return nil, httpgrpc.Errorf(http.StatusInternalServerError, "invalid request type %T", req)
    98  	}
    99  
   100  	interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration)
   101  	// skip caching by if interval is unset
   102  	if interval == 0 {
   103  		return l.next.Do(ctx, req)
   104  	}
   105  	// The first subquery might not be aligned.
   106  	alignedStart := time.Unix(0, lokiReq.GetStartTs().UnixNano()-(lokiReq.GetStartTs().UnixNano()%interval.Nanoseconds()))
   107  	// generate the cache key based on query, tenant and start time.
   108  	cacheKey := fmt.Sprintf("log:%s:%s:%d:%d", tenant.JoinTenantIDs(tenantIDs), req.GetQuery(), interval.Nanoseconds(), alignedStart.UnixNano()/(interval.Nanoseconds()))
   109  
   110  	_, buff, _, err := l.cache.Fetch(ctx, []string{cache.HashKey(cacheKey)})
   111  	if err != nil {
   112  		level.Warn(l.logger).Log("msg", "error fetching cache", "err", err, "cacheKey", cacheKey)
   113  		return l.next.Do(ctx, req)
   114  	}
   115  	// we expect only one key to be found or missing.
   116  	if len(buff) > 1 {
   117  		level.Warn(l.logger).Log("msg", "unexpected length of cache return values", "buff", len(buff))
   118  		return l.next.Do(ctx, req)
   119  	}
   120  
   121  	if len(buff) == 0 {
   122  		// cache miss
   123  		return l.handleMiss(ctx, cacheKey, lokiReq)
   124  	}
   125  
   126  	// cache hit
   127  	var cachedRequest LokiRequest
   128  	err = proto.Unmarshal(buff[0], &cachedRequest)
   129  	if err != nil {
   130  		level.Warn(l.logger).Log("msg", "error unmarshalling request from cache", "err", err)
   131  		return l.next.Do(ctx, req)
   132  	}
   133  	return l.handleHit(ctx, cacheKey, &cachedRequest, lokiReq)
   134  }
   135  
   136  func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *LokiRequest) (queryrangebase.Response, error) {
   137  	l.metrics.CacheMiss.Inc()
   138  	level.Debug(l.logger).Log("msg", "cache miss", "key", cacheKey)
   139  	resp, err := l.next.Do(ctx, req)
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  	lokiRes, ok := resp.(*LokiResponse)
   144  	if !ok {
   145  		return nil, fmt.Errorf("unexpected response type %T", resp)
   146  	}
   147  	// At the moment we only cache empty results
   148  	if !isEmpty(lokiRes) {
   149  		return resp, nil
   150  	}
   151  	data, err := proto.Marshal(req)
   152  	if err != nil {
   153  		level.Warn(l.logger).Log("msg", "error marshalling request", "err", err)
   154  		return resp, nil
   155  	}
   156  	// cache the result
   157  	err = l.cache.Store(ctx, []string{cache.HashKey(cacheKey)}, [][]byte{data})
   158  	if err != nil {
   159  		level.Warn(l.logger).Log("msg", "error storing cache", "err", err)
   160  	}
   161  	return resp, nil
   162  }
   163  
   164  func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) {
   165  	l.metrics.CacheHit.Inc()
   166  	// we start with an empty response
   167  	result := emptyResponse(cachedRequest)
   168  	// if the request is the same and cover the whole time range,
   169  	// we can just return the cached result.
   170  	if !lokiReq.GetStartTs().After(cachedRequest.GetStartTs()) && lokiReq.GetStartTs().Equal(cachedRequest.GetStartTs()) &&
   171  		!lokiReq.GetEndTs().Before(cachedRequest.GetEndTs()) && lokiReq.GetEndTs().Equal(cachedRequest.GetEndTs()) {
   172  		return result, nil
   173  	}
   174  	// we could be missing data at the start and the end.
   175  	// so we're going to fetch what is missing.
   176  	var (
   177  		startRequest, endRequest *LokiRequest
   178  		startResp, endResp       *LokiResponse
   179  		updateCache              bool
   180  		ok                       bool
   181  	)
   182  	g, ctx := errgroup.WithContext(ctx)
   183  
   184  	// if we're missing data at the start, start fetching from the start to the cached start.
   185  	if lokiReq.GetStartTs().Before(cachedRequest.GetStartTs()) {
   186  		g.Go(func() error {
   187  			startRequest = lokiReq.WithStartEndTime(lokiReq.GetStartTs(), cachedRequest.GetStartTs())
   188  			resp, err := l.next.Do(ctx, startRequest)
   189  			if err != nil {
   190  				return err
   191  			}
   192  			startResp, ok = resp.(*LokiResponse)
   193  			if !ok {
   194  				return fmt.Errorf("unexpected response type %T", resp)
   195  			}
   196  			return nil
   197  		})
   198  	}
   199  
   200  	// if we're missing data at the end, start fetching from the cached end to the end.
   201  	if lokiReq.GetEndTs().After(cachedRequest.GetEndTs()) {
   202  		g.Go(func() error {
   203  			endRequest = lokiReq.WithStartEndTime(cachedRequest.GetEndTs(), lokiReq.GetEndTs())
   204  			resp, err := l.next.Do(ctx, endRequest)
   205  			if err != nil {
   206  				return err
   207  			}
   208  			endResp, ok = resp.(*LokiResponse)
   209  			if !ok {
   210  				return fmt.Errorf("unexpected response type %T", resp)
   211  			}
   212  			return nil
   213  		})
   214  	}
   215  
   216  	if err := g.Wait(); err != nil {
   217  		return nil, err
   218  	}
   219  
   220  	// if we have data at the start, we need to merge it with the cached data if it's empty and update the cache.
   221  	// If it's not empty only merge the response.
   222  	if startResp != nil {
   223  		if isEmpty(startResp) {
   224  			cachedRequest = cachedRequest.WithStartEndTime(startRequest.GetStartTs(), cachedRequest.GetEndTs())
   225  			updateCache = true
   226  		} else {
   227  			if startResp.Status != loghttp.QueryStatusSuccess {
   228  				return startResp, nil
   229  			}
   230  			result = mergeLokiResponse(startResp, result)
   231  		}
   232  	}
   233  
   234  	// if we have data at the end, we need to merge it with the cached data if it's empty and update the cache.
   235  	// If it's not empty only merge the response.
   236  	if endResp != nil {
   237  		if isEmpty(endResp) {
   238  			cachedRequest = cachedRequest.WithStartEndTime(cachedRequest.GetStartTs(), endRequest.GetEndTs())
   239  			updateCache = true
   240  		} else {
   241  			if endResp.Status != loghttp.QueryStatusSuccess {
   242  				return endResp, nil
   243  			}
   244  			result = mergeLokiResponse(endResp, result)
   245  		}
   246  	}
   247  
   248  	// we need to update the cache since we fetched more either at the end or the start and it was empty.
   249  	if updateCache {
   250  		data, err := proto.Marshal(cachedRequest)
   251  		if err != nil {
   252  			level.Warn(l.logger).Log("msg", "error marshalling request", "err", err)
   253  			return result, err
   254  		}
   255  		// cache the result
   256  		err = l.cache.Store(ctx, []string{cache.HashKey(cacheKey)}, [][]byte{data})
   257  		if err != nil {
   258  			level.Warn(l.logger).Log("msg", "error storing cache", "err", err)
   259  		}
   260  	}
   261  	return result, nil
   262  }
   263  
   264  func isEmpty(lokiRes *LokiResponse) bool {
   265  	return lokiRes.Status == loghttp.QueryStatusSuccess && len(lokiRes.Data.Result) == 0
   266  }
   267  
   268  func emptyResponse(lokiReq *LokiRequest) *LokiResponse {
   269  	return &LokiResponse{
   270  		Status:     loghttp.QueryStatusSuccess,
   271  		Statistics: stats.Result{},
   272  		Direction:  lokiReq.Direction,
   273  		Limit:      lokiReq.Limit,
   274  		Version:    uint32(loghttp.GetVersion(lokiReq.Path)),
   275  		Data: LokiData{
   276  			ResultType: loghttp.ResultTypeStream,
   277  			Result:     []logproto.Stream{},
   278  		},
   279  	}
   280  }