github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/querier/queryrange/log_result_cache.go (about) 1 package queryrange 2 3 import ( 4 "context" 5 "fmt" 6 "net/http" 7 "time" 8 9 "github.com/go-kit/log" 10 "github.com/go-kit/log/level" 11 "github.com/gogo/protobuf/proto" 12 "github.com/prometheus/client_golang/prometheus" 13 "github.com/prometheus/client_golang/prometheus/promauto" 14 "github.com/prometheus/common/model" 15 "github.com/weaveworks/common/httpgrpc" 16 "golang.org/x/sync/errgroup" 17 18 "github.com/grafana/dskit/tenant" 19 20 "github.com/grafana/loki/pkg/loghttp" 21 "github.com/grafana/loki/pkg/logproto" 22 "github.com/grafana/loki/pkg/logqlmodel/stats" 23 "github.com/grafana/loki/pkg/querier/queryrange/queryrangebase" 24 "github.com/grafana/loki/pkg/storage/chunk/cache" 25 "github.com/grafana/loki/pkg/util/validation" 26 ) 27 28 // LogResultCacheMetrics is the metrics wrapper used in log result cache. 29 type LogResultCacheMetrics struct { 30 CacheHit prometheus.Counter 31 CacheMiss prometheus.Counter 32 } 33 34 // NewLogResultCacheMetrics creates metrics to be used in log result cache. 35 func NewLogResultCacheMetrics(registerer prometheus.Registerer) *LogResultCacheMetrics { 36 return &LogResultCacheMetrics{ 37 CacheHit: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 38 Namespace: "loki", 39 Name: "query_frontend_log_result_cache_hit_total", 40 }), 41 CacheMiss: promauto.With(registerer).NewCounter(prometheus.CounterOpts{ 42 Namespace: "loki", 43 Name: "query_frontend_log_result_cache_miss_total", 44 }), 45 } 46 } 47 48 // NewLogResultCache creates a new log result cache middleware. 49 // Currently it only caches empty filter queries, this is because those are usually easily and freely cacheable. 50 // Log hits are difficult to handle because of the limit query parameter and the size of the response. 51 // In the future it could be extended to cache non-empty query results. 52 // see https://docs.google.com/document/d/1_mACOpxdWZ5K0cIedaja5gzMbv-m0lUVazqZd2O4mEU/edit 53 func NewLogResultCache(logger log.Logger, limits Limits, cache cache.Cache, shouldCache queryrangebase.ShouldCacheFn, metrics *LogResultCacheMetrics) queryrangebase.Middleware { 54 if metrics == nil { 55 metrics = NewLogResultCacheMetrics(nil) 56 } 57 return queryrangebase.MiddlewareFunc(func(next queryrangebase.Handler) queryrangebase.Handler { 58 return &logResultCache{ 59 next: next, 60 limits: limits, 61 cache: cache, 62 logger: logger, 63 shouldCache: shouldCache, 64 metrics: metrics, 65 } 66 }) 67 } 68 69 type logResultCache struct { 70 next queryrangebase.Handler 71 limits Limits 72 cache cache.Cache 73 shouldCache queryrangebase.ShouldCacheFn 74 75 metrics *LogResultCacheMetrics 76 logger log.Logger 77 } 78 79 func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (queryrangebase.Response, error) { 80 tenantIDs, err := tenant.TenantIDs(ctx) 81 if err != nil { 82 return nil, httpgrpc.Errorf(http.StatusBadRequest, err.Error()) 83 } 84 85 if l.shouldCache != nil && !l.shouldCache(req) { 86 return l.next.Do(ctx, req) 87 } 88 89 maxCacheFreshness := validation.MaxDurationPerTenant(tenantIDs, l.limits.MaxCacheFreshness) 90 maxCacheTime := int64(model.Now().Add(-maxCacheFreshness)) 91 if req.GetEnd() > maxCacheTime { 92 return l.next.Do(ctx, req) 93 } 94 95 lokiReq, ok := req.(*LokiRequest) 96 if !ok { 97 return nil, httpgrpc.Errorf(http.StatusInternalServerError, "invalid request type %T", req) 98 } 99 100 interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration) 101 // skip caching by if interval is unset 102 if interval == 0 { 103 return l.next.Do(ctx, req) 104 } 105 // The first subquery might not be aligned. 106 alignedStart := time.Unix(0, lokiReq.GetStartTs().UnixNano()-(lokiReq.GetStartTs().UnixNano()%interval.Nanoseconds())) 107 // generate the cache key based on query, tenant and start time. 108 cacheKey := fmt.Sprintf("log:%s:%s:%d:%d", tenant.JoinTenantIDs(tenantIDs), req.GetQuery(), interval.Nanoseconds(), alignedStart.UnixNano()/(interval.Nanoseconds())) 109 110 _, buff, _, err := l.cache.Fetch(ctx, []string{cache.HashKey(cacheKey)}) 111 if err != nil { 112 level.Warn(l.logger).Log("msg", "error fetching cache", "err", err, "cacheKey", cacheKey) 113 return l.next.Do(ctx, req) 114 } 115 // we expect only one key to be found or missing. 116 if len(buff) > 1 { 117 level.Warn(l.logger).Log("msg", "unexpected length of cache return values", "buff", len(buff)) 118 return l.next.Do(ctx, req) 119 } 120 121 if len(buff) == 0 { 122 // cache miss 123 return l.handleMiss(ctx, cacheKey, lokiReq) 124 } 125 126 // cache hit 127 var cachedRequest LokiRequest 128 err = proto.Unmarshal(buff[0], &cachedRequest) 129 if err != nil { 130 level.Warn(l.logger).Log("msg", "error unmarshalling request from cache", "err", err) 131 return l.next.Do(ctx, req) 132 } 133 return l.handleHit(ctx, cacheKey, &cachedRequest, lokiReq) 134 } 135 136 func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *LokiRequest) (queryrangebase.Response, error) { 137 l.metrics.CacheMiss.Inc() 138 level.Debug(l.logger).Log("msg", "cache miss", "key", cacheKey) 139 resp, err := l.next.Do(ctx, req) 140 if err != nil { 141 return nil, err 142 } 143 lokiRes, ok := resp.(*LokiResponse) 144 if !ok { 145 return nil, fmt.Errorf("unexpected response type %T", resp) 146 } 147 // At the moment we only cache empty results 148 if !isEmpty(lokiRes) { 149 return resp, nil 150 } 151 data, err := proto.Marshal(req) 152 if err != nil { 153 level.Warn(l.logger).Log("msg", "error marshalling request", "err", err) 154 return resp, nil 155 } 156 // cache the result 157 err = l.cache.Store(ctx, []string{cache.HashKey(cacheKey)}, [][]byte{data}) 158 if err != nil { 159 level.Warn(l.logger).Log("msg", "error storing cache", "err", err) 160 } 161 return resp, nil 162 } 163 164 func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) { 165 l.metrics.CacheHit.Inc() 166 // we start with an empty response 167 result := emptyResponse(cachedRequest) 168 // if the request is the same and cover the whole time range, 169 // we can just return the cached result. 170 if !lokiReq.GetStartTs().After(cachedRequest.GetStartTs()) && lokiReq.GetStartTs().Equal(cachedRequest.GetStartTs()) && 171 !lokiReq.GetEndTs().Before(cachedRequest.GetEndTs()) && lokiReq.GetEndTs().Equal(cachedRequest.GetEndTs()) { 172 return result, nil 173 } 174 // we could be missing data at the start and the end. 175 // so we're going to fetch what is missing. 176 var ( 177 startRequest, endRequest *LokiRequest 178 startResp, endResp *LokiResponse 179 updateCache bool 180 ok bool 181 ) 182 g, ctx := errgroup.WithContext(ctx) 183 184 // if we're missing data at the start, start fetching from the start to the cached start. 185 if lokiReq.GetStartTs().Before(cachedRequest.GetStartTs()) { 186 g.Go(func() error { 187 startRequest = lokiReq.WithStartEndTime(lokiReq.GetStartTs(), cachedRequest.GetStartTs()) 188 resp, err := l.next.Do(ctx, startRequest) 189 if err != nil { 190 return err 191 } 192 startResp, ok = resp.(*LokiResponse) 193 if !ok { 194 return fmt.Errorf("unexpected response type %T", resp) 195 } 196 return nil 197 }) 198 } 199 200 // if we're missing data at the end, start fetching from the cached end to the end. 201 if lokiReq.GetEndTs().After(cachedRequest.GetEndTs()) { 202 g.Go(func() error { 203 endRequest = lokiReq.WithStartEndTime(cachedRequest.GetEndTs(), lokiReq.GetEndTs()) 204 resp, err := l.next.Do(ctx, endRequest) 205 if err != nil { 206 return err 207 } 208 endResp, ok = resp.(*LokiResponse) 209 if !ok { 210 return fmt.Errorf("unexpected response type %T", resp) 211 } 212 return nil 213 }) 214 } 215 216 if err := g.Wait(); err != nil { 217 return nil, err 218 } 219 220 // if we have data at the start, we need to merge it with the cached data if it's empty and update the cache. 221 // If it's not empty only merge the response. 222 if startResp != nil { 223 if isEmpty(startResp) { 224 cachedRequest = cachedRequest.WithStartEndTime(startRequest.GetStartTs(), cachedRequest.GetEndTs()) 225 updateCache = true 226 } else { 227 if startResp.Status != loghttp.QueryStatusSuccess { 228 return startResp, nil 229 } 230 result = mergeLokiResponse(startResp, result) 231 } 232 } 233 234 // if we have data at the end, we need to merge it with the cached data if it's empty and update the cache. 235 // If it's not empty only merge the response. 236 if endResp != nil { 237 if isEmpty(endResp) { 238 cachedRequest = cachedRequest.WithStartEndTime(cachedRequest.GetStartTs(), endRequest.GetEndTs()) 239 updateCache = true 240 } else { 241 if endResp.Status != loghttp.QueryStatusSuccess { 242 return endResp, nil 243 } 244 result = mergeLokiResponse(endResp, result) 245 } 246 } 247 248 // we need to update the cache since we fetched more either at the end or the start and it was empty. 249 if updateCache { 250 data, err := proto.Marshal(cachedRequest) 251 if err != nil { 252 level.Warn(l.logger).Log("msg", "error marshalling request", "err", err) 253 return result, err 254 } 255 // cache the result 256 err = l.cache.Store(ctx, []string{cache.HashKey(cacheKey)}, [][]byte{data}) 257 if err != nil { 258 level.Warn(l.logger).Log("msg", "error storing cache", "err", err) 259 } 260 } 261 return result, nil 262 } 263 264 func isEmpty(lokiRes *LokiResponse) bool { 265 return lokiRes.Status == loghttp.QueryStatusSuccess && len(lokiRes.Data.Result) == 0 266 } 267 268 func emptyResponse(lokiReq *LokiRequest) *LokiResponse { 269 return &LokiResponse{ 270 Status: loghttp.QueryStatusSuccess, 271 Statistics: stats.Result{}, 272 Direction: lokiReq.Direction, 273 Limit: lokiReq.Limit, 274 Version: uint32(loghttp.GetVersion(lokiReq.Path)), 275 Data: LokiData{ 276 ResultType: loghttp.ResultTypeStream, 277 Result: []logproto.Stream{}, 278 }, 279 } 280 }