github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/base/compat.go (about) 1 package base 2 3 import ( 4 "context" 5 "errors" 6 "time" 7 8 "github.com/go-kit/log" 9 "github.com/go-kit/log/level" 10 "github.com/prometheus/client_golang/prometheus" 11 "github.com/prometheus/client_golang/prometheus/promauto" 12 "github.com/prometheus/prometheus/model/exemplar" 13 "github.com/prometheus/prometheus/model/labels" 14 "github.com/prometheus/prometheus/model/value" 15 "github.com/prometheus/prometheus/notifier" 16 "github.com/prometheus/prometheus/promql" 17 "github.com/prometheus/prometheus/rules" 18 "github.com/prometheus/prometheus/storage" 19 "github.com/weaveworks/common/httpgrpc" 20 "github.com/weaveworks/common/user" 21 22 "github.com/grafana/loki/pkg/logproto" 23 util_log "github.com/grafana/loki/pkg/util/log" 24 ) 25 26 // Pusher is an ingester server that accepts pushes. 27 type Pusher interface { 28 Push(context.Context, *logproto.WriteRequest) (*logproto.WriteResponse, error) 29 } 30 31 type PusherAppender struct { 32 failedWrites prometheus.Counter 33 totalWrites prometheus.Counter 34 35 ctx context.Context 36 pusher Pusher 37 labels []labels.Labels 38 samples []logproto.LegacySample 39 userID string 40 evaluationDelay time.Duration 41 } 42 43 func (a *PusherAppender) Append(_ storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) { 44 a.labels = append(a.labels, l) 45 46 // Adapt staleness markers for ruler evaluation delay. As the upstream code 47 // is using the actual time, when there is a no longer available series. 48 // This then causes 'out of order' append failures once the series is 49 // becoming available again. 50 // see https://github.com/prometheus/prometheus/blob/6c56a1faaaad07317ff585bda75b99bdba0517ad/rules/manager.go#L647-L660 51 // Similar to staleness markers, the rule manager also appends actual time to the ALERTS and ALERTS_FOR_STATE series. 52 // See: https://github.com/prometheus/prometheus/blob/ae086c73cb4d6db9e8b67d5038d3704fea6aec4a/rules/alerting.go#L414-L417 53 metricName := l.Get(labels.MetricName) 54 if a.evaluationDelay > 0 && (value.IsStaleNaN(v) || metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE") { 55 t -= a.evaluationDelay.Milliseconds() 56 } 57 58 a.samples = append(a.samples, logproto.LegacySample{ 59 TimestampMs: t, 60 Value: v, 61 }) 62 return 0, nil 63 } 64 65 func (a *PusherAppender) AppendExemplar(_ storage.SeriesRef, _ labels.Labels, _ exemplar.Exemplar) (storage.SeriesRef, error) { 66 return 0, errors.New("exemplars are unsupported") 67 } 68 69 func (a *PusherAppender) Commit() error { 70 a.totalWrites.Inc() 71 72 // Since a.pusher is distributor, client.ReuseSlice will be called in a.pusher.Push. 73 // We shouldn't call client.ReuseSlice here. 74 _, err := a.pusher.Push(user.InjectOrgID(a.ctx, a.userID), logproto.ToWriteRequest(a.labels, a.samples, nil, logproto.RULE)) 75 if err != nil { 76 // Don't report errors that ended with 4xx HTTP status code (series limits, duplicate samples, out of order, etc.) 77 if resp, ok := httpgrpc.HTTPResponseFromError(err); !ok || resp.Code/100 != 4 { 78 a.failedWrites.Inc() 79 } 80 } 81 82 a.labels = nil 83 a.samples = nil 84 return err 85 } 86 87 func (a *PusherAppender) Rollback() error { 88 a.labels = nil 89 a.samples = nil 90 return nil 91 } 92 93 // PusherAppendable fulfills the storage.Appendable interface for prometheus manager 94 type PusherAppendable struct { 95 pusher Pusher 96 userID string 97 rulesLimits RulesLimits 98 99 totalWrites prometheus.Counter 100 failedWrites prometheus.Counter 101 } 102 103 func NewPusherAppendable(pusher Pusher, userID string, limits RulesLimits, totalWrites, failedWrites prometheus.Counter) *PusherAppendable { 104 return &PusherAppendable{ 105 pusher: pusher, 106 userID: userID, 107 rulesLimits: limits, 108 totalWrites: totalWrites, 109 failedWrites: failedWrites, 110 } 111 } 112 113 // Appender returns a storage.Appender 114 func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender { 115 return &PusherAppender{ 116 failedWrites: t.failedWrites, 117 totalWrites: t.totalWrites, 118 119 ctx: ctx, 120 pusher: t.pusher, 121 userID: t.userID, 122 evaluationDelay: t.rulesLimits.EvaluationDelay(t.userID), 123 } 124 } 125 126 // RulesLimits defines limits used by Ruler. 127 type RulesLimits interface { 128 EvaluationDelay(userID string) time.Duration 129 RulerTenantShardSize(userID string) int 130 RulerMaxRuleGroupsPerTenant(userID string) int 131 RulerMaxRulesPerRuleGroup(userID string) int 132 } 133 134 // EngineQueryFunc returns a new query function using the rules.EngineQueryFunc function 135 // and passing an altered timestamp. 136 func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides RulesLimits, userID string) rules.QueryFunc { 137 return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { 138 orig := rules.EngineQueryFunc(engine, q) 139 // Delay the evaluation of all rules by a set interval to give a buffer 140 // to metric that haven't been forwarded to cortex yet. 141 evaluationDelay := overrides.EvaluationDelay(userID) 142 return orig(ctx, qs, t.Add(-evaluationDelay)) 143 } 144 } 145 146 func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { 147 return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { 148 queries.Inc() 149 result, err := qf(ctx, qs, t) 150 151 // We only care about errors returned by underlying Queryable. Errors returned by PromQL engine are "user-errors", 152 // and not interesting here. 153 qerr := QueryableError{} 154 if err != nil && errors.As(err, &qerr) { 155 origErr := qerr.Unwrap() 156 157 // Not all errors returned by Queryable are interesting, only those that would result in 500 status code. 158 // 159 // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. 160 // It returns promql.ErrStorage, if error should be reported back as 500. 161 // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures), 162 // or various user-errors (limits, duplicate samples, etc. ... also not failures). 163 // 164 // All errors will still be counted towards "evaluation failures" metrics and logged by Prometheus Ruler, 165 // but we only want internal errors here. 166 if _, ok := TranslateToPromqlAPIError(origErr).(promql.ErrStorage); ok { 167 failedQueries.Inc() 168 } 169 170 // Return unwrapped error. 171 return result, origErr 172 } 173 174 return result, err 175 } 176 } 177 178 func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { 179 if queryTime == nil { 180 return qf 181 } 182 183 return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { 184 // If we've been passed a counter we want to record the wall time spent executing this request. 185 timer := prometheus.NewTimer(nil) 186 defer func() { 187 querySeconds := timer.ObserveDuration().Seconds() 188 queryTime.Add(querySeconds) 189 190 // Log ruler query stats. 191 logMessage := []interface{}{ 192 "msg", "query stats", 193 "component", "ruler", 194 "cortex_ruler_query_seconds_total", querySeconds, 195 "query", qs, 196 } 197 level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) 198 }() 199 200 result, err := qf(ctx, qs, t) 201 return result, err 202 } 203 } 204 205 // This interface mimicks rules.Manager API. Interface is used to simplify tests. 206 type RulesManager interface { 207 // Starts rules manager. Blocks until Stop is called. 208 Run() 209 210 // Stops rules manager. (Unblocks Run.) 211 Stop() 212 213 // Updates rules manager state. 214 Update(interval time.Duration, files []string, externalLabels labels.Labels, externalURL string, ruleGroupPostProcessFunc rules.RuleGroupPostProcessFunc) error 215 216 // Returns current rules groups. 217 RuleGroups() []*rules.Group 218 } 219 220 // ManagerFactory is a function that creates new RulesManager for given user and notifier.Manager. 221 type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager 222 223 func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine *promql.Engine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory { 224 totalWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 225 Name: "cortex_ruler_write_requests_total", 226 Help: "Number of write requests to ingesters.", 227 }) 228 failedWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 229 Name: "cortex_ruler_write_requests_failed_total", 230 Help: "Number of failed write requests to ingesters.", 231 }) 232 233 totalQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 234 Name: "cortex_ruler_queries_total", 235 Help: "Number of queries executed by ruler.", 236 }) 237 failedQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 238 Name: "cortex_ruler_queries_failed_total", 239 Help: "Number of failed queries by ruler.", 240 }) 241 var rulerQuerySeconds *prometheus.CounterVec 242 if cfg.EnableQueryStats { 243 rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 244 Name: "cortex_ruler_query_seconds_total", 245 Help: "Total amount of wall clock time spent processing queries by the ruler.", 246 }, []string{"user"}) 247 } 248 249 // Wrap errors returned by Queryable to our wrapper, so that we can distinguish between those errors 250 // and errors returned by PromQL engine. Errors from Queryable can be either caused by user (limits) or internal errors. 251 // Errors from PromQL are always "user" errors. 252 q = NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors) 253 254 return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { 255 var queryTime prometheus.Counter 256 if rulerQuerySeconds != nil { 257 queryTime = rulerQuerySeconds.WithLabelValues(userID) 258 } 259 260 return rules.NewManager(&rules.ManagerOptions{ 261 Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), 262 Queryable: q, 263 QueryFunc: RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger), 264 Context: user.InjectOrgID(ctx, userID), 265 ExternalURL: cfg.ExternalURL.URL, 266 NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), 267 Logger: log.With(logger, "user", userID), 268 Registerer: reg, 269 OutageTolerance: cfg.OutageTolerance, 270 ForGracePeriod: cfg.ForGracePeriod, 271 ResendDelay: cfg.ResendDelay, 272 }) 273 } 274 } 275 276 type QueryableError struct { 277 err error 278 } 279 280 func (q QueryableError) Unwrap() error { 281 return q.err 282 } 283 284 func (q QueryableError) Error() string { 285 return q.err.Error() 286 } 287 288 func WrapQueryableErrors(err error) error { 289 if err == nil { 290 return err 291 } 292 293 return QueryableError{err: err} 294 }