github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/compat.go (about) 1 package ruler 2 3 import ( 4 "context" 5 "errors" 6 "time" 7 8 "github.com/go-kit/log" 9 "github.com/go-kit/log/level" 10 "github.com/prometheus/client_golang/prometheus" 11 "github.com/prometheus/client_golang/prometheus/promauto" 12 "github.com/prometheus/prometheus/notifier" 13 "github.com/prometheus/prometheus/pkg/exemplar" 14 "github.com/prometheus/prometheus/pkg/labels" 15 "github.com/prometheus/prometheus/pkg/value" 16 "github.com/prometheus/prometheus/promql" 17 "github.com/prometheus/prometheus/rules" 18 "github.com/prometheus/prometheus/storage" 19 "github.com/weaveworks/common/httpgrpc" 20 "github.com/weaveworks/common/user" 21 22 "github.com/cortexproject/cortex/pkg/cortexpb" 23 "github.com/cortexproject/cortex/pkg/querier" 24 util_log "github.com/cortexproject/cortex/pkg/util/log" 25 ) 26 27 // Pusher is an ingester server that accepts pushes. 28 type Pusher interface { 29 Push(context.Context, *cortexpb.WriteRequest) (*cortexpb.WriteResponse, error) 30 } 31 32 type PusherAppender struct { 33 failedWrites prometheus.Counter 34 totalWrites prometheus.Counter 35 36 ctx context.Context 37 pusher Pusher 38 labels []labels.Labels 39 samples []cortexpb.Sample 40 userID string 41 evaluationDelay time.Duration 42 } 43 44 func (a *PusherAppender) Append(_ uint64, l labels.Labels, t int64, v float64) (uint64, error) { 45 a.labels = append(a.labels, l) 46 47 // Adapt staleness markers for ruler evaluation delay. As the upstream code 48 // is using the actual time, when there is a no longer available series. 49 // This then causes 'out of order' append failures once the series is 50 // becoming available again. 51 // see https://github.com/prometheus/prometheus/blob/6c56a1faaaad07317ff585bda75b99bdba0517ad/rules/manager.go#L647-L660 52 // Similar to staleness markers, the rule manager also appends actual time to the ALERTS and ALERTS_FOR_STATE series. 53 // See: https://github.com/prometheus/prometheus/blob/ae086c73cb4d6db9e8b67d5038d3704fea6aec4a/rules/alerting.go#L414-L417 54 metricName := l.Get(labels.MetricName) 55 if a.evaluationDelay > 0 && (value.IsStaleNaN(v) || metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE") { 56 t -= a.evaluationDelay.Milliseconds() 57 } 58 59 a.samples = append(a.samples, cortexpb.Sample{ 60 TimestampMs: t, 61 Value: v, 62 }) 63 return 0, nil 64 } 65 66 func (a *PusherAppender) AppendExemplar(_ uint64, _ labels.Labels, _ exemplar.Exemplar) (uint64, error) { 67 return 0, errors.New("exemplars are unsupported") 68 } 69 70 func (a *PusherAppender) Commit() error { 71 a.totalWrites.Inc() 72 73 // Since a.pusher is distributor, client.ReuseSlice will be called in a.pusher.Push. 74 // We shouldn't call client.ReuseSlice here. 75 _, err := a.pusher.Push(user.InjectOrgID(a.ctx, a.userID), cortexpb.ToWriteRequest(a.labels, a.samples, nil, cortexpb.RULE)) 76 77 if err != nil { 78 // Don't report errors that ended with 4xx HTTP status code (series limits, duplicate samples, out of order, etc.) 79 if resp, ok := httpgrpc.HTTPResponseFromError(err); !ok || resp.Code/100 != 4 { 80 a.failedWrites.Inc() 81 } 82 } 83 84 a.labels = nil 85 a.samples = nil 86 return err 87 } 88 89 func (a *PusherAppender) Rollback() error { 90 a.labels = nil 91 a.samples = nil 92 return nil 93 } 94 95 // PusherAppendable fulfills the storage.Appendable interface for prometheus manager 96 type PusherAppendable struct { 97 pusher Pusher 98 userID string 99 rulesLimits RulesLimits 100 101 totalWrites prometheus.Counter 102 failedWrites prometheus.Counter 103 } 104 105 func NewPusherAppendable(pusher Pusher, userID string, limits RulesLimits, totalWrites, failedWrites prometheus.Counter) *PusherAppendable { 106 return &PusherAppendable{ 107 pusher: pusher, 108 userID: userID, 109 rulesLimits: limits, 110 totalWrites: totalWrites, 111 failedWrites: failedWrites, 112 } 113 } 114 115 // Appender returns a storage.Appender 116 func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender { 117 return &PusherAppender{ 118 failedWrites: t.failedWrites, 119 totalWrites: t.totalWrites, 120 121 ctx: ctx, 122 pusher: t.pusher, 123 userID: t.userID, 124 evaluationDelay: t.rulesLimits.EvaluationDelay(t.userID), 125 } 126 } 127 128 // RulesLimits defines limits used by Ruler. 129 type RulesLimits interface { 130 EvaluationDelay(userID string) time.Duration 131 RulerTenantShardSize(userID string) int 132 RulerMaxRuleGroupsPerTenant(userID string) int 133 RulerMaxRulesPerRuleGroup(userID string) int 134 } 135 136 // EngineQueryFunc returns a new query function using the rules.EngineQueryFunc function 137 // and passing an altered timestamp. 138 func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides RulesLimits, userID string) rules.QueryFunc { 139 return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { 140 orig := rules.EngineQueryFunc(engine, q) 141 // Delay the evaluation of all rules by a set interval to give a buffer 142 // to metric that haven't been forwarded to cortex yet. 143 evaluationDelay := overrides.EvaluationDelay(userID) 144 return orig(ctx, qs, t.Add(-evaluationDelay)) 145 } 146 } 147 148 func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { 149 return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { 150 queries.Inc() 151 result, err := qf(ctx, qs, t) 152 153 // We only care about errors returned by underlying Queryable. Errors returned by PromQL engine are "user-errors", 154 // and not interesting here. 155 qerr := QueryableError{} 156 if err != nil && errors.As(err, &qerr) { 157 origErr := qerr.Unwrap() 158 159 // Not all errors returned by Queryable are interesting, only those that would result in 500 status code. 160 // 161 // We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil. 162 // It returns promql.ErrStorage, if error should be reported back as 500. 163 // Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures), 164 // or various user-errors (limits, duplicate samples, etc. ... also not failures). 165 // 166 // All errors will still be counted towards "evaluation failures" metrics and logged by Prometheus Ruler, 167 // but we only want internal errors here. 168 if _, ok := querier.TranslateToPromqlAPIError(origErr).(promql.ErrStorage); ok { 169 failedQueries.Inc() 170 } 171 172 // Return unwrapped error. 173 return result, origErr 174 } 175 176 return result, err 177 } 178 } 179 180 func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc { 181 if queryTime == nil { 182 return qf 183 } 184 185 return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { 186 // If we've been passed a counter we want to record the wall time spent executing this request. 187 timer := prometheus.NewTimer(nil) 188 defer func() { 189 querySeconds := timer.ObserveDuration().Seconds() 190 queryTime.Add(querySeconds) 191 192 // Log ruler query stats. 193 logMessage := []interface{}{ 194 "msg", "query stats", 195 "component", "ruler", 196 "cortex_ruler_query_seconds_total", querySeconds, 197 "query", qs, 198 } 199 level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...) 200 }() 201 202 result, err := qf(ctx, qs, t) 203 return result, err 204 } 205 } 206 207 // This interface mimicks rules.Manager API. Interface is used to simplify tests. 208 type RulesManager interface { 209 // Starts rules manager. Blocks until Stop is called. 210 Run() 211 212 // Stops rules manager. (Unblocks Run.) 213 Stop() 214 215 // Updates rules manager state. 216 Update(interval time.Duration, files []string, externalLabels labels.Labels, externalURL string) error 217 218 // Returns current rules groups. 219 RuleGroups() []*rules.Group 220 } 221 222 // ManagerFactory is a function that creates new RulesManager for given user and notifier.Manager. 223 type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager 224 225 func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine *promql.Engine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory { 226 totalWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 227 Name: "cortex_ruler_write_requests_total", 228 Help: "Number of write requests to ingesters.", 229 }) 230 failedWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 231 Name: "cortex_ruler_write_requests_failed_total", 232 Help: "Number of failed write requests to ingesters.", 233 }) 234 235 totalQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 236 Name: "cortex_ruler_queries_total", 237 Help: "Number of queries executed by ruler.", 238 }) 239 failedQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{ 240 Name: "cortex_ruler_queries_failed_total", 241 Help: "Number of failed queries by ruler.", 242 }) 243 var rulerQuerySeconds *prometheus.CounterVec 244 if cfg.EnableQueryStats { 245 rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 246 Name: "cortex_ruler_query_seconds_total", 247 Help: "Total amount of wall clock time spent processing queries by the ruler.", 248 }, []string{"user"}) 249 } 250 251 // Wrap errors returned by Queryable to our wrapper, so that we can distinguish between those errors 252 // and errors returned by PromQL engine. Errors from Queryable can be either caused by user (limits) or internal errors. 253 // Errors from PromQL are always "user" errors. 254 q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors) 255 256 return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager { 257 var queryTime prometheus.Counter = nil 258 if rulerQuerySeconds != nil { 259 queryTime = rulerQuerySeconds.WithLabelValues(userID) 260 } 261 262 return rules.NewManager(&rules.ManagerOptions{ 263 Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites), 264 Queryable: q, 265 QueryFunc: RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger), 266 Context: user.InjectOrgID(ctx, userID), 267 ExternalURL: cfg.ExternalURL.URL, 268 NotifyFunc: SendAlerts(notifier, cfg.ExternalURL.URL.String()), 269 Logger: log.With(logger, "user", userID), 270 Registerer: reg, 271 OutageTolerance: cfg.OutageTolerance, 272 ForGracePeriod: cfg.ForGracePeriod, 273 ResendDelay: cfg.ResendDelay, 274 }) 275 } 276 } 277 278 type QueryableError struct { 279 err error 280 } 281 282 func (q QueryableError) Unwrap() error { 283 return q.err 284 } 285 286 func (q QueryableError) Error() string { 287 return q.err.Error() 288 } 289 290 func WrapQueryableErrors(err error) error { 291 if err == nil { 292 return err 293 } 294 295 return QueryableError{err: err} 296 }