github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/compat.go (about)

     1  package ruler
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"time"
     7  
     8  	"github.com/go-kit/log"
     9  	"github.com/go-kit/log/level"
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"github.com/prometheus/client_golang/prometheus/promauto"
    12  	"github.com/prometheus/prometheus/notifier"
    13  	"github.com/prometheus/prometheus/pkg/exemplar"
    14  	"github.com/prometheus/prometheus/pkg/labels"
    15  	"github.com/prometheus/prometheus/pkg/value"
    16  	"github.com/prometheus/prometheus/promql"
    17  	"github.com/prometheus/prometheus/rules"
    18  	"github.com/prometheus/prometheus/storage"
    19  	"github.com/weaveworks/common/httpgrpc"
    20  	"github.com/weaveworks/common/user"
    21  
    22  	"github.com/cortexproject/cortex/pkg/cortexpb"
    23  	"github.com/cortexproject/cortex/pkg/querier"
    24  	util_log "github.com/cortexproject/cortex/pkg/util/log"
    25  )
    26  
    27  // Pusher is an ingester server that accepts pushes.
    28  type Pusher interface {
    29  	Push(context.Context, *cortexpb.WriteRequest) (*cortexpb.WriteResponse, error)
    30  }
    31  
    32  type PusherAppender struct {
    33  	failedWrites prometheus.Counter
    34  	totalWrites  prometheus.Counter
    35  
    36  	ctx             context.Context
    37  	pusher          Pusher
    38  	labels          []labels.Labels
    39  	samples         []cortexpb.Sample
    40  	userID          string
    41  	evaluationDelay time.Duration
    42  }
    43  
    44  func (a *PusherAppender) Append(_ uint64, l labels.Labels, t int64, v float64) (uint64, error) {
    45  	a.labels = append(a.labels, l)
    46  
    47  	// Adapt staleness markers for ruler evaluation delay. As the upstream code
    48  	// is using the actual time, when there is a no longer available series.
    49  	// This then causes 'out of order' append failures once the series is
    50  	// becoming available again.
    51  	// see https://github.com/prometheus/prometheus/blob/6c56a1faaaad07317ff585bda75b99bdba0517ad/rules/manager.go#L647-L660
    52  	// Similar to staleness markers, the rule manager also appends actual time to the ALERTS and ALERTS_FOR_STATE series.
    53  	// See: https://github.com/prometheus/prometheus/blob/ae086c73cb4d6db9e8b67d5038d3704fea6aec4a/rules/alerting.go#L414-L417
    54  	metricName := l.Get(labels.MetricName)
    55  	if a.evaluationDelay > 0 && (value.IsStaleNaN(v) || metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE") {
    56  		t -= a.evaluationDelay.Milliseconds()
    57  	}
    58  
    59  	a.samples = append(a.samples, cortexpb.Sample{
    60  		TimestampMs: t,
    61  		Value:       v,
    62  	})
    63  	return 0, nil
    64  }
    65  
    66  func (a *PusherAppender) AppendExemplar(_ uint64, _ labels.Labels, _ exemplar.Exemplar) (uint64, error) {
    67  	return 0, errors.New("exemplars are unsupported")
    68  }
    69  
    70  func (a *PusherAppender) Commit() error {
    71  	a.totalWrites.Inc()
    72  
    73  	// Since a.pusher is distributor, client.ReuseSlice will be called in a.pusher.Push.
    74  	// We shouldn't call client.ReuseSlice here.
    75  	_, err := a.pusher.Push(user.InjectOrgID(a.ctx, a.userID), cortexpb.ToWriteRequest(a.labels, a.samples, nil, cortexpb.RULE))
    76  
    77  	if err != nil {
    78  		// Don't report errors that ended with 4xx HTTP status code (series limits, duplicate samples, out of order, etc.)
    79  		if resp, ok := httpgrpc.HTTPResponseFromError(err); !ok || resp.Code/100 != 4 {
    80  			a.failedWrites.Inc()
    81  		}
    82  	}
    83  
    84  	a.labels = nil
    85  	a.samples = nil
    86  	return err
    87  }
    88  
    89  func (a *PusherAppender) Rollback() error {
    90  	a.labels = nil
    91  	a.samples = nil
    92  	return nil
    93  }
    94  
    95  // PusherAppendable fulfills the storage.Appendable interface for prometheus manager
    96  type PusherAppendable struct {
    97  	pusher      Pusher
    98  	userID      string
    99  	rulesLimits RulesLimits
   100  
   101  	totalWrites  prometheus.Counter
   102  	failedWrites prometheus.Counter
   103  }
   104  
   105  func NewPusherAppendable(pusher Pusher, userID string, limits RulesLimits, totalWrites, failedWrites prometheus.Counter) *PusherAppendable {
   106  	return &PusherAppendable{
   107  		pusher:       pusher,
   108  		userID:       userID,
   109  		rulesLimits:  limits,
   110  		totalWrites:  totalWrites,
   111  		failedWrites: failedWrites,
   112  	}
   113  }
   114  
   115  // Appender returns a storage.Appender
   116  func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender {
   117  	return &PusherAppender{
   118  		failedWrites: t.failedWrites,
   119  		totalWrites:  t.totalWrites,
   120  
   121  		ctx:             ctx,
   122  		pusher:          t.pusher,
   123  		userID:          t.userID,
   124  		evaluationDelay: t.rulesLimits.EvaluationDelay(t.userID),
   125  	}
   126  }
   127  
   128  // RulesLimits defines limits used by Ruler.
   129  type RulesLimits interface {
   130  	EvaluationDelay(userID string) time.Duration
   131  	RulerTenantShardSize(userID string) int
   132  	RulerMaxRuleGroupsPerTenant(userID string) int
   133  	RulerMaxRulesPerRuleGroup(userID string) int
   134  }
   135  
   136  // EngineQueryFunc returns a new query function using the rules.EngineQueryFunc function
   137  // and passing an altered timestamp.
   138  func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides RulesLimits, userID string) rules.QueryFunc {
   139  	return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
   140  		orig := rules.EngineQueryFunc(engine, q)
   141  		// Delay the evaluation of all rules by a set interval to give a buffer
   142  		// to metric that haven't been forwarded to cortex yet.
   143  		evaluationDelay := overrides.EvaluationDelay(userID)
   144  		return orig(ctx, qs, t.Add(-evaluationDelay))
   145  	}
   146  }
   147  
   148  func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc {
   149  	return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
   150  		queries.Inc()
   151  		result, err := qf(ctx, qs, t)
   152  
   153  		// We only care about errors returned by underlying Queryable. Errors returned by PromQL engine are "user-errors",
   154  		// and not interesting here.
   155  		qerr := QueryableError{}
   156  		if err != nil && errors.As(err, &qerr) {
   157  			origErr := qerr.Unwrap()
   158  
   159  			// Not all errors returned by Queryable are interesting, only those that would result in 500 status code.
   160  			//
   161  			// We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil.
   162  			// It returns promql.ErrStorage, if error should be reported back as 500.
   163  			// Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures),
   164  			// or various user-errors (limits, duplicate samples, etc. ... also not failures).
   165  			//
   166  			// All errors will still be counted towards "evaluation failures" metrics and logged by Prometheus Ruler,
   167  			// but we only want internal errors here.
   168  			if _, ok := querier.TranslateToPromqlAPIError(origErr).(promql.ErrStorage); ok {
   169  				failedQueries.Inc()
   170  			}
   171  
   172  			// Return unwrapped error.
   173  			return result, origErr
   174  		}
   175  
   176  		return result, err
   177  	}
   178  }
   179  
   180  func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc {
   181  	if queryTime == nil {
   182  		return qf
   183  	}
   184  
   185  	return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
   186  		// If we've been passed a counter we want to record the wall time spent executing this request.
   187  		timer := prometheus.NewTimer(nil)
   188  		defer func() {
   189  			querySeconds := timer.ObserveDuration().Seconds()
   190  			queryTime.Add(querySeconds)
   191  
   192  			// Log ruler query stats.
   193  			logMessage := []interface{}{
   194  				"msg", "query stats",
   195  				"component", "ruler",
   196  				"cortex_ruler_query_seconds_total", querySeconds,
   197  				"query", qs,
   198  			}
   199  			level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...)
   200  		}()
   201  
   202  		result, err := qf(ctx, qs, t)
   203  		return result, err
   204  	}
   205  }
   206  
   207  // This interface mimicks rules.Manager API. Interface is used to simplify tests.
   208  type RulesManager interface {
   209  	// Starts rules manager. Blocks until Stop is called.
   210  	Run()
   211  
   212  	// Stops rules manager. (Unblocks Run.)
   213  	Stop()
   214  
   215  	// Updates rules manager state.
   216  	Update(interval time.Duration, files []string, externalLabels labels.Labels, externalURL string) error
   217  
   218  	// Returns current rules groups.
   219  	RuleGroups() []*rules.Group
   220  }
   221  
   222  // ManagerFactory is a function that creates new RulesManager for given user and notifier.Manager.
   223  type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager
   224  
   225  func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine *promql.Engine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory {
   226  	totalWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   227  		Name: "cortex_ruler_write_requests_total",
   228  		Help: "Number of write requests to ingesters.",
   229  	})
   230  	failedWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   231  		Name: "cortex_ruler_write_requests_failed_total",
   232  		Help: "Number of failed write requests to ingesters.",
   233  	})
   234  
   235  	totalQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   236  		Name: "cortex_ruler_queries_total",
   237  		Help: "Number of queries executed by ruler.",
   238  	})
   239  	failedQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   240  		Name: "cortex_ruler_queries_failed_total",
   241  		Help: "Number of failed queries by ruler.",
   242  	})
   243  	var rulerQuerySeconds *prometheus.CounterVec
   244  	if cfg.EnableQueryStats {
   245  		rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   246  			Name: "cortex_ruler_query_seconds_total",
   247  			Help: "Total amount of wall clock time spent processing queries by the ruler.",
   248  		}, []string{"user"})
   249  	}
   250  
   251  	// Wrap errors returned by Queryable to our wrapper, so that we can distinguish between those errors
   252  	// and errors returned by PromQL engine. Errors from Queryable can be either caused by user (limits) or internal errors.
   253  	// Errors from PromQL are always "user" errors.
   254  	q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors)
   255  
   256  	return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {
   257  		var queryTime prometheus.Counter = nil
   258  		if rulerQuerySeconds != nil {
   259  			queryTime = rulerQuerySeconds.WithLabelValues(userID)
   260  		}
   261  
   262  		return rules.NewManager(&rules.ManagerOptions{
   263  			Appendable:      NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
   264  			Queryable:       q,
   265  			QueryFunc:       RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger),
   266  			Context:         user.InjectOrgID(ctx, userID),
   267  			ExternalURL:     cfg.ExternalURL.URL,
   268  			NotifyFunc:      SendAlerts(notifier, cfg.ExternalURL.URL.String()),
   269  			Logger:          log.With(logger, "user", userID),
   270  			Registerer:      reg,
   271  			OutageTolerance: cfg.OutageTolerance,
   272  			ForGracePeriod:  cfg.ForGracePeriod,
   273  			ResendDelay:     cfg.ResendDelay,
   274  		})
   275  	}
   276  }
   277  
   278  type QueryableError struct {
   279  	err error
   280  }
   281  
   282  func (q QueryableError) Unwrap() error {
   283  	return q.err
   284  }
   285  
   286  func (q QueryableError) Error() string {
   287  	return q.err.Error()
   288  }
   289  
   290  func WrapQueryableErrors(err error) error {
   291  	if err == nil {
   292  		return err
   293  	}
   294  
   295  	return QueryableError{err: err}
   296  }