github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/ruler/base/compat.go (about)

     1  package base
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"time"
     7  
     8  	"github.com/go-kit/log"
     9  	"github.com/go-kit/log/level"
    10  	"github.com/prometheus/client_golang/prometheus"
    11  	"github.com/prometheus/client_golang/prometheus/promauto"
    12  	"github.com/prometheus/prometheus/model/exemplar"
    13  	"github.com/prometheus/prometheus/model/labels"
    14  	"github.com/prometheus/prometheus/model/value"
    15  	"github.com/prometheus/prometheus/notifier"
    16  	"github.com/prometheus/prometheus/promql"
    17  	"github.com/prometheus/prometheus/rules"
    18  	"github.com/prometheus/prometheus/storage"
    19  	"github.com/weaveworks/common/httpgrpc"
    20  	"github.com/weaveworks/common/user"
    21  
    22  	"github.com/grafana/loki/pkg/logproto"
    23  	util_log "github.com/grafana/loki/pkg/util/log"
    24  )
    25  
    26  // Pusher is an ingester server that accepts pushes.
    27  type Pusher interface {
    28  	Push(context.Context, *logproto.WriteRequest) (*logproto.WriteResponse, error)
    29  }
    30  
    31  type PusherAppender struct {
    32  	failedWrites prometheus.Counter
    33  	totalWrites  prometheus.Counter
    34  
    35  	ctx             context.Context
    36  	pusher          Pusher
    37  	labels          []labels.Labels
    38  	samples         []logproto.LegacySample
    39  	userID          string
    40  	evaluationDelay time.Duration
    41  }
    42  
    43  func (a *PusherAppender) Append(_ storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) {
    44  	a.labels = append(a.labels, l)
    45  
    46  	// Adapt staleness markers for ruler evaluation delay. As the upstream code
    47  	// is using the actual time, when there is a no longer available series.
    48  	// This then causes 'out of order' append failures once the series is
    49  	// becoming available again.
    50  	// see https://github.com/prometheus/prometheus/blob/6c56a1faaaad07317ff585bda75b99bdba0517ad/rules/manager.go#L647-L660
    51  	// Similar to staleness markers, the rule manager also appends actual time to the ALERTS and ALERTS_FOR_STATE series.
    52  	// See: https://github.com/prometheus/prometheus/blob/ae086c73cb4d6db9e8b67d5038d3704fea6aec4a/rules/alerting.go#L414-L417
    53  	metricName := l.Get(labels.MetricName)
    54  	if a.evaluationDelay > 0 && (value.IsStaleNaN(v) || metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE") {
    55  		t -= a.evaluationDelay.Milliseconds()
    56  	}
    57  
    58  	a.samples = append(a.samples, logproto.LegacySample{
    59  		TimestampMs: t,
    60  		Value:       v,
    61  	})
    62  	return 0, nil
    63  }
    64  
    65  func (a *PusherAppender) AppendExemplar(_ storage.SeriesRef, _ labels.Labels, _ exemplar.Exemplar) (storage.SeriesRef, error) {
    66  	return 0, errors.New("exemplars are unsupported")
    67  }
    68  
    69  func (a *PusherAppender) Commit() error {
    70  	a.totalWrites.Inc()
    71  
    72  	// Since a.pusher is distributor, client.ReuseSlice will be called in a.pusher.Push.
    73  	// We shouldn't call client.ReuseSlice here.
    74  	_, err := a.pusher.Push(user.InjectOrgID(a.ctx, a.userID), logproto.ToWriteRequest(a.labels, a.samples, nil, logproto.RULE))
    75  	if err != nil {
    76  		// Don't report errors that ended with 4xx HTTP status code (series limits, duplicate samples, out of order, etc.)
    77  		if resp, ok := httpgrpc.HTTPResponseFromError(err); !ok || resp.Code/100 != 4 {
    78  			a.failedWrites.Inc()
    79  		}
    80  	}
    81  
    82  	a.labels = nil
    83  	a.samples = nil
    84  	return err
    85  }
    86  
    87  func (a *PusherAppender) Rollback() error {
    88  	a.labels = nil
    89  	a.samples = nil
    90  	return nil
    91  }
    92  
    93  // PusherAppendable fulfills the storage.Appendable interface for prometheus manager
    94  type PusherAppendable struct {
    95  	pusher      Pusher
    96  	userID      string
    97  	rulesLimits RulesLimits
    98  
    99  	totalWrites  prometheus.Counter
   100  	failedWrites prometheus.Counter
   101  }
   102  
   103  func NewPusherAppendable(pusher Pusher, userID string, limits RulesLimits, totalWrites, failedWrites prometheus.Counter) *PusherAppendable {
   104  	return &PusherAppendable{
   105  		pusher:       pusher,
   106  		userID:       userID,
   107  		rulesLimits:  limits,
   108  		totalWrites:  totalWrites,
   109  		failedWrites: failedWrites,
   110  	}
   111  }
   112  
   113  // Appender returns a storage.Appender
   114  func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender {
   115  	return &PusherAppender{
   116  		failedWrites: t.failedWrites,
   117  		totalWrites:  t.totalWrites,
   118  
   119  		ctx:             ctx,
   120  		pusher:          t.pusher,
   121  		userID:          t.userID,
   122  		evaluationDelay: t.rulesLimits.EvaluationDelay(t.userID),
   123  	}
   124  }
   125  
   126  // RulesLimits defines limits used by Ruler.
   127  type RulesLimits interface {
   128  	EvaluationDelay(userID string) time.Duration
   129  	RulerTenantShardSize(userID string) int
   130  	RulerMaxRuleGroupsPerTenant(userID string) int
   131  	RulerMaxRulesPerRuleGroup(userID string) int
   132  }
   133  
   134  // EngineQueryFunc returns a new query function using the rules.EngineQueryFunc function
   135  // and passing an altered timestamp.
   136  func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides RulesLimits, userID string) rules.QueryFunc {
   137  	return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
   138  		orig := rules.EngineQueryFunc(engine, q)
   139  		// Delay the evaluation of all rules by a set interval to give a buffer
   140  		// to metric that haven't been forwarded to cortex yet.
   141  		evaluationDelay := overrides.EvaluationDelay(userID)
   142  		return orig(ctx, qs, t.Add(-evaluationDelay))
   143  	}
   144  }
   145  
   146  func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc {
   147  	return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
   148  		queries.Inc()
   149  		result, err := qf(ctx, qs, t)
   150  
   151  		// We only care about errors returned by underlying Queryable. Errors returned by PromQL engine are "user-errors",
   152  		// and not interesting here.
   153  		qerr := QueryableError{}
   154  		if err != nil && errors.As(err, &qerr) {
   155  			origErr := qerr.Unwrap()
   156  
   157  			// Not all errors returned by Queryable are interesting, only those that would result in 500 status code.
   158  			//
   159  			// We rely on TranslateToPromqlApiError to do its job here... it returns nil, if err is nil.
   160  			// It returns promql.ErrStorage, if error should be reported back as 500.
   161  			// Other errors it returns are either for canceled or timed-out queriers (we're not reporting those as failures),
   162  			// or various user-errors (limits, duplicate samples, etc. ... also not failures).
   163  			//
   164  			// All errors will still be counted towards "evaluation failures" metrics and logged by Prometheus Ruler,
   165  			// but we only want internal errors here.
   166  			if _, ok := TranslateToPromqlAPIError(origErr).(promql.ErrStorage); ok {
   167  				failedQueries.Inc()
   168  			}
   169  
   170  			// Return unwrapped error.
   171  			return result, origErr
   172  		}
   173  
   174  		return result, err
   175  	}
   176  }
   177  
   178  func RecordAndReportRuleQueryMetrics(qf rules.QueryFunc, queryTime prometheus.Counter, logger log.Logger) rules.QueryFunc {
   179  	if queryTime == nil {
   180  		return qf
   181  	}
   182  
   183  	return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) {
   184  		// If we've been passed a counter we want to record the wall time spent executing this request.
   185  		timer := prometheus.NewTimer(nil)
   186  		defer func() {
   187  			querySeconds := timer.ObserveDuration().Seconds()
   188  			queryTime.Add(querySeconds)
   189  
   190  			// Log ruler query stats.
   191  			logMessage := []interface{}{
   192  				"msg", "query stats",
   193  				"component", "ruler",
   194  				"cortex_ruler_query_seconds_total", querySeconds,
   195  				"query", qs,
   196  			}
   197  			level.Info(util_log.WithContext(ctx, logger)).Log(logMessage...)
   198  		}()
   199  
   200  		result, err := qf(ctx, qs, t)
   201  		return result, err
   202  	}
   203  }
   204  
   205  // This interface mimicks rules.Manager API. Interface is used to simplify tests.
   206  type RulesManager interface {
   207  	// Starts rules manager. Blocks until Stop is called.
   208  	Run()
   209  
   210  	// Stops rules manager. (Unblocks Run.)
   211  	Stop()
   212  
   213  	// Updates rules manager state.
   214  	Update(interval time.Duration, files []string, externalLabels labels.Labels, externalURL string, ruleGroupPostProcessFunc rules.RuleGroupPostProcessFunc) error
   215  
   216  	// Returns current rules groups.
   217  	RuleGroups() []*rules.Group
   218  }
   219  
   220  // ManagerFactory is a function that creates new RulesManager for given user and notifier.Manager.
   221  type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager
   222  
   223  func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine *promql.Engine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory {
   224  	totalWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   225  		Name: "cortex_ruler_write_requests_total",
   226  		Help: "Number of write requests to ingesters.",
   227  	})
   228  	failedWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   229  		Name: "cortex_ruler_write_requests_failed_total",
   230  		Help: "Number of failed write requests to ingesters.",
   231  	})
   232  
   233  	totalQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   234  		Name: "cortex_ruler_queries_total",
   235  		Help: "Number of queries executed by ruler.",
   236  	})
   237  	failedQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
   238  		Name: "cortex_ruler_queries_failed_total",
   239  		Help: "Number of failed queries by ruler.",
   240  	})
   241  	var rulerQuerySeconds *prometheus.CounterVec
   242  	if cfg.EnableQueryStats {
   243  		rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   244  			Name: "cortex_ruler_query_seconds_total",
   245  			Help: "Total amount of wall clock time spent processing queries by the ruler.",
   246  		}, []string{"user"})
   247  	}
   248  
   249  	// Wrap errors returned by Queryable to our wrapper, so that we can distinguish between those errors
   250  	// and errors returned by PromQL engine. Errors from Queryable can be either caused by user (limits) or internal errors.
   251  	// Errors from PromQL are always "user" errors.
   252  	q = NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors)
   253  
   254  	return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {
   255  		var queryTime prometheus.Counter
   256  		if rulerQuerySeconds != nil {
   257  			queryTime = rulerQuerySeconds.WithLabelValues(userID)
   258  		}
   259  
   260  		return rules.NewManager(&rules.ManagerOptions{
   261  			Appendable:      NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
   262  			Queryable:       q,
   263  			QueryFunc:       RecordAndReportRuleQueryMetrics(MetricsQueryFunc(EngineQueryFunc(engine, q, overrides, userID), totalQueries, failedQueries), queryTime, logger),
   264  			Context:         user.InjectOrgID(ctx, userID),
   265  			ExternalURL:     cfg.ExternalURL.URL,
   266  			NotifyFunc:      SendAlerts(notifier, cfg.ExternalURL.URL.String()),
   267  			Logger:          log.With(logger, "user", userID),
   268  			Registerer:      reg,
   269  			OutageTolerance: cfg.OutageTolerance,
   270  			ForGracePeriod:  cfg.ForGracePeriod,
   271  			ResendDelay:     cfg.ResendDelay,
   272  		})
   273  	}
   274  }
   275  
   276  type QueryableError struct {
   277  	err error
   278  }
   279  
   280  func (q QueryableError) Unwrap() error {
   281  	return q.err
   282  }
   283  
   284  func (q QueryableError) Error() string {
   285  	return q.err.Error()
   286  }
   287  
   288  func WrapQueryableErrors(err error) error {
   289  	if err == nil {
   290  		return err
   291  	}
   292  
   293  	return QueryableError{err: err}
   294  }