github.com/thanos-io/thanos@v0.32.5/pkg/rules/manager.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package rules
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"io"
    10  	"os"
    11  	"path/filepath"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/gogo/protobuf/proto"
    18  	"github.com/pkg/errors"
    19  	"github.com/prometheus/client_golang/prometheus"
    20  	"github.com/prometheus/prometheus/model/labels"
    21  	"github.com/prometheus/prometheus/model/rulefmt"
    22  	"github.com/prometheus/prometheus/rules"
    23  	"gopkg.in/yaml.v3"
    24  
    25  	"github.com/thanos-io/thanos/pkg/errutil"
    26  	"github.com/thanos-io/thanos/pkg/extprom"
    27  	"github.com/thanos-io/thanos/pkg/rules/rulespb"
    28  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    29  	"github.com/thanos-io/thanos/pkg/store/storepb"
    30  	"github.com/thanos-io/thanos/pkg/tracing"
    31  )
    32  
    33  const tmpRuleDir = ".tmp-rules"
    34  
    35  type Group struct {
    36  	*rules.Group
    37  	OriginalFile            string
    38  	PartialResponseStrategy storepb.PartialResponseStrategy
    39  }
    40  
    41  func (g Group) toProto() *rulespb.RuleGroup {
    42  	ret := &rulespb.RuleGroup{
    43  		Name:                    g.Name(),
    44  		File:                    g.OriginalFile,
    45  		Interval:                g.Interval().Seconds(),
    46  		Limit:                   int64(g.Limit()),
    47  		PartialResponseStrategy: g.PartialResponseStrategy,
    48  		// UTC needed due to https://github.com/gogo/protobuf/issues/519.
    49  		LastEvaluation:            g.GetLastEvaluation().UTC(),
    50  		EvaluationDurationSeconds: g.GetEvaluationTime().Seconds(),
    51  	}
    52  
    53  	for _, r := range g.Rules() {
    54  		lastError := ""
    55  		if r.LastError() != nil {
    56  			lastError = r.LastError().Error()
    57  		}
    58  
    59  		switch rule := r.(type) {
    60  		case *rules.AlertingRule:
    61  			ret.Rules = append(ret.Rules, &rulespb.Rule{
    62  				Result: &rulespb.Rule_Alert{Alert: &rulespb.Alert{
    63  					State:                     rulespb.AlertState(rule.State()),
    64  					Name:                      rule.Name(),
    65  					Query:                     rule.Query().String(),
    66  					DurationSeconds:           rule.HoldDuration().Seconds(),
    67  					Labels:                    labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(rule.Labels())},
    68  					Annotations:               labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(rule.Annotations())},
    69  					Alerts:                    ActiveAlertsToProto(g.PartialResponseStrategy, rule),
    70  					Health:                    string(rule.Health()),
    71  					LastError:                 lastError,
    72  					EvaluationDurationSeconds: rule.GetEvaluationDuration().Seconds(),
    73  					// UTC needed due to https://github.com/gogo/protobuf/issues/519.
    74  					LastEvaluation: rule.GetEvaluationTimestamp().UTC(),
    75  				}}})
    76  		case *rules.RecordingRule:
    77  			ret.Rules = append(ret.Rules, &rulespb.Rule{
    78  				Result: &rulespb.Rule_Recording{Recording: &rulespb.RecordingRule{
    79  					Name:                      rule.Name(),
    80  					Query:                     rule.Query().String(),
    81  					Labels:                    labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(rule.Labels())},
    82  					Health:                    string(rule.Health()),
    83  					LastError:                 lastError,
    84  					EvaluationDurationSeconds: rule.GetEvaluationDuration().Seconds(),
    85  					// UTC needed due to https://github.com/gogo/protobuf/issues/519.
    86  					LastEvaluation: rule.GetEvaluationTimestamp().UTC(),
    87  				}}})
    88  		default:
    89  			// We cannot do much, let's panic, API will recover.
    90  			panic(fmt.Sprintf("rule %q: unsupported type %T", r.Name(), rule))
    91  		}
    92  	}
    93  	return ret
    94  }
    95  
    96  func ActiveAlertsToProto(s storepb.PartialResponseStrategy, a *rules.AlertingRule) []*rulespb.AlertInstance {
    97  	active := a.ActiveAlerts()
    98  	ret := make([]*rulespb.AlertInstance, len(active))
    99  	for i, ruleAlert := range active {
   100  		// UTC needed due to https://github.com/gogo/protobuf/issues/519.
   101  		activeAt := ruleAlert.ActiveAt.UTC()
   102  		ret[i] = &rulespb.AlertInstance{
   103  			PartialResponseStrategy: s,
   104  			Labels:                  labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(ruleAlert.Labels)},
   105  			Annotations:             labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(ruleAlert.Annotations)},
   106  			State:                   rulespb.AlertState(ruleAlert.State),
   107  			ActiveAt:                &activeAt,
   108  			Value:                   strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64),
   109  		}
   110  	}
   111  	return ret
   112  }
   113  
   114  // Manager is a partial response strategy and proto compatible Manager.
   115  // Manager also implements rulespb.Rules gRPC service.
   116  type Manager struct {
   117  	workDir string
   118  	mgrs    map[storepb.PartialResponseStrategy]*rules.Manager
   119  	extLset labels.Labels
   120  
   121  	mtx         sync.RWMutex
   122  	ruleFiles   map[string]string
   123  	externalURL string
   124  }
   125  
   126  // NewManager creates new Manager.
   127  // QueryFunc from baseOpts will be rewritten.
   128  func NewManager(
   129  	ctx context.Context,
   130  	reg prometheus.Registerer,
   131  	dataDir string,
   132  	baseOpts rules.ManagerOptions,
   133  	queryFuncCreator func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc,
   134  	extLset labels.Labels,
   135  	externalURL string,
   136  ) *Manager {
   137  	m := &Manager{
   138  		workDir:     filepath.Join(dataDir, tmpRuleDir),
   139  		mgrs:        make(map[storepb.PartialResponseStrategy]*rules.Manager),
   140  		extLset:     extLset,
   141  		ruleFiles:   make(map[string]string),
   142  		externalURL: externalURL,
   143  	}
   144  	for _, strategy := range storepb.PartialResponseStrategy_value {
   145  		s := storepb.PartialResponseStrategy(strategy)
   146  
   147  		opts := baseOpts
   148  		opts.Registerer = extprom.WrapRegistererWith(prometheus.Labels{"strategy": strings.ToLower(s.String())}, reg)
   149  		opts.Context = ctx
   150  		opts.QueryFunc = queryFuncCreator(s)
   151  
   152  		m.mgrs[s] = rules.NewManager(&opts)
   153  	}
   154  
   155  	return m
   156  }
   157  
   158  // Run is non blocking, in opposite to TSDB manager, which is blocking.
   159  func (m *Manager) Run() {
   160  	for _, mgr := range m.mgrs {
   161  		go mgr.Run()
   162  	}
   163  }
   164  
   165  func (m *Manager) Stop() {
   166  	for _, mgr := range m.mgrs {
   167  		mgr.Stop()
   168  	}
   169  }
   170  func (m *Manager) protoRuleGroups() []*rulespb.RuleGroup {
   171  
   172  	rg := m.RuleGroups()
   173  	res := make([]*rulespb.RuleGroup, 0, len(rg))
   174  	for _, g := range rg {
   175  		res = append(res, g.toProto())
   176  	}
   177  	return res
   178  }
   179  
   180  func (m *Manager) RuleGroups() []Group {
   181  	m.mtx.RLock()
   182  	defer m.mtx.RUnlock()
   183  	var res []Group
   184  	for s, r := range m.mgrs {
   185  		for _, group := range r.RuleGroups() {
   186  			res = append(res, Group{
   187  				Group:                   group,
   188  				OriginalFile:            m.ruleFiles[group.File()],
   189  				PartialResponseStrategy: s,
   190  			})
   191  		}
   192  	}
   193  	return res
   194  }
   195  
   196  func (m *Manager) Active() []*rulespb.AlertInstance {
   197  	var res []*rulespb.AlertInstance
   198  	for s, r := range m.mgrs {
   199  		for _, r := range r.AlertingRules() {
   200  			res = append(res, ActiveAlertsToProto(s, r)...)
   201  		}
   202  	}
   203  	return res
   204  }
   205  
   206  type configRuleAdapter struct {
   207  	PartialResponseStrategy *storepb.PartialResponseStrategy
   208  
   209  	group           rulefmt.RuleGroup
   210  	nativeRuleGroup map[string]interface{}
   211  }
   212  
   213  func (g *configRuleAdapter) UnmarshalYAML(unmarshal func(interface{}) error) error {
   214  	rs := struct {
   215  		RuleGroup rulefmt.RuleGroup `yaml:",inline"`
   216  		Strategy  string            `yaml:"partial_response_strategy"`
   217  	}{}
   218  
   219  	if err := unmarshal(&rs); err != nil {
   220  		return err
   221  	}
   222  
   223  	g.PartialResponseStrategy = new(storepb.PartialResponseStrategy)
   224  	// Same as YAMl. Quote as JSON unmarshal expects raw JSON field.
   225  	if err := g.PartialResponseStrategy.UnmarshalJSON([]byte("\"" + rs.Strategy + "\"")); err != nil {
   226  		return err
   227  	}
   228  	g.group = rs.RuleGroup
   229  
   230  	var native map[string]interface{}
   231  	if err := unmarshal(&native); err != nil {
   232  		return errors.Wrap(err, "failed to unmarshal rulefmt.configRuleAdapter")
   233  	}
   234  	delete(native, "partial_response_strategy")
   235  
   236  	g.nativeRuleGroup = native
   237  	return nil
   238  }
   239  
   240  func (g configRuleAdapter) MarshalYAML() (interface{}, error) {
   241  	return struct {
   242  		RuleGroup map[string]interface{} `yaml:",inline"`
   243  	}{
   244  		RuleGroup: g.nativeRuleGroup,
   245  	}, nil
   246  }
   247  
   248  // TODO(bwplotka): Replace this with upstream implementation after https://github.com/prometheus/prometheus/issues/7128 is fixed.
   249  func (g configRuleAdapter) validate() (errs []error) {
   250  	set := map[string]struct{}{}
   251  	if g.group.Name == "" {
   252  		errs = append(errs, errors.New("Groupname should not be empty"))
   253  	}
   254  
   255  	if _, ok := set[g.group.Name]; ok {
   256  		errs = append(
   257  			errs,
   258  			fmt.Errorf("groupname: %q is repeated in the same file", g.group.Name),
   259  		)
   260  	}
   261  
   262  	set[g.group.Name] = struct{}{}
   263  
   264  	for i, r := range g.group.Rules {
   265  		for _, node := range r.Validate() {
   266  			var ruleName string
   267  			if r.Alert.Value != "" {
   268  				ruleName = r.Alert.Value
   269  			} else {
   270  				ruleName = r.Record.Value
   271  			}
   272  			errs = append(errs, &rulefmt.Error{
   273  				Group:    g.group.Name,
   274  				Rule:     i,
   275  				RuleName: ruleName,
   276  				Err:      node,
   277  			})
   278  		}
   279  	}
   280  
   281  	return errs
   282  }
   283  
   284  // ValidateAndCount validates all rules in the rule groups and return overal number of rules in all groups.
   285  // TODO(bwplotka): Replace this with upstream implementation after https://github.com/prometheus/prometheus/issues/7128 is fixed.
   286  func ValidateAndCount(group io.Reader) (numRules int, errs errutil.MultiError) {
   287  	var rgs configGroups
   288  	d := yaml.NewDecoder(group)
   289  	d.KnownFields(true)
   290  	if err := d.Decode(&rgs); err != nil {
   291  		errs.Add(err)
   292  		return 0, errs
   293  	}
   294  
   295  	for _, g := range rgs.Groups {
   296  		if err := g.validate(); err != nil {
   297  			for _, e := range err {
   298  				errs.Add(e)
   299  			}
   300  			return 0, errs
   301  		}
   302  	}
   303  
   304  	for _, rg := range rgs.Groups {
   305  		numRules += len(rg.group.Rules)
   306  	}
   307  	return numRules, errs
   308  }
   309  
   310  type configGroups struct {
   311  	Groups []configRuleAdapter `yaml:"groups"`
   312  }
   313  
   314  // Update updates rules from given files to all managers we hold. We decide which groups should go where, based on
   315  // special field in configGroups.configRuleAdapter struct.
   316  func (m *Manager) Update(evalInterval time.Duration, files []string) error {
   317  	var (
   318  		errs            errutil.MultiError
   319  		filesByStrategy = map[storepb.PartialResponseStrategy][]string{}
   320  		ruleFiles       = map[string]string{}
   321  	)
   322  
   323  	// Initialize filesByStrategy for existing managers' strategies to make
   324  	// sure that managers are updated when they have no rules configured.
   325  	for strategy := range m.mgrs {
   326  		filesByStrategy[strategy] = make([]string, 0)
   327  	}
   328  
   329  	if err := os.RemoveAll(m.workDir); err != nil {
   330  		return errors.Wrapf(err, "remove %s", m.workDir)
   331  	}
   332  	if err := os.MkdirAll(m.workDir, os.ModePerm); err != nil {
   333  		return errors.Wrapf(err, "create %s", m.workDir)
   334  	}
   335  
   336  	for _, fn := range files {
   337  		b, err := os.ReadFile(filepath.Clean(fn))
   338  		if err != nil {
   339  			errs.Add(err)
   340  			continue
   341  		}
   342  
   343  		var rg configGroups
   344  		if err := yaml.Unmarshal(b, &rg); err != nil {
   345  			errs.Add(errors.Wrap(err, fn))
   346  			continue
   347  		}
   348  
   349  		// NOTE: This is very ugly, but we need to write those yaml into tmp dir without the partial partial response field
   350  		// which is not supported, to be able to reuse rules.Manager. The problem is that it uses yaml.UnmarshalStrict.
   351  		groupsByStrategy := map[storepb.PartialResponseStrategy][]configRuleAdapter{}
   352  		for _, rg := range rg.Groups {
   353  			groupsByStrategy[*rg.PartialResponseStrategy] = append(groupsByStrategy[*rg.PartialResponseStrategy], rg)
   354  		}
   355  		for s, rg := range groupsByStrategy {
   356  			b, err := yaml.Marshal(configGroups{Groups: rg})
   357  			if err != nil {
   358  				errs = append(errs, errors.Wrapf(err, "%s: failed to marshal rule groups", fn))
   359  				continue
   360  			}
   361  
   362  			// Use full file name appending to work dir, so we can differentiate between different dirs and same filenames(!).
   363  			// This will be also used as key for file group name.
   364  			newFn := filepath.Join(m.workDir, s.String(), fn)
   365  			if err := os.MkdirAll(filepath.Dir(newFn), os.ModePerm); err != nil {
   366  				errs.Add(errors.Wrapf(err, "create %s", filepath.Dir(newFn)))
   367  				continue
   368  			}
   369  			if err := os.WriteFile(newFn, b, os.ModePerm); err != nil {
   370  				errs.Add(errors.Wrapf(err, "write file %v", newFn))
   371  				continue
   372  			}
   373  			filesByStrategy[s] = append(filesByStrategy[s], newFn)
   374  			ruleFiles[newFn] = fn
   375  		}
   376  	}
   377  
   378  	m.mtx.Lock()
   379  	for s, fs := range filesByStrategy {
   380  		mgr, ok := m.mgrs[s]
   381  		if !ok {
   382  			errs.Add(errors.Errorf("no manager found for %v", s))
   383  			continue
   384  		}
   385  		// We add external labels in `pkg/alert.Queue`.
   386  		if err := mgr.Update(evalInterval, fs, m.extLset, m.externalURL, nil); err != nil {
   387  			// TODO(bwplotka): Prometheus logs all error details. Fix it upstream to have consistent error handling.
   388  			errs.Add(errors.Wrapf(err, "strategy %s, update rules", s))
   389  			continue
   390  		}
   391  	}
   392  	m.ruleFiles = ruleFiles
   393  	m.mtx.Unlock()
   394  
   395  	return errs.Err()
   396  }
   397  
   398  // Rules returns specified rules from manager. This is used by gRPC and locally for HTTP and UI purposes.
   399  func (m *Manager) Rules(r *rulespb.RulesRequest, s rulespb.Rules_RulesServer) (err error) {
   400  	groups := m.protoRuleGroups()
   401  
   402  	pgs := make([]*rulespb.RuleGroup, 0, len(groups))
   403  	for _, g := range groups {
   404  		// UTC needed due to https://github.com/gogo/protobuf/issues/519.
   405  		g.LastEvaluation = g.LastEvaluation.UTC()
   406  		if r.Type == rulespb.RulesRequest_ALL {
   407  			pgs = append(pgs, g)
   408  			continue
   409  		}
   410  
   411  		filtered := proto.Clone(g).(*rulespb.RuleGroup)
   412  		filtered.Rules = nil
   413  		for _, rule := range g.Rules {
   414  			if rule.GetAlert() != nil && r.Type == rulespb.RulesRequest_ALERT {
   415  				filtered.Rules = append(filtered.Rules, rule)
   416  				continue
   417  			}
   418  			if rule.GetRecording() != nil && r.Type == rulespb.RulesRequest_RECORD {
   419  				filtered.Rules = append(filtered.Rules, rule)
   420  			}
   421  		}
   422  		pgs = append(pgs, filtered)
   423  	}
   424  
   425  	enrichRulesWithExtLabels(pgs, m.extLset)
   426  
   427  	for _, pg := range pgs {
   428  		tracing.DoInSpan(s.Context(), "send_rule_group_response", func(_ context.Context) {
   429  			err = s.Send(&rulespb.RulesResponse{Result: &rulespb.RulesResponse_Group{Group: pg}})
   430  		})
   431  		if err != nil {
   432  			return err
   433  		}
   434  	}
   435  	return nil
   436  }