github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/manager.go (about)

     1  package ruler
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net/http"
     7  	"sync"
     8  
     9  	"github.com/go-kit/log"
    10  	"github.com/go-kit/log/level"
    11  	ot "github.com/opentracing/opentracing-go"
    12  	"github.com/pkg/errors"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/prometheus/client_golang/prometheus/promauto"
    15  	"github.com/prometheus/prometheus/config"
    16  	"github.com/prometheus/prometheus/notifier"
    17  	"github.com/prometheus/prometheus/pkg/rulefmt"
    18  	promRules "github.com/prometheus/prometheus/rules"
    19  	"github.com/weaveworks/common/user"
    20  	"golang.org/x/net/context/ctxhttp"
    21  
    22  	"github.com/cortexproject/cortex/pkg/ruler/rulespb"
    23  )
    24  
    25  type DefaultMultiTenantManager struct {
    26  	cfg            Config
    27  	notifierCfg    *config.Config
    28  	managerFactory ManagerFactory
    29  
    30  	mapper *mapper
    31  
    32  	// Structs for holding per-user Prometheus rules Managers
    33  	// and a corresponding metrics struct
    34  	userManagerMtx     sync.Mutex
    35  	userManagers       map[string]RulesManager
    36  	userManagerMetrics *ManagerMetrics
    37  
    38  	// Per-user notifiers with separate queues.
    39  	notifiersMtx sync.Mutex
    40  	notifiers    map[string]*rulerNotifier
    41  
    42  	managersTotal                 prometheus.Gauge
    43  	lastReloadSuccessful          *prometheus.GaugeVec
    44  	lastReloadSuccessfulTimestamp *prometheus.GaugeVec
    45  	configUpdatesTotal            *prometheus.CounterVec
    46  	registry                      prometheus.Registerer
    47  	logger                        log.Logger
    48  }
    49  
    50  func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
    51  	ncfg, err := buildNotifierConfig(&cfg)
    52  	if err != nil {
    53  		return nil, err
    54  	}
    55  
    56  	userManagerMetrics := NewManagerMetrics()
    57  	if reg != nil {
    58  		reg.MustRegister(userManagerMetrics)
    59  	}
    60  
    61  	return &DefaultMultiTenantManager{
    62  		cfg:                cfg,
    63  		notifierCfg:        ncfg,
    64  		managerFactory:     managerFactory,
    65  		notifiers:          map[string]*rulerNotifier{},
    66  		mapper:             newMapper(cfg.RulePath, logger),
    67  		userManagers:       map[string]RulesManager{},
    68  		userManagerMetrics: userManagerMetrics,
    69  		managersTotal: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
    70  			Namespace: "cortex",
    71  			Name:      "ruler_managers_total",
    72  			Help:      "Total number of managers registered and running in the ruler",
    73  		}),
    74  		lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
    75  			Namespace: "cortex",
    76  			Name:      "ruler_config_last_reload_successful",
    77  			Help:      "Boolean set to 1 whenever the last configuration reload attempt was successful.",
    78  		}, []string{"user"}),
    79  		lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
    80  			Namespace: "cortex",
    81  			Name:      "ruler_config_last_reload_successful_seconds",
    82  			Help:      "Timestamp of the last successful configuration reload.",
    83  		}, []string{"user"}),
    84  		configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
    85  			Namespace: "cortex",
    86  			Name:      "ruler_config_updates_total",
    87  			Help:      "Total number of config updates triggered by a user",
    88  		}, []string{"user"}),
    89  		registry: reg,
    90  		logger:   logger,
    91  	}, nil
    92  }
    93  
    94  func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) {
    95  	// A lock is taken to ensure if this function is called concurrently, then each call
    96  	// returns after the call map files and check for updates
    97  	r.userManagerMtx.Lock()
    98  	defer r.userManagerMtx.Unlock()
    99  
   100  	for userID, ruleGroup := range ruleGroups {
   101  		r.syncRulesToManager(ctx, userID, ruleGroup)
   102  	}
   103  
   104  	// Check for deleted users and remove them
   105  	for userID, mngr := range r.userManagers {
   106  		if _, exists := ruleGroups[userID]; !exists {
   107  			go mngr.Stop()
   108  			delete(r.userManagers, userID)
   109  
   110  			r.mapper.cleanupUser(userID)
   111  			r.lastReloadSuccessful.DeleteLabelValues(userID)
   112  			r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
   113  			r.configUpdatesTotal.DeleteLabelValues(userID)
   114  			r.userManagerMetrics.RemoveUserRegistry(userID)
   115  			level.Info(r.logger).Log("msg", "deleted rule manager and local rule files", "user", userID)
   116  		}
   117  	}
   118  
   119  	r.managersTotal.Set(float64(len(r.userManagers)))
   120  }
   121  
   122  // syncRulesToManager maps the rule files to disk, detects any changes and will create/update the
   123  // the users Prometheus Rules Manager.
   124  func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user string, groups rulespb.RuleGroupList) {
   125  	// Map the files to disk and return the file names to be passed to the users manager if they
   126  	// have been updated
   127  	update, files, err := r.mapper.MapRules(user, groups.Formatted())
   128  	if err != nil {
   129  		r.lastReloadSuccessful.WithLabelValues(user).Set(0)
   130  		level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err)
   131  		return
   132  	}
   133  
   134  	manager, exists := r.userManagers[user]
   135  	if !exists || update {
   136  		level.Debug(r.logger).Log("msg", "updating rules", "user", user)
   137  		r.configUpdatesTotal.WithLabelValues(user).Inc()
   138  		if !exists {
   139  			level.Debug(r.logger).Log("msg", "creating rule manager for user", "user", user)
   140  			manager, err = r.newManager(ctx, user)
   141  			if err != nil {
   142  				r.lastReloadSuccessful.WithLabelValues(user).Set(0)
   143  				level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
   144  				return
   145  			}
   146  			// manager.Run() starts running the manager and blocks until Stop() is called.
   147  			// Hence run it as another goroutine.
   148  			go manager.Run()
   149  			r.userManagers[user] = manager
   150  		}
   151  		err = manager.Update(r.cfg.EvaluationInterval, files, nil, r.cfg.ExternalURL.String())
   152  		if err != nil {
   153  			r.lastReloadSuccessful.WithLabelValues(user).Set(0)
   154  			level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
   155  			return
   156  		}
   157  
   158  		r.lastReloadSuccessful.WithLabelValues(user).Set(1)
   159  		r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
   160  	}
   161  }
   162  
   163  // newManager creates a prometheus rule manager wrapped with a user id
   164  // configured storage, appendable, notifier, and instrumentation
   165  func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) {
   166  	notifier, err := r.getOrCreateNotifier(userID)
   167  	if err != nil {
   168  		return nil, err
   169  	}
   170  
   171  	// Create a new Prometheus registry and register it within
   172  	// our metrics struct for the provided user.
   173  	reg := prometheus.NewRegistry()
   174  	r.userManagerMetrics.AddUserRegistry(userID, reg)
   175  
   176  	return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil
   177  }
   178  
   179  func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) {
   180  	r.notifiersMtx.Lock()
   181  	defer r.notifiersMtx.Unlock()
   182  
   183  	n, ok := r.notifiers[userID]
   184  	if ok {
   185  		return n.notifier, nil
   186  	}
   187  
   188  	reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry)
   189  	reg = prometheus.WrapRegistererWithPrefix("cortex_", reg)
   190  	n = newRulerNotifier(&notifier.Options{
   191  		QueueCapacity: r.cfg.NotificationQueueCapacity,
   192  		Registerer:    reg,
   193  		Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) {
   194  			// Note: The passed-in context comes from the Prometheus notifier
   195  			// and does *not* contain the userID. So it needs to be added to the context
   196  			// here before using the context to inject the userID into the HTTP request.
   197  			ctx = user.InjectOrgID(ctx, userID)
   198  			if err := user.InjectOrgIDIntoHTTPRequest(ctx, req); err != nil {
   199  				return nil, err
   200  			}
   201  			// Jaeger complains the passed-in context has an invalid span ID, so start a new root span
   202  			sp := ot.GlobalTracer().StartSpan("notify", ot.Tag{Key: "organization", Value: userID})
   203  			defer sp.Finish()
   204  			ctx = ot.ContextWithSpan(ctx, sp)
   205  			_ = ot.GlobalTracer().Inject(sp.Context(), ot.HTTPHeaders, ot.HTTPHeadersCarrier(req.Header))
   206  			return ctxhttp.Do(ctx, client, req)
   207  		},
   208  	}, log.With(r.logger, "user", userID))
   209  
   210  	n.run()
   211  
   212  	// This should never fail, unless there's a programming mistake.
   213  	if err := n.applyConfig(r.notifierCfg); err != nil {
   214  		return nil, err
   215  	}
   216  
   217  	r.notifiers[userID] = n
   218  	return n.notifier, nil
   219  }
   220  
   221  func (r *DefaultMultiTenantManager) GetRules(userID string) []*promRules.Group {
   222  	var groups []*promRules.Group
   223  	r.userManagerMtx.Lock()
   224  	if mngr, exists := r.userManagers[userID]; exists {
   225  		groups = mngr.RuleGroups()
   226  	}
   227  	r.userManagerMtx.Unlock()
   228  	return groups
   229  }
   230  
   231  func (r *DefaultMultiTenantManager) Stop() {
   232  	r.notifiersMtx.Lock()
   233  	for _, n := range r.notifiers {
   234  		n.stop()
   235  	}
   236  	r.notifiersMtx.Unlock()
   237  
   238  	level.Info(r.logger).Log("msg", "stopping user managers")
   239  	wg := sync.WaitGroup{}
   240  	r.userManagerMtx.Lock()
   241  	for user, manager := range r.userManagers {
   242  		level.Debug(r.logger).Log("msg", "shutting down user  manager", "user", user)
   243  		wg.Add(1)
   244  		go func(manager RulesManager, user string) {
   245  			manager.Stop()
   246  			wg.Done()
   247  			level.Debug(r.logger).Log("msg", "user manager shut down", "user", user)
   248  		}(manager, user)
   249  	}
   250  	wg.Wait()
   251  	r.userManagerMtx.Unlock()
   252  	level.Info(r.logger).Log("msg", "all user managers stopped")
   253  
   254  	// cleanup user rules directories
   255  	r.mapper.cleanup()
   256  }
   257  
   258  func (*DefaultMultiTenantManager) ValidateRuleGroup(g rulefmt.RuleGroup) []error {
   259  	var errs []error
   260  
   261  	if g.Name == "" {
   262  		errs = append(errs, errors.New("invalid rules config: rule group name must not be empty"))
   263  		return errs
   264  	}
   265  
   266  	if len(g.Rules) == 0 {
   267  		errs = append(errs, fmt.Errorf("invalid rules config: rule group '%s' has no rules", g.Name))
   268  		return errs
   269  	}
   270  
   271  	for i, r := range g.Rules {
   272  		for _, err := range r.Validate() {
   273  			var ruleName string
   274  			if r.Alert.Value != "" {
   275  				ruleName = r.Alert.Value
   276  			} else {
   277  				ruleName = r.Record.Value
   278  			}
   279  			errs = append(errs, &rulefmt.Error{
   280  				Group:    g.Name,
   281  				Rule:     i,
   282  				RuleName: ruleName,
   283  				Err:      err,
   284  			})
   285  		}
   286  	}
   287  
   288  	return errs
   289  }