github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/ruler/manager.go (about) 1 package ruler 2 3 import ( 4 "context" 5 "fmt" 6 "net/http" 7 "sync" 8 9 "github.com/go-kit/log" 10 "github.com/go-kit/log/level" 11 ot "github.com/opentracing/opentracing-go" 12 "github.com/pkg/errors" 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/prometheus/client_golang/prometheus/promauto" 15 "github.com/prometheus/prometheus/config" 16 "github.com/prometheus/prometheus/notifier" 17 "github.com/prometheus/prometheus/pkg/rulefmt" 18 promRules "github.com/prometheus/prometheus/rules" 19 "github.com/weaveworks/common/user" 20 "golang.org/x/net/context/ctxhttp" 21 22 "github.com/cortexproject/cortex/pkg/ruler/rulespb" 23 ) 24 25 type DefaultMultiTenantManager struct { 26 cfg Config 27 notifierCfg *config.Config 28 managerFactory ManagerFactory 29 30 mapper *mapper 31 32 // Structs for holding per-user Prometheus rules Managers 33 // and a corresponding metrics struct 34 userManagerMtx sync.Mutex 35 userManagers map[string]RulesManager 36 userManagerMetrics *ManagerMetrics 37 38 // Per-user notifiers with separate queues. 39 notifiersMtx sync.Mutex 40 notifiers map[string]*rulerNotifier 41 42 managersTotal prometheus.Gauge 43 lastReloadSuccessful *prometheus.GaugeVec 44 lastReloadSuccessfulTimestamp *prometheus.GaugeVec 45 configUpdatesTotal *prometheus.CounterVec 46 registry prometheus.Registerer 47 logger log.Logger 48 } 49 50 func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) { 51 ncfg, err := buildNotifierConfig(&cfg) 52 if err != nil { 53 return nil, err 54 } 55 56 userManagerMetrics := NewManagerMetrics() 57 if reg != nil { 58 reg.MustRegister(userManagerMetrics) 59 } 60 61 return &DefaultMultiTenantManager{ 62 cfg: cfg, 63 notifierCfg: ncfg, 64 managerFactory: managerFactory, 65 notifiers: map[string]*rulerNotifier{}, 66 mapper: newMapper(cfg.RulePath, logger), 67 userManagers: map[string]RulesManager{}, 68 userManagerMetrics: userManagerMetrics, 69 managersTotal: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 70 Namespace: "cortex", 71 Name: "ruler_managers_total", 72 Help: "Total number of managers registered and running in the ruler", 73 }), 74 lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 75 Namespace: "cortex", 76 Name: "ruler_config_last_reload_successful", 77 Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.", 78 }, []string{"user"}), 79 lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ 80 Namespace: "cortex", 81 Name: "ruler_config_last_reload_successful_seconds", 82 Help: "Timestamp of the last successful configuration reload.", 83 }, []string{"user"}), 84 configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 85 Namespace: "cortex", 86 Name: "ruler_config_updates_total", 87 Help: "Total number of config updates triggered by a user", 88 }, []string{"user"}), 89 registry: reg, 90 logger: logger, 91 }, nil 92 } 93 94 func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGroups map[string]rulespb.RuleGroupList) { 95 // A lock is taken to ensure if this function is called concurrently, then each call 96 // returns after the call map files and check for updates 97 r.userManagerMtx.Lock() 98 defer r.userManagerMtx.Unlock() 99 100 for userID, ruleGroup := range ruleGroups { 101 r.syncRulesToManager(ctx, userID, ruleGroup) 102 } 103 104 // Check for deleted users and remove them 105 for userID, mngr := range r.userManagers { 106 if _, exists := ruleGroups[userID]; !exists { 107 go mngr.Stop() 108 delete(r.userManagers, userID) 109 110 r.mapper.cleanupUser(userID) 111 r.lastReloadSuccessful.DeleteLabelValues(userID) 112 r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID) 113 r.configUpdatesTotal.DeleteLabelValues(userID) 114 r.userManagerMetrics.RemoveUserRegistry(userID) 115 level.Info(r.logger).Log("msg", "deleted rule manager and local rule files", "user", userID) 116 } 117 } 118 119 r.managersTotal.Set(float64(len(r.userManagers))) 120 } 121 122 // syncRulesToManager maps the rule files to disk, detects any changes and will create/update the 123 // the users Prometheus Rules Manager. 124 func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user string, groups rulespb.RuleGroupList) { 125 // Map the files to disk and return the file names to be passed to the users manager if they 126 // have been updated 127 update, files, err := r.mapper.MapRules(user, groups.Formatted()) 128 if err != nil { 129 r.lastReloadSuccessful.WithLabelValues(user).Set(0) 130 level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err) 131 return 132 } 133 134 manager, exists := r.userManagers[user] 135 if !exists || update { 136 level.Debug(r.logger).Log("msg", "updating rules", "user", user) 137 r.configUpdatesTotal.WithLabelValues(user).Inc() 138 if !exists { 139 level.Debug(r.logger).Log("msg", "creating rule manager for user", "user", user) 140 manager, err = r.newManager(ctx, user) 141 if err != nil { 142 r.lastReloadSuccessful.WithLabelValues(user).Set(0) 143 level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err) 144 return 145 } 146 // manager.Run() starts running the manager and blocks until Stop() is called. 147 // Hence run it as another goroutine. 148 go manager.Run() 149 r.userManagers[user] = manager 150 } 151 err = manager.Update(r.cfg.EvaluationInterval, files, nil, r.cfg.ExternalURL.String()) 152 if err != nil { 153 r.lastReloadSuccessful.WithLabelValues(user).Set(0) 154 level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err) 155 return 156 } 157 158 r.lastReloadSuccessful.WithLabelValues(user).Set(1) 159 r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime() 160 } 161 } 162 163 // newManager creates a prometheus rule manager wrapped with a user id 164 // configured storage, appendable, notifier, and instrumentation 165 func (r *DefaultMultiTenantManager) newManager(ctx context.Context, userID string) (RulesManager, error) { 166 notifier, err := r.getOrCreateNotifier(userID) 167 if err != nil { 168 return nil, err 169 } 170 171 // Create a new Prometheus registry and register it within 172 // our metrics struct for the provided user. 173 reg := prometheus.NewRegistry() 174 r.userManagerMetrics.AddUserRegistry(userID, reg) 175 176 return r.managerFactory(ctx, userID, notifier, r.logger, reg), nil 177 } 178 179 func (r *DefaultMultiTenantManager) getOrCreateNotifier(userID string) (*notifier.Manager, error) { 180 r.notifiersMtx.Lock() 181 defer r.notifiersMtx.Unlock() 182 183 n, ok := r.notifiers[userID] 184 if ok { 185 return n.notifier, nil 186 } 187 188 reg := prometheus.WrapRegistererWith(prometheus.Labels{"user": userID}, r.registry) 189 reg = prometheus.WrapRegistererWithPrefix("cortex_", reg) 190 n = newRulerNotifier(¬ifier.Options{ 191 QueueCapacity: r.cfg.NotificationQueueCapacity, 192 Registerer: reg, 193 Do: func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error) { 194 // Note: The passed-in context comes from the Prometheus notifier 195 // and does *not* contain the userID. So it needs to be added to the context 196 // here before using the context to inject the userID into the HTTP request. 197 ctx = user.InjectOrgID(ctx, userID) 198 if err := user.InjectOrgIDIntoHTTPRequest(ctx, req); err != nil { 199 return nil, err 200 } 201 // Jaeger complains the passed-in context has an invalid span ID, so start a new root span 202 sp := ot.GlobalTracer().StartSpan("notify", ot.Tag{Key: "organization", Value: userID}) 203 defer sp.Finish() 204 ctx = ot.ContextWithSpan(ctx, sp) 205 _ = ot.GlobalTracer().Inject(sp.Context(), ot.HTTPHeaders, ot.HTTPHeadersCarrier(req.Header)) 206 return ctxhttp.Do(ctx, client, req) 207 }, 208 }, log.With(r.logger, "user", userID)) 209 210 n.run() 211 212 // This should never fail, unless there's a programming mistake. 213 if err := n.applyConfig(r.notifierCfg); err != nil { 214 return nil, err 215 } 216 217 r.notifiers[userID] = n 218 return n.notifier, nil 219 } 220 221 func (r *DefaultMultiTenantManager) GetRules(userID string) []*promRules.Group { 222 var groups []*promRules.Group 223 r.userManagerMtx.Lock() 224 if mngr, exists := r.userManagers[userID]; exists { 225 groups = mngr.RuleGroups() 226 } 227 r.userManagerMtx.Unlock() 228 return groups 229 } 230 231 func (r *DefaultMultiTenantManager) Stop() { 232 r.notifiersMtx.Lock() 233 for _, n := range r.notifiers { 234 n.stop() 235 } 236 r.notifiersMtx.Unlock() 237 238 level.Info(r.logger).Log("msg", "stopping user managers") 239 wg := sync.WaitGroup{} 240 r.userManagerMtx.Lock() 241 for user, manager := range r.userManagers { 242 level.Debug(r.logger).Log("msg", "shutting down user manager", "user", user) 243 wg.Add(1) 244 go func(manager RulesManager, user string) { 245 manager.Stop() 246 wg.Done() 247 level.Debug(r.logger).Log("msg", "user manager shut down", "user", user) 248 }(manager, user) 249 } 250 wg.Wait() 251 r.userManagerMtx.Unlock() 252 level.Info(r.logger).Log("msg", "all user managers stopped") 253 254 // cleanup user rules directories 255 r.mapper.cleanup() 256 } 257 258 func (*DefaultMultiTenantManager) ValidateRuleGroup(g rulefmt.RuleGroup) []error { 259 var errs []error 260 261 if g.Name == "" { 262 errs = append(errs, errors.New("invalid rules config: rule group name must not be empty")) 263 return errs 264 } 265 266 if len(g.Rules) == 0 { 267 errs = append(errs, fmt.Errorf("invalid rules config: rule group '%s' has no rules", g.Name)) 268 return errs 269 } 270 271 for i, r := range g.Rules { 272 for _, err := range r.Validate() { 273 var ruleName string 274 if r.Alert.Value != "" { 275 ruleName = r.Alert.Value 276 } else { 277 ruleName = r.Record.Value 278 } 279 errs = append(errs, &rulefmt.Error{ 280 Group: g.Name, 281 Rule: i, 282 RuleName: ruleName, 283 Err: err, 284 }) 285 } 286 } 287 288 return errs 289 }