github.com/thanos-io/thanos@v0.32.5/pkg/rules/manager.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package rules 5 6 import ( 7 "context" 8 "fmt" 9 "io" 10 "os" 11 "path/filepath" 12 "strconv" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/gogo/protobuf/proto" 18 "github.com/pkg/errors" 19 "github.com/prometheus/client_golang/prometheus" 20 "github.com/prometheus/prometheus/model/labels" 21 "github.com/prometheus/prometheus/model/rulefmt" 22 "github.com/prometheus/prometheus/rules" 23 "gopkg.in/yaml.v3" 24 25 "github.com/thanos-io/thanos/pkg/errutil" 26 "github.com/thanos-io/thanos/pkg/extprom" 27 "github.com/thanos-io/thanos/pkg/rules/rulespb" 28 "github.com/thanos-io/thanos/pkg/store/labelpb" 29 "github.com/thanos-io/thanos/pkg/store/storepb" 30 "github.com/thanos-io/thanos/pkg/tracing" 31 ) 32 33 const tmpRuleDir = ".tmp-rules" 34 35 type Group struct { 36 *rules.Group 37 OriginalFile string 38 PartialResponseStrategy storepb.PartialResponseStrategy 39 } 40 41 func (g Group) toProto() *rulespb.RuleGroup { 42 ret := &rulespb.RuleGroup{ 43 Name: g.Name(), 44 File: g.OriginalFile, 45 Interval: g.Interval().Seconds(), 46 Limit: int64(g.Limit()), 47 PartialResponseStrategy: g.PartialResponseStrategy, 48 // UTC needed due to https://github.com/gogo/protobuf/issues/519. 49 LastEvaluation: g.GetLastEvaluation().UTC(), 50 EvaluationDurationSeconds: g.GetEvaluationTime().Seconds(), 51 } 52 53 for _, r := range g.Rules() { 54 lastError := "" 55 if r.LastError() != nil { 56 lastError = r.LastError().Error() 57 } 58 59 switch rule := r.(type) { 60 case *rules.AlertingRule: 61 ret.Rules = append(ret.Rules, &rulespb.Rule{ 62 Result: &rulespb.Rule_Alert{Alert: &rulespb.Alert{ 63 State: rulespb.AlertState(rule.State()), 64 Name: rule.Name(), 65 Query: rule.Query().String(), 66 DurationSeconds: rule.HoldDuration().Seconds(), 67 Labels: labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(rule.Labels())}, 68 Annotations: labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(rule.Annotations())}, 69 Alerts: ActiveAlertsToProto(g.PartialResponseStrategy, rule), 70 Health: string(rule.Health()), 71 LastError: lastError, 72 EvaluationDurationSeconds: rule.GetEvaluationDuration().Seconds(), 73 // UTC needed due to https://github.com/gogo/protobuf/issues/519. 74 LastEvaluation: rule.GetEvaluationTimestamp().UTC(), 75 }}}) 76 case *rules.RecordingRule: 77 ret.Rules = append(ret.Rules, &rulespb.Rule{ 78 Result: &rulespb.Rule_Recording{Recording: &rulespb.RecordingRule{ 79 Name: rule.Name(), 80 Query: rule.Query().String(), 81 Labels: labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(rule.Labels())}, 82 Health: string(rule.Health()), 83 LastError: lastError, 84 EvaluationDurationSeconds: rule.GetEvaluationDuration().Seconds(), 85 // UTC needed due to https://github.com/gogo/protobuf/issues/519. 86 LastEvaluation: rule.GetEvaluationTimestamp().UTC(), 87 }}}) 88 default: 89 // We cannot do much, let's panic, API will recover. 90 panic(fmt.Sprintf("rule %q: unsupported type %T", r.Name(), rule)) 91 } 92 } 93 return ret 94 } 95 96 func ActiveAlertsToProto(s storepb.PartialResponseStrategy, a *rules.AlertingRule) []*rulespb.AlertInstance { 97 active := a.ActiveAlerts() 98 ret := make([]*rulespb.AlertInstance, len(active)) 99 for i, ruleAlert := range active { 100 // UTC needed due to https://github.com/gogo/protobuf/issues/519. 101 activeAt := ruleAlert.ActiveAt.UTC() 102 ret[i] = &rulespb.AlertInstance{ 103 PartialResponseStrategy: s, 104 Labels: labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(ruleAlert.Labels)}, 105 Annotations: labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(ruleAlert.Annotations)}, 106 State: rulespb.AlertState(ruleAlert.State), 107 ActiveAt: &activeAt, 108 Value: strconv.FormatFloat(ruleAlert.Value, 'e', -1, 64), 109 } 110 } 111 return ret 112 } 113 114 // Manager is a partial response strategy and proto compatible Manager. 115 // Manager also implements rulespb.Rules gRPC service. 116 type Manager struct { 117 workDir string 118 mgrs map[storepb.PartialResponseStrategy]*rules.Manager 119 extLset labels.Labels 120 121 mtx sync.RWMutex 122 ruleFiles map[string]string 123 externalURL string 124 } 125 126 // NewManager creates new Manager. 127 // QueryFunc from baseOpts will be rewritten. 128 func NewManager( 129 ctx context.Context, 130 reg prometheus.Registerer, 131 dataDir string, 132 baseOpts rules.ManagerOptions, 133 queryFuncCreator func(partialResponseStrategy storepb.PartialResponseStrategy) rules.QueryFunc, 134 extLset labels.Labels, 135 externalURL string, 136 ) *Manager { 137 m := &Manager{ 138 workDir: filepath.Join(dataDir, tmpRuleDir), 139 mgrs: make(map[storepb.PartialResponseStrategy]*rules.Manager), 140 extLset: extLset, 141 ruleFiles: make(map[string]string), 142 externalURL: externalURL, 143 } 144 for _, strategy := range storepb.PartialResponseStrategy_value { 145 s := storepb.PartialResponseStrategy(strategy) 146 147 opts := baseOpts 148 opts.Registerer = extprom.WrapRegistererWith(prometheus.Labels{"strategy": strings.ToLower(s.String())}, reg) 149 opts.Context = ctx 150 opts.QueryFunc = queryFuncCreator(s) 151 152 m.mgrs[s] = rules.NewManager(&opts) 153 } 154 155 return m 156 } 157 158 // Run is non blocking, in opposite to TSDB manager, which is blocking. 159 func (m *Manager) Run() { 160 for _, mgr := range m.mgrs { 161 go mgr.Run() 162 } 163 } 164 165 func (m *Manager) Stop() { 166 for _, mgr := range m.mgrs { 167 mgr.Stop() 168 } 169 } 170 func (m *Manager) protoRuleGroups() []*rulespb.RuleGroup { 171 172 rg := m.RuleGroups() 173 res := make([]*rulespb.RuleGroup, 0, len(rg)) 174 for _, g := range rg { 175 res = append(res, g.toProto()) 176 } 177 return res 178 } 179 180 func (m *Manager) RuleGroups() []Group { 181 m.mtx.RLock() 182 defer m.mtx.RUnlock() 183 var res []Group 184 for s, r := range m.mgrs { 185 for _, group := range r.RuleGroups() { 186 res = append(res, Group{ 187 Group: group, 188 OriginalFile: m.ruleFiles[group.File()], 189 PartialResponseStrategy: s, 190 }) 191 } 192 } 193 return res 194 } 195 196 func (m *Manager) Active() []*rulespb.AlertInstance { 197 var res []*rulespb.AlertInstance 198 for s, r := range m.mgrs { 199 for _, r := range r.AlertingRules() { 200 res = append(res, ActiveAlertsToProto(s, r)...) 201 } 202 } 203 return res 204 } 205 206 type configRuleAdapter struct { 207 PartialResponseStrategy *storepb.PartialResponseStrategy 208 209 group rulefmt.RuleGroup 210 nativeRuleGroup map[string]interface{} 211 } 212 213 func (g *configRuleAdapter) UnmarshalYAML(unmarshal func(interface{}) error) error { 214 rs := struct { 215 RuleGroup rulefmt.RuleGroup `yaml:",inline"` 216 Strategy string `yaml:"partial_response_strategy"` 217 }{} 218 219 if err := unmarshal(&rs); err != nil { 220 return err 221 } 222 223 g.PartialResponseStrategy = new(storepb.PartialResponseStrategy) 224 // Same as YAMl. Quote as JSON unmarshal expects raw JSON field. 225 if err := g.PartialResponseStrategy.UnmarshalJSON([]byte("\"" + rs.Strategy + "\"")); err != nil { 226 return err 227 } 228 g.group = rs.RuleGroup 229 230 var native map[string]interface{} 231 if err := unmarshal(&native); err != nil { 232 return errors.Wrap(err, "failed to unmarshal rulefmt.configRuleAdapter") 233 } 234 delete(native, "partial_response_strategy") 235 236 g.nativeRuleGroup = native 237 return nil 238 } 239 240 func (g configRuleAdapter) MarshalYAML() (interface{}, error) { 241 return struct { 242 RuleGroup map[string]interface{} `yaml:",inline"` 243 }{ 244 RuleGroup: g.nativeRuleGroup, 245 }, nil 246 } 247 248 // TODO(bwplotka): Replace this with upstream implementation after https://github.com/prometheus/prometheus/issues/7128 is fixed. 249 func (g configRuleAdapter) validate() (errs []error) { 250 set := map[string]struct{}{} 251 if g.group.Name == "" { 252 errs = append(errs, errors.New("Groupname should not be empty")) 253 } 254 255 if _, ok := set[g.group.Name]; ok { 256 errs = append( 257 errs, 258 fmt.Errorf("groupname: %q is repeated in the same file", g.group.Name), 259 ) 260 } 261 262 set[g.group.Name] = struct{}{} 263 264 for i, r := range g.group.Rules { 265 for _, node := range r.Validate() { 266 var ruleName string 267 if r.Alert.Value != "" { 268 ruleName = r.Alert.Value 269 } else { 270 ruleName = r.Record.Value 271 } 272 errs = append(errs, &rulefmt.Error{ 273 Group: g.group.Name, 274 Rule: i, 275 RuleName: ruleName, 276 Err: node, 277 }) 278 } 279 } 280 281 return errs 282 } 283 284 // ValidateAndCount validates all rules in the rule groups and return overal number of rules in all groups. 285 // TODO(bwplotka): Replace this with upstream implementation after https://github.com/prometheus/prometheus/issues/7128 is fixed. 286 func ValidateAndCount(group io.Reader) (numRules int, errs errutil.MultiError) { 287 var rgs configGroups 288 d := yaml.NewDecoder(group) 289 d.KnownFields(true) 290 if err := d.Decode(&rgs); err != nil { 291 errs.Add(err) 292 return 0, errs 293 } 294 295 for _, g := range rgs.Groups { 296 if err := g.validate(); err != nil { 297 for _, e := range err { 298 errs.Add(e) 299 } 300 return 0, errs 301 } 302 } 303 304 for _, rg := range rgs.Groups { 305 numRules += len(rg.group.Rules) 306 } 307 return numRules, errs 308 } 309 310 type configGroups struct { 311 Groups []configRuleAdapter `yaml:"groups"` 312 } 313 314 // Update updates rules from given files to all managers we hold. We decide which groups should go where, based on 315 // special field in configGroups.configRuleAdapter struct. 316 func (m *Manager) Update(evalInterval time.Duration, files []string) error { 317 var ( 318 errs errutil.MultiError 319 filesByStrategy = map[storepb.PartialResponseStrategy][]string{} 320 ruleFiles = map[string]string{} 321 ) 322 323 // Initialize filesByStrategy for existing managers' strategies to make 324 // sure that managers are updated when they have no rules configured. 325 for strategy := range m.mgrs { 326 filesByStrategy[strategy] = make([]string, 0) 327 } 328 329 if err := os.RemoveAll(m.workDir); err != nil { 330 return errors.Wrapf(err, "remove %s", m.workDir) 331 } 332 if err := os.MkdirAll(m.workDir, os.ModePerm); err != nil { 333 return errors.Wrapf(err, "create %s", m.workDir) 334 } 335 336 for _, fn := range files { 337 b, err := os.ReadFile(filepath.Clean(fn)) 338 if err != nil { 339 errs.Add(err) 340 continue 341 } 342 343 var rg configGroups 344 if err := yaml.Unmarshal(b, &rg); err != nil { 345 errs.Add(errors.Wrap(err, fn)) 346 continue 347 } 348 349 // NOTE: This is very ugly, but we need to write those yaml into tmp dir without the partial partial response field 350 // which is not supported, to be able to reuse rules.Manager. The problem is that it uses yaml.UnmarshalStrict. 351 groupsByStrategy := map[storepb.PartialResponseStrategy][]configRuleAdapter{} 352 for _, rg := range rg.Groups { 353 groupsByStrategy[*rg.PartialResponseStrategy] = append(groupsByStrategy[*rg.PartialResponseStrategy], rg) 354 } 355 for s, rg := range groupsByStrategy { 356 b, err := yaml.Marshal(configGroups{Groups: rg}) 357 if err != nil { 358 errs = append(errs, errors.Wrapf(err, "%s: failed to marshal rule groups", fn)) 359 continue 360 } 361 362 // Use full file name appending to work dir, so we can differentiate between different dirs and same filenames(!). 363 // This will be also used as key for file group name. 364 newFn := filepath.Join(m.workDir, s.String(), fn) 365 if err := os.MkdirAll(filepath.Dir(newFn), os.ModePerm); err != nil { 366 errs.Add(errors.Wrapf(err, "create %s", filepath.Dir(newFn))) 367 continue 368 } 369 if err := os.WriteFile(newFn, b, os.ModePerm); err != nil { 370 errs.Add(errors.Wrapf(err, "write file %v", newFn)) 371 continue 372 } 373 filesByStrategy[s] = append(filesByStrategy[s], newFn) 374 ruleFiles[newFn] = fn 375 } 376 } 377 378 m.mtx.Lock() 379 for s, fs := range filesByStrategy { 380 mgr, ok := m.mgrs[s] 381 if !ok { 382 errs.Add(errors.Errorf("no manager found for %v", s)) 383 continue 384 } 385 // We add external labels in `pkg/alert.Queue`. 386 if err := mgr.Update(evalInterval, fs, m.extLset, m.externalURL, nil); err != nil { 387 // TODO(bwplotka): Prometheus logs all error details. Fix it upstream to have consistent error handling. 388 errs.Add(errors.Wrapf(err, "strategy %s, update rules", s)) 389 continue 390 } 391 } 392 m.ruleFiles = ruleFiles 393 m.mtx.Unlock() 394 395 return errs.Err() 396 } 397 398 // Rules returns specified rules from manager. This is used by gRPC and locally for HTTP and UI purposes. 399 func (m *Manager) Rules(r *rulespb.RulesRequest, s rulespb.Rules_RulesServer) (err error) { 400 groups := m.protoRuleGroups() 401 402 pgs := make([]*rulespb.RuleGroup, 0, len(groups)) 403 for _, g := range groups { 404 // UTC needed due to https://github.com/gogo/protobuf/issues/519. 405 g.LastEvaluation = g.LastEvaluation.UTC() 406 if r.Type == rulespb.RulesRequest_ALL { 407 pgs = append(pgs, g) 408 continue 409 } 410 411 filtered := proto.Clone(g).(*rulespb.RuleGroup) 412 filtered.Rules = nil 413 for _, rule := range g.Rules { 414 if rule.GetAlert() != nil && r.Type == rulespb.RulesRequest_ALERT { 415 filtered.Rules = append(filtered.Rules, rule) 416 continue 417 } 418 if rule.GetRecording() != nil && r.Type == rulespb.RulesRequest_RECORD { 419 filtered.Rules = append(filtered.Rules, rule) 420 } 421 } 422 pgs = append(pgs, filtered) 423 } 424 425 enrichRulesWithExtLabels(pgs, m.extLset) 426 427 for _, pg := range pgs { 428 tracing.DoInSpan(s.Context(), "send_rule_group_response", func(_ context.Context) { 429 err = s.Send(&rulespb.RulesResponse{Result: &rulespb.RulesResponse_Group{Group: pg}}) 430 }) 431 if err != nil { 432 return err 433 } 434 } 435 return nil 436 }