bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/sched/check.go (about) 1 package sched 2 3 import ( 4 "fmt" 5 "math" 6 "sort" 7 "time" 8 9 "bosun.org/cmd/bosun/cache" 10 "bosun.org/cmd/bosun/conf" 11 "bosun.org/cmd/bosun/expr" 12 "bosun.org/collect" 13 "bosun.org/metadata" 14 "bosun.org/models" 15 "bosun.org/opentsdb" 16 "bosun.org/slog" 17 "github.com/MiniProfiler/go/miniprofiler" 18 ) 19 20 func init() { 21 metadata.AddMetricMeta( 22 "bosun.alerts.current_severity", metadata.Gauge, metadata.Alert, 23 "The number of open alerts by current severity.") 24 metadata.AddMetricMeta( 25 "bosun.alerts.last_abnormal_severity", metadata.Gauge, metadata.Alert, 26 "The number of open alerts by last abnormal severity.") 27 metadata.AddMetricMeta( 28 "bosun.alerts.acknowledgement_status", metadata.Gauge, metadata.Alert, 29 "The number of open alerts by acknowledgement status.") 30 metadata.AddMetricMeta( 31 "bosun.alerts.active_status", metadata.Gauge, metadata.Alert, 32 "The number of open alerts by active status.") 33 metadata.AddMetricMeta("alerts.acknowledgement_status_by_notification", metadata.Gauge, metadata.Alert, 34 "The number of alerts by acknowledgement status and notification. Does not reflect escalation chains.") 35 metadata.AddMetricMeta("alerts.oldest_unacked_by_notification", metadata.Gauge, metadata.Second, 36 "How old the oldest unacknowledged notification is by notification.. Does not reflect escalation chains.") 37 collect.AggregateMeta("bosun.template.render", metadata.MilliSecond, "The amount of time it takes to render the specified alert template.") 38 } 39 40 func NewIncident(ak models.AlertKey) *models.IncidentState { 41 s := &models.IncidentState{} 42 s.Start = utcNow() 43 s.AlertKey = ak 44 s.Alert = ak.Name() 45 s.Tags = ak.Group().Tags() 46 s.Result = &models.Result{} 47 return s 48 } 49 50 type RunHistory struct { 51 Cache *cache.Cache 52 Start time.Time 53 Backends *expr.Backends 54 Events map[models.AlertKey]*models.Event 55 schedule *Schedule 56 } 57 58 // AtTime creates a new RunHistory starting at t with the same context and 59 // events as rh. 60 func (rh *RunHistory) AtTime(t time.Time) *RunHistory { 61 n := *rh 62 n.Start = t 63 return &n 64 } 65 66 func (s *Schedule) NewRunHistory(start time.Time, cache *cache.Cache) *RunHistory { 67 r := &RunHistory{ 68 Cache: cache, 69 Start: start, 70 Events: make(map[models.AlertKey]*models.Event), 71 schedule: s, 72 Backends: &expr.Backends{ 73 TSDBContext: s.SystemConf.GetTSDBContext(), 74 GraphiteContext: s.SystemConf.GetGraphiteContext(), 75 InfluxConfig: s.SystemConf.GetInfluxContext(), 76 ElasticHosts: s.SystemConf.GetElasticContext(), 77 AzureMonitor: s.SystemConf.GetAzureMonitorContext(), 78 PromConfig: s.SystemConf.GetPromContext(), 79 CloudWatchContext: s.SystemConf.GetCloudWatchContext(), 80 }, 81 } 82 return r 83 } 84 85 // RunHistory processes an event history and triggers notifications if needed. 86 func (s *Schedule) RunHistory(r *RunHistory) { 87 checkNotify := false 88 silenced := s.Silenced() 89 for ak, event := range r.Events { 90 shouldNotify, err := s.runHistory(r, ak, event, silenced) 91 checkNotify = checkNotify || shouldNotify 92 if err != nil { 93 slog.Errorf("Error in runHistory for %s. %s.", ak, err) 94 } 95 } 96 if checkNotify && s.nc != nil { 97 select { 98 case s.nc <- true: 99 default: 100 } 101 } 102 } 103 104 // RunHistory for a single alert key. Returns true if notifications were altered. 105 func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) { 106 event.Time = r.Start 107 a := s.RuleConf.GetAlert(ak.Name()) 108 if a.UnknownsNormal && event.Status == models.StUnknown { 109 event.Status = models.StNormal 110 } 111 112 data := s.DataAccess.State() 113 err = data.TouchAlertKey(ak, utcNow()) 114 if err != nil { 115 return 116 } 117 118 si := silenced(ak) 119 120 // get existing open incident if exists 121 var incident *models.IncidentState 122 rt := &models.RenderedTemplates{} 123 124 incident, err = data.GetOpenIncident(ak) 125 if err != nil { 126 return 127 } 128 129 defer func() { 130 // save unless incident is new and closed (log alert) 131 if incident != nil && (incident.Id != 0 || incident.Open) { 132 _, err = data.UpdateIncidentState(incident) 133 err = data.SetRenderedTemplates(incident.Id, rt) 134 } else { 135 err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state 136 if err != nil { 137 return 138 } 139 } 140 }() 141 if incident != nil { 142 rt, err = data.GetRenderedTemplates(incident.Id) 143 if err != nil { 144 return 145 } 146 for i, action := range incident.Actions { 147 if action.Type == models.ActionDelayedClose && !(action.Fullfilled || action.Cancelled) { 148 if event.Status > incident.WorstStatus { 149 // If the lifetime severity of the incident has increased, cancel the delayed close 150 err = s.ActionByAlertKey("bosun", "cancelled delayed close due to severity increase", models.ActionCancelClose, nil, ak) 151 if err != nil { 152 return 153 } 154 incident, err = data.GetIncidentState(incident.Id) 155 if err != nil { 156 return 157 } 158 // Continue processing alert after cancelling the delayed close 159 break 160 } 161 if action.Deadline == nil { 162 err = fmt.Errorf("should not be here - cancelled close without deadline") 163 return 164 } 165 if r.Start.Before(*action.Deadline) { 166 if event.Status == models.StNormal { 167 slog.Infof("closing alert %v on delayed close because the alert has returned to normal before deadline", incident.AlertKey) 168 if event.Status != incident.CurrentStatus { 169 incident.Events = append(incident.Events, *event) 170 } 171 incident.CurrentStatus = event.Status 172 // Action needs to know it is normal, so update the incident that action will read 173 _, err = data.UpdateIncidentState(incident) 174 if err != nil { 175 return 176 } 177 err = s.ActionByAlertKey("bosun", fmt.Sprintf("close on behalf of delayed close by %v", action.User), models.ActionClose, nil, ak) 178 if err != nil { 179 return 180 } 181 incident, err = data.GetIncidentState(incident.Id) 182 if err != nil { 183 return 184 } 185 incident.Actions[i].Fullfilled = true 186 return 187 } 188 } else { 189 // We are after Deadline 190 slog.Infof("force closing alert %v on delayed close because the alert is after the deadline", incident.AlertKey) 191 incident.Actions[i].Fullfilled = true 192 err = s.ActionByAlertKey("bosun", fmt.Sprintf("forceclose on behalf of delayed close by %v", action.User), models.ActionForceClose, nil, ak) 193 if err != nil { 194 return 195 } 196 incident, err = data.GetIncidentState(incident.Id) 197 if err != nil { 198 return 199 } 200 return 201 } 202 } 203 } 204 } 205 // If nothing is out of the ordinary we are done 206 if event.Status <= models.StNormal && incident == nil { 207 return 208 } 209 210 // if event is unevaluated, we are done also. 211 if incident != nil { 212 incident.Unevaluated = event.Unevaluated 213 } 214 if event.Unevaluated { 215 return 216 } 217 218 shouldNotify := false 219 newIncident := false 220 if incident == nil { 221 incident = NewIncident(ak) 222 newIncident = true 223 shouldNotify = true 224 } 225 // set state.Result according to event result 226 if event.Status == models.StCritical { 227 incident.Result = event.Crit 228 } else if event.Status == models.StWarning { 229 incident.Result = event.Warn 230 } 231 232 if event.Status > models.StNormal { 233 incident.LastAbnormalStatus = event.Status 234 incident.LastAbnormalTime = models.Epoch{Time: event.Time.UTC()} 235 } 236 if event.Status > incident.WorstStatus { 237 incident.WorstStatus = event.Status 238 shouldNotify = true 239 } 240 if event.Status != incident.CurrentStatus { 241 incident.Events = append(incident.Events, *event) 242 } 243 incident.CurrentStatus = event.Status 244 245 //run a preliminary save on new incidents to get an id 246 if newIncident { 247 if a.Log || silencedOrIgnored(a, event, si) { 248 //a log or silenced/ignored alert will not need to be saved 249 } else { 250 daState := s.DataAccess.State() 251 incident.Id, err = daState.UpdateIncidentState(incident) 252 if err != nil { 253 return 254 } 255 previousIds := []int64{} 256 previousIds, err = daState.GetAllIncidentIdsByAlertKey(ak) 257 if err != nil { 258 return 259 } 260 for _, id := range previousIds { 261 if incident.Id > id { 262 incident.PreviousIds = append(incident.PreviousIds, id) 263 } 264 } 265 sort.Slice(incident.PreviousIds, func(i, j int) bool { 266 return incident.PreviousIds[i] > incident.PreviousIds[j] 267 }) 268 _, err = daState.UpdateIncidentState(incident) 269 if err != nil { 270 return 271 } 272 if len(incident.PreviousIds) > 0 { 273 err = daState.SetIncidentNext(incident.PreviousIds[0], incident.Id) 274 if err != nil { 275 return 276 } 277 } 278 } 279 } 280 281 //render templates and open alert key if abnormal 282 if event.Status > models.StNormal { 283 rt = s.executeTemplates(incident, event, a, r) 284 incident.Open = true 285 if a.Log { 286 incident.Open = false 287 } 288 } 289 290 // On state increase, clear old notifications and notify current. 291 // Do nothing if state did not change. 292 notify := func(ns *conf.Notifications) { 293 if a.Log { 294 lastLogTime := s.lastLogTimes[ak] 295 now := utcNow() 296 if now.Before(lastLogTime.Add(a.MaxLogFrequency)) { 297 return 298 } 299 s.lastLogTimes[ak] = now 300 } 301 nots := ns.Get(s.RuleConf, incident.AlertKey.Group()) 302 for _, n := range nots { 303 s.Notify(incident, rt, n) 304 checkNotify = true 305 } 306 } 307 308 notifyCurrent := func() { 309 //Auto close ignoreUnknowns for new incident. 310 if silencedOrIgnored(a, event, si) || si != nil && si.Forget { 311 incident.Open = false 312 //auto forget 313 if si != nil && si.Forget { 314 slog.Infof("Auto forget enabled for %s", ak) 315 err := s.ActionByAlertKey("bosun", "Auto forget was enabled", models.ActionForget, nil, ak) 316 if err != nil { 317 slog.Errorln(err) 318 } 319 } 320 return 321 } 322 incident.NeedAck = true 323 switch event.Status { 324 case models.StCritical, models.StUnknown: 325 notify(a.CritNotification) 326 case models.StWarning: 327 notify(a.WarnNotification) 328 } 329 } 330 331 // lock while we change notifications. 332 s.Lock("RunHistory") 333 if shouldNotify { 334 incident.NeedAck = false 335 if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil { 336 return 337 } 338 notifyCurrent() 339 } 340 341 // finally close an open alert with silence once it goes back to normal. 342 if si := silenced(ak); si != nil && event.Status == models.StNormal { 343 go func(ak models.AlertKey) { 344 slog.Infof("auto close %s because was silenced", ak) 345 err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, nil, ak) 346 if err != nil { 347 slog.Errorln(err) 348 } 349 }(ak) 350 } 351 s.Unlock() 352 return checkNotify, nil 353 } 354 355 func silencedOrIgnored(a *conf.Alert, event *models.Event, si *models.Silence) bool { 356 if a.IgnoreUnknown && event.Status == models.StUnknown { 357 return true 358 } 359 return false 360 } 361 362 func (s *Schedule) executeTemplates(st *models.IncidentState, event *models.Event, a *conf.Alert, r *RunHistory) *models.RenderedTemplates { 363 if event.Status == models.StUnknown { 364 return nil 365 } 366 rt, errs := s.ExecuteAll(r, a, st, true) 367 if len(errs) > 0 { 368 for _, err := range errs { 369 slog.Errorf("rendering templates for %s: %s", a.Name, err) 370 } 371 subject, body, err := s.ExecuteBadTemplate(errs, r, a, st) 372 if err != nil { 373 subject = fmt.Sprintf("unable to create template error notification: %v", err) 374 } 375 rt.Subject = subject 376 if body != "" { 377 rt.Body = body 378 } 379 } 380 st.Subject = rt.Subject 381 return rt 382 } 383 384 // CollectStates sends various state information to bosun with collect. 385 func (s *Schedule) CollectStates() { 386 // [AlertName][Severity]Count 387 severityCounts := make(map[string]map[string]int64) 388 abnormalCounts := make(map[string]map[string]int64) 389 ackStatusCounts := make(map[string]map[bool]int64) 390 ackByNotificationCounts := make(map[string]map[bool]int64) 391 unAckOldestByNotification := make(map[string]time.Time) 392 activeStatusCounts := make(map[string]map[bool]int64) 393 // Initalize the Counts 394 for _, alert := range s.RuleConf.GetAlerts() { 395 severityCounts[alert.Name] = make(map[string]int64) 396 abnormalCounts[alert.Name] = make(map[string]int64) 397 var i models.Status 398 for i = 1; i.String() != "none"; i++ { 399 severityCounts[alert.Name][i.String()] = 0 400 abnormalCounts[alert.Name][i.String()] = 0 401 } 402 ackStatusCounts[alert.Name] = make(map[bool]int64) 403 activeStatusCounts[alert.Name] = make(map[bool]int64) 404 ackStatusCounts[alert.Name][false] = 0 405 activeStatusCounts[alert.Name][false] = 0 406 ackStatusCounts[alert.Name][true] = 0 407 activeStatusCounts[alert.Name][true] = 0 408 } 409 for notificationName := range s.RuleConf.GetNotifications() { 410 unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999) 411 ackByNotificationCounts[notificationName] = make(map[bool]int64) 412 ackByNotificationCounts[notificationName][false] = 0 413 ackByNotificationCounts[notificationName][true] = 0 414 } 415 //TODO: 416 // for _, state := range s.status { 417 // if !state.Open { 418 // continue 419 // } 420 // name := state.AlertKey.Name() 421 // alertDef := s.Conf.Alerts[name] 422 // nots := make(map[string]bool) 423 // for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) { 424 // nots[name] = true 425 // } 426 // for name := range alertDef.CritNotification.Get(s.Conf, state.Group) { 427 // nots[name] = true 428 // } 429 // incident, err := s.GetIncident(state.Last().IncidentId) 430 // if err != nil { 431 // slog.Errorln(err) 432 // } 433 // for notificationName := range nots { 434 // ackByNotificationCounts[notificationName][state.NeedAck]++ 435 // if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck { 436 // unAckOldestByNotification[notificationName] = incident.Start 437 // } 438 // } 439 // severity := state.CurrentStatus.String() 440 // lastAbnormal := state.LastAbnormalStatus.String() 441 // severityCounts[state.Alert][severity]++ 442 // abnormalCounts[state.Alert][lastAbnormal]++ 443 // ackStatusCounts[state.Alert][state.NeedAck]++ 444 // activeStatusCounts[state.Alert][state.IsActive()]++ 445 // } 446 for notification := range ackByNotificationCounts { 447 ts := opentsdb.TagSet{"notification": notification} 448 err := collect.Put("alerts.acknowledgement_status_by_notification", 449 ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), 450 ackByNotificationCounts[notification][true]) 451 if err != nil { 452 slog.Errorln(err) 453 } 454 err = collect.Put("alerts.acknowledgement_status_by_notification", 455 ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), 456 ackByNotificationCounts[notification][false]) 457 if err != nil { 458 slog.Errorln(err) 459 } 460 } 461 for notification, timeStamp := range unAckOldestByNotification { 462 ts := opentsdb.TagSet{"notification": notification} 463 var ago time.Duration 464 if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) { 465 ago = utcNow().Sub(timeStamp) 466 } 467 err := collect.Put("alerts.oldest_unacked_by_notification", 468 ts, 469 ago.Seconds()) 470 if err != nil { 471 slog.Errorln(err) 472 } 473 } 474 for alertName := range severityCounts { 475 ts := opentsdb.TagSet{"alert": alertName} 476 // The tagset of the alert is not included because there is no way to 477 // store the string of a group in OpenTSBD in a parsable way. This is 478 // because any delimiter we chose could also be part of a tag key or tag 479 // value. 480 for severity := range severityCounts[alertName] { 481 err := collect.Put("alerts.current_severity", 482 ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), 483 severityCounts[alertName][severity]) 484 if err != nil { 485 slog.Errorln(err) 486 } 487 err = collect.Put("alerts.last_abnormal_severity", 488 ts.Copy().Merge(opentsdb.TagSet{"severity": severity}), 489 abnormalCounts[alertName][severity]) 490 if err != nil { 491 slog.Errorln(err) 492 } 493 } 494 err := collect.Put("alerts.acknowledgement_status", 495 ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}), 496 ackStatusCounts[alertName][true]) 497 err = collect.Put("alerts.acknowledgement_status", 498 ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}), 499 ackStatusCounts[alertName][false]) 500 if err != nil { 501 slog.Errorln(err) 502 } 503 err = collect.Put("alerts.active_status", 504 ts.Copy().Merge(opentsdb.TagSet{"status": "active"}), 505 activeStatusCounts[alertName][true]) 506 if err != nil { 507 slog.Errorln(err) 508 } 509 err = collect.Put("alerts.active_status", 510 ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}), 511 activeStatusCounts[alertName][false]) 512 if err != nil { 513 slog.Errorln(err) 514 } 515 } 516 } 517 518 func (s *Schedule) GetUnknownAndUnevaluatedAlertKeys(alert string) (unknown, uneval []models.AlertKey) { 519 unknown, uneval, err := s.DataAccess.State().GetUnknownAndUnevalAlertKeys(alert) 520 if err != nil { 521 slog.Errorf("Error getting unknown/unevaluated alert keys: %s", err) 522 return nil, nil 523 } 524 return unknown, uneval 525 } 526 527 var bosunStartupTime = utcNow() 528 529 func (s *Schedule) findUnknownAlerts(now time.Time, alert string) []models.AlertKey { 530 keys := []models.AlertKey{} 531 if utcNow().Sub(bosunStartupTime) < s.SystemConf.GetCheckFrequency() { 532 return keys 533 } 534 if !s.AlertSuccessful(alert) { 535 return keys 536 } 537 a := s.RuleConf.GetAlert(alert) 538 t := a.Unknown 539 if t == 0 { 540 runEvery := s.SystemConf.GetDefaultRunEvery() 541 if a.RunEvery != 0 { 542 runEvery = a.RunEvery 543 } 544 t = s.SystemConf.GetCheckFrequency() * 2 * time.Duration(runEvery) 545 } 546 maxTouched := now.UTC().Unix() - int64(t.Seconds()) 547 untouched, err := s.DataAccess.State().GetUntouchedSince(alert, maxTouched) 548 if err != nil { 549 slog.Errorf("Error finding unknown alerts for alert %s: %s.", alert, err) 550 return keys 551 } 552 for _, ak := range untouched { 553 if a.Squelch.Squelched(ak.Group()) { 554 continue 555 } 556 keys = append(keys, ak) 557 } 558 return keys 559 } 560 561 func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) { 562 slog.Infof("check alert %v start with now set to %v", a.Name, r.Start.Format("2006-01-02 15:04:05.999999999")) 563 start := utcNow() 564 for _, ak := range s.findUnknownAlerts(r.Start, a.Name) { 565 r.Events[ak] = &models.Event{Status: models.StUnknown} 566 } 567 var warns, crits models.AlertKeys 568 type res struct { 569 results *expr.Results 570 error error 571 } 572 // buffered channel so go func that runs executeExpr won't leak if the Check is cancelled 573 // by the closing of the schedule 574 rc := make(chan res, 1) 575 var d *expr.Results 576 var err error 577 go func() { 578 d, err := s.executeExpr(T, r, a, a.Depends) 579 rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc 580 }() 581 select { 582 case res := <-rc: 583 d = res.results 584 err = res.error 585 // If the schedule closes before the expression has finised executing, we abandon the 586 // execution of the expression 587 case <-s.runnerContext.Done(): 588 return true 589 } 590 var deps expr.ResultSlice 591 if err == nil { 592 deps = filterDependencyResults(d) 593 crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil) 594 if err == nil && !cancelled { 595 warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits) 596 } 597 } 598 if cancelled { 599 return true 600 } 601 unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name) 602 if err != nil { 603 slog.Errorf("Error checking alert %s: %s", a.Name, err.Error()) 604 removeUnknownEvents(r.Events, a.Name) 605 s.markAlertError(a.Name, err) 606 } else { 607 s.markAlertSuccessful(a.Name) 608 } 609 collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds()) 610 slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount) 611 return false 612 } 613 614 func removeUnknownEvents(evs map[models.AlertKey]*models.Event, alert string) { 615 for k, v := range evs { 616 if v.Status == models.StUnknown && k.Name() == alert { 617 delete(evs, k) 618 } 619 } 620 } 621 622 func filterDependencyResults(results *expr.Results) expr.ResultSlice { 623 // take the results of the dependency expression and filter it to 624 // non-zero tag sets. 625 filtered := expr.ResultSlice{} 626 if results == nil { 627 return filtered 628 } 629 for _, r := range results.Results { 630 var n float64 631 switch v := r.Value.(type) { 632 case expr.Number: 633 n = float64(v) 634 case expr.Scalar: 635 n = float64(v) 636 } 637 if !math.IsNaN(n) && n != 0 { 638 filtered = append(filtered, r) 639 } 640 } 641 return filtered 642 } 643 644 func markDependenciesUnevaluated(events map[models.AlertKey]*models.Event, deps expr.ResultSlice, alert string) (unevalCount, unknownCount int) { 645 for ak, ev := range events { 646 if ak.Name() != alert { 647 continue 648 } 649 for _, dep := range deps { 650 if len(dep.Group) == 0 || dep.Group.Overlaps(ak.Group()) { 651 ev.Unevaluated = true 652 unevalCount++ 653 } 654 if ev.Status == models.StUnknown { 655 unknownCount++ 656 } 657 } 658 } 659 return unevalCount, unknownCount 660 } 661 662 func (s *Schedule) executeExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr) (*expr.Results, error) { 663 if e == nil { 664 return nil, nil 665 } 666 providers := &expr.BosunProviders{ 667 Cache: rh.Cache, 668 Search: s.Search, 669 Squelched: s.RuleConf.AlertSquelched(a), 670 History: s, 671 Annotate: s.annotate, 672 } 673 origin := fmt.Sprintf("Schedule: Alert Name: %s", a.Name) 674 results, _, err := e.Execute(rh.Backends, providers, T, rh.Start, 0, a.UnjoinedOK, origin) 675 return results, err 676 } 677 678 func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error, cancelled bool) { 679 if e == nil { 680 return 681 } 682 defer func() { 683 if err == nil { 684 return 685 } 686 collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1) 687 slog.Errorln(err) 688 }() 689 type res struct { 690 results *expr.Results 691 error error 692 } 693 // See s.CheckAlert for an explanation of execution and cancellation with this channel 694 rc := make(chan res, 1) 695 var results *expr.Results 696 go func() { 697 results, err := s.executeExpr(T, rh, a, e) 698 rc <- res{results, err} 699 }() 700 select { 701 case res := <-rc: 702 results = res.results 703 err = res.error 704 case <-s.runnerContext.Done(): 705 return nil, nil, true 706 } 707 if err != nil { 708 return 709 } 710 Loop: 711 for _, r := range results.Results { 712 if s.RuleConf.Squelched(a, r.Group) { 713 continue 714 } 715 ak := models.NewAlertKey(a.Name, r.Group) 716 for _, v := range ignore { 717 if ak == v { 718 continue Loop 719 } 720 } 721 var n float64 722 n, err = valueToFloat(r.Value) 723 if err != nil { 724 return 725 } 726 event := rh.Events[ak] 727 if event == nil { 728 event = new(models.Event) 729 rh.Events[ak] = event 730 } 731 result := &models.Result{ 732 Computations: r.Computations, 733 Value: models.Float(n), 734 Expr: e.String(), 735 } 736 switch checkStatus { 737 case models.StWarning: 738 event.Warn = result 739 case models.StCritical: 740 event.Crit = result 741 } 742 status := checkStatus 743 if math.IsNaN(n) { 744 status = checkStatus 745 } else if n == 0 { 746 status = models.StNormal 747 } 748 if status != models.StNormal { 749 alerts = append(alerts, ak) 750 } 751 if status > rh.Events[ak].Status { 752 event.Status = status 753 } 754 } 755 return 756 } 757 758 func valueToFloat(val expr.Value) (float64, error) { 759 var n float64 760 switch v := val.(type) { 761 case expr.Number: 762 n = float64(v) 763 case expr.Scalar: 764 n = float64(v) 765 default: 766 return 0, fmt.Errorf("expected number or scalar") 767 } 768 return n, nil 769 }