bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/sched/check.go (about)

     1  package sched
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"sort"
     7  	"time"
     8  
     9  	"bosun.org/cmd/bosun/cache"
    10  	"bosun.org/cmd/bosun/conf"
    11  	"bosun.org/cmd/bosun/expr"
    12  	"bosun.org/collect"
    13  	"bosun.org/metadata"
    14  	"bosun.org/models"
    15  	"bosun.org/opentsdb"
    16  	"bosun.org/slog"
    17  	"github.com/MiniProfiler/go/miniprofiler"
    18  )
    19  
    20  func init() {
    21  	metadata.AddMetricMeta(
    22  		"bosun.alerts.current_severity", metadata.Gauge, metadata.Alert,
    23  		"The number of open alerts by current severity.")
    24  	metadata.AddMetricMeta(
    25  		"bosun.alerts.last_abnormal_severity", metadata.Gauge, metadata.Alert,
    26  		"The number of open alerts by last abnormal severity.")
    27  	metadata.AddMetricMeta(
    28  		"bosun.alerts.acknowledgement_status", metadata.Gauge, metadata.Alert,
    29  		"The number of open alerts by acknowledgement status.")
    30  	metadata.AddMetricMeta(
    31  		"bosun.alerts.active_status", metadata.Gauge, metadata.Alert,
    32  		"The number of open alerts by active status.")
    33  	metadata.AddMetricMeta("alerts.acknowledgement_status_by_notification", metadata.Gauge, metadata.Alert,
    34  		"The number of alerts by acknowledgement status and notification. Does not reflect escalation chains.")
    35  	metadata.AddMetricMeta("alerts.oldest_unacked_by_notification", metadata.Gauge, metadata.Second,
    36  		"How old the oldest unacknowledged notification is by notification.. Does not reflect escalation chains.")
    37  	collect.AggregateMeta("bosun.template.render", metadata.MilliSecond, "The amount of time it takes to render the specified alert template.")
    38  }
    39  
    40  func NewIncident(ak models.AlertKey) *models.IncidentState {
    41  	s := &models.IncidentState{}
    42  	s.Start = utcNow()
    43  	s.AlertKey = ak
    44  	s.Alert = ak.Name()
    45  	s.Tags = ak.Group().Tags()
    46  	s.Result = &models.Result{}
    47  	return s
    48  }
    49  
    50  type RunHistory struct {
    51  	Cache    *cache.Cache
    52  	Start    time.Time
    53  	Backends *expr.Backends
    54  	Events   map[models.AlertKey]*models.Event
    55  	schedule *Schedule
    56  }
    57  
    58  // AtTime creates a new RunHistory starting at t with the same context and
    59  // events as rh.
    60  func (rh *RunHistory) AtTime(t time.Time) *RunHistory {
    61  	n := *rh
    62  	n.Start = t
    63  	return &n
    64  }
    65  
    66  func (s *Schedule) NewRunHistory(start time.Time, cache *cache.Cache) *RunHistory {
    67  	r := &RunHistory{
    68  		Cache:    cache,
    69  		Start:    start,
    70  		Events:   make(map[models.AlertKey]*models.Event),
    71  		schedule: s,
    72  		Backends: &expr.Backends{
    73  			TSDBContext:       s.SystemConf.GetTSDBContext(),
    74  			GraphiteContext:   s.SystemConf.GetGraphiteContext(),
    75  			InfluxConfig:      s.SystemConf.GetInfluxContext(),
    76  			ElasticHosts:      s.SystemConf.GetElasticContext(),
    77  			AzureMonitor:      s.SystemConf.GetAzureMonitorContext(),
    78  			PromConfig:        s.SystemConf.GetPromContext(),
    79  			CloudWatchContext: s.SystemConf.GetCloudWatchContext(),
    80  		},
    81  	}
    82  	return r
    83  }
    84  
    85  // RunHistory processes an event history and triggers notifications if needed.
    86  func (s *Schedule) RunHistory(r *RunHistory) {
    87  	checkNotify := false
    88  	silenced := s.Silenced()
    89  	for ak, event := range r.Events {
    90  		shouldNotify, err := s.runHistory(r, ak, event, silenced)
    91  		checkNotify = checkNotify || shouldNotify
    92  		if err != nil {
    93  			slog.Errorf("Error in runHistory for %s. %s.", ak, err)
    94  		}
    95  	}
    96  	if checkNotify && s.nc != nil {
    97  		select {
    98  		case s.nc <- true:
    99  		default:
   100  		}
   101  	}
   102  }
   103  
   104  // RunHistory for a single alert key. Returns true if notifications were altered.
   105  func (s *Schedule) runHistory(r *RunHistory, ak models.AlertKey, event *models.Event, silenced SilenceTester) (checkNotify bool, err error) {
   106  	event.Time = r.Start
   107  	a := s.RuleConf.GetAlert(ak.Name())
   108  	if a.UnknownsNormal && event.Status == models.StUnknown {
   109  		event.Status = models.StNormal
   110  	}
   111  
   112  	data := s.DataAccess.State()
   113  	err = data.TouchAlertKey(ak, utcNow())
   114  	if err != nil {
   115  		return
   116  	}
   117  
   118  	si := silenced(ak)
   119  
   120  	// get existing open incident if exists
   121  	var incident *models.IncidentState
   122  	rt := &models.RenderedTemplates{}
   123  
   124  	incident, err = data.GetOpenIncident(ak)
   125  	if err != nil {
   126  		return
   127  	}
   128  
   129  	defer func() {
   130  		// save unless incident is new and closed (log alert)
   131  		if incident != nil && (incident.Id != 0 || incident.Open) {
   132  			_, err = data.UpdateIncidentState(incident)
   133  			err = data.SetRenderedTemplates(incident.Id, rt)
   134  		} else {
   135  			err = data.SetUnevaluated(ak, event.Unevaluated) // if nothing to save, at least store the unevaluated state
   136  			if err != nil {
   137  				return
   138  			}
   139  		}
   140  	}()
   141  	if incident != nil {
   142  		rt, err = data.GetRenderedTemplates(incident.Id)
   143  		if err != nil {
   144  			return
   145  		}
   146  		for i, action := range incident.Actions {
   147  			if action.Type == models.ActionDelayedClose && !(action.Fullfilled || action.Cancelled) {
   148  				if event.Status > incident.WorstStatus {
   149  					// If the lifetime severity of the incident has increased, cancel the delayed close
   150  					err = s.ActionByAlertKey("bosun", "cancelled delayed close due to severity increase", models.ActionCancelClose, nil, ak)
   151  					if err != nil {
   152  						return
   153  					}
   154  					incident, err = data.GetIncidentState(incident.Id)
   155  					if err != nil {
   156  						return
   157  					}
   158  					// Continue processing alert after cancelling the delayed close
   159  					break
   160  				}
   161  				if action.Deadline == nil {
   162  					err = fmt.Errorf("should not be here - cancelled close without deadline")
   163  					return
   164  				}
   165  				if r.Start.Before(*action.Deadline) {
   166  					if event.Status == models.StNormal {
   167  						slog.Infof("closing alert %v on delayed close because the alert has returned to normal before deadline", incident.AlertKey)
   168  						if event.Status != incident.CurrentStatus {
   169  							incident.Events = append(incident.Events, *event)
   170  						}
   171  						incident.CurrentStatus = event.Status
   172  						// Action needs to know it is normal, so update the incident that action will read
   173  						_, err = data.UpdateIncidentState(incident)
   174  						if err != nil {
   175  							return
   176  						}
   177  						err = s.ActionByAlertKey("bosun", fmt.Sprintf("close on behalf of delayed close by %v", action.User), models.ActionClose, nil, ak)
   178  						if err != nil {
   179  							return
   180  						}
   181  						incident, err = data.GetIncidentState(incident.Id)
   182  						if err != nil {
   183  							return
   184  						}
   185  						incident.Actions[i].Fullfilled = true
   186  						return
   187  					}
   188  				} else {
   189  					// We are after Deadline
   190  					slog.Infof("force closing alert %v on delayed close because the alert is after the deadline", incident.AlertKey)
   191  					incident.Actions[i].Fullfilled = true
   192  					err = s.ActionByAlertKey("bosun", fmt.Sprintf("forceclose on behalf of delayed close by %v", action.User), models.ActionForceClose, nil, ak)
   193  					if err != nil {
   194  						return
   195  					}
   196  					incident, err = data.GetIncidentState(incident.Id)
   197  					if err != nil {
   198  						return
   199  					}
   200  					return
   201  				}
   202  			}
   203  		}
   204  	}
   205  	// If nothing is out of the ordinary we are done
   206  	if event.Status <= models.StNormal && incident == nil {
   207  		return
   208  	}
   209  
   210  	// if event is unevaluated, we are done also.
   211  	if incident != nil {
   212  		incident.Unevaluated = event.Unevaluated
   213  	}
   214  	if event.Unevaluated {
   215  		return
   216  	}
   217  
   218  	shouldNotify := false
   219  	newIncident := false
   220  	if incident == nil {
   221  		incident = NewIncident(ak)
   222  		newIncident = true
   223  		shouldNotify = true
   224  	}
   225  	// set state.Result according to event result
   226  	if event.Status == models.StCritical {
   227  		incident.Result = event.Crit
   228  	} else if event.Status == models.StWarning {
   229  		incident.Result = event.Warn
   230  	}
   231  
   232  	if event.Status > models.StNormal {
   233  		incident.LastAbnormalStatus = event.Status
   234  		incident.LastAbnormalTime = models.Epoch{Time: event.Time.UTC()}
   235  	}
   236  	if event.Status > incident.WorstStatus {
   237  		incident.WorstStatus = event.Status
   238  		shouldNotify = true
   239  	}
   240  	if event.Status != incident.CurrentStatus {
   241  		incident.Events = append(incident.Events, *event)
   242  	}
   243  	incident.CurrentStatus = event.Status
   244  
   245  	//run a preliminary save on new incidents to get an id
   246  	if newIncident {
   247  		if a.Log || silencedOrIgnored(a, event, si) {
   248  			//a log or silenced/ignored alert will not need to be saved
   249  		} else {
   250  			daState := s.DataAccess.State()
   251  			incident.Id, err = daState.UpdateIncidentState(incident)
   252  			if err != nil {
   253  				return
   254  			}
   255  			previousIds := []int64{}
   256  			previousIds, err = daState.GetAllIncidentIdsByAlertKey(ak)
   257  			if err != nil {
   258  				return
   259  			}
   260  			for _, id := range previousIds {
   261  				if incident.Id > id {
   262  					incident.PreviousIds = append(incident.PreviousIds, id)
   263  				}
   264  			}
   265  			sort.Slice(incident.PreviousIds, func(i, j int) bool {
   266  				return incident.PreviousIds[i] > incident.PreviousIds[j]
   267  			})
   268  			_, err = daState.UpdateIncidentState(incident)
   269  			if err != nil {
   270  				return
   271  			}
   272  			if len(incident.PreviousIds) > 0 {
   273  				err = daState.SetIncidentNext(incident.PreviousIds[0], incident.Id)
   274  				if err != nil {
   275  					return
   276  				}
   277  			}
   278  		}
   279  	}
   280  
   281  	//render templates and open alert key if abnormal
   282  	if event.Status > models.StNormal {
   283  		rt = s.executeTemplates(incident, event, a, r)
   284  		incident.Open = true
   285  		if a.Log {
   286  			incident.Open = false
   287  		}
   288  	}
   289  
   290  	// On state increase, clear old notifications and notify current.
   291  	// Do nothing if state did not change.
   292  	notify := func(ns *conf.Notifications) {
   293  		if a.Log {
   294  			lastLogTime := s.lastLogTimes[ak]
   295  			now := utcNow()
   296  			if now.Before(lastLogTime.Add(a.MaxLogFrequency)) {
   297  				return
   298  			}
   299  			s.lastLogTimes[ak] = now
   300  		}
   301  		nots := ns.Get(s.RuleConf, incident.AlertKey.Group())
   302  		for _, n := range nots {
   303  			s.Notify(incident, rt, n)
   304  			checkNotify = true
   305  		}
   306  	}
   307  
   308  	notifyCurrent := func() {
   309  		//Auto close ignoreUnknowns for new incident.
   310  		if silencedOrIgnored(a, event, si) || si != nil && si.Forget {
   311  			incident.Open = false
   312  			//auto forget
   313  			if si != nil && si.Forget {
   314  				slog.Infof("Auto forget enabled for %s", ak)
   315  				err := s.ActionByAlertKey("bosun", "Auto forget was enabled", models.ActionForget, nil, ak)
   316  				if err != nil {
   317  					slog.Errorln(err)
   318  				}
   319  			}
   320  			return
   321  		}
   322  		incident.NeedAck = true
   323  		switch event.Status {
   324  		case models.StCritical, models.StUnknown:
   325  			notify(a.CritNotification)
   326  		case models.StWarning:
   327  			notify(a.WarnNotification)
   328  		}
   329  	}
   330  
   331  	// lock while we change notifications.
   332  	s.Lock("RunHistory")
   333  	if shouldNotify {
   334  		incident.NeedAck = false
   335  		if err = s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
   336  			return
   337  		}
   338  		notifyCurrent()
   339  	}
   340  
   341  	// finally close an open alert with silence once it goes back to normal.
   342  	if si := silenced(ak); si != nil && event.Status == models.StNormal {
   343  		go func(ak models.AlertKey) {
   344  			slog.Infof("auto close %s because was silenced", ak)
   345  			err := s.ActionByAlertKey("bosun", "Auto close because was silenced.", models.ActionClose, nil, ak)
   346  			if err != nil {
   347  				slog.Errorln(err)
   348  			}
   349  		}(ak)
   350  	}
   351  	s.Unlock()
   352  	return checkNotify, nil
   353  }
   354  
   355  func silencedOrIgnored(a *conf.Alert, event *models.Event, si *models.Silence) bool {
   356  	if a.IgnoreUnknown && event.Status == models.StUnknown {
   357  		return true
   358  	}
   359  	return false
   360  }
   361  
   362  func (s *Schedule) executeTemplates(st *models.IncidentState, event *models.Event, a *conf.Alert, r *RunHistory) *models.RenderedTemplates {
   363  	if event.Status == models.StUnknown {
   364  		return nil
   365  	}
   366  	rt, errs := s.ExecuteAll(r, a, st, true)
   367  	if len(errs) > 0 {
   368  		for _, err := range errs {
   369  			slog.Errorf("rendering templates for %s: %s", a.Name, err)
   370  		}
   371  		subject, body, err := s.ExecuteBadTemplate(errs, r, a, st)
   372  		if err != nil {
   373  			subject = fmt.Sprintf("unable to create template error notification: %v", err)
   374  		}
   375  		rt.Subject = subject
   376  		if body != "" {
   377  			rt.Body = body
   378  		}
   379  	}
   380  	st.Subject = rt.Subject
   381  	return rt
   382  }
   383  
   384  // CollectStates sends various state information to bosun with collect.
   385  func (s *Schedule) CollectStates() {
   386  	// [AlertName][Severity]Count
   387  	severityCounts := make(map[string]map[string]int64)
   388  	abnormalCounts := make(map[string]map[string]int64)
   389  	ackStatusCounts := make(map[string]map[bool]int64)
   390  	ackByNotificationCounts := make(map[string]map[bool]int64)
   391  	unAckOldestByNotification := make(map[string]time.Time)
   392  	activeStatusCounts := make(map[string]map[bool]int64)
   393  	// Initalize the Counts
   394  	for _, alert := range s.RuleConf.GetAlerts() {
   395  		severityCounts[alert.Name] = make(map[string]int64)
   396  		abnormalCounts[alert.Name] = make(map[string]int64)
   397  		var i models.Status
   398  		for i = 1; i.String() != "none"; i++ {
   399  			severityCounts[alert.Name][i.String()] = 0
   400  			abnormalCounts[alert.Name][i.String()] = 0
   401  		}
   402  		ackStatusCounts[alert.Name] = make(map[bool]int64)
   403  		activeStatusCounts[alert.Name] = make(map[bool]int64)
   404  		ackStatusCounts[alert.Name][false] = 0
   405  		activeStatusCounts[alert.Name][false] = 0
   406  		ackStatusCounts[alert.Name][true] = 0
   407  		activeStatusCounts[alert.Name][true] = 0
   408  	}
   409  	for notificationName := range s.RuleConf.GetNotifications() {
   410  		unAckOldestByNotification[notificationName] = time.Unix(1<<63-62135596801, 999999999)
   411  		ackByNotificationCounts[notificationName] = make(map[bool]int64)
   412  		ackByNotificationCounts[notificationName][false] = 0
   413  		ackByNotificationCounts[notificationName][true] = 0
   414  	}
   415  	//TODO:
   416  	//	for _, state := range s.status {
   417  	//		if !state.Open {
   418  	//			continue
   419  	//		}
   420  	//		name := state.AlertKey.Name()
   421  	//		alertDef := s.Conf.Alerts[name]
   422  	//		nots := make(map[string]bool)
   423  	//		for name := range alertDef.WarnNotification.Get(s.Conf, state.Group) {
   424  	//			nots[name] = true
   425  	//		}
   426  	//		for name := range alertDef.CritNotification.Get(s.Conf, state.Group) {
   427  	//			nots[name] = true
   428  	//		}
   429  	//		incident, err := s.GetIncident(state.Last().IncidentId)
   430  	//		if err != nil {
   431  	//			slog.Errorln(err)
   432  	//		}
   433  	//		for notificationName := range nots {
   434  	//			ackByNotificationCounts[notificationName][state.NeedAck]++
   435  	//			if incident != nil && incident.Start.Before(unAckOldestByNotification[notificationName]) && state.NeedAck {
   436  	//				unAckOldestByNotification[notificationName] = incident.Start
   437  	//			}
   438  	//		}
   439  	//		severity := state.CurrentStatus.String()
   440  	//		lastAbnormal := state.LastAbnormalStatus.String()
   441  	//		severityCounts[state.Alert][severity]++
   442  	//		abnormalCounts[state.Alert][lastAbnormal]++
   443  	//		ackStatusCounts[state.Alert][state.NeedAck]++
   444  	//		activeStatusCounts[state.Alert][state.IsActive()]++
   445  	//	}
   446  	for notification := range ackByNotificationCounts {
   447  		ts := opentsdb.TagSet{"notification": notification}
   448  		err := collect.Put("alerts.acknowledgement_status_by_notification",
   449  			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
   450  			ackByNotificationCounts[notification][true])
   451  		if err != nil {
   452  			slog.Errorln(err)
   453  		}
   454  		err = collect.Put("alerts.acknowledgement_status_by_notification",
   455  			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
   456  			ackByNotificationCounts[notification][false])
   457  		if err != nil {
   458  			slog.Errorln(err)
   459  		}
   460  	}
   461  	for notification, timeStamp := range unAckOldestByNotification {
   462  		ts := opentsdb.TagSet{"notification": notification}
   463  		var ago time.Duration
   464  		if !timeStamp.Equal(time.Unix(1<<63-62135596801, 999999999)) {
   465  			ago = utcNow().Sub(timeStamp)
   466  		}
   467  		err := collect.Put("alerts.oldest_unacked_by_notification",
   468  			ts,
   469  			ago.Seconds())
   470  		if err != nil {
   471  			slog.Errorln(err)
   472  		}
   473  	}
   474  	for alertName := range severityCounts {
   475  		ts := opentsdb.TagSet{"alert": alertName}
   476  		// The tagset of the alert is not included because there is no way to
   477  		// store the string of a group in OpenTSBD in a parsable way. This is
   478  		// because any delimiter we chose could also be part of a tag key or tag
   479  		// value.
   480  		for severity := range severityCounts[alertName] {
   481  			err := collect.Put("alerts.current_severity",
   482  				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
   483  				severityCounts[alertName][severity])
   484  			if err != nil {
   485  				slog.Errorln(err)
   486  			}
   487  			err = collect.Put("alerts.last_abnormal_severity",
   488  				ts.Copy().Merge(opentsdb.TagSet{"severity": severity}),
   489  				abnormalCounts[alertName][severity])
   490  			if err != nil {
   491  				slog.Errorln(err)
   492  			}
   493  		}
   494  		err := collect.Put("alerts.acknowledgement_status",
   495  			ts.Copy().Merge(opentsdb.TagSet{"status": "unacknowledged"}),
   496  			ackStatusCounts[alertName][true])
   497  		err = collect.Put("alerts.acknowledgement_status",
   498  			ts.Copy().Merge(opentsdb.TagSet{"status": "acknowledged"}),
   499  			ackStatusCounts[alertName][false])
   500  		if err != nil {
   501  			slog.Errorln(err)
   502  		}
   503  		err = collect.Put("alerts.active_status",
   504  			ts.Copy().Merge(opentsdb.TagSet{"status": "active"}),
   505  			activeStatusCounts[alertName][true])
   506  		if err != nil {
   507  			slog.Errorln(err)
   508  		}
   509  		err = collect.Put("alerts.active_status",
   510  			ts.Copy().Merge(opentsdb.TagSet{"status": "inactive"}),
   511  			activeStatusCounts[alertName][false])
   512  		if err != nil {
   513  			slog.Errorln(err)
   514  		}
   515  	}
   516  }
   517  
   518  func (s *Schedule) GetUnknownAndUnevaluatedAlertKeys(alert string) (unknown, uneval []models.AlertKey) {
   519  	unknown, uneval, err := s.DataAccess.State().GetUnknownAndUnevalAlertKeys(alert)
   520  	if err != nil {
   521  		slog.Errorf("Error getting unknown/unevaluated alert keys: %s", err)
   522  		return nil, nil
   523  	}
   524  	return unknown, uneval
   525  }
   526  
   527  var bosunStartupTime = utcNow()
   528  
   529  func (s *Schedule) findUnknownAlerts(now time.Time, alert string) []models.AlertKey {
   530  	keys := []models.AlertKey{}
   531  	if utcNow().Sub(bosunStartupTime) < s.SystemConf.GetCheckFrequency() {
   532  		return keys
   533  	}
   534  	if !s.AlertSuccessful(alert) {
   535  		return keys
   536  	}
   537  	a := s.RuleConf.GetAlert(alert)
   538  	t := a.Unknown
   539  	if t == 0 {
   540  		runEvery := s.SystemConf.GetDefaultRunEvery()
   541  		if a.RunEvery != 0 {
   542  			runEvery = a.RunEvery
   543  		}
   544  		t = s.SystemConf.GetCheckFrequency() * 2 * time.Duration(runEvery)
   545  	}
   546  	maxTouched := now.UTC().Unix() - int64(t.Seconds())
   547  	untouched, err := s.DataAccess.State().GetUntouchedSince(alert, maxTouched)
   548  	if err != nil {
   549  		slog.Errorf("Error finding unknown alerts for alert %s: %s.", alert, err)
   550  		return keys
   551  	}
   552  	for _, ak := range untouched {
   553  		if a.Squelch.Squelched(ak.Group()) {
   554  			continue
   555  		}
   556  		keys = append(keys, ak)
   557  	}
   558  	return keys
   559  }
   560  
   561  func (s *Schedule) CheckAlert(T miniprofiler.Timer, r *RunHistory, a *conf.Alert) (cancelled bool) {
   562  	slog.Infof("check alert %v start with now set to %v", a.Name, r.Start.Format("2006-01-02 15:04:05.999999999"))
   563  	start := utcNow()
   564  	for _, ak := range s.findUnknownAlerts(r.Start, a.Name) {
   565  		r.Events[ak] = &models.Event{Status: models.StUnknown}
   566  	}
   567  	var warns, crits models.AlertKeys
   568  	type res struct {
   569  		results *expr.Results
   570  		error   error
   571  	}
   572  	// buffered channel so go func that runs executeExpr won't leak if the Check is cancelled
   573  	// by the closing of the schedule
   574  	rc := make(chan res, 1)
   575  	var d *expr.Results
   576  	var err error
   577  	go func() {
   578  		d, err := s.executeExpr(T, r, a, a.Depends)
   579  		rc <- res{d, err} // this will hang forever if the channel isn't buffered since nothing will ever receieve from rc
   580  	}()
   581  	select {
   582  	case res := <-rc:
   583  		d = res.results
   584  		err = res.error
   585  	// If the schedule closes before the expression has finised executing, we abandon the
   586  	// execution of the expression
   587  	case <-s.runnerContext.Done():
   588  		return true
   589  	}
   590  	var deps expr.ResultSlice
   591  	if err == nil {
   592  		deps = filterDependencyResults(d)
   593  		crits, err, cancelled = s.CheckExpr(T, r, a, a.Crit, models.StCritical, nil)
   594  		if err == nil && !cancelled {
   595  			warns, err, cancelled = s.CheckExpr(T, r, a, a.Warn, models.StWarning, crits)
   596  		}
   597  	}
   598  	if cancelled {
   599  		return true
   600  	}
   601  	unevalCount, unknownCount := markDependenciesUnevaluated(r.Events, deps, a.Name)
   602  	if err != nil {
   603  		slog.Errorf("Error checking alert %s: %s", a.Name, err.Error())
   604  		removeUnknownEvents(r.Events, a.Name)
   605  		s.markAlertError(a.Name, err)
   606  	} else {
   607  		s.markAlertSuccessful(a.Name)
   608  	}
   609  	collect.Put("check.duration", opentsdb.TagSet{"name": a.Name}, time.Since(start).Seconds())
   610  	slog.Infof("check alert %v done (%s): %v crits, %v warns, %v unevaluated, %v unknown", a.Name, time.Since(start), len(crits), len(warns), unevalCount, unknownCount)
   611  	return false
   612  }
   613  
   614  func removeUnknownEvents(evs map[models.AlertKey]*models.Event, alert string) {
   615  	for k, v := range evs {
   616  		if v.Status == models.StUnknown && k.Name() == alert {
   617  			delete(evs, k)
   618  		}
   619  	}
   620  }
   621  
   622  func filterDependencyResults(results *expr.Results) expr.ResultSlice {
   623  	// take the results of the dependency expression and filter it to
   624  	// non-zero tag sets.
   625  	filtered := expr.ResultSlice{}
   626  	if results == nil {
   627  		return filtered
   628  	}
   629  	for _, r := range results.Results {
   630  		var n float64
   631  		switch v := r.Value.(type) {
   632  		case expr.Number:
   633  			n = float64(v)
   634  		case expr.Scalar:
   635  			n = float64(v)
   636  		}
   637  		if !math.IsNaN(n) && n != 0 {
   638  			filtered = append(filtered, r)
   639  		}
   640  	}
   641  	return filtered
   642  }
   643  
   644  func markDependenciesUnevaluated(events map[models.AlertKey]*models.Event, deps expr.ResultSlice, alert string) (unevalCount, unknownCount int) {
   645  	for ak, ev := range events {
   646  		if ak.Name() != alert {
   647  			continue
   648  		}
   649  		for _, dep := range deps {
   650  			if len(dep.Group) == 0 || dep.Group.Overlaps(ak.Group()) {
   651  				ev.Unevaluated = true
   652  				unevalCount++
   653  			}
   654  			if ev.Status == models.StUnknown {
   655  				unknownCount++
   656  			}
   657  		}
   658  	}
   659  	return unevalCount, unknownCount
   660  }
   661  
   662  func (s *Schedule) executeExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr) (*expr.Results, error) {
   663  	if e == nil {
   664  		return nil, nil
   665  	}
   666  	providers := &expr.BosunProviders{
   667  		Cache:     rh.Cache,
   668  		Search:    s.Search,
   669  		Squelched: s.RuleConf.AlertSquelched(a),
   670  		History:   s,
   671  		Annotate:  s.annotate,
   672  	}
   673  	origin := fmt.Sprintf("Schedule: Alert Name: %s", a.Name)
   674  	results, _, err := e.Execute(rh.Backends, providers, T, rh.Start, 0, a.UnjoinedOK, origin)
   675  	return results, err
   676  }
   677  
   678  func (s *Schedule) CheckExpr(T miniprofiler.Timer, rh *RunHistory, a *conf.Alert, e *expr.Expr, checkStatus models.Status, ignore models.AlertKeys) (alerts models.AlertKeys, err error, cancelled bool) {
   679  	if e == nil {
   680  		return
   681  	}
   682  	defer func() {
   683  		if err == nil {
   684  			return
   685  		}
   686  		collect.Add("check.errs", opentsdb.TagSet{"metric": a.Name}, 1)
   687  		slog.Errorln(err)
   688  	}()
   689  	type res struct {
   690  		results *expr.Results
   691  		error   error
   692  	}
   693  	// See s.CheckAlert for an explanation of execution and cancellation with this channel
   694  	rc := make(chan res, 1)
   695  	var results *expr.Results
   696  	go func() {
   697  		results, err := s.executeExpr(T, rh, a, e)
   698  		rc <- res{results, err}
   699  	}()
   700  	select {
   701  	case res := <-rc:
   702  		results = res.results
   703  		err = res.error
   704  	case <-s.runnerContext.Done():
   705  		return nil, nil, true
   706  	}
   707  	if err != nil {
   708  		return
   709  	}
   710  Loop:
   711  	for _, r := range results.Results {
   712  		if s.RuleConf.Squelched(a, r.Group) {
   713  			continue
   714  		}
   715  		ak := models.NewAlertKey(a.Name, r.Group)
   716  		for _, v := range ignore {
   717  			if ak == v {
   718  				continue Loop
   719  			}
   720  		}
   721  		var n float64
   722  		n, err = valueToFloat(r.Value)
   723  		if err != nil {
   724  			return
   725  		}
   726  		event := rh.Events[ak]
   727  		if event == nil {
   728  			event = new(models.Event)
   729  			rh.Events[ak] = event
   730  		}
   731  		result := &models.Result{
   732  			Computations: r.Computations,
   733  			Value:        models.Float(n),
   734  			Expr:         e.String(),
   735  		}
   736  		switch checkStatus {
   737  		case models.StWarning:
   738  			event.Warn = result
   739  		case models.StCritical:
   740  			event.Crit = result
   741  		}
   742  		status := checkStatus
   743  		if math.IsNaN(n) {
   744  			status = checkStatus
   745  		} else if n == 0 {
   746  			status = models.StNormal
   747  		}
   748  		if status != models.StNormal {
   749  			alerts = append(alerts, ak)
   750  		}
   751  		if status > rh.Events[ak].Status {
   752  			event.Status = status
   753  		}
   754  	}
   755  	return
   756  }
   757  
   758  func valueToFloat(val expr.Value) (float64, error) {
   759  	var n float64
   760  	switch v := val.(type) {
   761  	case expr.Number:
   762  		n = float64(v)
   763  	case expr.Scalar:
   764  		n = float64(v)
   765  	default:
   766  		return 0, fmt.Errorf("expected number or scalar")
   767  	}
   768  	return n, nil
   769  }