bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/sched/notify.go (about)

     1  package sched
     2  
     3  import (
     4  	"time"
     5  
     6  	"bosun.org/cmd/bosun/conf"
     7  	"bosun.org/models"
     8  	"bosun.org/slog"
     9  )
    10  
    11  // dispatchNotifications triggers notification checks at 2x the the system configuration's
    12  // check frequency, when something has signaled the schedule via the nc channels, or when
    13  // a notification that was scheduled in the future due to a notification chain
    14  func (s *Schedule) dispatchNotifications() {
    15  	ticker := time.NewTicker(s.SystemConf.GetCheckFrequency() * 2)
    16  	var next <-chan time.Time
    17  	nextAt := func(t time.Time) {
    18  		diff := t.Sub(utcNow())
    19  		if diff <= 0 {
    20  			diff = time.Millisecond
    21  		}
    22  		next = time.After(diff)
    23  	}
    24  	nextAt(utcNow())
    25  	for {
    26  		select {
    27  		case <-s.runnerContext.Done():
    28  			slog.Infoln("Stopping notification dispatcher")
    29  			return
    30  		case <-next:
    31  			nextAt(s.CheckNotifications())
    32  		case <-s.nc:
    33  			nextAt(s.CheckNotifications())
    34  		case <-ticker.C:
    35  			s.sendUnknownNotifications()
    36  		}
    37  	}
    38  
    39  }
    40  
    41  type IncidentWithTemplates struct {
    42  	*models.IncidentState
    43  	*models.RenderedTemplates
    44  }
    45  
    46  // Notify puts a rendered notification in the schedule's pendingNotifications queue
    47  func (s *Schedule) Notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) bool {
    48  	it := &IncidentWithTemplates{
    49  		IncidentState:     st,
    50  		RenderedTemplates: rt,
    51  	}
    52  	if s.pendingNotifications == nil {
    53  		s.pendingNotifications = make(map[*conf.Notification][]*IncidentWithTemplates)
    54  	}
    55  	s.pendingNotifications[n] = append(s.pendingNotifications[n], it)
    56  	return st.SetNotified(n.Name)
    57  }
    58  
    59  // CheckNotifications processes past notification events. It returns the next time a notification is needed.
    60  func (s *Schedule) CheckNotifications() time.Time {
    61  	silenced := s.Silenced()
    62  	s.Lock("CheckNotifications")
    63  	defer s.Unlock()
    64  	latestTime := utcNow()
    65  	notifications, err := s.DataAccess.Notifications().GetDueNotifications()
    66  	if err != nil {
    67  		slog.Error("Error getting notifications", err)
    68  		return utcNow().Add(time.Minute)
    69  	}
    70  	for ak, ns := range notifications {
    71  		if si := silenced(ak); si != nil {
    72  			slog.Infoln("silencing", ak)
    73  			continue
    74  		}
    75  		for name, t := range ns {
    76  			n := s.RuleConf.GetNotification(name)
    77  			if n == nil {
    78  				continue
    79  			}
    80  			//If alert is currently unevaluated because of a dependency,
    81  			//simply requeue it until the dependency resolves itself.
    82  			_, uneval := s.GetUnknownAndUnevaluatedAlertKeys(ak.Name())
    83  			unevaluated := false
    84  			for _, un := range uneval {
    85  				if un == ak {
    86  					unevaluated = true
    87  					break
    88  				}
    89  			}
    90  			if unevaluated {
    91  				// look at it again in a minute
    92  				s.QueueNotification(ak, n, t.Add(time.Minute))
    93  				continue
    94  			}
    95  			st, err := s.DataAccess.State().GetLatestIncident(ak)
    96  			if err != nil {
    97  				slog.Error(err)
    98  				continue
    99  			}
   100  			if st == nil {
   101  				continue
   102  			}
   103  			rt, err := s.DataAccess.State().GetRenderedTemplates(st.Id)
   104  			if err != nil {
   105  				slog.Error(err)
   106  				continue
   107  			}
   108  			if s.Notify(st, rt, n) {
   109  				_, err = s.DataAccess.State().UpdateIncidentState(st)
   110  				if err != nil {
   111  					slog.Error(err)
   112  					continue
   113  				}
   114  			}
   115  		}
   116  	}
   117  	s.sendNotifications(silenced)
   118  	s.pendingNotifications = nil
   119  	err = s.DataAccess.Notifications().ClearNotificationsBefore(latestTime)
   120  	if err != nil {
   121  		slog.Error("Error clearing notifications", err)
   122  		return utcNow().Add(time.Minute)
   123  	}
   124  	timeout, err := s.DataAccess.Notifications().GetNextNotificationTime()
   125  	if err != nil {
   126  		slog.Error("Error getting next notification time", err)
   127  		return utcNow().Add(time.Minute)
   128  	}
   129  	return timeout
   130  }
   131  
   132  // sendNotifications processes the schedule's pendingNotifications queue. It silences notifications,
   133  // moves unknown notifications to the unknownNotifications queue so they can be grouped, calls the notification
   134  // Notify method to trigger notification actions, and queues notifications that are in the future because they
   135  // are part of a notification chain
   136  func (s *Schedule) sendNotifications(silenced SilenceTester) {
   137  	if s.quiet {
   138  		slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications")
   139  		return
   140  	}
   141  	for n, states := range s.pendingNotifications {
   142  		for _, st := range states {
   143  			ak := st.AlertKey
   144  			alert := s.RuleConf.GetAlert(ak.Name())
   145  			if alert == nil {
   146  				continue
   147  			}
   148  			silenced := silenced(ak) != nil
   149  			if st.CurrentStatus == models.StUnknown {
   150  				if silenced {
   151  					slog.Infoln("silencing unknown", ak)
   152  					continue
   153  				}
   154  				gk := notificationGroupKey{notification: n, template: alert.Template}
   155  				s.pendingUnknowns[gk] = append(s.pendingUnknowns[gk], st.IncidentState)
   156  			} else if silenced {
   157  				slog.Infof("silencing %s", ak)
   158  				continue
   159  			} else if !alert.Log && (!st.Open || !st.NeedAck) {
   160  				slog.Errorf("Cannot notify acked or closed alert %s. Clearing.", ak)
   161  				if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil {
   162  					slog.Error(err)
   163  				}
   164  				continue
   165  			} else {
   166  				s.notify(st.IncidentState, st.RenderedTemplates, n)
   167  			}
   168  			if n.Next != nil {
   169  				s.QueueNotification(ak, n.Next, utcNow().Add(n.Timeout))
   170  			}
   171  		}
   172  	}
   173  
   174  }
   175  
   176  // sendUnknownNotifications processes the schedule's pendingUnknowns queue. It puts unknowns into groups
   177  // to be processed by the notification. When it is done processing the pendingUnknowns queue,
   178  // it reinitializes the queue. Will send a maximum of $Unknown_Threshold notifications. If more are needed,
   179  // the last one will be a multi-group.
   180  func (s *Schedule) sendUnknownNotifications() {
   181  	if len(s.pendingUnknowns) > 0 {
   182  		slog.Info("Batching and sending unknown notifications")
   183  		defer slog.Info("Done sending unknown notifications")
   184  	}
   185  	for gk, states := range s.pendingUnknowns {
   186  		n := gk.notification
   187  		ustates := make(States)
   188  		for _, st := range states {
   189  			ustates[st.AlertKey] = st
   190  		}
   191  		var c int
   192  		var multiUstates []*models.IncidentState
   193  
   194  		hitThreshold := false
   195  		overThresholdSets := make(map[string]models.AlertKeys)
   196  		minGroupSize := s.SystemConf.GetMinGroupSize()
   197  		if n.UnknownMinGroupSize != nil {
   198  			minGroupSize = *n.UnknownMinGroupSize
   199  		}
   200  		groupSets := ustates.GroupSets(minGroupSize)
   201  		threshold := s.SystemConf.GetUnknownThreshold()
   202  		if n.UnknownThreshold != nil {
   203  			threshold = *n.UnknownThreshold
   204  		}
   205  		for name, group := range groupSets {
   206  			c++
   207  			for _, ak := range group {
   208  				if c >= threshold && threshold > 0 {
   209  					if !hitThreshold && len(groupSets) == c {
   210  						// If the threshold is hit but only 1 email remains, just send the normal unknown
   211  						n.NotifyUnknown(gk.template, s.SystemConf, name, group, ustates[ak])
   212  						break
   213  					}
   214  					hitThreshold = true
   215  					overThresholdSets[name] = group
   216  					multiUstates = append(multiUstates, ustates[ak])
   217  				} else {
   218  					n.NotifyUnknown(gk.template, s.SystemConf, name, group, ustates[ak])
   219  				}
   220  			}
   221  		}
   222  		if len(overThresholdSets) > 0 {
   223  			n.NotifyMultipleUnknowns(gk.template, s.SystemConf, overThresholdSets, multiUstates)
   224  		}
   225  	}
   226  	s.pendingUnknowns = make(map[notificationGroupKey][]*models.IncidentState)
   227  }
   228  
   229  // notify is a wrapper for the notifications Notify method that sets the EmailSubject and EmailBody for the rendered
   230  // template. It passes properties from the schedule that the Notification's Notify method requires.
   231  func (s *Schedule) notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) {
   232  	n.NotifyAlert(rt, s.SystemConf, string(st.AlertKey), rt.Attachments...)
   233  }
   234  
   235  // QueueNotification persists a notification to the datastore to be sent in the future. This happens when
   236  // there are notification chains or an alert is unevaluated due to a dependency.
   237  func (s *Schedule) QueueNotification(ak models.AlertKey, n *conf.Notification, time time.Time) error {
   238  	return s.DataAccess.Notifications().InsertNotification(ak, n.Name, time)
   239  }
   240  
   241  func (s *Schedule) ActionNotify(at models.ActionType, user, message string, aks []models.AlertKey) error {
   242  	groupings, err := s.groupActionNotifications(at, aks)
   243  	if err != nil {
   244  		return err
   245  	}
   246  	for groupKey, states := range groupings {
   247  		not := groupKey.notification
   248  		if not.GroupActions == false {
   249  			for _, state := range states {
   250  				not.NotifyAction(at, groupKey.template, s.SystemConf, []*models.IncidentState{state}, user, message, s.RuleConf)
   251  			}
   252  		} else {
   253  			incidents := []*models.IncidentState{}
   254  			for _, state := range states {
   255  				incidents = append(incidents, state)
   256  			}
   257  			not.NotifyAction(at, groupKey.template, s.SystemConf, incidents, user, message, s.RuleConf)
   258  		}
   259  	}
   260  	return nil
   261  }
   262  
   263  // used to group notifications together. Notification alone is not sufficient, since different alerts
   264  // can reference different templates.
   265  // TODO: This may be overly aggressive at splitting things up. We really only need to seperate them if the
   266  // specific keys referenced in the notification for action/unknown things are different between templates.
   267  type notificationGroupKey struct {
   268  	notification *conf.Notification
   269  	template     *conf.Template
   270  }
   271  
   272  // group by notification and template
   273  func (s *Schedule) groupActionNotifications(at models.ActionType, aks []models.AlertKey) (map[notificationGroupKey][]*models.IncidentState, error) {
   274  	groupings := make(map[notificationGroupKey][]*models.IncidentState)
   275  	for _, ak := range aks {
   276  		alert := s.RuleConf.GetAlert(ak.Name())
   277  		tmpl := alert.Template
   278  		status, err := s.DataAccess.State().GetLatestIncident(ak)
   279  		if err != nil {
   280  			return nil, err
   281  		}
   282  		if alert == nil || status == nil {
   283  			continue
   284  		}
   285  		// new way: incident keeps track of which notifications it has alerted.
   286  		nots := map[string]*conf.Notification{}
   287  		for _, name := range status.Notifications {
   288  			not := s.RuleConf.GetNotification(name)
   289  			if not != nil {
   290  				nots[name] = not
   291  			}
   292  		}
   293  		if len(nots) == 0 {
   294  			// legacy behavior. Infer notifications from conf:
   295  			var n *conf.Notifications
   296  			if status.WorstStatus == models.StWarning || alert.CritNotification == nil {
   297  				n = alert.WarnNotification
   298  			} else {
   299  				n = alert.CritNotification
   300  			}
   301  			if n == nil {
   302  				continue
   303  			}
   304  			nots = n.Get(s.RuleConf, ak.Group())
   305  		}
   306  		for _, not := range nots {
   307  			if !not.RunOnActionType(at) {
   308  				continue
   309  			}
   310  			key := notificationGroupKey{not, tmpl}
   311  			groupings[key] = append(groupings[key], status)
   312  		}
   313  	}
   314  	return groupings, nil
   315  }