bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/sched/notify.go (about) 1 package sched 2 3 import ( 4 "time" 5 6 "bosun.org/cmd/bosun/conf" 7 "bosun.org/models" 8 "bosun.org/slog" 9 ) 10 11 // dispatchNotifications triggers notification checks at 2x the the system configuration's 12 // check frequency, when something has signaled the schedule via the nc channels, or when 13 // a notification that was scheduled in the future due to a notification chain 14 func (s *Schedule) dispatchNotifications() { 15 ticker := time.NewTicker(s.SystemConf.GetCheckFrequency() * 2) 16 var next <-chan time.Time 17 nextAt := func(t time.Time) { 18 diff := t.Sub(utcNow()) 19 if diff <= 0 { 20 diff = time.Millisecond 21 } 22 next = time.After(diff) 23 } 24 nextAt(utcNow()) 25 for { 26 select { 27 case <-s.runnerContext.Done(): 28 slog.Infoln("Stopping notification dispatcher") 29 return 30 case <-next: 31 nextAt(s.CheckNotifications()) 32 case <-s.nc: 33 nextAt(s.CheckNotifications()) 34 case <-ticker.C: 35 s.sendUnknownNotifications() 36 } 37 } 38 39 } 40 41 type IncidentWithTemplates struct { 42 *models.IncidentState 43 *models.RenderedTemplates 44 } 45 46 // Notify puts a rendered notification in the schedule's pendingNotifications queue 47 func (s *Schedule) Notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) bool { 48 it := &IncidentWithTemplates{ 49 IncidentState: st, 50 RenderedTemplates: rt, 51 } 52 if s.pendingNotifications == nil { 53 s.pendingNotifications = make(map[*conf.Notification][]*IncidentWithTemplates) 54 } 55 s.pendingNotifications[n] = append(s.pendingNotifications[n], it) 56 return st.SetNotified(n.Name) 57 } 58 59 // CheckNotifications processes past notification events. It returns the next time a notification is needed. 60 func (s *Schedule) CheckNotifications() time.Time { 61 silenced := s.Silenced() 62 s.Lock("CheckNotifications") 63 defer s.Unlock() 64 latestTime := utcNow() 65 notifications, err := s.DataAccess.Notifications().GetDueNotifications() 66 if err != nil { 67 slog.Error("Error getting notifications", err) 68 return utcNow().Add(time.Minute) 69 } 70 for ak, ns := range notifications { 71 if si := silenced(ak); si != nil { 72 slog.Infoln("silencing", ak) 73 continue 74 } 75 for name, t := range ns { 76 n := s.RuleConf.GetNotification(name) 77 if n == nil { 78 continue 79 } 80 //If alert is currently unevaluated because of a dependency, 81 //simply requeue it until the dependency resolves itself. 82 _, uneval := s.GetUnknownAndUnevaluatedAlertKeys(ak.Name()) 83 unevaluated := false 84 for _, un := range uneval { 85 if un == ak { 86 unevaluated = true 87 break 88 } 89 } 90 if unevaluated { 91 // look at it again in a minute 92 s.QueueNotification(ak, n, t.Add(time.Minute)) 93 continue 94 } 95 st, err := s.DataAccess.State().GetLatestIncident(ak) 96 if err != nil { 97 slog.Error(err) 98 continue 99 } 100 if st == nil { 101 continue 102 } 103 rt, err := s.DataAccess.State().GetRenderedTemplates(st.Id) 104 if err != nil { 105 slog.Error(err) 106 continue 107 } 108 if s.Notify(st, rt, n) { 109 _, err = s.DataAccess.State().UpdateIncidentState(st) 110 if err != nil { 111 slog.Error(err) 112 continue 113 } 114 } 115 } 116 } 117 s.sendNotifications(silenced) 118 s.pendingNotifications = nil 119 err = s.DataAccess.Notifications().ClearNotificationsBefore(latestTime) 120 if err != nil { 121 slog.Error("Error clearing notifications", err) 122 return utcNow().Add(time.Minute) 123 } 124 timeout, err := s.DataAccess.Notifications().GetNextNotificationTime() 125 if err != nil { 126 slog.Error("Error getting next notification time", err) 127 return utcNow().Add(time.Minute) 128 } 129 return timeout 130 } 131 132 // sendNotifications processes the schedule's pendingNotifications queue. It silences notifications, 133 // moves unknown notifications to the unknownNotifications queue so they can be grouped, calls the notification 134 // Notify method to trigger notification actions, and queues notifications that are in the future because they 135 // are part of a notification chain 136 func (s *Schedule) sendNotifications(silenced SilenceTester) { 137 if s.quiet { 138 slog.Infoln("quiet mode prevented", len(s.pendingNotifications), "notifications") 139 return 140 } 141 for n, states := range s.pendingNotifications { 142 for _, st := range states { 143 ak := st.AlertKey 144 alert := s.RuleConf.GetAlert(ak.Name()) 145 if alert == nil { 146 continue 147 } 148 silenced := silenced(ak) != nil 149 if st.CurrentStatus == models.StUnknown { 150 if silenced { 151 slog.Infoln("silencing unknown", ak) 152 continue 153 } 154 gk := notificationGroupKey{notification: n, template: alert.Template} 155 s.pendingUnknowns[gk] = append(s.pendingUnknowns[gk], st.IncidentState) 156 } else if silenced { 157 slog.Infof("silencing %s", ak) 158 continue 159 } else if !alert.Log && (!st.Open || !st.NeedAck) { 160 slog.Errorf("Cannot notify acked or closed alert %s. Clearing.", ak) 161 if err := s.DataAccess.Notifications().ClearNotifications(ak); err != nil { 162 slog.Error(err) 163 } 164 continue 165 } else { 166 s.notify(st.IncidentState, st.RenderedTemplates, n) 167 } 168 if n.Next != nil { 169 s.QueueNotification(ak, n.Next, utcNow().Add(n.Timeout)) 170 } 171 } 172 } 173 174 } 175 176 // sendUnknownNotifications processes the schedule's pendingUnknowns queue. It puts unknowns into groups 177 // to be processed by the notification. When it is done processing the pendingUnknowns queue, 178 // it reinitializes the queue. Will send a maximum of $Unknown_Threshold notifications. If more are needed, 179 // the last one will be a multi-group. 180 func (s *Schedule) sendUnknownNotifications() { 181 if len(s.pendingUnknowns) > 0 { 182 slog.Info("Batching and sending unknown notifications") 183 defer slog.Info("Done sending unknown notifications") 184 } 185 for gk, states := range s.pendingUnknowns { 186 n := gk.notification 187 ustates := make(States) 188 for _, st := range states { 189 ustates[st.AlertKey] = st 190 } 191 var c int 192 var multiUstates []*models.IncidentState 193 194 hitThreshold := false 195 overThresholdSets := make(map[string]models.AlertKeys) 196 minGroupSize := s.SystemConf.GetMinGroupSize() 197 if n.UnknownMinGroupSize != nil { 198 minGroupSize = *n.UnknownMinGroupSize 199 } 200 groupSets := ustates.GroupSets(minGroupSize) 201 threshold := s.SystemConf.GetUnknownThreshold() 202 if n.UnknownThreshold != nil { 203 threshold = *n.UnknownThreshold 204 } 205 for name, group := range groupSets { 206 c++ 207 for _, ak := range group { 208 if c >= threshold && threshold > 0 { 209 if !hitThreshold && len(groupSets) == c { 210 // If the threshold is hit but only 1 email remains, just send the normal unknown 211 n.NotifyUnknown(gk.template, s.SystemConf, name, group, ustates[ak]) 212 break 213 } 214 hitThreshold = true 215 overThresholdSets[name] = group 216 multiUstates = append(multiUstates, ustates[ak]) 217 } else { 218 n.NotifyUnknown(gk.template, s.SystemConf, name, group, ustates[ak]) 219 } 220 } 221 } 222 if len(overThresholdSets) > 0 { 223 n.NotifyMultipleUnknowns(gk.template, s.SystemConf, overThresholdSets, multiUstates) 224 } 225 } 226 s.pendingUnknowns = make(map[notificationGroupKey][]*models.IncidentState) 227 } 228 229 // notify is a wrapper for the notifications Notify method that sets the EmailSubject and EmailBody for the rendered 230 // template. It passes properties from the schedule that the Notification's Notify method requires. 231 func (s *Schedule) notify(st *models.IncidentState, rt *models.RenderedTemplates, n *conf.Notification) { 232 n.NotifyAlert(rt, s.SystemConf, string(st.AlertKey), rt.Attachments...) 233 } 234 235 // QueueNotification persists a notification to the datastore to be sent in the future. This happens when 236 // there are notification chains or an alert is unevaluated due to a dependency. 237 func (s *Schedule) QueueNotification(ak models.AlertKey, n *conf.Notification, time time.Time) error { 238 return s.DataAccess.Notifications().InsertNotification(ak, n.Name, time) 239 } 240 241 func (s *Schedule) ActionNotify(at models.ActionType, user, message string, aks []models.AlertKey) error { 242 groupings, err := s.groupActionNotifications(at, aks) 243 if err != nil { 244 return err 245 } 246 for groupKey, states := range groupings { 247 not := groupKey.notification 248 if not.GroupActions == false { 249 for _, state := range states { 250 not.NotifyAction(at, groupKey.template, s.SystemConf, []*models.IncidentState{state}, user, message, s.RuleConf) 251 } 252 } else { 253 incidents := []*models.IncidentState{} 254 for _, state := range states { 255 incidents = append(incidents, state) 256 } 257 not.NotifyAction(at, groupKey.template, s.SystemConf, incidents, user, message, s.RuleConf) 258 } 259 } 260 return nil 261 } 262 263 // used to group notifications together. Notification alone is not sufficient, since different alerts 264 // can reference different templates. 265 // TODO: This may be overly aggressive at splitting things up. We really only need to seperate them if the 266 // specific keys referenced in the notification for action/unknown things are different between templates. 267 type notificationGroupKey struct { 268 notification *conf.Notification 269 template *conf.Template 270 } 271 272 // group by notification and template 273 func (s *Schedule) groupActionNotifications(at models.ActionType, aks []models.AlertKey) (map[notificationGroupKey][]*models.IncidentState, error) { 274 groupings := make(map[notificationGroupKey][]*models.IncidentState) 275 for _, ak := range aks { 276 alert := s.RuleConf.GetAlert(ak.Name()) 277 tmpl := alert.Template 278 status, err := s.DataAccess.State().GetLatestIncident(ak) 279 if err != nil { 280 return nil, err 281 } 282 if alert == nil || status == nil { 283 continue 284 } 285 // new way: incident keeps track of which notifications it has alerted. 286 nots := map[string]*conf.Notification{} 287 for _, name := range status.Notifications { 288 not := s.RuleConf.GetNotification(name) 289 if not != nil { 290 nots[name] = not 291 } 292 } 293 if len(nots) == 0 { 294 // legacy behavior. Infer notifications from conf: 295 var n *conf.Notifications 296 if status.WorstStatus == models.StWarning || alert.CritNotification == nil { 297 n = alert.WarnNotification 298 } else { 299 n = alert.CritNotification 300 } 301 if n == nil { 302 continue 303 } 304 nots = n.Get(s.RuleConf, ak.Group()) 305 } 306 for _, not := range nots { 307 if !not.RunOnActionType(at) { 308 continue 309 } 310 key := notificationGroupKey{not, tmpl} 311 groupings[key] = append(groupings[key], status) 312 } 313 } 314 return groupings, nil 315 }