bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/sched/alertRunner.go (about) 1 package sched 2 3 import ( 4 "fmt" 5 "time" 6 7 "bosun.org/cmd/bosun/cache" 8 "bosun.org/cmd/bosun/conf" 9 "bosun.org/slog" 10 ) 11 12 // Run should be called once (and only once) to start all schedule activity. 13 func (s *Schedule) Run() error { 14 if s.RuleConf == nil || s.SystemConf == nil { 15 return fmt.Errorf("sched: nil configuration") 16 } 17 s.nc = make(chan interface{}, 1) 18 go s.dispatchNotifications() 19 type alertCh struct { 20 ch chan<- *checkContext 21 modulo int 22 shift int // used to distribute alert runs 23 } 24 chs := []alertCh{} 25 26 // Every alert gets a small shift in time. 27 // This way the alerts with the same period are not fired 28 // simultaneously, but are distributed. 29 circular_shifts := make(map[int]int) // the map is *run period* -> *time shift to add* 30 for _, a := range s.RuleConf.GetAlerts() { 31 ch := make(chan *checkContext, 1) 32 re := a.RunEvery 33 if re == 0 { 34 re = s.SystemConf.GetDefaultRunEvery() 35 } 36 go s.runAlert(a, ch) 37 38 if s.SystemConf.GetAlertCheckDistribution() == "simple" { // only apply shifts if the respective option is set 39 chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]}) 40 } else { 41 // there are no shifts if option is off 42 chs = append(chs, alertCh{ch: ch, modulo: re, shift: 0}) 43 } 44 45 // the shifts for a given period range 0..(period - 1) 46 circular_shifts[re] = (circular_shifts[re] + 1) % re 47 } 48 i := 0 49 for { 50 select { 51 case <-s.runnerContext.Done(): 52 slog.Infoln("Stopping main scheduler routine") 53 return nil 54 default: 55 } 56 ctx := &checkContext{utcNow(), cache.New("alerts", 0)} 57 s.LastCheck = utcNow() 58 for _, a := range chs { 59 if (i+a.shift)%a.modulo != 0 { 60 continue 61 } 62 // Put on channel. If that fails, the alert is backed up pretty bad. 63 // Because channel is buffered size 1, it will continue as soon as it finishes. 64 // Master scheduler will never block here. 65 select { 66 case a.ch <- ctx: 67 default: 68 } 69 } 70 i++ 71 time.Sleep(s.SystemConf.GetCheckFrequency()) 72 s.Lock("CollectStates") 73 s.CollectStates() 74 s.Unlock() 75 } 76 } 77 78 func (s *Schedule) runAlert(a *conf.Alert, ch <-chan *checkContext) { 79 // Add to waitgroup for running alert 80 s.checksRunning.Add(1) 81 // ensure when an alert is done it is removed from the wait group 82 defer s.checksRunning.Done() 83 for { 84 select { 85 case <-s.runnerContext.Done(): 86 slog.Infof("Stopping alert routine for %v\n", a.Name) 87 return 88 case ctx := <-ch: 89 s.checkAlert(a, ctx) 90 91 } 92 } 93 } 94 95 func (s *Schedule) checkAlert(a *conf.Alert, ctx *checkContext) { 96 rh := s.NewRunHistory(ctx.runTime, ctx.checkCache) 97 // s.CheckAlert will return early if the schedule has been closed 98 cancelled := s.CheckAlert(nil, rh, a) 99 if cancelled { 100 // Don't runHistory for the alert if expression evaluation has been cancelled 101 return 102 } 103 start := utcNow() 104 s.RunHistory(rh) 105 slog.Infof("runHistory on %s took %v\n", a.Name, time.Since(start)) 106 }