bosun.org@v0.0.0-20210513094433-e25bc3e69a1f/cmd/bosun/sched/alertRunner.go (about)

     1  package sched
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"bosun.org/cmd/bosun/cache"
     8  	"bosun.org/cmd/bosun/conf"
     9  	"bosun.org/slog"
    10  )
    11  
    12  // Run should be called once (and only once) to start all schedule activity.
    13  func (s *Schedule) Run() error {
    14  	if s.RuleConf == nil || s.SystemConf == nil {
    15  		return fmt.Errorf("sched: nil configuration")
    16  	}
    17  	s.nc = make(chan interface{}, 1)
    18  	go s.dispatchNotifications()
    19  	type alertCh struct {
    20  		ch     chan<- *checkContext
    21  		modulo int
    22  		shift  int // used to distribute alert runs
    23  	}
    24  	chs := []alertCh{}
    25  
    26  	// Every alert gets a small shift in time.
    27  	// This way the alerts with the same period are not fired
    28  	// simultaneously, but are distributed.
    29  	circular_shifts := make(map[int]int) // the map is *run period* -> *time shift to add*
    30  	for _, a := range s.RuleConf.GetAlerts() {
    31  		ch := make(chan *checkContext, 1)
    32  		re := a.RunEvery
    33  		if re == 0 {
    34  			re = s.SystemConf.GetDefaultRunEvery()
    35  		}
    36  		go s.runAlert(a, ch)
    37  
    38  		if s.SystemConf.GetAlertCheckDistribution() == "simple" { // only apply shifts if the respective option is set
    39  			chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]})
    40  		} else {
    41  			// there are no shifts if option is off
    42  			chs = append(chs, alertCh{ch: ch, modulo: re, shift: 0})
    43  		}
    44  
    45  		// the shifts for a given period range 0..(period - 1)
    46  		circular_shifts[re] = (circular_shifts[re] + 1) % re
    47  	}
    48  	i := 0
    49  	for {
    50  		select {
    51  		case <-s.runnerContext.Done():
    52  			slog.Infoln("Stopping main scheduler routine")
    53  			return nil
    54  		default:
    55  		}
    56  		ctx := &checkContext{utcNow(), cache.New("alerts", 0)}
    57  		s.LastCheck = utcNow()
    58  		for _, a := range chs {
    59  			if (i+a.shift)%a.modulo != 0 {
    60  				continue
    61  			}
    62  			// Put on channel. If that fails, the alert is backed up pretty bad.
    63  			// Because channel is buffered size 1, it will continue as soon as it finishes.
    64  			// Master scheduler will never block here.
    65  			select {
    66  			case a.ch <- ctx:
    67  			default:
    68  			}
    69  		}
    70  		i++
    71  		time.Sleep(s.SystemConf.GetCheckFrequency())
    72  		s.Lock("CollectStates")
    73  		s.CollectStates()
    74  		s.Unlock()
    75  	}
    76  }
    77  
    78  func (s *Schedule) runAlert(a *conf.Alert, ch <-chan *checkContext) {
    79  	// Add to waitgroup for running alert
    80  	s.checksRunning.Add(1)
    81  	// ensure when an alert is done it is removed from the wait group
    82  	defer s.checksRunning.Done()
    83  	for {
    84  		select {
    85  		case <-s.runnerContext.Done():
    86  			slog.Infof("Stopping alert routine for %v\n", a.Name)
    87  			return
    88  		case ctx := <-ch:
    89  			s.checkAlert(a, ctx)
    90  
    91  		}
    92  	}
    93  }
    94  
    95  func (s *Schedule) checkAlert(a *conf.Alert, ctx *checkContext) {
    96  	rh := s.NewRunHistory(ctx.runTime, ctx.checkCache)
    97  	// s.CheckAlert will return early if the schedule has been closed
    98  	cancelled := s.CheckAlert(nil, rh, a)
    99  	if cancelled {
   100  		// Don't runHistory for the alert if expression evaluation has been cancelled
   101  		return
   102  	}
   103  	start := utcNow()
   104  	s.RunHistory(rh)
   105  	slog.Infof("runHistory on %s took %v\n", a.Name, time.Since(start))
   106  }