github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/monitor.go (about)

     1  package monitor
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/evergreen-ci/evergreen"
     7  	"github.com/evergreen-ci/evergreen/alerts"
     8  	"github.com/evergreen-ci/evergreen/db"
     9  	"github.com/evergreen-ci/evergreen/model"
    10  	"github.com/evergreen-ci/evergreen/model/distro"
    11  	"github.com/evergreen-ci/evergreen/model/host"
    12  	"github.com/mongodb/grip"
    13  	"github.com/pkg/errors"
    14  )
    15  
    16  var (
    17  	// the functions the task monitor will run through to find tasks needing
    18  	// to be cleaned up
    19  	defaultTaskFlaggingFuncs = []taskFlaggingFunc{
    20  		flagTimedOutHeartbeats,
    21  	}
    22  
    23  	// the functions the host monitor will run through to find hosts needing
    24  	// to be terminated
    25  	defaultHostFlaggingFuncs = []hostFlagger{
    26  		{flagDecommissionedHosts, "decommissioned"},
    27  		{flagUnreachableHosts, "unreachable"},
    28  		{flagIdleHosts, "idle"},
    29  		{flagExcessHosts, "excess"},
    30  		{flagUnprovisionedHosts, "provision_timeout"},
    31  		{flagProvisioningFailedHosts, "provision_failed"},
    32  		{flagExpiredHosts, "expired"},
    33  	}
    34  
    35  	// the functions the host monitor will run through to do simpler checks
    36  	defaultHostMonitoringFuncs = []hostMonitoringFunc{
    37  		monitorReachability,
    38  	}
    39  
    40  	// the functions the notifier will use to build notifications that need
    41  	// to be sent
    42  	defaultNotificationBuilders = []notificationBuilder{
    43  		spawnHostExpirationWarnings,
    44  		slowProvisioningWarnings,
    45  	}
    46  )
    47  
    48  // run all monitoring functions
    49  func RunAllMonitoring(settings *evergreen.Settings) error {
    50  
    51  	// load in all of the distros
    52  	distros, err := distro.Find(db.Q{})
    53  	if err != nil {
    54  		return errors.Wrap(err, "error finding distros")
    55  	}
    56  
    57  	// fetch the project refs, which we will use to get all of the projects
    58  	projectRefs, err := model.FindAllProjectRefs()
    59  	if err != nil {
    60  		return errors.Wrap(err, "error loading in project refs")
    61  	}
    62  
    63  	// turn the project refs into a map of the project id -> project
    64  	projects := map[string]model.Project{}
    65  	var project *model.Project
    66  
    67  	for _, ref := range projectRefs {
    68  		// only monitor projects that are enabled
    69  		if !ref.Enabled {
    70  			continue
    71  		}
    72  		project, err = model.FindProject("", &ref)
    73  
    74  		// continue on error to stop the whole monitoring process from
    75  		// being held up
    76  		if err != nil {
    77  			grip.Errorf("error finding project %s: %+v", ref.Identifier, err)
    78  			continue
    79  		}
    80  
    81  		if project == nil {
    82  			grip.Errorf("no project entry found for ref %s", ref.Identifier)
    83  			continue
    84  		}
    85  
    86  		projects[project.Identifier] = *project
    87  	}
    88  
    89  	// initialize the task monitor
    90  	taskMonitor := &TaskMonitor{
    91  		flaggingFuncs: defaultTaskFlaggingFuncs,
    92  	}
    93  
    94  	// clean up any necessary tasks
    95  	errs := withGlobalLock("task cleanup",
    96  		func() []error { return taskMonitor.CleanupTasks(projects) })
    97  	for _, err := range errs {
    98  		grip.Error(errors.Wrap(err, "Error cleaning up tasks"))
    99  	}
   100  
   101  	// initialize the host monitor
   102  	hostMonitor := &HostMonitor{
   103  		flaggingFuncs:   defaultHostFlaggingFuncs,
   104  		monitoringFuncs: defaultHostMonitoringFuncs,
   105  	}
   106  
   107  	// clean up any necessary hosts
   108  	errs = withGlobalLock("host cleanup",
   109  		func() []error { return hostMonitor.CleanupHosts(distros, settings) })
   110  
   111  	for _, err := range errs {
   112  		grip.Error(errors.Wrap(err, "Error cleaning up hosts"))
   113  	}
   114  
   115  	// run monitoring checks
   116  	errs = withGlobalLock("host monitoring",
   117  		func() []error { return hostMonitor.RunMonitoringChecks(settings) })
   118  	for _, err := range errs {
   119  		grip.Error(errors.Wrap(err, "Error running host monitoring checks"))
   120  	}
   121  
   122  	// initialize the notifier
   123  	notifier := &Notifier{
   124  		notificationBuilders: defaultNotificationBuilders,
   125  	}
   126  
   127  	// send notifications
   128  	errs = notifier.Notify(settings)
   129  	for _, err := range errs {
   130  		grip.Error(errors.Wrap(err, "Error sending notifications"))
   131  	}
   132  
   133  	// Do alerts for spawnhosts - collect all hosts expiring in the next 12 hours.
   134  	// The trigger logic will filter out any hosts that aren't in a notification window, or have
   135  	// already have alerts sent.
   136  	now := time.Now()
   137  	thresholdTime := now.Add(12 * time.Hour)
   138  	expiringSoonHosts, err := host.Find(host.ByExpiringBetween(now, thresholdTime))
   139  	if err != nil {
   140  		return errors.WithStack(err)
   141  	}
   142  
   143  	for _, h := range expiringSoonHosts {
   144  		err := alerts.RunSpawnWarningTriggers(&h)
   145  
   146  		grip.Error(errors.Wrap(err, "Error queuing alert"))
   147  	}
   148  
   149  	return nil
   150  }
   151  
   152  // withGlobalLock is a wrapper for grabbing the global lock for each segment of the monitor.
   153  func withGlobalLock(name string, f func() []error) (errs []error) {
   154  	grip.Debugln("Attempting to acquire global lock for monitor:", name)
   155  	// sleep for 1 second to give other spinning locks a chance to preempt this one
   156  	time.Sleep(time.Second)
   157  	acquired, err := db.WaitTillAcquireGlobalLock(name, db.LockTimeout)
   158  	if err != nil {
   159  		grip.Errorf("problem acquiring global lock for monitor %s: %+v", name, err)
   160  		return []error{errors.Errorf("error acquiring global lock for %s: %v", name, err)}
   161  	}
   162  
   163  	if !acquired {
   164  		grip.Errorln("Timed out attempting to acquire global lock for monitor:", name)
   165  		return []error{errors.Errorf("timed out acquiring global lock for monitor %s", name)}
   166  	}
   167  	defer func() {
   168  		grip.Debugln("Releasing global lock for monitor", name)
   169  		if err := db.ReleaseGlobalLock(name); err != nil {
   170  			grip.Errorf("Error releasing global lock for monitor %s: %+v", name, err)
   171  			errs = append(errs, errors.Errorf("error releasing global lock for monitor %v: %v", name, err))
   172  		} else {
   173  			grip.Debugln("Released global lock for monitor:", name)
   174  		}
   175  	}()
   176  	grip.Debugln("Acquired global lock for monitor", name)
   177  	errs = f()
   178  	return errs
   179  }