github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/host_monitoring.go (about)

     1  package monitor
     2  
     3  import (
     4  	"sync"
     5  	"time"
     6  
     7  	"github.com/evergreen-ci/evergreen"
     8  	"github.com/evergreen-ci/evergreen/cloud"
     9  	"github.com/evergreen-ci/evergreen/cloud/providers"
    10  	"github.com/evergreen-ci/evergreen/model/event"
    11  	"github.com/evergreen-ci/evergreen/model/host"
    12  	"github.com/mongodb/grip"
    13  	"github.com/pkg/errors"
    14  )
    15  
    16  const (
    17  	// how long to wait in between reachability checks
    18  	ReachabilityCheckInterval = 10 * time.Minute
    19  	NumReachabilityWorkers    = 100
    20  )
    21  
    22  // responsible for monitoring and checking in on hosts
    23  type hostMonitoringFunc func(*evergreen.Settings) []error
    24  
    25  // monitorReachability is a hostMonitoringFunc responsible for seeing if
    26  // hosts are reachable or not. returns a slice of any errors that occur
    27  func monitorReachability(settings *evergreen.Settings) []error {
    28  	grip.Info("Running reachability checks...")
    29  
    30  	// used to store any errors that occur
    31  	var errs []error
    32  
    33  	// fetch all hosts that have not been checked recently
    34  	// (> 10 minutes ago)
    35  	threshold := time.Now().Add(-ReachabilityCheckInterval)
    36  	hosts, err := host.Find(host.ByNotMonitoredSince(threshold))
    37  	if err != nil {
    38  		errs = append(errs, errors.Wrap(err, "error finding hosts not monitored recently"))
    39  		return errs
    40  	}
    41  
    42  	workers := NumReachabilityWorkers
    43  	if len(hosts) < workers {
    44  		workers = len(hosts)
    45  	}
    46  
    47  	wg := sync.WaitGroup{}
    48  
    49  	wg.Add(workers)
    50  
    51  	hostsChan := make(chan host.Host, workers)
    52  	errChan := make(chan error, workers)
    53  
    54  	for i := 0; i < workers; i++ {
    55  		go func() {
    56  			defer wg.Done()
    57  			for host := range hostsChan {
    58  				if err := checkHostReachability(host, settings); err != nil {
    59  					errChan <- errors.WithStack(err)
    60  				}
    61  			}
    62  		}()
    63  	}
    64  
    65  	errDone := make(chan struct{})
    66  	go func() {
    67  		defer close(errDone)
    68  		for err := range errChan {
    69  			errs = append(errs, errors.Wrap(err, "error checking reachability"))
    70  		}
    71  	}()
    72  
    73  	// check all of the hosts. continue on error so that other hosts can be
    74  	// checked successfully
    75  	for _, host := range hosts {
    76  		hostsChan <- host
    77  	}
    78  	close(hostsChan)
    79  	wg.Wait()
    80  	close(errChan)
    81  
    82  	<-errDone
    83  	return errs
    84  }
    85  
    86  // check reachability for a single host, and take any necessary action
    87  func checkHostReachability(host host.Host, settings *evergreen.Settings) error {
    88  	grip.Infoln("Running reachability check for host:", host.Id)
    89  
    90  	// get a cloud version of the host
    91  	cloudHost, err := providers.GetCloudHost(&host, settings)
    92  	if err != nil {
    93  		return errors.Wrapf(err, "error getting cloud host for host %v: %v", host.Id)
    94  	}
    95  
    96  	// get the cloud status for the host
    97  	cloudStatus, err := cloudHost.GetInstanceStatus()
    98  	if err != nil {
    99  		return errors.Wrapf(err, "error getting cloud status for host %s", host.Id)
   100  	}
   101  
   102  	// take different action, depending on how the cloud provider reports the host's status
   103  	switch cloudStatus {
   104  	case cloud.StatusRunning:
   105  		// check if the host is reachable via SSH
   106  		reachable, err := cloudHost.IsSSHReachable()
   107  		if err != nil {
   108  			return errors.Wrapf(err, "error checking ssh reachability for host %s", host.Id)
   109  		}
   110  
   111  		// log the status update if the reachability of the host is changing
   112  		if host.Status == evergreen.HostUnreachable && reachable {
   113  			grip.Infof("Setting host %s as reachable", host.Id)
   114  		} else if host.Status != evergreen.HostUnreachable && !reachable {
   115  			grip.Infof("Setting host %s as unreachable", host.Id)
   116  		}
   117  
   118  		// mark the host appropriately
   119  		if err := host.UpdateReachability(reachable); err != nil {
   120  			return errors.Wrapf(err, "error updating reachability for host %s", host.Id)
   121  		}
   122  	case cloud.StatusTerminated:
   123  		grip.Infof("Host %s terminated externally; updating db status to terminated", host.Id)
   124  		event.LogHostTerminatedExternally(host.Id)
   125  
   126  		// the instance was terminated from outside our control
   127  		if err := host.SetTerminated(); err != nil {
   128  			return errors.Wrapf(err, "error setting host %s terminated", host.Id)
   129  		}
   130  	}
   131  
   132  	// success
   133  	return nil
   134  
   135  }