github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/host_monitoring.go (about) 1 package monitor 2 3 import ( 4 "sync" 5 "time" 6 7 "github.com/evergreen-ci/evergreen" 8 "github.com/evergreen-ci/evergreen/cloud" 9 "github.com/evergreen-ci/evergreen/cloud/providers" 10 "github.com/evergreen-ci/evergreen/model/event" 11 "github.com/evergreen-ci/evergreen/model/host" 12 "github.com/mongodb/grip" 13 "github.com/pkg/errors" 14 ) 15 16 const ( 17 // how long to wait in between reachability checks 18 ReachabilityCheckInterval = 10 * time.Minute 19 NumReachabilityWorkers = 100 20 ) 21 22 // responsible for monitoring and checking in on hosts 23 type hostMonitoringFunc func(*evergreen.Settings) []error 24 25 // monitorReachability is a hostMonitoringFunc responsible for seeing if 26 // hosts are reachable or not. returns a slice of any errors that occur 27 func monitorReachability(settings *evergreen.Settings) []error { 28 grip.Info("Running reachability checks...") 29 30 // used to store any errors that occur 31 var errs []error 32 33 // fetch all hosts that have not been checked recently 34 // (> 10 minutes ago) 35 threshold := time.Now().Add(-ReachabilityCheckInterval) 36 hosts, err := host.Find(host.ByNotMonitoredSince(threshold)) 37 if err != nil { 38 errs = append(errs, errors.Wrap(err, "error finding hosts not monitored recently")) 39 return errs 40 } 41 42 workers := NumReachabilityWorkers 43 if len(hosts) < workers { 44 workers = len(hosts) 45 } 46 47 wg := sync.WaitGroup{} 48 49 wg.Add(workers) 50 51 hostsChan := make(chan host.Host, workers) 52 errChan := make(chan error, workers) 53 54 for i := 0; i < workers; i++ { 55 go func() { 56 defer wg.Done() 57 for host := range hostsChan { 58 if err := checkHostReachability(host, settings); err != nil { 59 errChan <- errors.WithStack(err) 60 } 61 } 62 }() 63 } 64 65 errDone := make(chan struct{}) 66 go func() { 67 defer close(errDone) 68 for err := range errChan { 69 errs = append(errs, errors.Wrap(err, "error checking reachability")) 70 } 71 }() 72 73 // check all of the hosts. continue on error so that other hosts can be 74 // checked successfully 75 for _, host := range hosts { 76 hostsChan <- host 77 } 78 close(hostsChan) 79 wg.Wait() 80 close(errChan) 81 82 <-errDone 83 return errs 84 } 85 86 // check reachability for a single host, and take any necessary action 87 func checkHostReachability(host host.Host, settings *evergreen.Settings) error { 88 grip.Infoln("Running reachability check for host:", host.Id) 89 90 // get a cloud version of the host 91 cloudHost, err := providers.GetCloudHost(&host, settings) 92 if err != nil { 93 return errors.Wrapf(err, "error getting cloud host for host %v: %v", host.Id) 94 } 95 96 // get the cloud status for the host 97 cloudStatus, err := cloudHost.GetInstanceStatus() 98 if err != nil { 99 return errors.Wrapf(err, "error getting cloud status for host %s", host.Id) 100 } 101 102 // take different action, depending on how the cloud provider reports the host's status 103 switch cloudStatus { 104 case cloud.StatusRunning: 105 // check if the host is reachable via SSH 106 reachable, err := cloudHost.IsSSHReachable() 107 if err != nil { 108 return errors.Wrapf(err, "error checking ssh reachability for host %s", host.Id) 109 } 110 111 // log the status update if the reachability of the host is changing 112 if host.Status == evergreen.HostUnreachable && reachable { 113 grip.Infof("Setting host %s as reachable", host.Id) 114 } else if host.Status != evergreen.HostUnreachable && !reachable { 115 grip.Infof("Setting host %s as unreachable", host.Id) 116 } 117 118 // mark the host appropriately 119 if err := host.UpdateReachability(reachable); err != nil { 120 return errors.Wrapf(err, "error updating reachability for host %s", host.Id) 121 } 122 case cloud.StatusTerminated: 123 grip.Infof("Host %s terminated externally; updating db status to terminated", host.Id) 124 event.LogHostTerminatedExternally(host.Id) 125 126 // the instance was terminated from outside our control 127 if err := host.SetTerminated(); err != nil { 128 return errors.Wrapf(err, "error setting host %s terminated", host.Id) 129 } 130 } 131 132 // success 133 return nil 134 135 }