github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/hosts.go (about) 1 package monitor 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/evergreen-ci/evergreen" 8 "github.com/evergreen-ci/evergreen/cloud" 9 "github.com/evergreen-ci/evergreen/cloud/providers" 10 "github.com/evergreen-ci/evergreen/hostutil" 11 "github.com/evergreen-ci/evergreen/model/distro" 12 "github.com/evergreen-ci/evergreen/model/event" 13 "github.com/evergreen-ci/evergreen/model/host" 14 "github.com/evergreen-ci/evergreen/notify" 15 "github.com/evergreen-ci/evergreen/util" 16 "github.com/mongodb/grip" 17 "github.com/pkg/errors" 18 ) 19 20 // responsible for running regular monitoring of hosts 21 type HostMonitor struct { 22 // will be used to determine what hosts need to be terminated 23 flaggingFuncs []hostFlagger 24 25 // will be used to perform regular checks on hosts 26 monitoringFuncs []hostMonitoringFunc 27 } 28 29 // run through the list of host monitoring functions. returns any errors that 30 // occur while running the monitoring functions 31 func (hm *HostMonitor) RunMonitoringChecks(settings *evergreen.Settings) []error { 32 grip.Info("Running host monitoring checks...") 33 34 // used to store any errors that occur 35 var errors []error 36 37 for _, f := range hm.monitoringFuncs { 38 39 // continue on error to allow the other monitoring functions to run 40 if errs := f(settings); errs != nil { 41 errors = append(errors, errs...) 42 } 43 } 44 45 grip.Info("Finished running host monitoring checks") 46 47 return errors 48 49 } 50 51 // run through the list of host flagging functions, finding all hosts that 52 // need to be terminated and terminating them 53 func (hm *HostMonitor) CleanupHosts(distros []distro.Distro, settings *evergreen.Settings) []error { 54 55 grip.Info("Running host cleanup...") 56 57 // used to store any errors that occur 58 var errs []error 59 60 for idx, flagger := range hm.flaggingFuncs { 61 grip.Infoln("Searching for flagged hosts under criteria:", flagger.Reason) 62 // find the next batch of hosts to terminate 63 hostsToTerminate, err := flagger.hostFlaggingFunc(distros, settings) 64 grip.Infof("Found %v hosts flagged for '%v'", len(hostsToTerminate), flagger.Reason) 65 66 // continuing on error so that one wonky flagging function doesn't 67 // stop others from running 68 if err != nil { 69 errs = append(errs, errors.Wrap(err, "error flagging hosts to be terminated")) 70 continue 71 } 72 73 grip.Infof("Check %v: found %v hosts to be terminated", idx, len(hostsToTerminate)) 74 75 // terminate all of the dead hosts. continue on error to allow further 76 // termination to work 77 if errs = terminateHosts(hostsToTerminate, settings, flagger.Reason); errs != nil { 78 for _, err := range errs { 79 errs = append(errs, errors.Wrap(err, "error terminating host")) 80 } 81 continue 82 } 83 } 84 85 return errs 86 } 87 88 // terminate the passed-in slice of hosts. returns any errors that occur 89 // terminating the hosts 90 func terminateHosts(hosts []host.Host, settings *evergreen.Settings, reason string) []error { 91 errChan := make(chan error) 92 for _, h := range hosts { 93 grip.Infof("Terminating host %v", h.Id) 94 // terminate the host in a goroutine, passing the host in as a parameter 95 // so that the variable isn't reused for subsequent iterations 96 go func(hostToTerminate host.Host) { 97 errChan <- func() error { 98 event.LogMonitorOperation(hostToTerminate.Id, reason) 99 err := util.RunFunctionWithTimeout(func() error { 100 return terminateHost(&hostToTerminate, settings) 101 }, 12*time.Minute) 102 if err != nil { 103 if err == util.ErrTimedOut { 104 return errors.Errorf("timeout terminating host %s", hostToTerminate.Id) 105 } 106 return errors.Wrapf(err, "error terminating host %s", hostToTerminate.Id) 107 } 108 grip.Infoln("Successfully terminated host", hostToTerminate.Id) 109 return nil 110 }() 111 }(h) 112 } 113 var errors []error 114 for range hosts { 115 if err := <-errChan; err != nil { 116 errors = append(errors, err) 117 } 118 } 119 return errors 120 } 121 122 // helper to terminate a single host 123 func terminateHost(h *host.Host, settings *evergreen.Settings) error { 124 // clear the running task of the host in case one has been assigned. 125 if h.RunningTask != "" { 126 grip.Warningf("Host has running task: %s. Clearing running task field for host"+ 127 "before terminating.", h.RunningTask) 128 err := h.ClearRunningTask(h.RunningTask, time.Now()) 129 if err != nil { 130 grip.Errorf("Error clearing running task for host: %s", h.Id) 131 } 132 } 133 // convert the host to a cloud host 134 cloudHost, err := providers.GetCloudHost(h, settings) 135 if err != nil { 136 return errors.Wrapf(err, "error getting cloud host for %v", h.Id) 137 } 138 139 // run teardown script if we have one, sending notifications if things go awry 140 if h.Distro.Teardown != "" && h.Provisioned { 141 grip.Errorln("Running teardown script for host:", h.Id) 142 if err := runHostTeardown(h, cloudHost); err != nil { 143 grip.Error(errors.Wrapf(err, "Error running teardown script for %s", h.Id)) 144 145 subj := fmt.Sprintf("%v Error running teardown for host %v", 146 notify.TeardownFailurePreface, h.Id) 147 148 grip.Error(errors.Wrap(notify.NotifyAdmins(subj, err.Error(), settings), 149 "Error sending email")) 150 151 } 152 } 153 154 // terminate the instance 155 if err := cloudHost.TerminateInstance(); err != nil { 156 return errors.Wrapf(err, "error terminating host %s", h.Id) 157 } 158 159 return nil 160 } 161 162 func runHostTeardown(h *host.Host, cloudHost *cloud.CloudHost) error { 163 sshOptions, err := cloudHost.GetSSHOptions() 164 if err != nil { 165 return errors.Wrapf(err, "error getting ssh options for host %s", h.Id) 166 } 167 startTime := time.Now() 168 logs, err := hostutil.RunRemoteScript(h, "teardown.sh", sshOptions) 169 event.LogHostTeardown(h.Id, logs, err == nil, time.Since(startTime)) 170 if err != nil { 171 return errors.Wrapf(err, "error running teardown.sh over ssh: %v", logs) 172 } 173 return nil 174 }