github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/hosts.go (about)

     1  package monitor
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/evergreen-ci/evergreen"
     8  	"github.com/evergreen-ci/evergreen/cloud"
     9  	"github.com/evergreen-ci/evergreen/cloud/providers"
    10  	"github.com/evergreen-ci/evergreen/hostutil"
    11  	"github.com/evergreen-ci/evergreen/model/distro"
    12  	"github.com/evergreen-ci/evergreen/model/event"
    13  	"github.com/evergreen-ci/evergreen/model/host"
    14  	"github.com/evergreen-ci/evergreen/notify"
    15  	"github.com/evergreen-ci/evergreen/util"
    16  	"github.com/mongodb/grip"
    17  	"github.com/pkg/errors"
    18  )
    19  
    20  // responsible for running regular monitoring of hosts
    21  type HostMonitor struct {
    22  	// will be used to determine what hosts need to be terminated
    23  	flaggingFuncs []hostFlagger
    24  
    25  	// will be used to perform regular checks on hosts
    26  	monitoringFuncs []hostMonitoringFunc
    27  }
    28  
    29  // run through the list of host monitoring functions. returns any errors that
    30  // occur while running the monitoring functions
    31  func (hm *HostMonitor) RunMonitoringChecks(settings *evergreen.Settings) []error {
    32  	grip.Info("Running host monitoring checks...")
    33  
    34  	// used to store any errors that occur
    35  	var errors []error
    36  
    37  	for _, f := range hm.monitoringFuncs {
    38  
    39  		// continue on error to allow the other monitoring functions to run
    40  		if errs := f(settings); errs != nil {
    41  			errors = append(errors, errs...)
    42  		}
    43  	}
    44  
    45  	grip.Info("Finished running host monitoring checks")
    46  
    47  	return errors
    48  
    49  }
    50  
    51  // run through the list of host flagging functions, finding all hosts that
    52  // need to be terminated and terminating them
    53  func (hm *HostMonitor) CleanupHosts(distros []distro.Distro, settings *evergreen.Settings) []error {
    54  
    55  	grip.Info("Running host cleanup...")
    56  
    57  	// used to store any errors that occur
    58  	var errs []error
    59  
    60  	for idx, flagger := range hm.flaggingFuncs {
    61  		grip.Infoln("Searching for flagged hosts under criteria:", flagger.Reason)
    62  		// find the next batch of hosts to terminate
    63  		hostsToTerminate, err := flagger.hostFlaggingFunc(distros, settings)
    64  		grip.Infof("Found %v hosts flagged for '%v'", len(hostsToTerminate), flagger.Reason)
    65  
    66  		// continuing on error so that one wonky flagging function doesn't
    67  		// stop others from running
    68  		if err != nil {
    69  			errs = append(errs, errors.Wrap(err, "error flagging hosts to be terminated"))
    70  			continue
    71  		}
    72  
    73  		grip.Infof("Check %v: found %v hosts to be terminated", idx, len(hostsToTerminate))
    74  
    75  		// terminate all of the dead hosts. continue on error to allow further
    76  		// termination to work
    77  		if errs = terminateHosts(hostsToTerminate, settings, flagger.Reason); errs != nil {
    78  			for _, err := range errs {
    79  				errs = append(errs, errors.Wrap(err, "error terminating host"))
    80  			}
    81  			continue
    82  		}
    83  	}
    84  
    85  	return errs
    86  }
    87  
    88  // terminate the passed-in slice of hosts. returns any errors that occur
    89  // terminating the hosts
    90  func terminateHosts(hosts []host.Host, settings *evergreen.Settings, reason string) []error {
    91  	errChan := make(chan error)
    92  	for _, h := range hosts {
    93  		grip.Infof("Terminating host %v", h.Id)
    94  		// terminate the host in a goroutine, passing the host in as a parameter
    95  		// so that the variable isn't reused for subsequent iterations
    96  		go func(hostToTerminate host.Host) {
    97  			errChan <- func() error {
    98  				event.LogMonitorOperation(hostToTerminate.Id, reason)
    99  				err := util.RunFunctionWithTimeout(func() error {
   100  					return terminateHost(&hostToTerminate, settings)
   101  				}, 12*time.Minute)
   102  				if err != nil {
   103  					if err == util.ErrTimedOut {
   104  						return errors.Errorf("timeout terminating host %s", hostToTerminate.Id)
   105  					}
   106  					return errors.Wrapf(err, "error terminating host %s", hostToTerminate.Id)
   107  				}
   108  				grip.Infoln("Successfully terminated host", hostToTerminate.Id)
   109  				return nil
   110  			}()
   111  		}(h)
   112  	}
   113  	var errors []error
   114  	for range hosts {
   115  		if err := <-errChan; err != nil {
   116  			errors = append(errors, err)
   117  		}
   118  	}
   119  	return errors
   120  }
   121  
   122  // helper to terminate a single host
   123  func terminateHost(h *host.Host, settings *evergreen.Settings) error {
   124  	// clear the running task of the host in case one has been assigned.
   125  	if h.RunningTask != "" {
   126  		grip.Warningf("Host has running task: %s. Clearing running task field for host"+
   127  			"before terminating.", h.RunningTask)
   128  		err := h.ClearRunningTask(h.RunningTask, time.Now())
   129  		if err != nil {
   130  			grip.Errorf("Error clearing running task for host: %s", h.Id)
   131  		}
   132  	}
   133  	// convert the host to a cloud host
   134  	cloudHost, err := providers.GetCloudHost(h, settings)
   135  	if err != nil {
   136  		return errors.Wrapf(err, "error getting cloud host for %v", h.Id)
   137  	}
   138  
   139  	// run teardown script if we have one, sending notifications if things go awry
   140  	if h.Distro.Teardown != "" && h.Provisioned {
   141  		grip.Errorln("Running teardown script for host:", h.Id)
   142  		if err := runHostTeardown(h, cloudHost); err != nil {
   143  			grip.Error(errors.Wrapf(err, "Error running teardown script for %s", h.Id))
   144  
   145  			subj := fmt.Sprintf("%v Error running teardown for host %v",
   146  				notify.TeardownFailurePreface, h.Id)
   147  
   148  			grip.Error(errors.Wrap(notify.NotifyAdmins(subj, err.Error(), settings),
   149  				"Error sending email"))
   150  
   151  		}
   152  	}
   153  
   154  	// terminate the instance
   155  	if err := cloudHost.TerminateInstance(); err != nil {
   156  		return errors.Wrapf(err, "error terminating host %s", h.Id)
   157  	}
   158  
   159  	return nil
   160  }
   161  
   162  func runHostTeardown(h *host.Host, cloudHost *cloud.CloudHost) error {
   163  	sshOptions, err := cloudHost.GetSSHOptions()
   164  	if err != nil {
   165  		return errors.Wrapf(err, "error getting ssh options for host %s", h.Id)
   166  	}
   167  	startTime := time.Now()
   168  	logs, err := hostutil.RunRemoteScript(h, "teardown.sh", sshOptions)
   169  	event.LogHostTeardown(h.Id, logs, err == nil, time.Since(startTime))
   170  	if err != nil {
   171  		return errors.Wrapf(err, "error running teardown.sh over ssh: %v", logs)
   172  	}
   173  	return nil
   174  }