github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/host_flagging.go (about)

     1  package monitor
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/evergreen-ci/evergreen"
     7  	"github.com/evergreen-ci/evergreen/cloud/providers"
     8  	"github.com/evergreen-ci/evergreen/model/distro"
     9  	"github.com/evergreen-ci/evergreen/model/host"
    10  	"github.com/mongodb/grip"
    11  	"github.com/pkg/errors"
    12  )
    13  
    14  const (
    15  	// ProvisioningCutoff is the threshold to consider as too long for a host to take provisioning
    16  	ProvisioningCutoff = 35 * time.Minute
    17  
    18  	// UnreachableCutoff is the threshold to wait for an unreachable host to become marked
    19  	// as reachable again before giving up and terminating it.
    20  	UnreachableCutoff = 10 * time.Minute
    21  
    22  	// IdleTimeCutoff is the amount of time we wait for an idle host to be marked as idle.
    23  	IdleTimeCutoff = 15 * time.Minute
    24  
    25  	// MaxTimeNextPayment is the amount of time we wait to have left before marking a host as idle
    26  	MaxTimeTilNextPayment = 5 * time.Minute
    27  
    28  	// CommunicationTimeCutoff is the limit to how much time has passed before the host is marked as idle
    29  	// due to the agent not being able to communicate with the API server.
    30  	CommunicationTimeCutoff = 10 * time.Minute
    31  )
    32  
    33  type hostFlagger struct {
    34  	hostFlaggingFunc
    35  	Reason string
    36  }
    37  
    38  // function that takes in all distros - specified as a map of
    39  // distro name -> distro info - as well as the mci settings,
    40  // and spits out a list of hosts to be terminated
    41  type hostFlaggingFunc func([]distro.Distro, *evergreen.Settings) ([]host.Host, error)
    42  
    43  // flagDecommissionedHosts is a hostFlaggingFunc to get all hosts which should
    44  // be terminated because they are decommissioned
    45  func flagDecommissionedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
    46  	hosts, err := host.Find(host.IsDecommissioned)
    47  	if err != nil {
    48  		return nil, errors.Wrap(err, "error finding decommissioned hosts")
    49  	}
    50  	return hosts, nil
    51  }
    52  
    53  // flagUnreachableHosts is a hostFlaggingFunc to get all hosts which should
    54  // be terminated because they are unreachable
    55  func flagUnreachableHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
    56  	threshold := time.Now().Add(-1 * UnreachableCutoff)
    57  	hosts, err := host.Find(host.ByUnreachableBefore(threshold))
    58  	if err != nil {
    59  		return nil, errors.Wrapf(err, "error finding hosts unreachable since before %v", threshold)
    60  	}
    61  	return hosts, nil
    62  }
    63  
    64  // flagIdleHosts is a hostFlaggingFunc to get all hosts which have spent too
    65  // long without running a task
    66  func flagIdleHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
    67  	// will ultimately contain all of the hosts determined to be idle
    68  	idleHosts := []host.Host{}
    69  
    70  	// fetch all hosts not currently running a task
    71  	freeHosts, err := host.Find(host.IsFree)
    72  	if err != nil {
    73  		return nil, errors.Wrap(err, "error finding free hosts")
    74  	}
    75  
    76  	// go through the hosts, and see if they have idled long enough to
    77  	// be terminated
    78  	for _, freeHost := range freeHosts {
    79  
    80  		// ask the host how long it has been idle
    81  		idleTime := freeHost.IdleTime()
    82  
    83  		// if the communication time is > 10 mins then there may not be an agent on the host.
    84  		communicationTime := time.Since(freeHost.LastCommunicationTime)
    85  
    86  		// get a cloud manager for the host
    87  		cloudManager, err := providers.GetCloudManager(freeHost.Provider, s)
    88  		if err != nil {
    89  			return nil, errors.Wrapf(err, "error getting cloud manager for host %v", freeHost.Id)
    90  		}
    91  
    92  		// if the host is not dynamically spun up (and can thus be terminated),
    93  		// skip it
    94  		canTerminate, err := hostCanBeTerminated(freeHost, s)
    95  		if err != nil {
    96  			return nil, errors.Wrapf(err, "error checking if host %v can be terminated", freeHost.Id)
    97  		}
    98  		if !canTerminate {
    99  			continue
   100  		}
   101  
   102  		// ask how long until the next payment for the host
   103  		tilNextPayment := cloudManager.TimeTilNextPayment(&freeHost)
   104  
   105  		// current determinants for idle:
   106  		//  idle for at least 15 minutes or last communication time has been more than 10 mins and
   107  		//  less than 5 minutes til next payment
   108  		if (communicationTime >= CommunicationTimeCutoff || idleTime >= IdleTimeCutoff) &&
   109  			tilNextPayment <= MaxTimeTilNextPayment {
   110  			idleHosts = append(idleHosts, freeHost)
   111  		}
   112  
   113  	}
   114  
   115  	return idleHosts, nil
   116  }
   117  
   118  // flagExcessHosts is a hostFlaggingFunc to get all hosts that push their
   119  // distros over the specified max hosts
   120  func flagExcessHosts(distros []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
   121  	// will ultimately contain all the hosts that can be terminated
   122  	excessHosts := []host.Host{}
   123  
   124  	// figure out the excess hosts for each distro
   125  	for _, d := range distros {
   126  
   127  		// fetch any hosts for the distro that count towards max hosts
   128  		allHostsForDistro, err := host.Find(host.ByDistroId(d.Id))
   129  		if err != nil {
   130  			return nil, errors.Wrapf(err, "error fetching hosts for distro %v", d.Id)
   131  		}
   132  
   133  		// if there are more than the specified max hosts, then terminate
   134  		// some, if they are not running tasks
   135  		numExcessHosts := len(allHostsForDistro) - d.PoolSize
   136  		if numExcessHosts > 0 {
   137  
   138  			// track how many hosts for the distro are terminated
   139  			counter := 0
   140  
   141  			for _, host := range allHostsForDistro {
   142  
   143  				// if the host is not dynamically spun up (and can
   144  				// thus be terminated), skip it
   145  				canTerminate, err := hostCanBeTerminated(host, s)
   146  				if err != nil {
   147  					return nil, errors.Wrapf(err, "error checking if host %s can be terminated", host.Id)
   148  				}
   149  				if !canTerminate {
   150  					continue
   151  				}
   152  
   153  				// if the host is not running a task, it can be
   154  				// safely terminated
   155  				if host.RunningTask == "" {
   156  					excessHosts = append(excessHosts, host)
   157  					counter++
   158  				}
   159  
   160  				// break if we've marked enough to be terminated
   161  				if counter == numExcessHosts {
   162  					break
   163  				}
   164  			}
   165  
   166  			grip.Infof("Found %d excess hosts for distro %s", counter, d.Id)
   167  
   168  		}
   169  
   170  	}
   171  	return excessHosts, nil
   172  }
   173  
   174  // flagUnprovisionedHosts is a hostFlaggingFunc to get all hosts that are
   175  // taking too long to provision
   176  func flagUnprovisionedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
   177  	// fetch all hosts that are taking too long to provision
   178  	threshold := time.Now().Add(-ProvisioningCutoff)
   179  	hosts, err := host.Find(host.ByUnprovisionedSince(threshold))
   180  	if err != nil {
   181  		return nil, errors.Wrap(err, "error finding unprovisioned hosts")
   182  	}
   183  	return hosts, err
   184  }
   185  
   186  // flagProvisioningFailedHosts is a hostFlaggingFunc to get all hosts
   187  // whose provisioning failed
   188  func flagProvisioningFailedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
   189  	// fetch all hosts whose provisioning failed
   190  	hosts, err := host.Find(host.IsProvisioningFailure)
   191  	if err != nil {
   192  		return nil, errors.Wrap(err, "error finding hosts whose provisioning failed")
   193  	}
   194  	return hosts, nil
   195  
   196  }
   197  
   198  // flagExpiredHosts is a hostFlaggingFunc to get all user-spawned hosts
   199  // that have expired
   200  func flagExpiredHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) {
   201  	// fetch the expired hosts
   202  	hosts, err := host.Find(host.ByExpiredSince(time.Now()))
   203  	if err != nil {
   204  		return nil, errors.Wrap(err, "error finding expired spawned hosts")
   205  	}
   206  	return hosts, nil
   207  
   208  }
   209  
   210  // helper to check if a host can be terminated
   211  func hostCanBeTerminated(h host.Host, s *evergreen.Settings) (bool, error) {
   212  	// get a cloud manager for the host
   213  	cloudManager, err := providers.GetCloudManager(h.Provider, s)
   214  	if err != nil {
   215  		return false, errors.Wrapf(err, "error getting cloud manager for host %s", h.Id)
   216  	}
   217  
   218  	// if the host is not part of a spawnable distro, then it was not
   219  	// dynamically spun up and as such cannot be terminated
   220  	canSpawn, err := cloudManager.CanSpawn()
   221  	if err != nil {
   222  		return false, errors.Wrapf(err, "error checking if cloud manager for host %s supports spawning")
   223  	}
   224  
   225  	return canSpawn, nil
   226  
   227  }