github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/host_flagging.go (about) 1 package monitor 2 3 import ( 4 "time" 5 6 "github.com/evergreen-ci/evergreen" 7 "github.com/evergreen-ci/evergreen/cloud/providers" 8 "github.com/evergreen-ci/evergreen/model/distro" 9 "github.com/evergreen-ci/evergreen/model/host" 10 "github.com/mongodb/grip" 11 "github.com/pkg/errors" 12 ) 13 14 const ( 15 // ProvisioningCutoff is the threshold to consider as too long for a host to take provisioning 16 ProvisioningCutoff = 35 * time.Minute 17 18 // UnreachableCutoff is the threshold to wait for an unreachable host to become marked 19 // as reachable again before giving up and terminating it. 20 UnreachableCutoff = 10 * time.Minute 21 22 // IdleTimeCutoff is the amount of time we wait for an idle host to be marked as idle. 23 IdleTimeCutoff = 15 * time.Minute 24 25 // MaxTimeNextPayment is the amount of time we wait to have left before marking a host as idle 26 MaxTimeTilNextPayment = 5 * time.Minute 27 28 // CommunicationTimeCutoff is the limit to how much time has passed before the host is marked as idle 29 // due to the agent not being able to communicate with the API server. 30 CommunicationTimeCutoff = 10 * time.Minute 31 ) 32 33 type hostFlagger struct { 34 hostFlaggingFunc 35 Reason string 36 } 37 38 // function that takes in all distros - specified as a map of 39 // distro name -> distro info - as well as the mci settings, 40 // and spits out a list of hosts to be terminated 41 type hostFlaggingFunc func([]distro.Distro, *evergreen.Settings) ([]host.Host, error) 42 43 // flagDecommissionedHosts is a hostFlaggingFunc to get all hosts which should 44 // be terminated because they are decommissioned 45 func flagDecommissionedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 46 hosts, err := host.Find(host.IsDecommissioned) 47 if err != nil { 48 return nil, errors.Wrap(err, "error finding decommissioned hosts") 49 } 50 return hosts, nil 51 } 52 53 // flagUnreachableHosts is a hostFlaggingFunc to get all hosts which should 54 // be terminated because they are unreachable 55 func flagUnreachableHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 56 threshold := time.Now().Add(-1 * UnreachableCutoff) 57 hosts, err := host.Find(host.ByUnreachableBefore(threshold)) 58 if err != nil { 59 return nil, errors.Wrapf(err, "error finding hosts unreachable since before %v", threshold) 60 } 61 return hosts, nil 62 } 63 64 // flagIdleHosts is a hostFlaggingFunc to get all hosts which have spent too 65 // long without running a task 66 func flagIdleHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 67 // will ultimately contain all of the hosts determined to be idle 68 idleHosts := []host.Host{} 69 70 // fetch all hosts not currently running a task 71 freeHosts, err := host.Find(host.IsFree) 72 if err != nil { 73 return nil, errors.Wrap(err, "error finding free hosts") 74 } 75 76 // go through the hosts, and see if they have idled long enough to 77 // be terminated 78 for _, freeHost := range freeHosts { 79 80 // ask the host how long it has been idle 81 idleTime := freeHost.IdleTime() 82 83 // if the communication time is > 10 mins then there may not be an agent on the host. 84 communicationTime := time.Since(freeHost.LastCommunicationTime) 85 86 // get a cloud manager for the host 87 cloudManager, err := providers.GetCloudManager(freeHost.Provider, s) 88 if err != nil { 89 return nil, errors.Wrapf(err, "error getting cloud manager for host %v", freeHost.Id) 90 } 91 92 // if the host is not dynamically spun up (and can thus be terminated), 93 // skip it 94 canTerminate, err := hostCanBeTerminated(freeHost, s) 95 if err != nil { 96 return nil, errors.Wrapf(err, "error checking if host %v can be terminated", freeHost.Id) 97 } 98 if !canTerminate { 99 continue 100 } 101 102 // ask how long until the next payment for the host 103 tilNextPayment := cloudManager.TimeTilNextPayment(&freeHost) 104 105 // current determinants for idle: 106 // idle for at least 15 minutes or last communication time has been more than 10 mins and 107 // less than 5 minutes til next payment 108 if (communicationTime >= CommunicationTimeCutoff || idleTime >= IdleTimeCutoff) && 109 tilNextPayment <= MaxTimeTilNextPayment { 110 idleHosts = append(idleHosts, freeHost) 111 } 112 113 } 114 115 return idleHosts, nil 116 } 117 118 // flagExcessHosts is a hostFlaggingFunc to get all hosts that push their 119 // distros over the specified max hosts 120 func flagExcessHosts(distros []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 121 // will ultimately contain all the hosts that can be terminated 122 excessHosts := []host.Host{} 123 124 // figure out the excess hosts for each distro 125 for _, d := range distros { 126 127 // fetch any hosts for the distro that count towards max hosts 128 allHostsForDistro, err := host.Find(host.ByDistroId(d.Id)) 129 if err != nil { 130 return nil, errors.Wrapf(err, "error fetching hosts for distro %v", d.Id) 131 } 132 133 // if there are more than the specified max hosts, then terminate 134 // some, if they are not running tasks 135 numExcessHosts := len(allHostsForDistro) - d.PoolSize 136 if numExcessHosts > 0 { 137 138 // track how many hosts for the distro are terminated 139 counter := 0 140 141 for _, host := range allHostsForDistro { 142 143 // if the host is not dynamically spun up (and can 144 // thus be terminated), skip it 145 canTerminate, err := hostCanBeTerminated(host, s) 146 if err != nil { 147 return nil, errors.Wrapf(err, "error checking if host %s can be terminated", host.Id) 148 } 149 if !canTerminate { 150 continue 151 } 152 153 // if the host is not running a task, it can be 154 // safely terminated 155 if host.RunningTask == "" { 156 excessHosts = append(excessHosts, host) 157 counter++ 158 } 159 160 // break if we've marked enough to be terminated 161 if counter == numExcessHosts { 162 break 163 } 164 } 165 166 grip.Infof("Found %d excess hosts for distro %s", counter, d.Id) 167 168 } 169 170 } 171 return excessHosts, nil 172 } 173 174 // flagUnprovisionedHosts is a hostFlaggingFunc to get all hosts that are 175 // taking too long to provision 176 func flagUnprovisionedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 177 // fetch all hosts that are taking too long to provision 178 threshold := time.Now().Add(-ProvisioningCutoff) 179 hosts, err := host.Find(host.ByUnprovisionedSince(threshold)) 180 if err != nil { 181 return nil, errors.Wrap(err, "error finding unprovisioned hosts") 182 } 183 return hosts, err 184 } 185 186 // flagProvisioningFailedHosts is a hostFlaggingFunc to get all hosts 187 // whose provisioning failed 188 func flagProvisioningFailedHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 189 // fetch all hosts whose provisioning failed 190 hosts, err := host.Find(host.IsProvisioningFailure) 191 if err != nil { 192 return nil, errors.Wrap(err, "error finding hosts whose provisioning failed") 193 } 194 return hosts, nil 195 196 } 197 198 // flagExpiredHosts is a hostFlaggingFunc to get all user-spawned hosts 199 // that have expired 200 func flagExpiredHosts(d []distro.Distro, s *evergreen.Settings) ([]host.Host, error) { 201 // fetch the expired hosts 202 hosts, err := host.Find(host.ByExpiredSince(time.Now())) 203 if err != nil { 204 return nil, errors.Wrap(err, "error finding expired spawned hosts") 205 } 206 return hosts, nil 207 208 } 209 210 // helper to check if a host can be terminated 211 func hostCanBeTerminated(h host.Host, s *evergreen.Settings) (bool, error) { 212 // get a cloud manager for the host 213 cloudManager, err := providers.GetCloudManager(h.Provider, s) 214 if err != nil { 215 return false, errors.Wrapf(err, "error getting cloud manager for host %s", h.Id) 216 } 217 218 // if the host is not part of a spawnable distro, then it was not 219 // dynamically spun up and as such cannot be terminated 220 canSpawn, err := cloudManager.CanSpawn() 221 if err != nil { 222 return false, errors.Wrapf(err, "error checking if cloud manager for host %s supports spawning") 223 } 224 225 return canSpawn, nil 226 227 }