github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/monitor/monitor.go (about) 1 package monitor 2 3 import ( 4 "time" 5 6 "github.com/evergreen-ci/evergreen" 7 "github.com/evergreen-ci/evergreen/alerts" 8 "github.com/evergreen-ci/evergreen/db" 9 "github.com/evergreen-ci/evergreen/model" 10 "github.com/evergreen-ci/evergreen/model/distro" 11 "github.com/evergreen-ci/evergreen/model/host" 12 "github.com/mongodb/grip" 13 "github.com/pkg/errors" 14 ) 15 16 var ( 17 // the functions the task monitor will run through to find tasks needing 18 // to be cleaned up 19 defaultTaskFlaggingFuncs = []taskFlaggingFunc{ 20 flagTimedOutHeartbeats, 21 } 22 23 // the functions the host monitor will run through to find hosts needing 24 // to be terminated 25 defaultHostFlaggingFuncs = []hostFlagger{ 26 {flagDecommissionedHosts, "decommissioned"}, 27 {flagUnreachableHosts, "unreachable"}, 28 {flagIdleHosts, "idle"}, 29 {flagExcessHosts, "excess"}, 30 {flagUnprovisionedHosts, "provision_timeout"}, 31 {flagProvisioningFailedHosts, "provision_failed"}, 32 {flagExpiredHosts, "expired"}, 33 } 34 35 // the functions the host monitor will run through to do simpler checks 36 defaultHostMonitoringFuncs = []hostMonitoringFunc{ 37 monitorReachability, 38 } 39 40 // the functions the notifier will use to build notifications that need 41 // to be sent 42 defaultNotificationBuilders = []notificationBuilder{ 43 spawnHostExpirationWarnings, 44 slowProvisioningWarnings, 45 } 46 ) 47 48 // run all monitoring functions 49 func RunAllMonitoring(settings *evergreen.Settings) error { 50 51 // load in all of the distros 52 distros, err := distro.Find(db.Q{}) 53 if err != nil { 54 return errors.Wrap(err, "error finding distros") 55 } 56 57 // fetch the project refs, which we will use to get all of the projects 58 projectRefs, err := model.FindAllProjectRefs() 59 if err != nil { 60 return errors.Wrap(err, "error loading in project refs") 61 } 62 63 // turn the project refs into a map of the project id -> project 64 projects := map[string]model.Project{} 65 var project *model.Project 66 67 for _, ref := range projectRefs { 68 // only monitor projects that are enabled 69 if !ref.Enabled { 70 continue 71 } 72 project, err = model.FindProject("", &ref) 73 74 // continue on error to stop the whole monitoring process from 75 // being held up 76 if err != nil { 77 grip.Errorf("error finding project %s: %+v", ref.Identifier, err) 78 continue 79 } 80 81 if project == nil { 82 grip.Errorf("no project entry found for ref %s", ref.Identifier) 83 continue 84 } 85 86 projects[project.Identifier] = *project 87 } 88 89 // initialize the task monitor 90 taskMonitor := &TaskMonitor{ 91 flaggingFuncs: defaultTaskFlaggingFuncs, 92 } 93 94 // clean up any necessary tasks 95 errs := withGlobalLock("task cleanup", 96 func() []error { return taskMonitor.CleanupTasks(projects) }) 97 for _, err := range errs { 98 grip.Error(errors.Wrap(err, "Error cleaning up tasks")) 99 } 100 101 // initialize the host monitor 102 hostMonitor := &HostMonitor{ 103 flaggingFuncs: defaultHostFlaggingFuncs, 104 monitoringFuncs: defaultHostMonitoringFuncs, 105 } 106 107 // clean up any necessary hosts 108 errs = withGlobalLock("host cleanup", 109 func() []error { return hostMonitor.CleanupHosts(distros, settings) }) 110 111 for _, err := range errs { 112 grip.Error(errors.Wrap(err, "Error cleaning up hosts")) 113 } 114 115 // run monitoring checks 116 errs = withGlobalLock("host monitoring", 117 func() []error { return hostMonitor.RunMonitoringChecks(settings) }) 118 for _, err := range errs { 119 grip.Error(errors.Wrap(err, "Error running host monitoring checks")) 120 } 121 122 // initialize the notifier 123 notifier := &Notifier{ 124 notificationBuilders: defaultNotificationBuilders, 125 } 126 127 // send notifications 128 errs = notifier.Notify(settings) 129 for _, err := range errs { 130 grip.Error(errors.Wrap(err, "Error sending notifications")) 131 } 132 133 // Do alerts for spawnhosts - collect all hosts expiring in the next 12 hours. 134 // The trigger logic will filter out any hosts that aren't in a notification window, or have 135 // already have alerts sent. 136 now := time.Now() 137 thresholdTime := now.Add(12 * time.Hour) 138 expiringSoonHosts, err := host.Find(host.ByExpiringBetween(now, thresholdTime)) 139 if err != nil { 140 return errors.WithStack(err) 141 } 142 143 for _, h := range expiringSoonHosts { 144 err := alerts.RunSpawnWarningTriggers(&h) 145 146 grip.Error(errors.Wrap(err, "Error queuing alert")) 147 } 148 149 return nil 150 } 151 152 // withGlobalLock is a wrapper for grabbing the global lock for each segment of the monitor. 153 func withGlobalLock(name string, f func() []error) (errs []error) { 154 grip.Debugln("Attempting to acquire global lock for monitor:", name) 155 // sleep for 1 second to give other spinning locks a chance to preempt this one 156 time.Sleep(time.Second) 157 acquired, err := db.WaitTillAcquireGlobalLock(name, db.LockTimeout) 158 if err != nil { 159 grip.Errorf("problem acquiring global lock for monitor %s: %+v", name, err) 160 return []error{errors.Errorf("error acquiring global lock for %s: %v", name, err)} 161 } 162 163 if !acquired { 164 grip.Errorln("Timed out attempting to acquire global lock for monitor:", name) 165 return []error{errors.Errorf("timed out acquiring global lock for monitor %s", name)} 166 } 167 defer func() { 168 grip.Debugln("Releasing global lock for monitor", name) 169 if err := db.ReleaseGlobalLock(name); err != nil { 170 grip.Errorf("Error releasing global lock for monitor %s: %+v", name, err) 171 errs = append(errs, errors.Errorf("error releasing global lock for monitor %v: %v", name, err)) 172 } else { 173 grip.Debugln("Released global lock for monitor:", name) 174 } 175 }() 176 grip.Debugln("Acquired global lock for monitor", name) 177 errs = f() 178 return errs 179 }