github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/scheduler/scheduler.go (about) 1 package scheduler 2 3 import ( 4 "runtime" 5 "sync" 6 "time" 7 8 "github.com/evergreen-ci/evergreen" 9 "github.com/evergreen-ci/evergreen/cloud" 10 "github.com/evergreen-ci/evergreen/cloud/providers" 11 "github.com/evergreen-ci/evergreen/model" 12 "github.com/evergreen-ci/evergreen/model/distro" 13 "github.com/evergreen-ci/evergreen/model/event" 14 "github.com/evergreen-ci/evergreen/model/host" 15 "github.com/evergreen-ci/evergreen/model/task" 16 "github.com/evergreen-ci/evergreen/model/version" 17 "github.com/evergreen-ci/evergreen/util" 18 "github.com/mongodb/grip" 19 "github.com/pkg/errors" 20 ) 21 22 // Responsible for prioritizing and scheduling tasks to be run, on a per-distro 23 // basis. 24 type Scheduler struct { 25 *evergreen.Settings 26 TaskFinder 27 TaskPrioritizer 28 TaskDurationEstimator 29 TaskQueuePersister 30 HostAllocator 31 } 32 33 // versionBuildVariant is used to keep track of the version/buildvariant fields 34 // for tasks that are to be split by distro 35 type versionBuildVariant struct { 36 Version, BuildVariant string 37 } 38 39 // Schedule all of the tasks to be run. Works by finding all of the tasks that 40 // are ready to be run, splitting them by distro, prioritizing them, and saving 41 // the per-distro queues. Then determines the number of new hosts to spin up 42 // for each distro, and spins them up. 43 func (s *Scheduler) Schedule() error { 44 // make sure the correct static hosts are in the database 45 grip.Info("Updating static hosts...") 46 47 err := model.UpdateStaticHosts(s.Settings) 48 if err != nil { 49 return errors.Wrap(err, "error updating static hosts") 50 } 51 52 // find all tasks ready to be run 53 grip.Info("Finding runnable tasks...") 54 55 runnableTasks, err := s.FindRunnableTasks() 56 if err != nil { 57 return errors.Wrap(err, "Error finding runnable tasks") 58 } 59 60 grip.Infof("There are %d tasks ready to be run", len(runnableTasks)) 61 62 // split the tasks by distro 63 tasksByDistro, taskRunDistros, err := s.splitTasksByDistro(runnableTasks) 64 if err != nil { 65 return errors.Wrap(err, "Error splitting tasks by distro to run on") 66 } 67 68 // load in all of the distros 69 distros, err := distro.Find(distro.All) 70 if err != nil { 71 return errors.Wrap(err, "Error finding distros") 72 } 73 74 // get the expected run duration of all runnable tasks 75 taskExpectedDuration, err := s.GetExpectedDurations(runnableTasks) 76 77 if err != nil { 78 return errors.Wrap(err, "Error getting expected task durations") 79 } 80 81 distroInputChan := make(chan distroSchedulerInput, len(distros)) 82 83 // put all of the needed input for the distro scheduler into a channel to be read by the 84 // distro scheduling loop. 85 for _, d := range distros { 86 runnableTasksForDistro := tasksByDistro[d.Id] 87 if len(runnableTasksForDistro) == 0 { 88 continue 89 } 90 distroInputChan <- distroSchedulerInput{ 91 distroId: d.Id, 92 runnableTasksForDistro: runnableTasksForDistro, 93 } 94 95 } 96 97 // close the channel to signal that the loop reading from it can terminate 98 close(distroInputChan) 99 workers := runtime.NumCPU() 100 101 wg := sync.WaitGroup{} 102 wg.Add(workers) 103 104 // make a channel to collect all of function results from scheduling the distros 105 distroSchedulerResultChan := make(chan *distroSchedulerResult) 106 107 // for each worker, create a new goroutine 108 for i := 0; i < workers; i++ { 109 go func() { 110 defer wg.Done() 111 // read the inputs for scheduling this distro 112 for d := range distroInputChan { 113 // schedule the distro 114 res := s.scheduleDistro(d.distroId, d.runnableTasksForDistro, taskExpectedDuration) 115 if res.err != nil { 116 grip.Error(err) 117 } 118 119 // write the results out to a results channel 120 distroSchedulerResultChan <- res 121 122 } 123 }() 124 } 125 126 // intialize a map of scheduler events 127 schedulerEvents := map[string]event.TaskQueueInfo{} 128 129 // prioritize the tasks, one distro at a time 130 taskQueueItems := make(map[string][]model.TaskQueueItem) 131 132 resDoneChan := make(chan struct{}) 133 var errResult error 134 go func() { 135 defer close(resDoneChan) 136 for res := range distroSchedulerResultChan { 137 if res.err != nil { 138 errResult = errors.Wrapf(err, "error scheduling tasks on distro %v", res.distroId) 139 return 140 } 141 schedulerEvents[res.distroId] = res.schedulerEvent 142 taskQueueItems[res.distroId] = res.taskQueueItem 143 } 144 }() 145 146 // wait for the distro scheduler goroutines to complete to complete 147 wg.Wait() 148 149 // wait group has terminated so scheduler channel can be closed 150 close(distroSchedulerResultChan) 151 152 // wait for the results to be collected 153 <-resDoneChan 154 155 if errResult != nil { 156 return errResult 157 } 158 159 // split distros by name 160 distrosByName := make(map[string]distro.Distro) 161 for _, d := range distros { 162 distrosByName[d.Id] = d 163 } 164 165 // fetch all hosts, split by distro 166 allHosts, err := host.Find(host.IsLive) 167 if err != nil { 168 return errors.Wrap(err, "Error finding live hosts") 169 } 170 171 // figure out all hosts we have up - per distro 172 hostsByDistro := make(map[string][]host.Host) 173 for _, liveHost := range allHosts { 174 hostsByDistro[liveHost.Distro.Id] = append(hostsByDistro[liveHost.Distro.Id], 175 liveHost) 176 } 177 178 // add the length of the host lists of hosts that are running to the event log. 179 for distroId, hosts := range hostsByDistro { 180 taskQueueInfo := schedulerEvents[distroId] 181 taskQueueInfo.NumHostsRunning = len(hosts) 182 schedulerEvents[distroId] = taskQueueInfo 183 } 184 185 // construct the data that will be needed by the host allocator 186 hostAllocatorData := HostAllocatorData{ 187 existingDistroHosts: hostsByDistro, 188 distros: distrosByName, 189 taskQueueItems: taskQueueItems, 190 taskRunDistros: taskRunDistros, 191 projectTaskDurations: taskExpectedDuration, 192 } 193 194 // figure out how many new hosts we need 195 newHostsNeeded, err := s.NewHostsNeeded(hostAllocatorData, s.Settings) 196 if err != nil { 197 return errors.Wrap(err, "Error determining how many new hosts are needed") 198 } 199 200 // spawn up the hosts 201 hostsSpawned, err := s.spawnHosts(newHostsNeeded) 202 if err != nil { 203 return errors.Wrap(err, "Error spawning new hosts") 204 } 205 206 if len(hostsSpawned) != 0 { 207 grip.Infof("Hosts spawned (%d distros total), by:", len(hostsSpawned)) 208 for distro, hosts := range hostsSpawned { 209 grip.Infoln("\t", "scheduling distro:", distro) 210 for _, host := range hosts { 211 grip.Infoln("\t\t", host.Id) 212 } 213 214 taskQueueInfo := schedulerEvents[distro] 215 taskQueueInfo.NumHostsRunning += len(hosts) 216 schedulerEvents[distro] = taskQueueInfo 217 } 218 } else { 219 grip.Info("No new hosts spawned") 220 } 221 222 for d, t := range schedulerEvents { 223 eventLog := event.SchedulerEventData{ 224 ResourceType: event.ResourceTypeScheduler, 225 TaskQueueInfo: t, 226 DistroId: d, 227 } 228 event.LogSchedulerEvent(eventLog) 229 } 230 231 return nil 232 } 233 234 type distroSchedulerInput struct { 235 distroId string 236 runnableTasksForDistro []task.Task 237 } 238 239 type distroSchedulerResult struct { 240 distroId string 241 schedulerEvent event.TaskQueueInfo 242 taskQueueItem []model.TaskQueueItem 243 err error 244 } 245 246 func (s *Scheduler) scheduleDistro(distroId string, runnableTasksForDistro []task.Task, 247 taskExpectedDuration model.ProjectTaskDurations) *distroSchedulerResult { 248 249 res := distroSchedulerResult{ 250 distroId: distroId, 251 } 252 grip.Infof("Prioritizing %d tasks for distro: %s", len(runnableTasksForDistro), distroId) 253 254 prioritizedTasks, err := s.PrioritizeTasks(s.Settings, 255 runnableTasksForDistro) 256 if err != nil { 257 res.err = errors.Wrap(err, "Error prioritizing tasks") 258 return &res 259 } 260 261 // persist the queue of tasks 262 grip.Infoln("Saving task queue for distro", distroId) 263 queuedTasks, err := s.PersistTaskQueue(distroId, prioritizedTasks, 264 taskExpectedDuration) 265 if err != nil { 266 res.err = errors.Wrapf(err, "Error processing distro %s saving task queue", distroId) 267 return &res 268 } 269 270 // track scheduled time for prioritized tasks 271 err = task.SetTasksScheduledTime(prioritizedTasks, time.Now()) 272 if err != nil { 273 res.err = errors.Wrapf(err, 274 "Error processing distro %s setting scheduled time for prioritized tasks", 275 distroId) 276 return &res 277 } 278 res.taskQueueItem = queuedTasks 279 280 var totalDuration time.Duration 281 for _, item := range queuedTasks { 282 totalDuration += item.ExpectedDuration 283 } 284 // initialize the task queue info 285 res.schedulerEvent = event.TaskQueueInfo{ 286 TaskQueueLength: len(queuedTasks), 287 NumHostsRunning: 0, 288 ExpectedDuration: totalDuration, 289 } 290 return &res 291 292 } 293 294 // Takes in a version id and a map of "key -> buildvariant" (where "key" is of 295 // type "versionBuildVariant") and updates the map with an entry for the 296 // buildvariants associated with "versionStr" 297 func (s *Scheduler) updateVersionBuildVarMap(versionStr string, 298 versionBuildVarMap map[versionBuildVariant]model.BuildVariant) error { 299 300 version, err := version.FindOne(version.ById(versionStr)) 301 if err != nil { 302 return err 303 } 304 305 if version == nil { 306 return errors.Errorf("nil version returned for version '%s'", versionStr) 307 } 308 309 project := &model.Project{} 310 311 err = model.LoadProjectInto([]byte(version.Config), version.Identifier, project) 312 if err != nil { 313 return errors.Wrapf(err, "unable to load project config for version %s", versionStr) 314 } 315 316 // create buildvariant map (for accessing purposes) 317 for _, buildVariant := range project.BuildVariants { 318 key := versionBuildVariant{versionStr, buildVariant.Name} 319 versionBuildVarMap[key] = buildVariant 320 } 321 322 return nil 323 } 324 325 // Takes in a list of tasks, and splits them by distro. 326 // Returns a map of distro name -> tasks that can be run on that distro 327 // and a map of task id -> distros that the task can be run on (for tasks 328 // that can be run on multiple distro) 329 func (s *Scheduler) splitTasksByDistro(tasksToSplit []task.Task) ( 330 map[string][]task.Task, map[string][]string, error) { 331 tasksByDistro := make(map[string][]task.Task) 332 taskRunDistros := make(map[string][]string) 333 334 // map of versionBuildVariant -> build variant 335 versionBuildVarMap := make(map[versionBuildVariant]model.BuildVariant) 336 337 // insert the tasks into the appropriate distro's queue in our map 338 for _, task := range tasksToSplit { 339 key := versionBuildVariant{task.Version, task.BuildVariant} 340 if _, exists := versionBuildVarMap[key]; !exists { 341 err := s.updateVersionBuildVarMap(task.Version, versionBuildVarMap) 342 if err != nil { 343 grip.Infof("skipping %s after problem getting buildvariant map for task %s: %v", 344 task.Version, task.Id, err) 345 continue 346 } 347 } 348 349 // get the build variant for the task 350 buildVariant, ok := versionBuildVarMap[key] 351 if !ok { 352 grip.Infof("task %s has no buildvariant called '%s' on project %s", 353 task.Id, task.BuildVariant, task.Project) 354 continue 355 } 356 357 // get the task specification for the build variant 358 var taskSpec model.BuildVariantTask 359 for _, tSpec := range buildVariant.Tasks { 360 if tSpec.Name == task.DisplayName { 361 taskSpec = tSpec 362 break 363 } 364 } 365 366 // if no matching spec was found log it and continue 367 if taskSpec.Name == "" { 368 grip.Infof("task %s has no matching spec for build variant %s on project %s", 369 task.Id, task.BuildVariant, task.Project) 370 continue 371 } 372 373 // use the specified distros for the task, or, if none are specified, 374 // the default distros for the build variant 375 distrosToUse := buildVariant.RunOn 376 if len(taskSpec.Distros) != 0 { 377 distrosToUse = taskSpec.Distros 378 } 379 // remove duplicates to avoid scheduling twice 380 distrosToUse = util.UniqueStrings(distrosToUse) 381 for _, d := range distrosToUse { 382 tasksByDistro[d] = append(tasksByDistro[d], task) 383 } 384 385 // for tasks that can run on multiple distros, keep track of which 386 // distros they will be scheduled on 387 if len(distrosToUse) > 1 { 388 taskRunDistros[task.Id] = distrosToUse 389 } 390 } 391 392 return tasksByDistro, taskRunDistros, nil 393 394 } 395 396 // Call out to the embedded CloudManager to spawn hosts. Takes in a map of 397 // distro -> number of hosts to spawn for the distro. 398 // Returns a map of distro -> hosts spawned, and an error if one occurs. 399 func (s *Scheduler) spawnHosts(newHostsNeeded map[string]int) ( 400 map[string][]host.Host, error) { 401 402 // loop over the distros, spawning up the appropriate number of hosts 403 // for each distro 404 hostsSpawnedPerDistro := make(map[string][]host.Host) 405 for distroId, numHostsToSpawn := range newHostsNeeded { 406 407 if numHostsToSpawn == 0 { 408 continue 409 } 410 411 hostsSpawnedPerDistro[distroId] = make([]host.Host, 0, numHostsToSpawn) 412 for i := 0; i < numHostsToSpawn; i++ { 413 d, err := distro.FindOne(distro.ById(distroId)) 414 if err != nil { 415 err = errors.Wrapf(err, "Failed to find distro '%s'", distroId) 416 grip.Error(err) 417 continue 418 } 419 420 allDistroHosts, err := host.Find(host.ByDistroId(distroId)) 421 if err != nil { 422 err = errors.Wrapf(err, "Error getting hosts for distro %s", distroId) 423 grip.Error(err) 424 continue 425 } 426 427 if len(allDistroHosts) >= d.PoolSize { 428 err = errors.Wrapf(err, "Already at max (%d) hosts for distro '%s'", 429 d.PoolSize, distroId) 430 grip.Error(err) 431 continue 432 } 433 434 cloudManager, err := providers.GetCloudManager(d.Provider, s.Settings) 435 if err != nil { 436 err = errors.Wrapf(err, "Error getting cloud manager for distro %s", distroId) 437 grip.Error(err) 438 continue 439 } 440 441 hostOptions := cloud.HostOptions{ 442 UserName: evergreen.User, 443 UserHost: false, 444 } 445 newHost, err := cloudManager.SpawnInstance(d, hostOptions) 446 if err != nil { 447 err = errors.Wrapf(err, "error spawning instance $s", distroId) 448 grip.Error(err) 449 continue 450 } 451 hostsSpawnedPerDistro[distroId] = 452 append(hostsSpawnedPerDistro[distroId], *newHost) 453 454 } 455 // if none were spawned successfully 456 if len(hostsSpawnedPerDistro[distroId]) == 0 { 457 delete(hostsSpawnedPerDistro, distroId) 458 } 459 } 460 return hostsSpawnedPerDistro, nil 461 }