github.com/justinjmoses/evergreen@v0.0.0-20170530173719-1d50e381ff0d/scheduler/scheduler.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"runtime"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/evergreen-ci/evergreen"
     9  	"github.com/evergreen-ci/evergreen/cloud"
    10  	"github.com/evergreen-ci/evergreen/cloud/providers"
    11  	"github.com/evergreen-ci/evergreen/model"
    12  	"github.com/evergreen-ci/evergreen/model/distro"
    13  	"github.com/evergreen-ci/evergreen/model/event"
    14  	"github.com/evergreen-ci/evergreen/model/host"
    15  	"github.com/evergreen-ci/evergreen/model/task"
    16  	"github.com/evergreen-ci/evergreen/model/version"
    17  	"github.com/evergreen-ci/evergreen/util"
    18  	"github.com/mongodb/grip"
    19  	"github.com/pkg/errors"
    20  )
    21  
    22  // Responsible for prioritizing and scheduling tasks to be run, on a per-distro
    23  // basis.
    24  type Scheduler struct {
    25  	*evergreen.Settings
    26  	TaskFinder
    27  	TaskPrioritizer
    28  	TaskDurationEstimator
    29  	TaskQueuePersister
    30  	HostAllocator
    31  }
    32  
    33  // versionBuildVariant is used to keep track of the version/buildvariant fields
    34  // for tasks that are to be split by distro
    35  type versionBuildVariant struct {
    36  	Version, BuildVariant string
    37  }
    38  
    39  // Schedule all of the tasks to be run.  Works by finding all of the tasks that
    40  // are ready to be run, splitting them by distro, prioritizing them, and saving
    41  // the per-distro queues.  Then determines the number of new hosts to spin up
    42  // for each distro, and spins them up.
    43  func (s *Scheduler) Schedule() error {
    44  	// make sure the correct static hosts are in the database
    45  	grip.Info("Updating static hosts...")
    46  
    47  	err := model.UpdateStaticHosts(s.Settings)
    48  	if err != nil {
    49  		return errors.Wrap(err, "error updating static hosts")
    50  	}
    51  
    52  	// find all tasks ready to be run
    53  	grip.Info("Finding runnable tasks...")
    54  
    55  	runnableTasks, err := s.FindRunnableTasks()
    56  	if err != nil {
    57  		return errors.Wrap(err, "Error finding runnable tasks")
    58  	}
    59  
    60  	grip.Infof("There are %d tasks ready to be run", len(runnableTasks))
    61  
    62  	// split the tasks by distro
    63  	tasksByDistro, taskRunDistros, err := s.splitTasksByDistro(runnableTasks)
    64  	if err != nil {
    65  		return errors.Wrap(err, "Error splitting tasks by distro to run on")
    66  	}
    67  
    68  	// load in all of the distros
    69  	distros, err := distro.Find(distro.All)
    70  	if err != nil {
    71  		return errors.Wrap(err, "Error finding distros")
    72  	}
    73  
    74  	// get the expected run duration of all runnable tasks
    75  	taskExpectedDuration, err := s.GetExpectedDurations(runnableTasks)
    76  
    77  	if err != nil {
    78  		return errors.Wrap(err, "Error getting expected task durations")
    79  	}
    80  
    81  	distroInputChan := make(chan distroSchedulerInput, len(distros))
    82  
    83  	// put all of the needed input for the distro scheduler into a channel to be read by the
    84  	// distro scheduling loop.
    85  	for _, d := range distros {
    86  		runnableTasksForDistro := tasksByDistro[d.Id]
    87  		if len(runnableTasksForDistro) == 0 {
    88  			continue
    89  		}
    90  		distroInputChan <- distroSchedulerInput{
    91  			distroId:               d.Id,
    92  			runnableTasksForDistro: runnableTasksForDistro,
    93  		}
    94  
    95  	}
    96  
    97  	// close the channel to signal that the loop reading from it can terminate
    98  	close(distroInputChan)
    99  	workers := runtime.NumCPU()
   100  
   101  	wg := sync.WaitGroup{}
   102  	wg.Add(workers)
   103  
   104  	// make a channel to collect all of function results from scheduling the distros
   105  	distroSchedulerResultChan := make(chan *distroSchedulerResult)
   106  
   107  	// for each worker, create a new goroutine
   108  	for i := 0; i < workers; i++ {
   109  		go func() {
   110  			defer wg.Done()
   111  			// read the inputs for scheduling this distro
   112  			for d := range distroInputChan {
   113  				// schedule the distro
   114  				res := s.scheduleDistro(d.distroId, d.runnableTasksForDistro, taskExpectedDuration)
   115  				if res.err != nil {
   116  					grip.Error(err)
   117  				}
   118  
   119  				// write the results out to a results channel
   120  				distroSchedulerResultChan <- res
   121  
   122  			}
   123  		}()
   124  	}
   125  
   126  	// intialize a map of scheduler events
   127  	schedulerEvents := map[string]event.TaskQueueInfo{}
   128  
   129  	// prioritize the tasks, one distro at a time
   130  	taskQueueItems := make(map[string][]model.TaskQueueItem)
   131  
   132  	resDoneChan := make(chan struct{})
   133  	var errResult error
   134  	go func() {
   135  		defer close(resDoneChan)
   136  		for res := range distroSchedulerResultChan {
   137  			if res.err != nil {
   138  				errResult = errors.Wrapf(err, "error scheduling tasks on distro %v", res.distroId)
   139  				return
   140  			}
   141  			schedulerEvents[res.distroId] = res.schedulerEvent
   142  			taskQueueItems[res.distroId] = res.taskQueueItem
   143  		}
   144  	}()
   145  
   146  	// wait for the distro scheduler goroutines to complete to complete
   147  	wg.Wait()
   148  
   149  	// wait group has terminated so scheduler channel can be closed
   150  	close(distroSchedulerResultChan)
   151  
   152  	// wait for the results to be collected
   153  	<-resDoneChan
   154  
   155  	if errResult != nil {
   156  		return errResult
   157  	}
   158  
   159  	// split distros by name
   160  	distrosByName := make(map[string]distro.Distro)
   161  	for _, d := range distros {
   162  		distrosByName[d.Id] = d
   163  	}
   164  
   165  	// fetch all hosts, split by distro
   166  	allHosts, err := host.Find(host.IsLive)
   167  	if err != nil {
   168  		return errors.Wrap(err, "Error finding live hosts")
   169  	}
   170  
   171  	// figure out all hosts we have up - per distro
   172  	hostsByDistro := make(map[string][]host.Host)
   173  	for _, liveHost := range allHosts {
   174  		hostsByDistro[liveHost.Distro.Id] = append(hostsByDistro[liveHost.Distro.Id],
   175  			liveHost)
   176  	}
   177  
   178  	// add the length of the host lists of hosts that are running to the event log.
   179  	for distroId, hosts := range hostsByDistro {
   180  		taskQueueInfo := schedulerEvents[distroId]
   181  		taskQueueInfo.NumHostsRunning = len(hosts)
   182  		schedulerEvents[distroId] = taskQueueInfo
   183  	}
   184  
   185  	// construct the data that will be needed by the host allocator
   186  	hostAllocatorData := HostAllocatorData{
   187  		existingDistroHosts:  hostsByDistro,
   188  		distros:              distrosByName,
   189  		taskQueueItems:       taskQueueItems,
   190  		taskRunDistros:       taskRunDistros,
   191  		projectTaskDurations: taskExpectedDuration,
   192  	}
   193  
   194  	// figure out how many new hosts we need
   195  	newHostsNeeded, err := s.NewHostsNeeded(hostAllocatorData, s.Settings)
   196  	if err != nil {
   197  		return errors.Wrap(err, "Error determining how many new hosts are needed")
   198  	}
   199  
   200  	// spawn up the hosts
   201  	hostsSpawned, err := s.spawnHosts(newHostsNeeded)
   202  	if err != nil {
   203  		return errors.Wrap(err, "Error spawning new hosts")
   204  	}
   205  
   206  	if len(hostsSpawned) != 0 {
   207  		grip.Infof("Hosts spawned (%d distros total), by:", len(hostsSpawned))
   208  		for distro, hosts := range hostsSpawned {
   209  			grip.Infoln("\t", "scheduling distro:", distro)
   210  			for _, host := range hosts {
   211  				grip.Infoln("\t\t", host.Id)
   212  			}
   213  
   214  			taskQueueInfo := schedulerEvents[distro]
   215  			taskQueueInfo.NumHostsRunning += len(hosts)
   216  			schedulerEvents[distro] = taskQueueInfo
   217  		}
   218  	} else {
   219  		grip.Info("No new hosts spawned")
   220  	}
   221  
   222  	for d, t := range schedulerEvents {
   223  		eventLog := event.SchedulerEventData{
   224  			ResourceType:  event.ResourceTypeScheduler,
   225  			TaskQueueInfo: t,
   226  			DistroId:      d,
   227  		}
   228  		event.LogSchedulerEvent(eventLog)
   229  	}
   230  
   231  	return nil
   232  }
   233  
   234  type distroSchedulerInput struct {
   235  	distroId               string
   236  	runnableTasksForDistro []task.Task
   237  }
   238  
   239  type distroSchedulerResult struct {
   240  	distroId       string
   241  	schedulerEvent event.TaskQueueInfo
   242  	taskQueueItem  []model.TaskQueueItem
   243  	err            error
   244  }
   245  
   246  func (s *Scheduler) scheduleDistro(distroId string, runnableTasksForDistro []task.Task,
   247  	taskExpectedDuration model.ProjectTaskDurations) *distroSchedulerResult {
   248  
   249  	res := distroSchedulerResult{
   250  		distroId: distroId,
   251  	}
   252  	grip.Infof("Prioritizing %d tasks for distro: %s", len(runnableTasksForDistro), distroId)
   253  
   254  	prioritizedTasks, err := s.PrioritizeTasks(s.Settings,
   255  		runnableTasksForDistro)
   256  	if err != nil {
   257  		res.err = errors.Wrap(err, "Error prioritizing tasks")
   258  		return &res
   259  	}
   260  
   261  	// persist the queue of tasks
   262  	grip.Infoln("Saving task queue for distro", distroId)
   263  	queuedTasks, err := s.PersistTaskQueue(distroId, prioritizedTasks,
   264  		taskExpectedDuration)
   265  	if err != nil {
   266  		res.err = errors.Wrapf(err, "Error processing distro %s saving task queue", distroId)
   267  		return &res
   268  	}
   269  
   270  	// track scheduled time for prioritized tasks
   271  	err = task.SetTasksScheduledTime(prioritizedTasks, time.Now())
   272  	if err != nil {
   273  		res.err = errors.Wrapf(err,
   274  			"Error processing distro %s setting scheduled time for prioritized tasks",
   275  			distroId)
   276  		return &res
   277  	}
   278  	res.taskQueueItem = queuedTasks
   279  
   280  	var totalDuration time.Duration
   281  	for _, item := range queuedTasks {
   282  		totalDuration += item.ExpectedDuration
   283  	}
   284  	// initialize the task queue info
   285  	res.schedulerEvent = event.TaskQueueInfo{
   286  		TaskQueueLength:  len(queuedTasks),
   287  		NumHostsRunning:  0,
   288  		ExpectedDuration: totalDuration,
   289  	}
   290  	return &res
   291  
   292  }
   293  
   294  // Takes in a version id and a map of "key -> buildvariant" (where "key" is of
   295  // type "versionBuildVariant") and updates the map with an entry for the
   296  // buildvariants associated with "versionStr"
   297  func (s *Scheduler) updateVersionBuildVarMap(versionStr string,
   298  	versionBuildVarMap map[versionBuildVariant]model.BuildVariant) error {
   299  
   300  	version, err := version.FindOne(version.ById(versionStr))
   301  	if err != nil {
   302  		return err
   303  	}
   304  
   305  	if version == nil {
   306  		return errors.Errorf("nil version returned for version '%s'", versionStr)
   307  	}
   308  
   309  	project := &model.Project{}
   310  
   311  	err = model.LoadProjectInto([]byte(version.Config), version.Identifier, project)
   312  	if err != nil {
   313  		return errors.Wrapf(err, "unable to load project config for version %s", versionStr)
   314  	}
   315  
   316  	// create buildvariant map (for accessing purposes)
   317  	for _, buildVariant := range project.BuildVariants {
   318  		key := versionBuildVariant{versionStr, buildVariant.Name}
   319  		versionBuildVarMap[key] = buildVariant
   320  	}
   321  
   322  	return nil
   323  }
   324  
   325  // Takes in a list of tasks, and splits them by distro.
   326  // Returns a map of distro name -> tasks that can be run on that distro
   327  // and a map of task id -> distros that the task can be run on (for tasks
   328  // that can be run on multiple distro)
   329  func (s *Scheduler) splitTasksByDistro(tasksToSplit []task.Task) (
   330  	map[string][]task.Task, map[string][]string, error) {
   331  	tasksByDistro := make(map[string][]task.Task)
   332  	taskRunDistros := make(map[string][]string)
   333  
   334  	// map of versionBuildVariant -> build variant
   335  	versionBuildVarMap := make(map[versionBuildVariant]model.BuildVariant)
   336  
   337  	// insert the tasks into the appropriate distro's queue in our map
   338  	for _, task := range tasksToSplit {
   339  		key := versionBuildVariant{task.Version, task.BuildVariant}
   340  		if _, exists := versionBuildVarMap[key]; !exists {
   341  			err := s.updateVersionBuildVarMap(task.Version, versionBuildVarMap)
   342  			if err != nil {
   343  				grip.Infof("skipping %s after problem getting buildvariant map for task %s: %v",
   344  					task.Version, task.Id, err)
   345  				continue
   346  			}
   347  		}
   348  
   349  		// get the build variant for the task
   350  		buildVariant, ok := versionBuildVarMap[key]
   351  		if !ok {
   352  			grip.Infof("task %s has no buildvariant called '%s' on project %s",
   353  				task.Id, task.BuildVariant, task.Project)
   354  			continue
   355  		}
   356  
   357  		// get the task specification for the build variant
   358  		var taskSpec model.BuildVariantTask
   359  		for _, tSpec := range buildVariant.Tasks {
   360  			if tSpec.Name == task.DisplayName {
   361  				taskSpec = tSpec
   362  				break
   363  			}
   364  		}
   365  
   366  		// if no matching spec was found log it and continue
   367  		if taskSpec.Name == "" {
   368  			grip.Infof("task %s has no matching spec for build variant %s on project %s",
   369  				task.Id, task.BuildVariant, task.Project)
   370  			continue
   371  		}
   372  
   373  		// use the specified distros for the task, or, if none are specified,
   374  		// the default distros for the build variant
   375  		distrosToUse := buildVariant.RunOn
   376  		if len(taskSpec.Distros) != 0 {
   377  			distrosToUse = taskSpec.Distros
   378  		}
   379  		// remove duplicates to avoid scheduling twice
   380  		distrosToUse = util.UniqueStrings(distrosToUse)
   381  		for _, d := range distrosToUse {
   382  			tasksByDistro[d] = append(tasksByDistro[d], task)
   383  		}
   384  
   385  		// for tasks that can run on multiple distros, keep track of which
   386  		// distros they will be scheduled on
   387  		if len(distrosToUse) > 1 {
   388  			taskRunDistros[task.Id] = distrosToUse
   389  		}
   390  	}
   391  
   392  	return tasksByDistro, taskRunDistros, nil
   393  
   394  }
   395  
   396  // Call out to the embedded CloudManager to spawn hosts.  Takes in a map of
   397  // distro -> number of hosts to spawn for the distro.
   398  // Returns a map of distro -> hosts spawned, and an error if one occurs.
   399  func (s *Scheduler) spawnHosts(newHostsNeeded map[string]int) (
   400  	map[string][]host.Host, error) {
   401  
   402  	// loop over the distros, spawning up the appropriate number of hosts
   403  	// for each distro
   404  	hostsSpawnedPerDistro := make(map[string][]host.Host)
   405  	for distroId, numHostsToSpawn := range newHostsNeeded {
   406  
   407  		if numHostsToSpawn == 0 {
   408  			continue
   409  		}
   410  
   411  		hostsSpawnedPerDistro[distroId] = make([]host.Host, 0, numHostsToSpawn)
   412  		for i := 0; i < numHostsToSpawn; i++ {
   413  			d, err := distro.FindOne(distro.ById(distroId))
   414  			if err != nil {
   415  				err = errors.Wrapf(err, "Failed to find distro '%s'", distroId)
   416  				grip.Error(err)
   417  				continue
   418  			}
   419  
   420  			allDistroHosts, err := host.Find(host.ByDistroId(distroId))
   421  			if err != nil {
   422  				err = errors.Wrapf(err, "Error getting hosts for distro %s", distroId)
   423  				grip.Error(err)
   424  				continue
   425  			}
   426  
   427  			if len(allDistroHosts) >= d.PoolSize {
   428  				err = errors.Wrapf(err, "Already at max (%d) hosts for distro '%s'",
   429  					d.PoolSize, distroId)
   430  				grip.Error(err)
   431  				continue
   432  			}
   433  
   434  			cloudManager, err := providers.GetCloudManager(d.Provider, s.Settings)
   435  			if err != nil {
   436  				err = errors.Wrapf(err, "Error getting cloud manager for distro %s", distroId)
   437  				grip.Error(err)
   438  				continue
   439  			}
   440  
   441  			hostOptions := cloud.HostOptions{
   442  				UserName: evergreen.User,
   443  				UserHost: false,
   444  			}
   445  			newHost, err := cloudManager.SpawnInstance(d, hostOptions)
   446  			if err != nil {
   447  				err = errors.Wrapf(err, "error spawning instance $s", distroId)
   448  				grip.Error(err)
   449  				continue
   450  			}
   451  			hostsSpawnedPerDistro[distroId] =
   452  				append(hostsSpawnedPerDistro[distroId], *newHost)
   453  
   454  		}
   455  		// if none were spawned successfully
   456  		if len(hostsSpawnedPerDistro[distroId]) == 0 {
   457  			delete(hostsSpawnedPerDistro, distroId)
   458  		}
   459  	}
   460  	return hostsSpawnedPerDistro, nil
   461  }