github.com/billybanfield/evergreen@v0.0.0-20170525200750-eeee692790f7/model/audit.go (about)

     1  package model
     2  
     3  import (
     4  	"fmt"
     5  
     6  	"github.com/evergreen-ci/evergreen/db"
     7  	"github.com/evergreen-ci/evergreen/db/bsonutil"
     8  	"github.com/evergreen-ci/evergreen/model/host"
     9  	"github.com/evergreen-ci/evergreen/model/task"
    10  	"github.com/pkg/errors"
    11  	"gopkg.in/mgo.v2/bson"
    12  )
    13  
    14  // HostTaskInconsistency represents a mismatch between task and host documents.
    15  // It contains both the host and task's view of their relationship.
    16  // Implements the Error interface, which returns a full string describing
    17  // the nature of the mismatch.
    18  type HostTaskInconsistency struct {
    19  	Host          string
    20  	HostTaskCache string
    21  	Task          string
    22  	TaskHostCache string
    23  }
    24  
    25  // StuckHostInconsistncy represents hosts that have running
    26  // tasks but the tasks have been marked as completed.
    27  type StuckHostInconsistency struct {
    28  	Host        string `bson:"host_id"`
    29  	RunningTask string `bson:"running_task"`
    30  	TaskStatus  string `bson:"task_status"`
    31  }
    32  
    33  var (
    34  	StuckHostKey            = bsonutil.MustHaveTag(StuckHostInconsistency{}, "Host")
    35  	StuckHostRunningTaskKey = bsonutil.MustHaveTag(StuckHostInconsistency{}, "RunningTask")
    36  	StuckHostTaskStatusKey  = bsonutil.MustHaveTag(StuckHostInconsistency{}, "TaskStatus")
    37  	HostTaskKey             = "task"
    38  )
    39  
    40  // Error returns a human-readible explanation of a HostTaskInconsistency.
    41  func (i HostTaskInconsistency) Error() string {
    42  	switch {
    43  	case i.Task == "" && i.TaskHostCache == "":
    44  		return fmt.Sprintf("host %s says it is running task %s, which does not exist",
    45  			i.Host, i.HostTaskCache)
    46  	case i.Host == "" && i.HostTaskCache == "":
    47  		return fmt.Sprintf("task %s says it is running on host %s, which is not a running host",
    48  			i.Task, i.TaskHostCache)
    49  	case i.HostTaskCache == i.Task:
    50  		return fmt.Sprintf(
    51  			"host %s says it is running task %s, but that task says it is assigned to %s",
    52  			i.Host, i.Task, i.TaskHostCache)
    53  	case i.TaskHostCache == i.Host:
    54  		return fmt.Sprintf(
    55  			"task %s says it is running on host %s, but that host says it is running %s",
    56  			i.Task, i.Host, i.HostTaskCache)
    57  	default:
    58  		// this should never be hit
    59  		return fmt.Sprintf("inconsistent mapping: %s/%s, %s/%s",
    60  			i.Host, i.HostTaskCache, i.Task, i.TaskHostCache)
    61  	}
    62  }
    63  
    64  // AuditHostTaskConsistency finds all running tasks and running hosts and compares
    65  // their caches of what host/task they are assigned to. Returns a slice of any mappings
    66  // that are not 1:1 and any errors that occur.
    67  //
    68  // NOTE: the error returned ONLY represents issues communicating with the database.
    69  // HostTaskInconsistency implements the error interface, but it is up to the caller
    70  // to cast the inconsistencies into an error type if they desire.
    71  func AuditHostTaskConsistency() ([]HostTaskInconsistency, error) {
    72  	hostToTask, taskToHost, err := loadHostTaskMapping()
    73  	if err != nil {
    74  		return nil, err
    75  	}
    76  	return auditHostTaskMapping(hostToTask, taskToHost), nil
    77  }
    78  
    79  // loadHostTaskMapping queries the DB for hosts with tasks, the tasks assigned in the hosts'
    80  // running task fields, all running (or dispatched) tasks, and the hosts in those tasks'
    81  // host id field. Returns a mapping of host Ids to task Ids and task Ids to host Ids,
    82  // representing both directions of the relationship.
    83  func loadHostTaskMapping() (map[string]string, map[string]string, error) {
    84  	hostToTask := map[string]string{}
    85  	hostTaskIds := []string{}
    86  	taskToHost := map[string]string{}
    87  	taskHostIds := []string{}
    88  
    89  	// fetch all hosts with running tasks and then all of the tasks the hosts
    90  	// say they are running.
    91  	runningHosts, err := host.Find(host.IsRunningTask)
    92  	if err != nil {
    93  		return nil, nil, errors.Wrapf(err, "querying for running hosts:")
    94  	}
    95  
    96  	for _, h := range runningHosts {
    97  		hostTaskIds = append(hostTaskIds, h.RunningTask)
    98  	}
    99  	hostsTasks, err := task.Find(task.ByIds(hostTaskIds))
   100  	if err != nil {
   101  		return nil, nil, errors.Wrapf(err, "querying for hosts' tasks:")
   102  	}
   103  
   104  	// fetch all tasks with an assigned host and the hosts they say
   105  	// they are assigned to
   106  	runningTasks, err := task.Find(task.IsDispatchedOrStarted)
   107  	if err != nil {
   108  		return nil, nil, errors.Wrapf(err, "querying for running tasks:")
   109  	}
   110  	for _, t := range append(hostsTasks, runningTasks...) {
   111  		taskToHost[t.Id] = t.HostId
   112  		taskHostIds = append(taskHostIds, t.HostId)
   113  	}
   114  	tasksHosts, err := host.Find(host.ByIds(taskHostIds))
   115  	if err != nil {
   116  		return nil, nil, errors.Wrapf(err, "querying for tasks' hosts:")
   117  	}
   118  
   119  	// we only want to have running hosts that are not empty.
   120  	for _, h := range append(runningHosts, tasksHosts...) {
   121  		// if the running task is empty don't add it to the map
   122  		if h.RunningTask != "" {
   123  			hostToTask[h.Id] = h.RunningTask
   124  		}
   125  	}
   126  
   127  	return hostToTask, taskToHost, nil
   128  }
   129  
   130  // auditHostMapping takes a mapping of hosts->tasks and tasks->hosts and
   131  // returns descriptions of any inconsistencies.
   132  func auditHostTaskMapping(hostToTask, taskToHost map[string]string) []HostTaskInconsistency {
   133  	found := []HostTaskInconsistency{}
   134  	// cases where a host thinks its running a task that it isn't
   135  	for h, t := range hostToTask {
   136  		cachedTask, ok := taskToHost[t]
   137  		if !ok {
   138  			// host thinks it is running a task that does not exist
   139  			found = append(found, HostTaskInconsistency{
   140  				Host:          h,
   141  				HostTaskCache: t,
   142  			})
   143  		} else {
   144  			if cachedTask != h {
   145  				found = append(found, HostTaskInconsistency{
   146  					Host:          h,
   147  					HostTaskCache: t,
   148  					Task:          t,
   149  					TaskHostCache: cachedTask,
   150  				})
   151  			}
   152  		}
   153  	}
   154  	// cases where a task thinks it is running on a host that isnt running it
   155  	for t, h := range taskToHost {
   156  		cachedHost, ok := hostToTask[h]
   157  		if !ok {
   158  			// task thinks it is running on a host that does not exist
   159  			found = append(found, HostTaskInconsistency{
   160  				Task:          t,
   161  				TaskHostCache: h,
   162  			})
   163  		} else {
   164  			if cachedHost != t {
   165  				found = append(found, HostTaskInconsistency{
   166  					Task:          t,
   167  					TaskHostCache: h,
   168  					Host:          h,
   169  					HostTaskCache: cachedHost,
   170  				})
   171  			}
   172  		}
   173  	}
   174  	return found
   175  }
   176  
   177  func (shi StuckHostInconsistency) Error() string {
   178  	return fmt.Sprintf(
   179  		"host %s has a running task %s with complete status %s", shi.Host, shi.RunningTask, shi.TaskStatus)
   180  }
   181  
   182  // CheckStuckHosts queries for hosts that tasks that are
   183  // completed but that still have them as a running task
   184  func CheckStuckHosts() ([]StuckHostInconsistency, error) {
   185  	// find all hosts with tasks that are completed
   186  	pipeline := []bson.M{
   187  		{"$match": bson.M{host.RunningTaskKey: bson.M{"$exists": true}}},
   188  		{"$lookup": bson.M{"from": task.Collection, "localField": host.RunningTaskKey,
   189  			"foreignField": task.IdKey, "as": HostTaskKey}},
   190  		{"$unwind": "$" + HostTaskKey},
   191  		{"$match": bson.M{HostTaskKey + "." + task.StatusKey: bson.M{"$in": task.CompletedStatuses}}},
   192  		{"$project": bson.M{
   193  			StuckHostKey:            "$" + host.IdKey,
   194  			StuckHostRunningTaskKey: "$" + host.RunningTaskKey,
   195  			StuckHostTaskStatusKey:  "$" + HostTaskKey + "." + task.StatusKey,
   196  		}},
   197  	}
   198  	stuckHosts := []StuckHostInconsistency{}
   199  	err := db.Aggregate(host.Collection, pipeline, &stuckHosts)
   200  	return stuckHosts, err
   201  }