github.com/billybanfield/evergreen@v0.0.0-20170525200750-eeee692790f7/model/audit.go (about) 1 package model 2 3 import ( 4 "fmt" 5 6 "github.com/evergreen-ci/evergreen/db" 7 "github.com/evergreen-ci/evergreen/db/bsonutil" 8 "github.com/evergreen-ci/evergreen/model/host" 9 "github.com/evergreen-ci/evergreen/model/task" 10 "github.com/pkg/errors" 11 "gopkg.in/mgo.v2/bson" 12 ) 13 14 // HostTaskInconsistency represents a mismatch between task and host documents. 15 // It contains both the host and task's view of their relationship. 16 // Implements the Error interface, which returns a full string describing 17 // the nature of the mismatch. 18 type HostTaskInconsistency struct { 19 Host string 20 HostTaskCache string 21 Task string 22 TaskHostCache string 23 } 24 25 // StuckHostInconsistncy represents hosts that have running 26 // tasks but the tasks have been marked as completed. 27 type StuckHostInconsistency struct { 28 Host string `bson:"host_id"` 29 RunningTask string `bson:"running_task"` 30 TaskStatus string `bson:"task_status"` 31 } 32 33 var ( 34 StuckHostKey = bsonutil.MustHaveTag(StuckHostInconsistency{}, "Host") 35 StuckHostRunningTaskKey = bsonutil.MustHaveTag(StuckHostInconsistency{}, "RunningTask") 36 StuckHostTaskStatusKey = bsonutil.MustHaveTag(StuckHostInconsistency{}, "TaskStatus") 37 HostTaskKey = "task" 38 ) 39 40 // Error returns a human-readible explanation of a HostTaskInconsistency. 41 func (i HostTaskInconsistency) Error() string { 42 switch { 43 case i.Task == "" && i.TaskHostCache == "": 44 return fmt.Sprintf("host %s says it is running task %s, which does not exist", 45 i.Host, i.HostTaskCache) 46 case i.Host == "" && i.HostTaskCache == "": 47 return fmt.Sprintf("task %s says it is running on host %s, which is not a running host", 48 i.Task, i.TaskHostCache) 49 case i.HostTaskCache == i.Task: 50 return fmt.Sprintf( 51 "host %s says it is running task %s, but that task says it is assigned to %s", 52 i.Host, i.Task, i.TaskHostCache) 53 case i.TaskHostCache == i.Host: 54 return fmt.Sprintf( 55 "task %s says it is running on host %s, but that host says it is running %s", 56 i.Task, i.Host, i.HostTaskCache) 57 default: 58 // this should never be hit 59 return fmt.Sprintf("inconsistent mapping: %s/%s, %s/%s", 60 i.Host, i.HostTaskCache, i.Task, i.TaskHostCache) 61 } 62 } 63 64 // AuditHostTaskConsistency finds all running tasks and running hosts and compares 65 // their caches of what host/task they are assigned to. Returns a slice of any mappings 66 // that are not 1:1 and any errors that occur. 67 // 68 // NOTE: the error returned ONLY represents issues communicating with the database. 69 // HostTaskInconsistency implements the error interface, but it is up to the caller 70 // to cast the inconsistencies into an error type if they desire. 71 func AuditHostTaskConsistency() ([]HostTaskInconsistency, error) { 72 hostToTask, taskToHost, err := loadHostTaskMapping() 73 if err != nil { 74 return nil, err 75 } 76 return auditHostTaskMapping(hostToTask, taskToHost), nil 77 } 78 79 // loadHostTaskMapping queries the DB for hosts with tasks, the tasks assigned in the hosts' 80 // running task fields, all running (or dispatched) tasks, and the hosts in those tasks' 81 // host id field. Returns a mapping of host Ids to task Ids and task Ids to host Ids, 82 // representing both directions of the relationship. 83 func loadHostTaskMapping() (map[string]string, map[string]string, error) { 84 hostToTask := map[string]string{} 85 hostTaskIds := []string{} 86 taskToHost := map[string]string{} 87 taskHostIds := []string{} 88 89 // fetch all hosts with running tasks and then all of the tasks the hosts 90 // say they are running. 91 runningHosts, err := host.Find(host.IsRunningTask) 92 if err != nil { 93 return nil, nil, errors.Wrapf(err, "querying for running hosts:") 94 } 95 96 for _, h := range runningHosts { 97 hostTaskIds = append(hostTaskIds, h.RunningTask) 98 } 99 hostsTasks, err := task.Find(task.ByIds(hostTaskIds)) 100 if err != nil { 101 return nil, nil, errors.Wrapf(err, "querying for hosts' tasks:") 102 } 103 104 // fetch all tasks with an assigned host and the hosts they say 105 // they are assigned to 106 runningTasks, err := task.Find(task.IsDispatchedOrStarted) 107 if err != nil { 108 return nil, nil, errors.Wrapf(err, "querying for running tasks:") 109 } 110 for _, t := range append(hostsTasks, runningTasks...) { 111 taskToHost[t.Id] = t.HostId 112 taskHostIds = append(taskHostIds, t.HostId) 113 } 114 tasksHosts, err := host.Find(host.ByIds(taskHostIds)) 115 if err != nil { 116 return nil, nil, errors.Wrapf(err, "querying for tasks' hosts:") 117 } 118 119 // we only want to have running hosts that are not empty. 120 for _, h := range append(runningHosts, tasksHosts...) { 121 // if the running task is empty don't add it to the map 122 if h.RunningTask != "" { 123 hostToTask[h.Id] = h.RunningTask 124 } 125 } 126 127 return hostToTask, taskToHost, nil 128 } 129 130 // auditHostMapping takes a mapping of hosts->tasks and tasks->hosts and 131 // returns descriptions of any inconsistencies. 132 func auditHostTaskMapping(hostToTask, taskToHost map[string]string) []HostTaskInconsistency { 133 found := []HostTaskInconsistency{} 134 // cases where a host thinks its running a task that it isn't 135 for h, t := range hostToTask { 136 cachedTask, ok := taskToHost[t] 137 if !ok { 138 // host thinks it is running a task that does not exist 139 found = append(found, HostTaskInconsistency{ 140 Host: h, 141 HostTaskCache: t, 142 }) 143 } else { 144 if cachedTask != h { 145 found = append(found, HostTaskInconsistency{ 146 Host: h, 147 HostTaskCache: t, 148 Task: t, 149 TaskHostCache: cachedTask, 150 }) 151 } 152 } 153 } 154 // cases where a task thinks it is running on a host that isnt running it 155 for t, h := range taskToHost { 156 cachedHost, ok := hostToTask[h] 157 if !ok { 158 // task thinks it is running on a host that does not exist 159 found = append(found, HostTaskInconsistency{ 160 Task: t, 161 TaskHostCache: h, 162 }) 163 } else { 164 if cachedHost != t { 165 found = append(found, HostTaskInconsistency{ 166 Task: t, 167 TaskHostCache: h, 168 Host: h, 169 HostTaskCache: cachedHost, 170 }) 171 } 172 } 173 } 174 return found 175 } 176 177 func (shi StuckHostInconsistency) Error() string { 178 return fmt.Sprintf( 179 "host %s has a running task %s with complete status %s", shi.Host, shi.RunningTask, shi.TaskStatus) 180 } 181 182 // CheckStuckHosts queries for hosts that tasks that are 183 // completed but that still have them as a running task 184 func CheckStuckHosts() ([]StuckHostInconsistency, error) { 185 // find all hosts with tasks that are completed 186 pipeline := []bson.M{ 187 {"$match": bson.M{host.RunningTaskKey: bson.M{"$exists": true}}}, 188 {"$lookup": bson.M{"from": task.Collection, "localField": host.RunningTaskKey, 189 "foreignField": task.IdKey, "as": HostTaskKey}}, 190 {"$unwind": "$" + HostTaskKey}, 191 {"$match": bson.M{HostTaskKey + "." + task.StatusKey: bson.M{"$in": task.CompletedStatuses}}}, 192 {"$project": bson.M{ 193 StuckHostKey: "$" + host.IdKey, 194 StuckHostRunningTaskKey: "$" + host.RunningTaskKey, 195 StuckHostTaskStatusKey: "$" + HostTaskKey + "." + task.StatusKey, 196 }}, 197 } 198 stuckHosts := []StuckHostInconsistency{} 199 err := db.Aggregate(host.Collection, pipeline, &stuckHosts) 200 return stuckHosts, err 201 }