github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/command/monitor.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "sync" 6 "time" 7 8 "github.com/hashicorp/nomad/api" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/mitchellh/cli" 11 ) 12 13 const ( 14 // updateWait is the amount of time to wait between status 15 // updates. Because the monitor is poll-based, we use this 16 // delay to avoid overwhelming the API server. 17 updateWait = time.Second 18 ) 19 20 // evalState is used to store the current "state of the world" 21 // in the context of monitoring an evaluation. 22 type evalState struct { 23 status string 24 desc string 25 node string 26 job string 27 allocs map[string]*allocState 28 wait time.Duration 29 index uint64 30 } 31 32 // newEvalState creates and initializes a new monitorState 33 func newEvalState() *evalState { 34 return &evalState{ 35 status: structs.EvalStatusPending, 36 allocs: make(map[string]*allocState), 37 } 38 } 39 40 // allocState is used to track the state of an allocation 41 type allocState struct { 42 id string 43 group string 44 node string 45 desired string 46 desiredDesc string 47 client string 48 clientDesc string 49 index uint64 50 51 // full is the allocation struct with full details. This 52 // must be queried for explicitly so it is only included 53 // if there is important error information inside. 54 full *api.Allocation 55 } 56 57 // monitor wraps an evaluation monitor and holds metadata and 58 // state information. 59 type monitor struct { 60 ui cli.Ui 61 client *api.Client 62 state *evalState 63 64 // length determines the number of characters for identifiers in the ui. 65 length int 66 67 sync.Mutex 68 } 69 70 // newMonitor returns a new monitor. The returned monitor will 71 // write output information to the provided ui. The length parameter determines 72 // the number of characters for identifiers in the ui. 73 func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor { 74 mon := &monitor{ 75 ui: &cli.PrefixedUi{ 76 InfoPrefix: "==> ", 77 OutputPrefix: " ", 78 ErrorPrefix: "==> ", 79 Ui: ui, 80 }, 81 client: client, 82 state: newEvalState(), 83 length: length, 84 } 85 return mon 86 } 87 88 // update is used to update our monitor with new state. It can be 89 // called whether the passed information is new or not, and will 90 // only dump update messages when state changes. 91 func (m *monitor) update(update *evalState) { 92 m.Lock() 93 defer m.Unlock() 94 95 existing := m.state 96 97 // Swap in the new state at the end 98 defer func() { 99 m.state = update 100 }() 101 102 // Check if the evaluation was triggered by a node 103 if existing.node == "" && update.node != "" { 104 m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q", 105 limit(update.node, m.length))) 106 } 107 108 // Check if the evaluation was triggered by a job 109 if existing.job == "" && update.job != "" { 110 m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job)) 111 } 112 113 // Check the allocations 114 for allocID, alloc := range update.allocs { 115 if existing, ok := existing.allocs[allocID]; !ok { 116 switch { 117 case alloc.desired == structs.AllocDesiredStatusFailed: 118 // New allocs with desired state failed indicate 119 // scheduling failure. 120 m.ui.Output(fmt.Sprintf("Scheduling error for group %q (%s)", 121 alloc.group, alloc.desiredDesc)) 122 123 // Log the client status, if any provided 124 if alloc.clientDesc != "" { 125 m.ui.Output("Client reported status: " + alloc.clientDesc) 126 } 127 128 // Generate a more descriptive error for why the allocation 129 // failed and dump it to the screen 130 if alloc.full != nil { 131 dumpAllocStatus(m.ui, alloc.full, m.length) 132 } 133 134 case alloc.index < update.index: 135 // New alloc with create index lower than the eval 136 // create index indicates modification 137 m.ui.Output(fmt.Sprintf( 138 "Allocation %q modified: node %q, group %q", 139 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 140 141 case alloc.desired == structs.AllocDesiredStatusRun: 142 // New allocation with desired status running 143 m.ui.Output(fmt.Sprintf( 144 "Allocation %q created: node %q, group %q", 145 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 146 } 147 } else { 148 switch { 149 case existing.client != alloc.client: 150 // Allocation status has changed 151 m.ui.Output(fmt.Sprintf( 152 "Allocation %q status changed: %q -> %q (%s)", 153 limit(alloc.id, m.length), existing.client, alloc.client, alloc.clientDesc)) 154 } 155 } 156 } 157 158 // Check if the status changed. We skip any transitions to pending status. 159 if existing.status != "" && 160 update.status != structs.AllocClientStatusPending && 161 existing.status != update.status { 162 m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q", 163 existing.status, update.status)) 164 } 165 } 166 167 // monitor is used to start monitoring the given evaluation ID. It 168 // writes output directly to the monitor's ui, and returns the 169 // exit code for the command. If allowPrefix is false, monitor will only accept 170 // exact matching evalIDs. 171 // 172 // The return code will be 0 on successful evaluation. If there are 173 // problems scheduling the job (impossible constraints, resources 174 // exhausted, etc), then the return code will be 2. For any other 175 // failures (API connectivity, internal errors, etc), the return code 176 // will be 1. 177 func (m *monitor) monitor(evalID string, allowPrefix bool) int { 178 // Track if we encounter a scheduling failure. This can only be 179 // detected while querying allocations, so we use this bool to 180 // carry that status into the return code. 181 var schedFailure bool 182 183 // The user may have specified a prefix as eval id. We need to lookup the 184 // full id from the database first. Since we do this in a loop we need a 185 // variable to keep track if we've already written the header message. 186 var headerWritten bool 187 188 // Add the initial pending state 189 m.update(newEvalState()) 190 191 for { 192 // Query the evaluation 193 eval, _, err := m.client.Evaluations().Info(evalID, nil) 194 if err != nil { 195 if !allowPrefix { 196 m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID)) 197 return 1 198 } 199 if len(evalID) == 1 { 200 m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters.")) 201 return 1 202 } 203 if len(evalID)%2 == 1 { 204 // Identifiers must be of even length, so we strip off the last byte 205 // to provide a consistent user experience. 206 evalID = evalID[:len(evalID)-1] 207 } 208 209 evals, _, err := m.client.Evaluations().PrefixList(evalID) 210 if err != nil { 211 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 212 return 1 213 } 214 if len(evals) == 0 { 215 m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID)) 216 return 1 217 } 218 if len(evals) > 1 { 219 // Format the evaluations 220 out := make([]string, len(evals)+1) 221 out[0] = "ID|Priority|Type|Triggered By|Status" 222 for i, eval := range evals { 223 out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s", 224 limit(eval.ID, m.length), 225 eval.Priority, 226 eval.Type, 227 eval.TriggeredBy, 228 eval.Status) 229 } 230 m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out))) 231 return 0 232 } 233 // Prefix lookup matched a single evaluation 234 eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil) 235 if err != nil { 236 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 237 } 238 } 239 240 if !headerWritten { 241 m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length))) 242 headerWritten = true 243 } 244 245 // Create the new eval state. 246 state := newEvalState() 247 state.status = eval.Status 248 state.desc = eval.StatusDescription 249 state.node = eval.NodeID 250 state.job = eval.JobID 251 state.wait = eval.Wait 252 state.index = eval.CreateIndex 253 254 // Query the allocations associated with the evaluation 255 allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil) 256 if err != nil { 257 m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err)) 258 return 1 259 } 260 261 // Add the allocs to the state 262 for _, alloc := range allocs { 263 state.allocs[alloc.ID] = &allocState{ 264 id: alloc.ID, 265 group: alloc.TaskGroup, 266 node: alloc.NodeID, 267 desired: alloc.DesiredStatus, 268 desiredDesc: alloc.DesiredDescription, 269 client: alloc.ClientStatus, 270 clientDesc: alloc.ClientDescription, 271 index: alloc.CreateIndex, 272 } 273 274 // If we have a scheduling error, query the full allocation 275 // to get the details. 276 if alloc.DesiredStatus == structs.AllocDesiredStatusFailed { 277 schedFailure = true 278 failed, _, err := m.client.Allocations().Info(alloc.ID, nil) 279 if err != nil { 280 m.ui.Error(fmt.Sprintf("Error querying allocation: %s", err)) 281 return 1 282 } 283 state.allocs[alloc.ID].full = failed 284 } 285 } 286 287 // Update the state 288 m.update(state) 289 290 switch eval.Status { 291 case structs.EvalStatusComplete, structs.EvalStatusFailed: 292 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q", 293 limit(eval.ID, m.length), eval.Status)) 294 default: 295 // Wait for the next update 296 time.Sleep(updateWait) 297 continue 298 } 299 300 // Monitor the next eval in the chain, if present 301 if eval.NextEval != "" { 302 m.ui.Info(fmt.Sprintf( 303 "Monitoring next evaluation %q in %s", 304 eval.NextEval, eval.Wait)) 305 306 // Skip some unnecessary polling 307 time.Sleep(eval.Wait) 308 309 // Reset the state and monitor the new eval 310 m.state = newEvalState() 311 return m.monitor(eval.NextEval, allowPrefix) 312 } 313 break 314 } 315 316 // Treat scheduling failures specially using a dedicated exit code. 317 // This makes it easier to detect failures from the CLI. 318 if schedFailure { 319 return 2 320 } 321 322 return 0 323 } 324 325 // dumpAllocStatus is a helper to generate a more user-friendly error message 326 // for scheduling failures, displaying a high level status of why the job 327 // could not be scheduled out. 328 func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) { 329 // Print filter stats 330 ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)", 331 limit(alloc.ID, length), alloc.ClientStatus, 332 alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated)) 333 334 // Print a helpful message if we have an eligibility problem 335 if alloc.Metrics.NodesEvaluated == 0 { 336 ui.Output(" * No nodes were eligible for evaluation") 337 } 338 339 // Print a helpful message if the user has asked for a DC that has no 340 // available nodes. 341 for dc, available := range alloc.Metrics.NodesAvailable { 342 if available == 0 { 343 ui.Output(fmt.Sprintf(" * No nodes are available in datacenter %q", dc)) 344 } 345 } 346 347 // Print filter info 348 for class, num := range alloc.Metrics.ClassFiltered { 349 ui.Output(fmt.Sprintf(" * Class %q filtered %d nodes", class, num)) 350 } 351 for cs, num := range alloc.Metrics.ConstraintFiltered { 352 ui.Output(fmt.Sprintf(" * Constraint %q filtered %d nodes", cs, num)) 353 } 354 355 // Print exhaustion info 356 if ne := alloc.Metrics.NodesExhausted; ne > 0 { 357 ui.Output(fmt.Sprintf(" * Resources exhausted on %d nodes", ne)) 358 } 359 for class, num := range alloc.Metrics.ClassExhausted { 360 ui.Output(fmt.Sprintf(" * Class %q exhausted on %d nodes", class, num)) 361 } 362 for dim, num := range alloc.Metrics.DimensionExhausted { 363 ui.Output(fmt.Sprintf(" * Dimension %q exhausted on %d nodes", dim, num)) 364 } 365 366 // Print scores 367 for name, score := range alloc.Metrics.Scores { 368 ui.Output(fmt.Sprintf(" * Score %q = %f", name, score)) 369 } 370 }