github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/command/monitor.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/api" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/mitchellh/cli" 12 ) 13 14 const ( 15 // updateWait is the amount of time to wait between status 16 // updates. Because the monitor is poll-based, we use this 17 // delay to avoid overwhelming the API server. 18 updateWait = time.Second 19 ) 20 21 // evalState is used to store the current "state of the world" 22 // in the context of monitoring an evaluation. 23 type evalState struct { 24 status string 25 desc string 26 node string 27 job string 28 allocs map[string]*allocState 29 wait time.Duration 30 index uint64 31 } 32 33 // newEvalState creates and initializes a new monitorState 34 func newEvalState() *evalState { 35 return &evalState{ 36 status: structs.EvalStatusPending, 37 allocs: make(map[string]*allocState), 38 } 39 } 40 41 // allocState is used to track the state of an allocation 42 type allocState struct { 43 id string 44 group string 45 node string 46 desired string 47 desiredDesc string 48 client string 49 clientDesc string 50 index uint64 51 52 // full is the allocation struct with full details. This 53 // must be queried for explicitly so it is only included 54 // if there is important error information inside. 55 full *api.Allocation 56 } 57 58 // monitor wraps an evaluation monitor and holds metadata and 59 // state information. 60 type monitor struct { 61 ui cli.Ui 62 client *api.Client 63 state *evalState 64 65 // length determines the number of characters for identifiers in the ui. 66 length int 67 68 sync.Mutex 69 } 70 71 // newMonitor returns a new monitor. The returned monitor will 72 // write output information to the provided ui. The length parameter determines 73 // the number of characters for identifiers in the ui. 74 func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor { 75 mon := &monitor{ 76 ui: &cli.PrefixedUi{ 77 InfoPrefix: "==> ", 78 OutputPrefix: " ", 79 ErrorPrefix: "==> ", 80 Ui: ui, 81 }, 82 client: client, 83 state: newEvalState(), 84 length: length, 85 } 86 return mon 87 } 88 89 // update is used to update our monitor with new state. It can be 90 // called whether the passed information is new or not, and will 91 // only dump update messages when state changes. 92 func (m *monitor) update(update *evalState) { 93 m.Lock() 94 defer m.Unlock() 95 96 existing := m.state 97 98 // Swap in the new state at the end 99 defer func() { 100 m.state = update 101 }() 102 103 // Check if the evaluation was triggered by a node 104 if existing.node == "" && update.node != "" { 105 m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q", 106 limit(update.node, m.length))) 107 } 108 109 // Check if the evaluation was triggered by a job 110 if existing.job == "" && update.job != "" { 111 m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job)) 112 } 113 114 // Check the allocations 115 for allocID, alloc := range update.allocs { 116 if existing, ok := existing.allocs[allocID]; !ok { 117 switch { 118 case alloc.index < update.index: 119 // New alloc with create index lower than the eval 120 // create index indicates modification 121 m.ui.Output(fmt.Sprintf( 122 "Allocation %q modified: node %q, group %q", 123 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 124 125 case alloc.desired == structs.AllocDesiredStatusRun: 126 // New allocation with desired status running 127 m.ui.Output(fmt.Sprintf( 128 "Allocation %q created: node %q, group %q", 129 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 130 } 131 } else { 132 switch { 133 case existing.client != alloc.client: 134 description := "" 135 if alloc.clientDesc != "" { 136 description = fmt.Sprintf(" (%s)", alloc.clientDesc) 137 } 138 // Allocation status has changed 139 m.ui.Output(fmt.Sprintf( 140 "Allocation %q status changed: %q -> %q%s", 141 limit(alloc.id, m.length), existing.client, alloc.client, description)) 142 } 143 } 144 } 145 146 // Check if the status changed. We skip any transitions to pending status. 147 if existing.status != "" && 148 update.status != structs.AllocClientStatusPending && 149 existing.status != update.status { 150 m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q", 151 existing.status, update.status)) 152 } 153 } 154 155 // monitor is used to start monitoring the given evaluation ID. It 156 // writes output directly to the monitor's ui, and returns the 157 // exit code for the command. If allowPrefix is false, monitor will only accept 158 // exact matching evalIDs. 159 // 160 // The return code will be 0 on successful evaluation. If there are 161 // problems scheduling the job (impossible constraints, resources 162 // exhausted, etc), then the return code will be 2. For any other 163 // failures (API connectivity, internal errors, etc), the return code 164 // will be 1. 165 func (m *monitor) monitor(evalID string, allowPrefix bool) int { 166 // Track if we encounter a scheduling failure. This can only be 167 // detected while querying allocations, so we use this bool to 168 // carry that status into the return code. 169 var schedFailure bool 170 171 // The user may have specified a prefix as eval id. We need to lookup the 172 // full id from the database first. Since we do this in a loop we need a 173 // variable to keep track if we've already written the header message. 174 var headerWritten bool 175 176 // Add the initial pending state 177 m.update(newEvalState()) 178 179 for { 180 // Query the evaluation 181 eval, _, err := m.client.Evaluations().Info(evalID, nil) 182 if err != nil { 183 if !allowPrefix { 184 m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID)) 185 return 1 186 } 187 if len(evalID) == 1 { 188 m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters.")) 189 return 1 190 } 191 if len(evalID)%2 == 1 { 192 // Identifiers must be of even length, so we strip off the last byte 193 // to provide a consistent user experience. 194 evalID = evalID[:len(evalID)-1] 195 } 196 197 evals, _, err := m.client.Evaluations().PrefixList(evalID) 198 if err != nil { 199 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 200 return 1 201 } 202 if len(evals) == 0 { 203 m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID)) 204 return 1 205 } 206 if len(evals) > 1 { 207 // Format the evaluations 208 out := make([]string, len(evals)+1) 209 out[0] = "ID|Priority|Type|Triggered By|Status" 210 for i, eval := range evals { 211 out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s", 212 limit(eval.ID, m.length), 213 eval.Priority, 214 eval.Type, 215 eval.TriggeredBy, 216 eval.Status) 217 } 218 m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out))) 219 return 0 220 } 221 // Prefix lookup matched a single evaluation 222 eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil) 223 if err != nil { 224 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 225 } 226 } 227 228 if !headerWritten { 229 m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length))) 230 headerWritten = true 231 } 232 233 // Create the new eval state. 234 state := newEvalState() 235 state.status = eval.Status 236 state.desc = eval.StatusDescription 237 state.node = eval.NodeID 238 state.job = eval.JobID 239 state.wait = eval.Wait 240 state.index = eval.CreateIndex 241 242 // Query the allocations associated with the evaluation 243 allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil) 244 if err != nil { 245 m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err)) 246 return 1 247 } 248 249 // Add the allocs to the state 250 for _, alloc := range allocs { 251 state.allocs[alloc.ID] = &allocState{ 252 id: alloc.ID, 253 group: alloc.TaskGroup, 254 node: alloc.NodeID, 255 desired: alloc.DesiredStatus, 256 desiredDesc: alloc.DesiredDescription, 257 client: alloc.ClientStatus, 258 clientDesc: alloc.ClientDescription, 259 index: alloc.CreateIndex, 260 } 261 } 262 263 // Update the state 264 m.update(state) 265 266 switch eval.Status { 267 case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled: 268 if len(eval.FailedTGAllocs) == 0 { 269 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q", 270 limit(eval.ID, m.length), eval.Status)) 271 } else { 272 // There were failures making the allocations 273 schedFailure = true 274 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:", 275 limit(eval.ID, m.length), eval.Status)) 276 277 // Print the failures per task group 278 for tg, metrics := range eval.FailedTGAllocs { 279 noun := "allocation" 280 if metrics.CoalescedFailures > 0 { 281 noun += "s" 282 } 283 m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun)) 284 metrics := formatAllocMetrics(metrics, false, " ") 285 for _, line := range strings.Split(metrics, "\n") { 286 m.ui.Output(line) 287 } 288 } 289 290 if eval.BlockedEval != "" { 291 m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder", 292 limit(eval.BlockedEval, m.length))) 293 } 294 } 295 default: 296 // Wait for the next update 297 time.Sleep(updateWait) 298 continue 299 } 300 301 // Monitor the next eval in the chain, if present 302 if eval.NextEval != "" { 303 if eval.Wait.Nanoseconds() != 0 { 304 m.ui.Info(fmt.Sprintf( 305 "Monitoring next evaluation %q in %s", 306 limit(eval.NextEval, m.length), eval.Wait)) 307 308 // Skip some unnecessary polling 309 time.Sleep(eval.Wait) 310 } 311 312 // Reset the state and monitor the new eval 313 m.state = newEvalState() 314 return m.monitor(eval.NextEval, allowPrefix) 315 } 316 break 317 } 318 319 // Treat scheduling failures specially using a dedicated exit code. 320 // This makes it easier to detect failures from the CLI. 321 if schedFailure { 322 return 2 323 } 324 325 return 0 326 } 327 328 // dumpAllocStatus is a helper to generate a more user-friendly error message 329 // for scheduling failures, displaying a high level status of why the job 330 // could not be scheduled out. 331 func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) { 332 // Print filter stats 333 ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)", 334 limit(alloc.ID, length), alloc.ClientStatus, 335 alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated)) 336 ui.Output(formatAllocMetrics(alloc.Metrics, true, " ")) 337 } 338 339 func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string { 340 // Print a helpful message if we have an eligibility problem 341 var out string 342 if metrics.NodesEvaluated == 0 { 343 out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix) 344 } 345 346 // Print a helpful message if the user has asked for a DC that has no 347 // available nodes. 348 for dc, available := range metrics.NodesAvailable { 349 if available == 0 { 350 out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc) 351 } 352 } 353 354 // Print filter info 355 for class, num := range metrics.ClassFiltered { 356 out += fmt.Sprintf("%s* Class %q filtered %d nodes\n", prefix, class, num) 357 } 358 for cs, num := range metrics.ConstraintFiltered { 359 out += fmt.Sprintf("%s* Constraint %q filtered %d nodes\n", prefix, cs, num) 360 } 361 362 // Print exhaustion info 363 if ne := metrics.NodesExhausted; ne > 0 { 364 out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne) 365 } 366 for class, num := range metrics.ClassExhausted { 367 out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num) 368 } 369 for dim, num := range metrics.DimensionExhausted { 370 out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num) 371 } 372 373 // Print scores 374 if scores { 375 for name, score := range metrics.Scores { 376 out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score) 377 } 378 } 379 380 out = strings.TrimSuffix(out, "\n") 381 return out 382 }