github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/command/monitor.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/api" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/mitchellh/cli" 12 ) 13 14 const ( 15 // updateWait is the amount of time to wait between status 16 // updates. Because the monitor is poll-based, we use this 17 // delay to avoid overwhelming the API server. 18 updateWait = time.Second 19 ) 20 21 // evalState is used to store the current "state of the world" 22 // in the context of monitoring an evaluation. 23 type evalState struct { 24 status string 25 desc string 26 node string 27 deployment string 28 job string 29 allocs map[string]*allocState 30 wait time.Duration 31 index uint64 32 } 33 34 // newEvalState creates and initializes a new monitorState 35 func newEvalState() *evalState { 36 return &evalState{ 37 status: structs.EvalStatusPending, 38 allocs: make(map[string]*allocState), 39 } 40 } 41 42 // allocState is used to track the state of an allocation 43 type allocState struct { 44 id string 45 group string 46 node string 47 desired string 48 desiredDesc string 49 client string 50 clientDesc string 51 index uint64 52 53 // full is the allocation struct with full details. This 54 // must be queried for explicitly so it is only included 55 // if there is important error information inside. 56 full *api.Allocation 57 } 58 59 // monitor wraps an evaluation monitor and holds metadata and 60 // state information. 61 type monitor struct { 62 ui cli.Ui 63 client *api.Client 64 state *evalState 65 66 // length determines the number of characters for identifiers in the ui. 67 length int 68 69 sync.Mutex 70 } 71 72 // newMonitor returns a new monitor. The returned monitor will 73 // write output information to the provided ui. The length parameter determines 74 // the number of characters for identifiers in the ui. 75 func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor { 76 mon := &monitor{ 77 ui: &cli.PrefixedUi{ 78 InfoPrefix: "==> ", 79 OutputPrefix: " ", 80 ErrorPrefix: "==> ", 81 Ui: ui, 82 }, 83 client: client, 84 state: newEvalState(), 85 length: length, 86 } 87 return mon 88 } 89 90 // update is used to update our monitor with new state. It can be 91 // called whether the passed information is new or not, and will 92 // only dump update messages when state changes. 93 func (m *monitor) update(update *evalState) { 94 m.Lock() 95 defer m.Unlock() 96 97 existing := m.state 98 99 // Swap in the new state at the end 100 defer func() { 101 m.state = update 102 }() 103 104 // Check if the evaluation was triggered by a node 105 if existing.node == "" && update.node != "" { 106 m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q", 107 limit(update.node, m.length))) 108 } 109 110 // Check if the evaluation was triggered by a job 111 if existing.job == "" && update.job != "" { 112 m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job)) 113 } 114 115 // Check if the evaluation was triggered by a deployment 116 if existing.deployment == "" && update.deployment != "" { 117 m.ui.Output(fmt.Sprintf("Evaluation within deployment: %q", limit(update.deployment, m.length))) 118 } 119 120 // Check the allocations 121 for allocID, alloc := range update.allocs { 122 if existing, ok := existing.allocs[allocID]; !ok { 123 switch { 124 case alloc.index < update.index: 125 // New alloc with create index lower than the eval 126 // create index indicates modification 127 m.ui.Output(fmt.Sprintf( 128 "Allocation %q modified: node %q, group %q", 129 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 130 131 case alloc.desired == structs.AllocDesiredStatusRun: 132 // New allocation with desired status running 133 m.ui.Output(fmt.Sprintf( 134 "Allocation %q created: node %q, group %q", 135 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 136 } 137 } else { 138 switch { 139 case existing.client != alloc.client: 140 description := "" 141 if alloc.clientDesc != "" { 142 description = fmt.Sprintf(" (%s)", alloc.clientDesc) 143 } 144 // Allocation status has changed 145 m.ui.Output(fmt.Sprintf( 146 "Allocation %q status changed: %q -> %q%s", 147 limit(alloc.id, m.length), existing.client, alloc.client, description)) 148 } 149 } 150 } 151 152 // Check if the status changed. We skip any transitions to pending status. 153 if existing.status != "" && 154 update.status != structs.AllocClientStatusPending && 155 existing.status != update.status { 156 m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q", 157 existing.status, update.status)) 158 } 159 } 160 161 // monitor is used to start monitoring the given evaluation ID. It 162 // writes output directly to the monitor's ui, and returns the 163 // exit code for the command. If allowPrefix is false, monitor will only accept 164 // exact matching evalIDs. 165 // 166 // The return code will be 0 on successful evaluation. If there are 167 // problems scheduling the job (impossible constraints, resources 168 // exhausted, etc), then the return code will be 2. For any other 169 // failures (API connectivity, internal errors, etc), the return code 170 // will be 1. 171 func (m *monitor) monitor(evalID string, allowPrefix bool) int { 172 // Track if we encounter a scheduling failure. This can only be 173 // detected while querying allocations, so we use this bool to 174 // carry that status into the return code. 175 var schedFailure bool 176 177 // The user may have specified a prefix as eval id. We need to lookup the 178 // full id from the database first. Since we do this in a loop we need a 179 // variable to keep track if we've already written the header message. 180 var headerWritten bool 181 182 // Add the initial pending state 183 m.update(newEvalState()) 184 185 for { 186 // Query the evaluation 187 eval, _, err := m.client.Evaluations().Info(evalID, nil) 188 if err != nil { 189 if !allowPrefix { 190 m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID)) 191 return 1 192 } 193 if len(evalID) == 1 { 194 m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters.")) 195 return 1 196 } 197 if len(evalID)%2 == 1 { 198 // Identifiers must be of even length, so we strip off the last byte 199 // to provide a consistent user experience. 200 evalID = evalID[:len(evalID)-1] 201 } 202 203 evals, _, err := m.client.Evaluations().PrefixList(evalID) 204 if err != nil { 205 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 206 return 1 207 } 208 if len(evals) == 0 { 209 m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID)) 210 return 1 211 } 212 if len(evals) > 1 { 213 // Format the evaluations 214 out := make([]string, len(evals)+1) 215 out[0] = "ID|Priority|Type|Triggered By|Status" 216 for i, eval := range evals { 217 out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s", 218 limit(eval.ID, m.length), 219 eval.Priority, 220 eval.Type, 221 eval.TriggeredBy, 222 eval.Status) 223 } 224 m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out))) 225 return 0 226 } 227 // Prefix lookup matched a single evaluation 228 eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil) 229 if err != nil { 230 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 231 } 232 } 233 234 if !headerWritten { 235 m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length))) 236 headerWritten = true 237 } 238 239 // Create the new eval state. 240 state := newEvalState() 241 state.status = eval.Status 242 state.desc = eval.StatusDescription 243 state.node = eval.NodeID 244 state.job = eval.JobID 245 state.deployment = eval.DeploymentID 246 state.wait = eval.Wait 247 state.index = eval.CreateIndex 248 249 // Query the allocations associated with the evaluation 250 allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil) 251 if err != nil { 252 m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err)) 253 return 1 254 } 255 256 // Add the allocs to the state 257 for _, alloc := range allocs { 258 state.allocs[alloc.ID] = &allocState{ 259 id: alloc.ID, 260 group: alloc.TaskGroup, 261 node: alloc.NodeID, 262 desired: alloc.DesiredStatus, 263 desiredDesc: alloc.DesiredDescription, 264 client: alloc.ClientStatus, 265 clientDesc: alloc.ClientDescription, 266 index: alloc.CreateIndex, 267 } 268 } 269 270 // Update the state 271 m.update(state) 272 273 switch eval.Status { 274 case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled: 275 if len(eval.FailedTGAllocs) == 0 { 276 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q", 277 limit(eval.ID, m.length), eval.Status)) 278 } else { 279 // There were failures making the allocations 280 schedFailure = true 281 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:", 282 limit(eval.ID, m.length), eval.Status)) 283 284 // Print the failures per task group 285 for tg, metrics := range eval.FailedTGAllocs { 286 noun := "allocation" 287 if metrics.CoalescedFailures > 0 { 288 noun += "s" 289 } 290 m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun)) 291 metrics := formatAllocMetrics(metrics, false, " ") 292 for _, line := range strings.Split(metrics, "\n") { 293 m.ui.Output(line) 294 } 295 } 296 297 if eval.BlockedEval != "" { 298 m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder", 299 limit(eval.BlockedEval, m.length))) 300 } 301 } 302 default: 303 // Wait for the next update 304 time.Sleep(updateWait) 305 continue 306 } 307 308 // Monitor the next eval in the chain, if present 309 if eval.NextEval != "" { 310 if eval.Wait.Nanoseconds() != 0 { 311 m.ui.Info(fmt.Sprintf( 312 "Monitoring next evaluation %q in %s", 313 limit(eval.NextEval, m.length), eval.Wait)) 314 315 // Skip some unnecessary polling 316 time.Sleep(eval.Wait) 317 } 318 319 // Reset the state and monitor the new eval 320 m.state = newEvalState() 321 return m.monitor(eval.NextEval, allowPrefix) 322 } 323 break 324 } 325 326 // Treat scheduling failures specially using a dedicated exit code. 327 // This makes it easier to detect failures from the CLI. 328 if schedFailure { 329 return 2 330 } 331 332 return 0 333 } 334 335 // dumpAllocStatus is a helper to generate a more user-friendly error message 336 // for scheduling failures, displaying a high level status of why the job 337 // could not be scheduled out. 338 func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) { 339 // Print filter stats 340 ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)", 341 limit(alloc.ID, length), alloc.ClientStatus, 342 alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated)) 343 ui.Output(formatAllocMetrics(alloc.Metrics, true, " ")) 344 } 345 346 func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string { 347 // Print a helpful message if we have an eligibility problem 348 var out string 349 if metrics.NodesEvaluated == 0 { 350 out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix) 351 } 352 353 // Print a helpful message if the user has asked for a DC that has no 354 // available nodes. 355 for dc, available := range metrics.NodesAvailable { 356 if available == 0 { 357 out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc) 358 } 359 } 360 361 // Print filter info 362 for class, num := range metrics.ClassFiltered { 363 out += fmt.Sprintf("%s* Class %q filtered %d nodes\n", prefix, class, num) 364 } 365 for cs, num := range metrics.ConstraintFiltered { 366 out += fmt.Sprintf("%s* Constraint %q filtered %d nodes\n", prefix, cs, num) 367 } 368 369 // Print exhaustion info 370 if ne := metrics.NodesExhausted; ne > 0 { 371 out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne) 372 } 373 for class, num := range metrics.ClassExhausted { 374 out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num) 375 } 376 for dim, num := range metrics.DimensionExhausted { 377 out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num) 378 } 379 380 // Print scores 381 if scores { 382 for name, score := range metrics.Scores { 383 out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score) 384 } 385 } 386 387 out = strings.TrimSuffix(out, "\n") 388 return out 389 }