github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/command/monitor.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "strings" 6 "sync" 7 "time" 8 9 "github.com/hashicorp/nomad/api" 10 "github.com/hashicorp/nomad/nomad/structs" 11 "github.com/mitchellh/cli" 12 ) 13 14 const ( 15 // updateWait is the amount of time to wait between status 16 // updates. Because the monitor is poll-based, we use this 17 // delay to avoid overwhelming the API server. 18 updateWait = time.Second 19 ) 20 21 // evalState is used to store the current "state of the world" 22 // in the context of monitoring an evaluation. 23 type evalState struct { 24 status string 25 desc string 26 node string 27 deployment string 28 job string 29 allocs map[string]*allocState 30 wait time.Duration 31 index uint64 32 } 33 34 // newEvalState creates and initializes a new monitorState 35 func newEvalState() *evalState { 36 return &evalState{ 37 status: structs.EvalStatusPending, 38 allocs: make(map[string]*allocState), 39 } 40 } 41 42 // allocState is used to track the state of an allocation 43 type allocState struct { 44 id string 45 group string 46 node string 47 desired string 48 desiredDesc string 49 client string 50 clientDesc string 51 index uint64 52 } 53 54 // monitor wraps an evaluation monitor and holds metadata and 55 // state information. 56 type monitor struct { 57 ui cli.Ui 58 client *api.Client 59 state *evalState 60 61 // length determines the number of characters for identifiers in the ui. 62 length int 63 64 sync.Mutex 65 } 66 67 // newMonitor returns a new monitor. The returned monitor will 68 // write output information to the provided ui. The length parameter determines 69 // the number of characters for identifiers in the ui. 70 func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor { 71 mon := &monitor{ 72 ui: &cli.PrefixedUi{ 73 InfoPrefix: "==> ", 74 OutputPrefix: " ", 75 ErrorPrefix: "==> ", 76 Ui: ui, 77 }, 78 client: client, 79 state: newEvalState(), 80 length: length, 81 } 82 return mon 83 } 84 85 // update is used to update our monitor with new state. It can be 86 // called whether the passed information is new or not, and will 87 // only dump update messages when state changes. 88 func (m *monitor) update(update *evalState) { 89 m.Lock() 90 defer m.Unlock() 91 92 existing := m.state 93 94 // Swap in the new state at the end 95 defer func() { 96 m.state = update 97 }() 98 99 // Check if the evaluation was triggered by a node 100 if existing.node == "" && update.node != "" { 101 m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q", 102 limit(update.node, m.length))) 103 } 104 105 // Check if the evaluation was triggered by a job 106 if existing.job == "" && update.job != "" { 107 m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job)) 108 } 109 110 // Check if the evaluation was triggered by a deployment 111 if existing.deployment == "" && update.deployment != "" { 112 m.ui.Output(fmt.Sprintf("Evaluation within deployment: %q", limit(update.deployment, m.length))) 113 } 114 115 // Check the allocations 116 for allocID, alloc := range update.allocs { 117 if existing, ok := existing.allocs[allocID]; !ok { 118 switch { 119 case alloc.index < update.index: 120 // New alloc with create index lower than the eval 121 // create index indicates modification 122 m.ui.Output(fmt.Sprintf( 123 "Allocation %q modified: node %q, group %q", 124 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 125 126 case alloc.desired == structs.AllocDesiredStatusRun: 127 // New allocation with desired status running 128 m.ui.Output(fmt.Sprintf( 129 "Allocation %q created: node %q, group %q", 130 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 131 } 132 } else { 133 switch { 134 case existing.client != alloc.client: 135 description := "" 136 if alloc.clientDesc != "" { 137 description = fmt.Sprintf(" (%s)", alloc.clientDesc) 138 } 139 // Allocation status has changed 140 m.ui.Output(fmt.Sprintf( 141 "Allocation %q status changed: %q -> %q%s", 142 limit(alloc.id, m.length), existing.client, alloc.client, description)) 143 } 144 } 145 } 146 147 // Check if the status changed. We skip any transitions to pending status. 148 if existing.status != "" && 149 update.status != structs.AllocClientStatusPending && 150 existing.status != update.status { 151 m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q", 152 existing.status, update.status)) 153 } 154 } 155 156 // monitor is used to start monitoring the given evaluation ID. It 157 // writes output directly to the monitor's ui, and returns the 158 // exit code for the command. If allowPrefix is false, monitor will only accept 159 // exact matching evalIDs. 160 // 161 // The return code will be 0 on successful evaluation. If there are 162 // problems scheduling the job (impossible constraints, resources 163 // exhausted, etc), then the return code will be 2. For any other 164 // failures (API connectivity, internal errors, etc), the return code 165 // will be 1. 166 func (m *monitor) monitor(evalID string, allowPrefix bool) int { 167 // Track if we encounter a scheduling failure. This can only be 168 // detected while querying allocations, so we use this bool to 169 // carry that status into the return code. 170 var schedFailure bool 171 172 // The user may have specified a prefix as eval id. We need to lookup the 173 // full id from the database first. Since we do this in a loop we need a 174 // variable to keep track if we've already written the header message. 175 var headerWritten bool 176 177 // Add the initial pending state 178 m.update(newEvalState()) 179 180 for { 181 // Query the evaluation 182 eval, _, err := m.client.Evaluations().Info(evalID, nil) 183 if err != nil { 184 if !allowPrefix { 185 m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID)) 186 return 1 187 } 188 if len(evalID) == 1 { 189 m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters.")) 190 return 1 191 } 192 193 evalID = sanatizeUUIDPrefix(evalID) 194 evals, _, err := m.client.Evaluations().PrefixList(evalID) 195 if err != nil { 196 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 197 return 1 198 } 199 if len(evals) == 0 { 200 m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID)) 201 return 1 202 } 203 if len(evals) > 1 { 204 // Format the evaluations 205 out := make([]string, len(evals)+1) 206 out[0] = "ID|Priority|Type|Triggered By|Status" 207 for i, eval := range evals { 208 out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s", 209 limit(eval.ID, m.length), 210 eval.Priority, 211 eval.Type, 212 eval.TriggeredBy, 213 eval.Status) 214 } 215 m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out))) 216 return 0 217 } 218 // Prefix lookup matched a single evaluation 219 eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil) 220 if err != nil { 221 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 222 } 223 } 224 225 if !headerWritten { 226 m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length))) 227 headerWritten = true 228 } 229 230 // Create the new eval state. 231 state := newEvalState() 232 state.status = eval.Status 233 state.desc = eval.StatusDescription 234 state.node = eval.NodeID 235 state.job = eval.JobID 236 state.deployment = eval.DeploymentID 237 state.wait = eval.Wait 238 state.index = eval.CreateIndex 239 240 // Query the allocations associated with the evaluation 241 allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil) 242 if err != nil { 243 m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err)) 244 return 1 245 } 246 247 // Add the allocs to the state 248 for _, alloc := range allocs { 249 state.allocs[alloc.ID] = &allocState{ 250 id: alloc.ID, 251 group: alloc.TaskGroup, 252 node: alloc.NodeID, 253 desired: alloc.DesiredStatus, 254 desiredDesc: alloc.DesiredDescription, 255 client: alloc.ClientStatus, 256 clientDesc: alloc.ClientDescription, 257 index: alloc.CreateIndex, 258 } 259 } 260 261 // Update the state 262 m.update(state) 263 264 switch eval.Status { 265 case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled: 266 if len(eval.FailedTGAllocs) == 0 { 267 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q", 268 limit(eval.ID, m.length), eval.Status)) 269 } else { 270 // There were failures making the allocations 271 schedFailure = true 272 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:", 273 limit(eval.ID, m.length), eval.Status)) 274 275 // Print the failures per task group 276 for tg, metrics := range eval.FailedTGAllocs { 277 noun := "allocation" 278 if metrics.CoalescedFailures > 0 { 279 noun += "s" 280 } 281 m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun)) 282 metrics := formatAllocMetrics(metrics, false, " ") 283 for _, line := range strings.Split(metrics, "\n") { 284 m.ui.Output(line) 285 } 286 } 287 288 if eval.BlockedEval != "" { 289 m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder", 290 limit(eval.BlockedEval, m.length))) 291 } 292 } 293 default: 294 // Wait for the next update 295 time.Sleep(updateWait) 296 continue 297 } 298 299 // Monitor the next eval in the chain, if present 300 if eval.NextEval != "" { 301 if eval.Wait.Nanoseconds() != 0 { 302 m.ui.Info(fmt.Sprintf( 303 "Monitoring next evaluation %q in %s", 304 limit(eval.NextEval, m.length), eval.Wait)) 305 306 // Skip some unnecessary polling 307 time.Sleep(eval.Wait) 308 } 309 310 // Reset the state and monitor the new eval 311 m.state = newEvalState() 312 return m.monitor(eval.NextEval, allowPrefix) 313 } 314 break 315 } 316 317 // Treat scheduling failures specially using a dedicated exit code. 318 // This makes it easier to detect failures from the CLI. 319 if schedFailure { 320 return 2 321 } 322 323 return 0 324 } 325 326 func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string { 327 // Print a helpful message if we have an eligibility problem 328 var out string 329 if metrics.NodesEvaluated == 0 { 330 out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix) 331 } 332 333 // Print a helpful message if the user has asked for a DC that has no 334 // available nodes. 335 for dc, available := range metrics.NodesAvailable { 336 if available == 0 { 337 out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc) 338 } 339 } 340 341 // Print filter info 342 for class, num := range metrics.ClassFiltered { 343 out += fmt.Sprintf("%s* Class %q filtered %d nodes\n", prefix, class, num) 344 } 345 for cs, num := range metrics.ConstraintFiltered { 346 out += fmt.Sprintf("%s* Constraint %q filtered %d nodes\n", prefix, cs, num) 347 } 348 349 // Print exhaustion info 350 if ne := metrics.NodesExhausted; ne > 0 { 351 out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne) 352 } 353 for class, num := range metrics.ClassExhausted { 354 out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num) 355 } 356 for dim, num := range metrics.DimensionExhausted { 357 out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num) 358 } 359 360 // Print quota info 361 for _, dim := range metrics.QuotaExhausted { 362 out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim) 363 } 364 365 // Print scores 366 if scores { 367 for name, score := range metrics.Scores { 368 out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score) 369 } 370 } 371 372 out = strings.TrimSuffix(out, "\n") 373 return out 374 }