github.com/hernad/nomad@v1.6.112/command/monitor.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package command 5 6 import ( 7 "fmt" 8 "sort" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/hernad/nomad/api" 14 "github.com/mitchellh/cli" 15 ) 16 17 const ( 18 // updateWait is the amount of time to wait between status 19 // updates. Because the monitor is poll-based, we use this 20 // delay to avoid overwhelming the API server. 21 updateWait = time.Second 22 ) 23 24 // evalState is used to store the current "state of the world" 25 // in the context of monitoring an evaluation. 26 type evalState struct { 27 status string 28 desc string 29 node string 30 deployment string 31 job string 32 allocs map[string]*allocState 33 wait time.Duration 34 index uint64 35 } 36 37 // newEvalState creates and initializes a new monitorState 38 func newEvalState() *evalState { 39 return &evalState{ 40 status: api.EvalStatusPending, 41 allocs: make(map[string]*allocState), 42 } 43 } 44 45 // allocState is used to track the state of an allocation 46 type allocState struct { 47 id string 48 group string 49 node string 50 desired string 51 desiredDesc string 52 client string 53 clientDesc string 54 index uint64 55 } 56 57 // monitor wraps an evaluation monitor and holds metadata and 58 // state information. 59 type monitor struct { 60 ui cli.Ui 61 client *api.Client 62 state *evalState 63 64 // length determines the number of characters for identifiers in the ui. 65 length int 66 67 sync.Mutex 68 } 69 70 // newMonitor returns a new monitor. The returned monitor will 71 // write output information to the provided ui. The length parameter determines 72 // the number of characters for identifiers in the ui. 73 func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor { 74 if colorUi, ok := ui.(*cli.ColoredUi); ok { 75 // Disable Info color for monitored output 76 ui = &cli.ColoredUi{ 77 ErrorColor: colorUi.ErrorColor, 78 WarnColor: colorUi.WarnColor, 79 InfoColor: cli.UiColorNone, 80 Ui: colorUi.Ui, 81 } 82 } 83 mon := &monitor{ 84 ui: &cli.PrefixedUi{ 85 InfoPrefix: "==> ", 86 OutputPrefix: " ", 87 ErrorPrefix: "==> ", 88 Ui: ui, 89 }, 90 client: client, 91 state: newEvalState(), 92 length: length, 93 } 94 return mon 95 } 96 97 // update is used to update our monitor with new state. It can be 98 // called whether the passed information is new or not, and will 99 // only dump update messages when state changes. 100 func (m *monitor) update(update *evalState) { 101 m.Lock() 102 defer m.Unlock() 103 104 existing := m.state 105 106 // Swap in the new state at the end 107 defer func() { 108 m.state = update 109 }() 110 111 // Check if the evaluation was triggered by a node 112 if existing.node == "" && update.node != "" { 113 m.ui.Output(fmt.Sprintf("%s: Evaluation triggered by node %q", 114 formatTime(time.Now()), limit(update.node, m.length))) 115 } 116 117 // Check if the evaluation was triggered by a job 118 if existing.job == "" && update.job != "" { 119 m.ui.Output(fmt.Sprintf("%s: Evaluation triggered by job %q", 120 formatTime(time.Now()), update.job)) 121 } 122 123 // Check if the evaluation was triggered by a deployment 124 if existing.deployment == "" && update.deployment != "" { 125 m.ui.Output(fmt.Sprintf("%s: Evaluation within deployment: %q", 126 formatTime(time.Now()), limit(update.deployment, m.length))) 127 } 128 129 // Check the allocations 130 for allocID, alloc := range update.allocs { 131 if existing, ok := existing.allocs[allocID]; !ok { 132 switch { 133 case alloc.index < update.index: 134 // New alloc with create index lower than the eval 135 // create index indicates modification 136 m.ui.Output(fmt.Sprintf( 137 "%s: Allocation %q modified: node %q, group %q", 138 formatTime(time.Now()), limit(alloc.id, m.length), 139 limit(alloc.node, m.length), alloc.group)) 140 141 case alloc.desired == api.AllocDesiredStatusRun: 142 // New allocation with desired status running 143 m.ui.Output(fmt.Sprintf( 144 "%s: Allocation %q created: node %q, group %q", 145 formatTime(time.Now()), limit(alloc.id, m.length), 146 limit(alloc.node, m.length), alloc.group)) 147 } 148 } else { 149 switch { 150 case existing.client != alloc.client: 151 description := "" 152 if alloc.clientDesc != "" { 153 description = fmt.Sprintf(" (%s)", alloc.clientDesc) 154 } 155 // Allocation status has changed 156 m.ui.Output(fmt.Sprintf( 157 "%s: Allocation %q status changed: %q -> %q%s", 158 formatTime(time.Now()), limit(alloc.id, m.length), 159 existing.client, alloc.client, description)) 160 } 161 } 162 } 163 164 // Check if the status changed. We skip any transitions to pending status. 165 if existing.status != "" && 166 update.status != api.AllocClientStatusPending && 167 existing.status != update.status { 168 m.ui.Output(fmt.Sprintf("%s: Evaluation status changed: %q -> %q", 169 formatTime(time.Now()), existing.status, update.status)) 170 } 171 } 172 173 // monitor is used to start monitoring the given evaluation ID. It 174 // writes output directly to the monitor's ui, and returns the 175 // exit code for the command. 176 // 177 // The return code will be 0 on successful evaluation. If there are 178 // problems scheduling the job (impossible constraints, resources 179 // exhausted, etc), then the return code will be 2. For any other 180 // failures (API connectivity, internal errors, etc), the return code 181 // will be 1. 182 func (m *monitor) monitor(evalID string) int { 183 // Track if we encounter a scheduling failure. This can only be 184 // detected while querying allocations, so we use this bool to 185 // carry that status into the return code. 186 var schedFailure bool 187 188 // Add the initial pending state 189 m.update(newEvalState()) 190 191 m.ui.Info(fmt.Sprintf("%s: Monitoring evaluation %q", 192 formatTime(time.Now()), limit(evalID, m.length))) 193 194 for { 195 // Query the evaluation 196 eval, _, err := m.client.Evaluations().Info(evalID, nil) 197 if err != nil { 198 m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID)) 199 return 1 200 } 201 202 // Create the new eval state. 203 state := newEvalState() 204 state.status = eval.Status 205 state.desc = eval.StatusDescription 206 state.node = eval.NodeID 207 state.job = eval.JobID 208 state.deployment = eval.DeploymentID 209 state.wait = eval.Wait 210 state.index = eval.CreateIndex 211 212 // Query the allocations associated with the evaluation 213 allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil) 214 if err != nil { 215 m.ui.Error(fmt.Sprintf("%s: Error reading allocations: %s", formatTime(time.Now()), err)) 216 return 1 217 } 218 219 // Add the allocs to the state 220 for _, alloc := range allocs { 221 state.allocs[alloc.ID] = &allocState{ 222 id: alloc.ID, 223 group: alloc.TaskGroup, 224 node: alloc.NodeID, 225 desired: alloc.DesiredStatus, 226 desiredDesc: alloc.DesiredDescription, 227 client: alloc.ClientStatus, 228 clientDesc: alloc.ClientDescription, 229 index: alloc.CreateIndex, 230 } 231 } 232 233 // Update the state 234 m.update(state) 235 236 switch eval.Status { 237 case api.EvalStatusComplete, api.EvalStatusFailed, api.EvalStatusCancelled: 238 if len(eval.FailedTGAllocs) == 0 { 239 m.ui.Info(fmt.Sprintf("%s: Evaluation %q finished with status %q", 240 formatTime(time.Now()), limit(eval.ID, m.length), eval.Status)) 241 } else { 242 // There were failures making the allocations 243 schedFailure = true 244 m.ui.Info(fmt.Sprintf("%s: Evaluation %q finished with status %q but failed to place all allocations:", 245 formatTime(time.Now()), limit(eval.ID, m.length), eval.Status)) 246 247 // Print the failures per task group 248 for tg, metrics := range eval.FailedTGAllocs { 249 noun := "allocation" 250 if metrics.CoalescedFailures > 0 { 251 noun += "s" 252 } 253 m.ui.Output(fmt.Sprintf("%s: Task Group %q (failed to place %d %s):", 254 formatTime(time.Now()), tg, metrics.CoalescedFailures+1, noun)) 255 metrics := formatAllocMetrics(metrics, false, " ") 256 for _, line := range strings.Split(metrics, "\n") { 257 m.ui.Output(line) 258 } 259 } 260 261 if eval.BlockedEval != "" { 262 m.ui.Output(fmt.Sprintf("%s: Evaluation %q waiting for additional capacity to place remainder", 263 formatTime(time.Now()), limit(eval.BlockedEval, m.length))) 264 } 265 } 266 default: 267 // Wait for the next update 268 time.Sleep(updateWait) 269 continue 270 } 271 272 // Monitor the next eval in the chain, if present 273 if eval.NextEval != "" { 274 if eval.Wait.Nanoseconds() != 0 { 275 m.ui.Info(fmt.Sprintf( 276 "%s: Monitoring next evaluation %q in %s", 277 formatTime(time.Now()), limit(eval.NextEval, m.length), eval.Wait)) 278 279 // Skip some unnecessary polling 280 time.Sleep(eval.Wait) 281 } 282 283 // Reset the state and monitor the new eval 284 m.state = newEvalState() 285 return m.monitor(eval.NextEval) 286 } 287 break 288 } 289 290 // Monitor the deployment if it exists 291 dID := m.state.deployment 292 if dID != "" { 293 m.ui.Info(fmt.Sprintf("%s: Monitoring deployment %q", formatTime(time.Now()), limit(dID, m.length))) 294 295 var verbose bool 296 if m.length == fullId { 297 verbose = true 298 } else { 299 verbose = false 300 } 301 302 meta := new(Meta) 303 meta.Ui = m.ui 304 cmd := &DeploymentStatusCommand{Meta: *meta} 305 status, err := cmd.monitor(m.client, dID, 0, m.state.wait, verbose) 306 if err != nil || status != api.DeploymentStatusSuccessful { 307 return 1 308 } 309 } 310 311 // Treat scheduling failures specially using a dedicated exit code. 312 // This makes it easier to detect failures from the CLI. 313 if schedFailure { 314 return 2 315 } 316 317 return 0 318 } 319 320 func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string { 321 // Print a helpful message if we have an eligibility problem 322 var out string 323 if metrics.NodesEvaluated == 0 { 324 out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix) 325 } 326 327 // Print a helpful message if the user has asked for a DC that has no 328 // available nodes. 329 for dc, available := range metrics.NodesAvailable { 330 if available == 0 { 331 out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc) 332 } 333 } 334 335 // Print filter info 336 for class, num := range metrics.ClassFiltered { 337 out += fmt.Sprintf("%s* Class %q: %d nodes excluded by filter\n", prefix, class, num) 338 } 339 for cs, num := range metrics.ConstraintFiltered { 340 out += fmt.Sprintf("%s* Constraint %q: %d nodes excluded by filter\n", prefix, cs, num) 341 } 342 343 // Print exhaustion info 344 if ne := metrics.NodesExhausted; ne > 0 { 345 out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne) 346 } 347 for class, num := range metrics.ClassExhausted { 348 out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num) 349 } 350 for dim, num := range metrics.DimensionExhausted { 351 out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num) 352 } 353 354 // Print quota info 355 for _, dim := range metrics.QuotaExhausted { 356 out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim) 357 } 358 359 // Print scores 360 if scores { 361 if len(metrics.ScoreMetaData) > 0 { 362 scoreOutput := make([]string, len(metrics.ScoreMetaData)+1) 363 364 // Find all possible scores and build header row. 365 allScores := make(map[string]struct{}) 366 for _, scoreMeta := range metrics.ScoreMetaData { 367 for score := range scoreMeta.Scores { 368 allScores[score] = struct{}{} 369 } 370 } 371 // Sort scores alphabetically. 372 scores := make([]string, 0, len(allScores)) 373 for score := range allScores { 374 scores = append(scores, score) 375 } 376 sort.Strings(scores) 377 scoreOutput[0] = fmt.Sprintf("Node|%s|final score", strings.Join(scores, "|")) 378 379 // Build row for each score. 380 for i, scoreMeta := range metrics.ScoreMetaData { 381 scoreOutput[i+1] = fmt.Sprintf("%v|", scoreMeta.NodeID) 382 for _, scorerName := range scores { 383 scoreVal := scoreMeta.Scores[scorerName] 384 scoreOutput[i+1] += fmt.Sprintf("%.3g|", scoreVal) 385 } 386 scoreOutput[i+1] += fmt.Sprintf("%.3g", scoreMeta.NormScore) 387 } 388 389 out += formatList(scoreOutput) 390 } else { 391 // Backwards compatibility for old allocs 392 for name, score := range metrics.Scores { 393 out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score) 394 } 395 } 396 } 397 398 out = strings.TrimSuffix(out, "\n") 399 return out 400 }