github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/command/monitor.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "sort" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/hashicorp/nomad/api" 11 "github.com/hashicorp/nomad/nomad/structs" 12 "github.com/mitchellh/cli" 13 ) 14 15 const ( 16 // updateWait is the amount of time to wait between status 17 // updates. Because the monitor is poll-based, we use this 18 // delay to avoid overwhelming the API server. 19 updateWait = time.Second 20 ) 21 22 // evalState is used to store the current "state of the world" 23 // in the context of monitoring an evaluation. 24 type evalState struct { 25 status string 26 desc string 27 node string 28 deployment string 29 job string 30 allocs map[string]*allocState 31 wait time.Duration 32 index uint64 33 } 34 35 // newEvalState creates and initializes a new monitorState 36 func newEvalState() *evalState { 37 return &evalState{ 38 status: structs.EvalStatusPending, 39 allocs: make(map[string]*allocState), 40 } 41 } 42 43 // allocState is used to track the state of an allocation 44 type allocState struct { 45 id string 46 group string 47 node string 48 desired string 49 desiredDesc string 50 client string 51 clientDesc string 52 index uint64 53 } 54 55 // monitor wraps an evaluation monitor and holds metadata and 56 // state information. 57 type monitor struct { 58 ui cli.Ui 59 client *api.Client 60 state *evalState 61 62 // length determines the number of characters for identifiers in the ui. 63 length int 64 65 sync.Mutex 66 } 67 68 // newMonitor returns a new monitor. The returned monitor will 69 // write output information to the provided ui. The length parameter determines 70 // the number of characters for identifiers in the ui. 71 func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor { 72 if colorUi, ok := ui.(*cli.ColoredUi); ok { 73 // Disable Info color for monitored output 74 ui = &cli.ColoredUi{ 75 ErrorColor: colorUi.ErrorColor, 76 WarnColor: colorUi.WarnColor, 77 InfoColor: cli.UiColorNone, 78 Ui: colorUi.Ui, 79 } 80 } 81 mon := &monitor{ 82 ui: &cli.PrefixedUi{ 83 InfoPrefix: "==> ", 84 OutputPrefix: " ", 85 ErrorPrefix: "==> ", 86 Ui: ui, 87 }, 88 client: client, 89 state: newEvalState(), 90 length: length, 91 } 92 return mon 93 } 94 95 // update is used to update our monitor with new state. It can be 96 // called whether the passed information is new or not, and will 97 // only dump update messages when state changes. 98 func (m *monitor) update(update *evalState) { 99 m.Lock() 100 defer m.Unlock() 101 102 existing := m.state 103 104 // Swap in the new state at the end 105 defer func() { 106 m.state = update 107 }() 108 109 // Check if the evaluation was triggered by a node 110 if existing.node == "" && update.node != "" { 111 m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q", 112 limit(update.node, m.length))) 113 } 114 115 // Check if the evaluation was triggered by a job 116 if existing.job == "" && update.job != "" { 117 m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job)) 118 } 119 120 // Check if the evaluation was triggered by a deployment 121 if existing.deployment == "" && update.deployment != "" { 122 m.ui.Output(fmt.Sprintf("Evaluation within deployment: %q", limit(update.deployment, m.length))) 123 } 124 125 // Check the allocations 126 for allocID, alloc := range update.allocs { 127 if existing, ok := existing.allocs[allocID]; !ok { 128 switch { 129 case alloc.index < update.index: 130 // New alloc with create index lower than the eval 131 // create index indicates modification 132 m.ui.Output(fmt.Sprintf( 133 "Allocation %q modified: node %q, group %q", 134 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 135 136 case alloc.desired == structs.AllocDesiredStatusRun: 137 // New allocation with desired status running 138 m.ui.Output(fmt.Sprintf( 139 "Allocation %q created: node %q, group %q", 140 limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group)) 141 } 142 } else { 143 switch { 144 case existing.client != alloc.client: 145 description := "" 146 if alloc.clientDesc != "" { 147 description = fmt.Sprintf(" (%s)", alloc.clientDesc) 148 } 149 // Allocation status has changed 150 m.ui.Output(fmt.Sprintf( 151 "Allocation %q status changed: %q -> %q%s", 152 limit(alloc.id, m.length), existing.client, alloc.client, description)) 153 } 154 } 155 } 156 157 // Check if the status changed. We skip any transitions to pending status. 158 if existing.status != "" && 159 update.status != structs.AllocClientStatusPending && 160 existing.status != update.status { 161 m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q", 162 existing.status, update.status)) 163 } 164 } 165 166 // monitor is used to start monitoring the given evaluation ID. It 167 // writes output directly to the monitor's ui, and returns the 168 // exit code for the command. 169 // 170 // The return code will be 0 on successful evaluation. If there are 171 // problems scheduling the job (impossible constraints, resources 172 // exhausted, etc), then the return code will be 2. For any other 173 // failures (API connectivity, internal errors, etc), the return code 174 // will be 1. 175 func (m *monitor) monitor(evalID string) int { 176 // Track if we encounter a scheduling failure. This can only be 177 // detected while querying allocations, so we use this bool to 178 // carry that status into the return code. 179 var schedFailure bool 180 181 // Add the initial pending state 182 m.update(newEvalState()) 183 184 for { 185 // Query the evaluation 186 eval, _, err := m.client.Evaluations().Info(evalID, nil) 187 if err != nil { 188 m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID)) 189 return 1 190 } 191 192 m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length))) 193 194 // Create the new eval state. 195 state := newEvalState() 196 state.status = eval.Status 197 state.desc = eval.StatusDescription 198 state.node = eval.NodeID 199 state.job = eval.JobID 200 state.deployment = eval.DeploymentID 201 state.wait = eval.Wait 202 state.index = eval.CreateIndex 203 204 // Query the allocations associated with the evaluation 205 allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil) 206 if err != nil { 207 m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err)) 208 return 1 209 } 210 211 // Add the allocs to the state 212 for _, alloc := range allocs { 213 state.allocs[alloc.ID] = &allocState{ 214 id: alloc.ID, 215 group: alloc.TaskGroup, 216 node: alloc.NodeID, 217 desired: alloc.DesiredStatus, 218 desiredDesc: alloc.DesiredDescription, 219 client: alloc.ClientStatus, 220 clientDesc: alloc.ClientDescription, 221 index: alloc.CreateIndex, 222 } 223 } 224 225 // Update the state 226 m.update(state) 227 228 switch eval.Status { 229 case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled: 230 if len(eval.FailedTGAllocs) == 0 { 231 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q", 232 limit(eval.ID, m.length), eval.Status)) 233 } else { 234 // There were failures making the allocations 235 schedFailure = true 236 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:", 237 limit(eval.ID, m.length), eval.Status)) 238 239 // Print the failures per task group 240 for tg, metrics := range eval.FailedTGAllocs { 241 noun := "allocation" 242 if metrics.CoalescedFailures > 0 { 243 noun += "s" 244 } 245 m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun)) 246 metrics := formatAllocMetrics(metrics, false, " ") 247 for _, line := range strings.Split(metrics, "\n") { 248 m.ui.Output(line) 249 } 250 } 251 252 if eval.BlockedEval != "" { 253 m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder", 254 limit(eval.BlockedEval, m.length))) 255 } 256 } 257 default: 258 // Wait for the next update 259 time.Sleep(updateWait) 260 continue 261 } 262 263 // Monitor the next eval in the chain, if present 264 if eval.NextEval != "" { 265 if eval.Wait.Nanoseconds() != 0 { 266 m.ui.Info(fmt.Sprintf( 267 "Monitoring next evaluation %q in %s", 268 limit(eval.NextEval, m.length), eval.Wait)) 269 270 // Skip some unnecessary polling 271 time.Sleep(eval.Wait) 272 } 273 274 // Reset the state and monitor the new eval 275 m.state = newEvalState() 276 return m.monitor(eval.NextEval) 277 } 278 break 279 } 280 281 // Treat scheduling failures specially using a dedicated exit code. 282 // This makes it easier to detect failures from the CLI. 283 if schedFailure { 284 return 2 285 } 286 287 return 0 288 } 289 290 func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string { 291 // Print a helpful message if we have an eligibility problem 292 var out string 293 if metrics.NodesEvaluated == 0 { 294 out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix) 295 } 296 297 // Print a helpful message if the user has asked for a DC that has no 298 // available nodes. 299 for dc, available := range metrics.NodesAvailable { 300 if available == 0 { 301 out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc) 302 } 303 } 304 305 // Print filter info 306 for class, num := range metrics.ClassFiltered { 307 out += fmt.Sprintf("%s* Class %q: %d nodes excluded by filter\n", prefix, class, num) 308 } 309 for cs, num := range metrics.ConstraintFiltered { 310 out += fmt.Sprintf("%s* Constraint %q: %d nodes excluded by filter\n", prefix, cs, num) 311 } 312 313 // Print exhaustion info 314 if ne := metrics.NodesExhausted; ne > 0 { 315 out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne) 316 } 317 for class, num := range metrics.ClassExhausted { 318 out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num) 319 } 320 for dim, num := range metrics.DimensionExhausted { 321 out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num) 322 } 323 324 // Print quota info 325 for _, dim := range metrics.QuotaExhausted { 326 out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim) 327 } 328 329 // Print scores 330 if scores { 331 if len(metrics.ScoreMetaData) > 0 { 332 scoreOutput := make([]string, len(metrics.ScoreMetaData)+1) 333 var scorerNames []string 334 for i, scoreMeta := range metrics.ScoreMetaData { 335 // Add header as first row 336 if i == 0 { 337 scoreOutput[0] = "Node|" 338 339 // sort scores alphabetically 340 scores := make([]string, 0, len(scoreMeta.Scores)) 341 for score := range scoreMeta.Scores { 342 scores = append(scores, score) 343 } 344 sort.Strings(scores) 345 346 // build score header output 347 for _, scorerName := range scores { 348 scoreOutput[0] += fmt.Sprintf("%v|", scorerName) 349 scorerNames = append(scorerNames, scorerName) 350 } 351 scoreOutput[0] += "final score" 352 } 353 scoreOutput[i+1] = fmt.Sprintf("%v|", scoreMeta.NodeID) 354 for _, scorerName := range scorerNames { 355 scoreVal := scoreMeta.Scores[scorerName] 356 scoreOutput[i+1] += fmt.Sprintf("%.3g|", scoreVal) 357 } 358 scoreOutput[i+1] += fmt.Sprintf("%.3g", scoreMeta.NormScore) 359 } 360 out += formatList(scoreOutput) 361 } else { 362 // Backwards compatibility for old allocs 363 for name, score := range metrics.Scores { 364 out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score) 365 } 366 } 367 } 368 369 out = strings.TrimSuffix(out, "\n") 370 return out 371 }