github.com/huiliang/nomad@v0.2.1-0.20151124023127-7a8b664699ff/command/monitor.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "sync" 6 "time" 7 8 "github.com/hashicorp/nomad/api" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/mitchellh/cli" 11 ) 12 13 const ( 14 // updateWait is the amount of time to wait between status 15 // updates. Because the monitor is poll-based, we use this 16 // delay to avoid overwhelming the API server. 17 updateWait = time.Second 18 ) 19 20 // evalState is used to store the current "state of the world" 21 // in the context of monitoring an evaluation. 22 type evalState struct { 23 status string 24 desc string 25 node string 26 job string 27 allocs map[string]*allocState 28 wait time.Duration 29 index uint64 30 } 31 32 // newEvalState creates and initializes a new monitorState 33 func newEvalState() *evalState { 34 return &evalState{ 35 status: structs.EvalStatusPending, 36 allocs: make(map[string]*allocState), 37 } 38 } 39 40 // allocState is used to track the state of an allocation 41 type allocState struct { 42 id string 43 group string 44 node string 45 desired string 46 desiredDesc string 47 client string 48 clientDesc string 49 index uint64 50 51 // full is the allocation struct with full details. This 52 // must be queried for explicitly so it is only included 53 // if there is important error information inside. 54 full *api.Allocation 55 } 56 57 // monitor wraps an evaluation monitor and holds metadata and 58 // state information. 59 type monitor struct { 60 ui cli.Ui 61 client *api.Client 62 state *evalState 63 64 sync.Mutex 65 } 66 67 // newMonitor returns a new monitor. The returned monitor will 68 // write output information to the provided ui. 69 func newMonitor(ui cli.Ui, client *api.Client) *monitor { 70 mon := &monitor{ 71 ui: &cli.PrefixedUi{ 72 InfoPrefix: "==> ", 73 OutputPrefix: " ", 74 ErrorPrefix: "==> ", 75 Ui: ui, 76 }, 77 client: client, 78 state: newEvalState(), 79 } 80 return mon 81 } 82 83 // update is used to update our monitor with new state. It can be 84 // called whether the passed information is new or not, and will 85 // only dump update messages when state changes. 86 func (m *monitor) update(update *evalState) { 87 m.Lock() 88 defer m.Unlock() 89 90 existing := m.state 91 92 // Swap in the new state at the end 93 defer func() { 94 m.state = update 95 }() 96 97 // Check if the evaluation was triggered by a node 98 if existing.node == "" && update.node != "" { 99 m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q", 100 update.node)) 101 } 102 103 // Check if the evaluation was triggered by a job 104 if existing.job == "" && update.job != "" { 105 m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job)) 106 } 107 108 // Check the allocations 109 for allocID, alloc := range update.allocs { 110 if existing, ok := existing.allocs[allocID]; !ok { 111 switch { 112 case alloc.desired == structs.AllocDesiredStatusFailed: 113 // New allocs with desired state failed indicate 114 // scheduling failure. 115 m.ui.Output(fmt.Sprintf("Scheduling error for group %q (%s)", 116 alloc.group, alloc.desiredDesc)) 117 118 // Log the client status, if any provided 119 if alloc.clientDesc != "" { 120 m.ui.Output("Client reported status: " + alloc.clientDesc) 121 } 122 123 // Generate a more descriptive error for why the allocation 124 // failed and dump it to the screen 125 if alloc.full != nil { 126 dumpAllocStatus(m.ui, alloc.full) 127 } 128 129 case alloc.index < update.index: 130 // New alloc with create index lower than the eval 131 // create index indicates modification 132 m.ui.Output(fmt.Sprintf( 133 "Allocation %q modified: node %q, group %q", 134 alloc.id, alloc.node, alloc.group)) 135 136 case alloc.desired == structs.AllocDesiredStatusRun: 137 // New allocation with desired status running 138 m.ui.Output(fmt.Sprintf( 139 "Allocation %q created: node %q, group %q", 140 alloc.id, alloc.node, alloc.group)) 141 } 142 } else { 143 switch { 144 case existing.client != alloc.client: 145 // Allocation status has changed 146 m.ui.Output(fmt.Sprintf( 147 "Allocation %q status changed: %q -> %q (%s)", 148 alloc.id, existing.client, alloc.client, alloc.clientDesc)) 149 } 150 } 151 } 152 153 // Check if the status changed. We skip any transitions to pending status. 154 if existing.status != "" && 155 update.status != structs.AllocClientStatusPending && 156 existing.status != update.status { 157 m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q", 158 existing.status, update.status)) 159 } 160 } 161 162 // monitor is used to start monitoring the given evaluation ID. It 163 // writes output directly to the monitor's ui, and returns the 164 // exit code for the command. 165 // 166 // The return code will be 0 on successful evaluation. If there are 167 // problems scheduling the job (impossible constraints, resources 168 // exhausted, etc), then the return code will be 2. For any other 169 // failures (API connectivity, internal errors, etc), the return code 170 // will be 1. 171 func (m *monitor) monitor(evalID string) int { 172 // Track if we encounter a scheduling failure. This can only be 173 // detected while querying allocations, so we use this bool to 174 // carry that status into the return code. 175 var schedFailure bool 176 177 // Add the initial pending state 178 m.update(newEvalState()) 179 180 m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", evalID)) 181 for { 182 // Query the evaluation 183 eval, _, err := m.client.Evaluations().Info(evalID, nil) 184 if err != nil { 185 m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err)) 186 return 1 187 } 188 189 // Create the new eval state. 190 state := newEvalState() 191 state.status = eval.Status 192 state.desc = eval.StatusDescription 193 state.node = eval.NodeID 194 state.job = eval.JobID 195 state.wait = eval.Wait 196 state.index = eval.CreateIndex 197 198 // Query the allocations associated with the evaluation 199 allocs, _, err := m.client.Evaluations().Allocations(evalID, nil) 200 if err != nil { 201 m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err)) 202 return 1 203 } 204 205 // Add the allocs to the state 206 for _, alloc := range allocs { 207 state.allocs[alloc.ID] = &allocState{ 208 id: alloc.ID, 209 group: alloc.TaskGroup, 210 node: alloc.NodeID, 211 desired: alloc.DesiredStatus, 212 desiredDesc: alloc.DesiredDescription, 213 client: alloc.ClientStatus, 214 clientDesc: alloc.ClientDescription, 215 index: alloc.CreateIndex, 216 } 217 218 // If we have a scheduling error, query the full allocation 219 // to get the details. 220 if alloc.DesiredStatus == structs.AllocDesiredStatusFailed { 221 schedFailure = true 222 failed, _, err := m.client.Allocations().Info(alloc.ID, nil) 223 if err != nil { 224 m.ui.Error(fmt.Sprintf("Error querying allocation: %s", err)) 225 return 1 226 } 227 state.allocs[alloc.ID].full = failed 228 } 229 } 230 231 // Update the state 232 m.update(state) 233 234 switch eval.Status { 235 case structs.EvalStatusComplete, structs.EvalStatusFailed: 236 m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q", 237 eval.ID, eval.Status)) 238 default: 239 // Wait for the next update 240 time.Sleep(updateWait) 241 continue 242 } 243 244 // Monitor the next eval in the chain, if present 245 if eval.NextEval != "" { 246 m.ui.Info(fmt.Sprintf( 247 "Monitoring next evaluation %q in %s", 248 eval.NextEval, eval.Wait)) 249 250 // Skip some unnecessary polling 251 time.Sleep(eval.Wait) 252 253 // Reset the state and monitor the new eval 254 m.state = newEvalState() 255 return m.monitor(eval.NextEval) 256 } 257 break 258 } 259 260 // Treat scheduling failures specially using a dedicated exit code. 261 // This makes it easier to detect failures from the CLI. 262 if schedFailure { 263 return 2 264 } 265 266 return 0 267 } 268 269 // dumpAllocStatus is a helper to generate a more user-friendly error message 270 // for scheduling failures, displaying a high level status of why the job 271 // could not be scheduled out. 272 func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation) { 273 // Print filter stats 274 ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)", 275 alloc.ID, alloc.ClientStatus, 276 alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated)) 277 278 // Print a helpful message if we have an eligibility problem 279 if alloc.Metrics.NodesEvaluated == 0 { 280 ui.Output(" * No nodes were eligible for evaluation") 281 } 282 283 // Print filter info 284 for class, num := range alloc.Metrics.ClassFiltered { 285 ui.Output(fmt.Sprintf(" * Class %q filtered %d nodes", class, num)) 286 } 287 for cs, num := range alloc.Metrics.ConstraintFiltered { 288 ui.Output(fmt.Sprintf(" * Constraint %q filtered %d nodes", cs, num)) 289 } 290 291 // Print exhaustion info 292 if ne := alloc.Metrics.NodesExhausted; ne > 0 { 293 ui.Output(fmt.Sprintf(" * Resources exhausted on %d nodes", ne)) 294 } 295 for class, num := range alloc.Metrics.ClassExhausted { 296 ui.Output(fmt.Sprintf(" * Class %q exhausted on %d nodes", class, num)) 297 } 298 for dim, num := range alloc.Metrics.DimensionExhausted { 299 ui.Output(fmt.Sprintf(" * Dimension %q exhausted on %d nodes", dim, num)) 300 } 301 302 // Print scores 303 for name, score := range alloc.Metrics.Scores { 304 ui.Output(fmt.Sprintf(" * Score %q = %f", name, score)) 305 } 306 }