github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/deploymentwatcher/deployment_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "log" 6 "sync" 7 "time" 8 9 "golang.org/x/time/rate" 10 11 "github.com/hashicorp/nomad/helper" 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // perJobEvalBatchPeriod is the batching length before creating an evaluation to 17 // trigger the scheduler when allocations are marked as healthy. 18 perJobEvalBatchPeriod = 1 * time.Second 19 ) 20 21 // deploymentTriggers are the set of functions required to trigger changes on 22 // behalf of a deployment 23 type deploymentTriggers interface { 24 // createEvaluation is used to create an evaluation. 25 createEvaluation(eval *structs.Evaluation) (uint64, error) 26 27 // upsertJob is used to roll back a job when autoreverting for a deployment 28 upsertJob(job *structs.Job) (uint64, error) 29 30 // upsertDeploymentStatusUpdate is used to upsert a deployment status update 31 // and an optional evaluation and job to upsert 32 upsertDeploymentStatusUpdate(u *structs.DeploymentStatusUpdate, eval *structs.Evaluation, job *structs.Job) (uint64, error) 33 34 // upsertDeploymentPromotion is used to promote canaries in a deployment 35 upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 36 37 // upsertDeploymentAllocHealth is used to set the health of allocations in a 38 // deployment 39 upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 40 } 41 42 // deploymentWatcher is used to watch a single deployment and trigger the 43 // scheduler when allocation health transistions. 44 type deploymentWatcher struct { 45 // queryLimiter is used to limit the rate of blocking queries 46 queryLimiter *rate.Limiter 47 48 // deploymentTriggers holds the methods required to trigger changes on behalf of the 49 // deployment 50 deploymentTriggers 51 52 // DeploymentStateWatchers holds the methods required to watch objects for 53 // changes on behalf of the deployment 54 watchers DeploymentStateWatchers 55 56 // d is the deployment being watched 57 d *structs.Deployment 58 59 // j is the job the deployment is for 60 j *structs.Job 61 62 // outstandingBatch marks whether an outstanding function exists to create 63 // the evaluation. Access should be done through the lock 64 outstandingBatch bool 65 66 // latestEval is the latest eval for the job. It is updated by the watch 67 // loop and any time an evaluation is created. The field should be accessed 68 // by holding the lock or using the setter and getter methods. 69 latestEval uint64 70 71 logger *log.Logger 72 ctx context.Context 73 exitFn context.CancelFunc 74 l sync.RWMutex 75 } 76 77 // newDeploymentWatcher returns a deployment watcher that is used to watch 78 // deployments and trigger the scheduler as needed. 79 func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, 80 logger *log.Logger, watchers DeploymentStateWatchers, d *structs.Deployment, 81 j *structs.Job, triggers deploymentTriggers) *deploymentWatcher { 82 83 ctx, exitFn := context.WithCancel(parent) 84 w := &deploymentWatcher{ 85 queryLimiter: queryLimiter, 86 d: d, 87 j: j, 88 watchers: watchers, 89 deploymentTriggers: triggers, 90 logger: logger, 91 ctx: ctx, 92 exitFn: exitFn, 93 } 94 95 // Start the long lived watcher that scans for allocation updates 96 go w.watch() 97 98 return w 99 } 100 101 func (w *deploymentWatcher) SetAllocHealth( 102 req *structs.DeploymentAllocHealthRequest, 103 resp *structs.DeploymentUpdateResponse) error { 104 105 // If we are failing the deployment, update the status and potentially 106 // rollback 107 var j *structs.Job 108 var u *structs.DeploymentStatusUpdate 109 110 // If there are unhealthy allocations we need to mark the deployment as 111 // failed and check if we should roll back to a stable job. 112 if l := len(req.UnhealthyAllocationIDs); l != 0 { 113 unhealthy := make(map[string]struct{}, l) 114 for _, alloc := range req.UnhealthyAllocationIDs { 115 unhealthy[alloc] = struct{}{} 116 } 117 118 // Get the allocations for the deployment 119 args := &structs.DeploymentSpecificRequest{DeploymentID: req.DeploymentID} 120 var resp structs.AllocListResponse 121 if err := w.watchers.Allocations(args, &resp); err != nil { 122 return err 123 } 124 125 // Determine if we should autorevert to an older job 126 desc := structs.DeploymentStatusDescriptionFailedAllocations 127 for _, alloc := range resp.Allocations { 128 // Check that the alloc has been marked unhealthy 129 if _, ok := unhealthy[alloc.ID]; !ok { 130 continue 131 } 132 133 // Check if the group has autorevert set 134 group, ok := w.d.TaskGroups[alloc.TaskGroup] 135 if !ok || !group.AutoRevert { 136 continue 137 } 138 139 var err error 140 j, err = w.latestStableJob() 141 if err != nil { 142 return err 143 } 144 145 if j != nil { 146 desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version) 147 } 148 break 149 } 150 151 u = w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) 152 } 153 154 // Create the request 155 areq := &structs.ApplyDeploymentAllocHealthRequest{ 156 DeploymentAllocHealthRequest: *req, 157 Eval: w.getEval(), 158 DeploymentUpdate: u, 159 Job: j, 160 } 161 162 index, err := w.upsertDeploymentAllocHealth(areq) 163 if err != nil { 164 return err 165 } 166 167 // Build the response 168 resp.EvalID = areq.Eval.ID 169 resp.EvalCreateIndex = index 170 resp.DeploymentModifyIndex = index 171 resp.Index = index 172 if j != nil { 173 resp.RevertedJobVersion = helper.Uint64ToPtr(j.Version) 174 } 175 w.setLatestEval(index) 176 return nil 177 } 178 179 func (w *deploymentWatcher) PromoteDeployment( 180 req *structs.DeploymentPromoteRequest, 181 resp *structs.DeploymentUpdateResponse) error { 182 183 // Create the request 184 areq := &structs.ApplyDeploymentPromoteRequest{ 185 DeploymentPromoteRequest: *req, 186 Eval: w.getEval(), 187 } 188 189 index, err := w.upsertDeploymentPromotion(areq) 190 if err != nil { 191 return err 192 } 193 194 // Build the response 195 resp.EvalID = areq.Eval.ID 196 resp.EvalCreateIndex = index 197 resp.DeploymentModifyIndex = index 198 resp.Index = index 199 w.setLatestEval(index) 200 return nil 201 } 202 203 func (w *deploymentWatcher) PauseDeployment( 204 req *structs.DeploymentPauseRequest, 205 resp *structs.DeploymentUpdateResponse) error { 206 // Determine the status we should transistion to and if we need to create an 207 // evaluation 208 status, desc := structs.DeploymentStatusPaused, structs.DeploymentStatusDescriptionPaused 209 var eval *structs.Evaluation 210 evalID := "" 211 if !req.Pause { 212 status, desc = structs.DeploymentStatusRunning, structs.DeploymentStatusDescriptionRunning 213 eval = w.getEval() 214 evalID = eval.ID 215 } 216 update := w.getDeploymentStatusUpdate(status, desc) 217 218 // Commit the change 219 i, err := w.upsertDeploymentStatusUpdate(update, eval, nil) 220 if err != nil { 221 return err 222 } 223 224 // Build the response 225 if evalID != "" { 226 resp.EvalID = evalID 227 resp.EvalCreateIndex = i 228 } 229 resp.DeploymentModifyIndex = i 230 resp.Index = i 231 w.setLatestEval(i) 232 return nil 233 } 234 235 func (w *deploymentWatcher) FailDeployment( 236 req *structs.DeploymentFailRequest, 237 resp *structs.DeploymentUpdateResponse) error { 238 239 status, desc := structs.DeploymentStatusFailed, structs.DeploymentStatusDescriptionFailedByUser 240 241 // Determine if we should rollback 242 rollback := false 243 for _, state := range w.d.TaskGroups { 244 if state.AutoRevert { 245 rollback = true 246 break 247 } 248 } 249 250 var rollbackJob *structs.Job 251 if rollback { 252 var err error 253 rollbackJob, err = w.latestStableJob() 254 if err != nil { 255 return err 256 } 257 258 if rollbackJob != nil { 259 desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version) 260 } 261 } 262 263 // Commit the change 264 update := w.getDeploymentStatusUpdate(status, desc) 265 eval := w.getEval() 266 i, err := w.upsertDeploymentStatusUpdate(update, eval, rollbackJob) 267 if err != nil { 268 return err 269 } 270 271 // Build the response 272 resp.EvalID = eval.ID 273 resp.EvalCreateIndex = i 274 resp.DeploymentModifyIndex = i 275 resp.Index = i 276 if rollbackJob != nil { 277 resp.RevertedJobVersion = helper.Uint64ToPtr(rollbackJob.Version) 278 } 279 w.setLatestEval(i) 280 return nil 281 } 282 283 // StopWatch stops watching the deployment. This should be called whenever a 284 // deployment is completed or the watcher is no longer needed. 285 func (w *deploymentWatcher) StopWatch() { 286 w.exitFn() 287 } 288 289 // watch is the long running watcher that takes actions upon allocation changes 290 func (w *deploymentWatcher) watch() { 291 allocIndex := uint64(1) 292 for { 293 // Block getting all allocations that are part of the deployment using 294 // the last evaluation index. This will have us block waiting for 295 // something to change past what the scheduler has evaluated. 296 allocResp, err := w.getAllocs(allocIndex) 297 if err != nil { 298 if err == context.Canceled || w.ctx.Err() == context.Canceled { 299 return 300 } 301 302 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to retrieve allocations for deployment %q: %v", w.d.ID, err) 303 return 304 } 305 allocIndex = allocResp.Index 306 307 // Get the latest evaluation index 308 latestEval, err := w.latestEvalIndex() 309 if err != nil { 310 if err == context.Canceled || w.ctx.Err() == context.Canceled { 311 return 312 } 313 314 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine last evaluation index for job %q: %v", w.d.JobID, err) 315 return 316 } 317 318 // Create an evaluation trigger if there is any allocation whose 319 // deployment status has been updated past the latest eval index. 320 createEval, failDeployment, rollback := false, false, false 321 for _, alloc := range allocResp.Allocations { 322 if alloc.DeploymentStatus == nil || alloc.DeploymentStatus.ModifyIndex <= latestEval { 323 continue 324 } 325 326 // We need to create an eval 327 createEval = true 328 329 if alloc.DeploymentStatus.IsUnhealthy() { 330 // Check if the group has autorevert set 331 group, ok := w.d.TaskGroups[alloc.TaskGroup] 332 if ok && group.AutoRevert { 333 rollback = true 334 } 335 336 // Since we have an unhealthy allocation, fail the deployment 337 failDeployment = true 338 } 339 340 // All conditions have been hit so we can break 341 if createEval && failDeployment && rollback { 342 break 343 } 344 } 345 346 // Change the deployments status to failed 347 if failDeployment { 348 // Default description 349 desc := structs.DeploymentStatusDescriptionFailedAllocations 350 351 // Rollback to the old job if necessary 352 var j *structs.Job 353 if rollback { 354 var err error 355 j, err = w.latestStableJob() 356 if err != nil { 357 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to lookup latest stable job for %q: %v", w.d.JobID, err) 358 } 359 360 // Description should include that the job is being rolled back to 361 // version N 362 if j != nil { 363 desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version) 364 } 365 } 366 367 // Update the status of the deployment to failed and create an 368 // evaluation. 369 e := w.getEval() 370 u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) 371 if index, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil { 372 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to update deployment %q status: %v", w.d.ID, err) 373 } else { 374 w.setLatestEval(index) 375 } 376 } else if createEval { 377 // Create an eval to push the deployment along 378 w.createEvalBatched(allocResp.Index) 379 } 380 } 381 } 382 383 // latestStableJob returns the latest stable job. It may be nil if none exist 384 func (w *deploymentWatcher) latestStableJob() (*structs.Job, error) { 385 args := &structs.JobVersionsRequest{JobID: w.d.JobID} 386 var resp structs.JobVersionsResponse 387 if err := w.watchers.GetJobVersions(args, &resp); err != nil { 388 return nil, err 389 } 390 391 var stable *structs.Job 392 for _, job := range resp.Versions { 393 if job.Stable { 394 stable = job 395 break 396 } 397 } 398 399 return stable, nil 400 } 401 402 // createEvalBatched creates an eval but batches calls together 403 func (w *deploymentWatcher) createEvalBatched(forIndex uint64) { 404 w.l.Lock() 405 defer w.l.Unlock() 406 407 if w.outstandingBatch || forIndex < w.latestEval { 408 return 409 } 410 411 w.outstandingBatch = true 412 413 time.AfterFunc(perJobEvalBatchPeriod, func() { 414 // Create the eval 415 evalCreateIndex, err := w.createEvaluation(w.getEval()) 416 if err != nil { 417 w.logger.Printf("[ERR] nomad.deployment_watcher: failed to create evaluation for deployment %q: %v", w.d.ID, err) 418 } else { 419 w.setLatestEval(evalCreateIndex) 420 } 421 422 w.l.Lock() 423 w.outstandingBatch = false 424 w.l.Unlock() 425 426 }) 427 } 428 429 // getEval returns an evaluation suitable for the deployment 430 func (w *deploymentWatcher) getEval() *structs.Evaluation { 431 return &structs.Evaluation{ 432 ID: structs.GenerateUUID(), 433 Priority: w.j.Priority, 434 Type: w.j.Type, 435 TriggeredBy: structs.EvalTriggerDeploymentWatcher, 436 JobID: w.j.ID, 437 DeploymentID: w.d.ID, 438 Status: structs.EvalStatusPending, 439 } 440 } 441 442 // getDeploymentStatusUpdate returns a deployment status update 443 func (w *deploymentWatcher) getDeploymentStatusUpdate(status, desc string) *structs.DeploymentStatusUpdate { 444 return &structs.DeploymentStatusUpdate{ 445 DeploymentID: w.d.ID, 446 Status: status, 447 StatusDescription: desc, 448 } 449 } 450 451 // getAllocs retrieves the allocations that are part of the deployment blocking 452 // at the given index. 453 func (w *deploymentWatcher) getAllocs(index uint64) (*structs.AllocListResponse, error) { 454 // Build the request 455 args := &structs.DeploymentSpecificRequest{ 456 DeploymentID: w.d.ID, 457 QueryOptions: structs.QueryOptions{ 458 MinQueryIndex: index, 459 }, 460 } 461 var resp structs.AllocListResponse 462 463 for resp.Index <= index { 464 if err := w.queryLimiter.Wait(w.ctx); err != nil { 465 return nil, err 466 } 467 468 if err := w.watchers.Allocations(args, &resp); err != nil { 469 return nil, err 470 } 471 } 472 473 return &resp, nil 474 } 475 476 // latestEvalIndex returns the index of the last evaluation created for 477 // the job. The index is used to determine if an allocation update requires an 478 // evaluation to be triggered. 479 func (w *deploymentWatcher) latestEvalIndex() (uint64, error) { 480 if err := w.queryLimiter.Wait(w.ctx); err != nil { 481 return 0, err 482 } 483 484 args := &structs.JobSpecificRequest{ 485 JobID: w.d.JobID, 486 } 487 var resp structs.JobEvaluationsResponse 488 err := w.watchers.Evaluations(args, &resp) 489 if err != nil { 490 return 0, err 491 } 492 493 if len(resp.Evaluations) == 0 { 494 w.setLatestEval(resp.Index) 495 return resp.Index, nil 496 } 497 498 // Prefer using the snapshot index. Otherwise use the create index 499 e := resp.Evaluations[0] 500 if e.SnapshotIndex != 0 { 501 w.setLatestEval(e.SnapshotIndex) 502 return e.SnapshotIndex, nil 503 } 504 505 w.setLatestEval(e.CreateIndex) 506 return e.CreateIndex, nil 507 } 508 509 // setLatestEval sets the given index as the latest eval unless the currently 510 // stored index is higher. 511 func (w *deploymentWatcher) setLatestEval(index uint64) { 512 w.l.Lock() 513 defer w.l.Unlock() 514 if index > w.latestEval { 515 w.latestEval = index 516 } 517 } 518 519 // getLatestEval returns the latest eval index. 520 func (w *deploymentWatcher) getLatestEval() uint64 { 521 w.l.Lock() 522 defer w.l.Unlock() 523 return w.latestEval 524 }