github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/deploymentwatcher/deployments_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "golang.org/x/time/rate" 11 12 "github.com/hashicorp/nomad/nomad/structs" 13 ) 14 15 const ( 16 // LimitStateQueriesPerSecond is the number of state queries allowed per 17 // second 18 LimitStateQueriesPerSecond = 100.0 19 20 // CrossDeploymentEvalBatchDuration is the duration in which evaluations are 21 // batched across all deployment watchers before commiting to Raft. 22 CrossDeploymentEvalBatchDuration = 250 * time.Millisecond 23 ) 24 25 var ( 26 // notEnabled is the error returned when the deployment watcher is not 27 // enabled 28 notEnabled = fmt.Errorf("deployment watcher not enabled") 29 ) 30 31 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 32 // to apply data transforms via Raft. 33 type DeploymentRaftEndpoints interface { 34 // UpsertEvals is used to upsert a set of evaluations 35 UpsertEvals([]*structs.Evaluation) (uint64, error) 36 37 // UpsertJob is used to upsert a job 38 UpsertJob(job *structs.Job) (uint64, error) 39 40 // UpdateDeploymentStatus is used to make a deployment status update 41 // and potentially create an evaluation. 42 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 43 44 // UpdateDeploymentPromotion is used to promote canaries in a deployment 45 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 46 47 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 48 // deployment 49 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 50 } 51 52 // DeploymentStateWatchers are the set of functions required to watch objects on 53 // behalf of a deployment 54 type DeploymentStateWatchers interface { 55 // Evaluations returns the set of evaluations for the given job 56 Evaluations(args *structs.JobSpecificRequest, reply *structs.JobEvaluationsResponse) error 57 58 // Allocations returns the set of allocations that are part of the 59 // deployment. 60 Allocations(args *structs.DeploymentSpecificRequest, reply *structs.AllocListResponse) error 61 62 // List is used to list all the deployments in the system 63 List(args *structs.DeploymentListRequest, reply *structs.DeploymentListResponse) error 64 65 // GetDeployment is used to lookup a particular deployment. 66 GetDeployment(args *structs.DeploymentSpecificRequest, reply *structs.SingleDeploymentResponse) error 67 68 // GetJobVersions is used to lookup the versions of a job. This is used when 69 // rolling back to find the latest stable job 70 GetJobVersions(args *structs.JobVersionsRequest, reply *structs.JobVersionsResponse) error 71 72 // GetJob is used to lookup a particular job. 73 GetJob(args *structs.JobSpecificRequest, reply *structs.SingleJobResponse) error 74 } 75 76 // Watcher is used to watch deployments and their allocations created 77 // by the scheduler and trigger the scheduler when allocation health 78 // transistions. 79 type Watcher struct { 80 enabled bool 81 logger *log.Logger 82 83 // queryLimiter is used to limit the rate of blocking queries 84 queryLimiter *rate.Limiter 85 86 // evalBatchDuration is the duration to batch eval creation across all 87 // deployment watchers 88 evalBatchDuration time.Duration 89 90 // raft contains the set of Raft endpoints that can be used by the 91 // deployments watcher 92 raft DeploymentRaftEndpoints 93 94 // stateWatchers is the set of functions required to watch a deployment for 95 // state changes 96 stateWatchers DeploymentStateWatchers 97 98 // watchers is the set of active watchers, one per deployment 99 watchers map[string]*deploymentWatcher 100 101 // evalBatcher is used to batch the creation of evaluations 102 evalBatcher *EvalBatcher 103 104 // ctx and exitFn are used to cancel the watcher 105 ctx context.Context 106 exitFn context.CancelFunc 107 108 l sync.RWMutex 109 } 110 111 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 112 // deployments and trigger the scheduler as needed. 113 func NewDeploymentsWatcher(logger *log.Logger, watchers DeploymentStateWatchers, 114 raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, 115 evalBatchDuration time.Duration) *Watcher { 116 117 return &Watcher{ 118 stateWatchers: watchers, 119 raft: raft, 120 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 121 evalBatchDuration: evalBatchDuration, 122 logger: logger, 123 } 124 } 125 126 // SetEnabled is used to control if the watcher is enabled. The watcher 127 // should only be enabled on the active leader. 128 func (w *Watcher) SetEnabled(enabled bool) error { 129 w.l.Lock() 130 defer w.l.Unlock() 131 132 wasEnabled := w.enabled 133 w.enabled = enabled 134 135 // Flush the state to create the necessary objects 136 w.flush() 137 138 // If we are starting now, launch the watch daemon 139 if enabled && !wasEnabled { 140 go w.watchDeployments(w.ctx) 141 } 142 143 return nil 144 } 145 146 // flush is used to clear the state of the watcher 147 func (w *Watcher) flush() { 148 // Stop all the watchers and clear it 149 for _, watcher := range w.watchers { 150 watcher.StopWatch() 151 } 152 153 // Kill everything associated with the watcher 154 if w.exitFn != nil { 155 w.exitFn() 156 } 157 158 w.watchers = make(map[string]*deploymentWatcher, 32) 159 w.ctx, w.exitFn = context.WithCancel(context.Background()) 160 w.evalBatcher = NewEvalBatcher(w.evalBatchDuration, w.raft, w.ctx) 161 } 162 163 // watchDeployments is the long lived go-routine that watches for deployments to 164 // add and remove watchers on. 165 func (w *Watcher) watchDeployments(ctx context.Context) { 166 dindex := uint64(1) 167 for { 168 // Block getting all deployments using the last deployment index. 169 resp, err := w.getDeploys(ctx, dindex) 170 if err != nil { 171 if err == context.Canceled || ctx.Err() == context.Canceled { 172 return 173 } 174 175 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deploylements: %v", err) 176 } 177 178 // Guard against npe 179 if resp == nil { 180 continue 181 } 182 183 // Ensure we are tracking the things we should and not tracking what we 184 // shouldn't be 185 for _, d := range resp.Deployments { 186 if d.Active() { 187 if err := w.add(d); err != nil { 188 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err) 189 } 190 } else { 191 w.remove(d) 192 } 193 } 194 195 // Update the latest index 196 dindex = resp.Index 197 } 198 } 199 200 // getDeploys retrieves all deployments blocking at the given index. 201 func (w *Watcher) getDeploys(ctx context.Context, index uint64) (*structs.DeploymentListResponse, error) { 202 // Build the request 203 args := &structs.DeploymentListRequest{ 204 QueryOptions: structs.QueryOptions{ 205 MinQueryIndex: index, 206 }, 207 } 208 var resp structs.DeploymentListResponse 209 210 for resp.Index <= index { 211 if err := w.queryLimiter.Wait(ctx); err != nil { 212 return nil, err 213 } 214 215 if err := w.stateWatchers.List(args, &resp); err != nil { 216 return nil, err 217 } 218 } 219 220 return &resp, nil 221 } 222 223 // add adds a deployment to the watch list 224 func (w *Watcher) add(d *structs.Deployment) error { 225 w.l.Lock() 226 defer w.l.Unlock() 227 _, err := w.addLocked(d) 228 return err 229 } 230 231 // addLocked adds a deployment to the watch list and should only be called when 232 // locked. 233 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 234 // Not enabled so no-op 235 if !w.enabled { 236 return nil, nil 237 } 238 239 if !d.Active() { 240 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 241 } 242 243 // Already watched so no-op 244 if _, ok := w.watchers[d.ID]; ok { 245 return nil, nil 246 } 247 248 // Get the job the deployment is referencing 249 args := &structs.JobSpecificRequest{ 250 JobID: d.JobID, 251 } 252 var resp structs.SingleJobResponse 253 if err := w.stateWatchers.GetJob(args, &resp); err != nil { 254 return nil, err 255 } 256 if resp.Job == nil { 257 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 258 } 259 260 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.stateWatchers, d, resp.Job, w) 261 w.watchers[d.ID] = watcher 262 return watcher, nil 263 } 264 265 // remove stops watching a deployment. This can be because the deployment is 266 // complete or being deleted. 267 func (w *Watcher) remove(d *structs.Deployment) { 268 w.l.Lock() 269 defer w.l.Unlock() 270 271 // Not enabled so no-op 272 if !w.enabled { 273 return 274 } 275 276 if watcher, ok := w.watchers[d.ID]; ok { 277 watcher.StopWatch() 278 delete(w.watchers, d.ID) 279 } 280 } 281 282 // forceAdd is used to force a lookup of the given deployment object and create 283 // a watcher. If the deployment does not exist or is terminal an error is 284 // returned. 285 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 286 // Build the request 287 args := &structs.DeploymentSpecificRequest{DeploymentID: dID} 288 var resp structs.SingleDeploymentResponse 289 if err := w.stateWatchers.GetDeployment(args, &resp); err != nil { 290 return nil, err 291 } 292 293 if resp.Deployment == nil { 294 return nil, fmt.Errorf("unknown deployment %q", dID) 295 } 296 297 return w.addLocked(resp.Deployment) 298 } 299 300 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 301 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 302 w.l.Lock() 303 defer w.l.Unlock() 304 305 // Not enabled so no-op 306 if !w.enabled { 307 return nil, notEnabled 308 } 309 310 watcher, ok := w.watchers[dID] 311 if ok { 312 return watcher, nil 313 } 314 315 return w.forceAdd(dID) 316 } 317 318 // SetAllocHealth is used to set the health of allocations for a deployment. If 319 // there are any unhealthy allocations, the deployment is updated to be failed. 320 // Otherwise the allocations are updated and an evaluation is created. 321 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 322 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 323 if err != nil { 324 return err 325 } 326 327 return watcher.SetAllocHealth(req, resp) 328 } 329 330 // PromoteDeployment is used to promote a deployment. If promote is false, 331 // deployment is marked as failed. Otherwise the deployment is updated and an 332 // evaluation is created. 333 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 334 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 335 if err != nil { 336 return err 337 } 338 339 return watcher.PromoteDeployment(req, resp) 340 } 341 342 // PauseDeployment is used to toggle the pause state on a deployment. If the 343 // deployment is being unpaused, an evaluation is created. 344 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 345 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 346 if err != nil { 347 return err 348 } 349 350 return watcher.PauseDeployment(req, resp) 351 } 352 353 // FailDeployment is used to fail the deployment. 354 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 355 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 356 if err != nil { 357 return err 358 } 359 360 return watcher.FailDeployment(req, resp) 361 } 362 363 // createEvaluation commits the given evaluation to Raft but batches the commit 364 // with other calls. 365 func (w *Watcher) createEvaluation(eval *structs.Evaluation) (uint64, error) { 366 return w.evalBatcher.CreateEval(eval).Results() 367 } 368 369 // upsertJob commits the given job to Raft 370 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 371 return w.raft.UpsertJob(job) 372 } 373 374 // upsertDeploymentStatusUpdate commits the given deployment update and optional 375 // evaluation to Raft 376 func (w *Watcher) upsertDeploymentStatusUpdate( 377 u *structs.DeploymentStatusUpdate, 378 e *structs.Evaluation, 379 j *structs.Job) (uint64, error) { 380 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 381 DeploymentUpdate: u, 382 Eval: e, 383 Job: j, 384 }) 385 } 386 387 // upsertDeploymentPromotion commits the given deployment promotion to Raft 388 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 389 return w.raft.UpdateDeploymentPromotion(req) 390 } 391 392 // upsertDeploymentAllocHealth commits the given allocation health changes to 393 // Raft 394 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 395 return w.raft.UpdateDeploymentAllocHealth(req) 396 }