github.com/uchennaokeke444/nomad@v0.11.8/nomad/deploymentwatcher/deployments_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "sync" 7 "time" 8 9 "golang.org/x/time/rate" 10 11 log "github.com/hashicorp/go-hclog" 12 memdb "github.com/hashicorp/go-memdb" 13 14 "github.com/hashicorp/nomad/nomad/state" 15 "github.com/hashicorp/nomad/nomad/structs" 16 ) 17 18 const ( 19 // LimitStateQueriesPerSecond is the number of state queries allowed per 20 // second 21 LimitStateQueriesPerSecond = 100.0 22 23 // CrossDeploymentUpdateBatchDuration is the duration in which allocation 24 // desired transition and evaluation creation updates are batched across 25 // all deployment watchers before committing to Raft. 26 CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond 27 ) 28 29 var ( 30 // notEnabled is the error returned when the deployment watcher is not 31 // enabled 32 notEnabled = fmt.Errorf("deployment watcher not enabled") 33 ) 34 35 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 36 // to apply data transforms via Raft. 37 type DeploymentRaftEndpoints interface { 38 // UpsertJob is used to upsert a job 39 UpsertJob(job *structs.Job) (uint64, error) 40 41 // UpdateDeploymentStatus is used to make a deployment status update 42 // and potentially create an evaluation. 43 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 44 45 // UpdateDeploymentPromotion is used to promote canaries in a deployment 46 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 47 48 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 49 // deployment 50 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 51 52 // UpdateAllocDesiredTransition is used to update the desired transition 53 // for allocations. 54 UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error) 55 } 56 57 // Watcher is used to watch deployments and their allocations created 58 // by the scheduler and trigger the scheduler when allocation health 59 // transitions. 60 type Watcher struct { 61 enabled bool 62 logger log.Logger 63 64 // queryLimiter is used to limit the rate of blocking queries 65 queryLimiter *rate.Limiter 66 67 // updateBatchDuration is the duration to batch allocation desired 68 // transition and eval creation across all deployment watchers 69 updateBatchDuration time.Duration 70 71 // raft contains the set of Raft endpoints that can be used by the 72 // deployments watcher 73 raft DeploymentRaftEndpoints 74 75 // state is the state that is watched for state changes. 76 state *state.StateStore 77 78 // watchers is the set of active watchers, one per deployment 79 watchers map[string]*deploymentWatcher 80 81 // allocUpdateBatcher is used to batch the creation of evaluations and 82 // allocation desired transition updates 83 allocUpdateBatcher *AllocUpdateBatcher 84 85 // ctx and exitFn are used to cancel the watcher 86 ctx context.Context 87 exitFn context.CancelFunc 88 89 l sync.RWMutex 90 } 91 92 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 93 // deployments and trigger the scheduler as needed. 94 func NewDeploymentsWatcher(logger log.Logger, 95 raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, 96 updateBatchDuration time.Duration) *Watcher { 97 98 return &Watcher{ 99 raft: raft, 100 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 101 updateBatchDuration: updateBatchDuration, 102 logger: logger.Named("deployments_watcher"), 103 } 104 } 105 106 // SetEnabled is used to control if the watcher is enabled. The watcher 107 // should only be enabled on the active leader. When being enabled the state is 108 // passed in as it is no longer valid once a leader election has taken place. 109 func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) { 110 w.l.Lock() 111 defer w.l.Unlock() 112 113 wasEnabled := w.enabled 114 w.enabled = enabled 115 116 if state != nil { 117 w.state = state 118 } 119 120 // Flush the state to create the necessary objects 121 w.flush(enabled) 122 123 // If we are starting now, launch the watch daemon 124 if enabled && !wasEnabled { 125 go w.watchDeployments(w.ctx) 126 } 127 } 128 129 // flush is used to clear the state of the watcher 130 func (w *Watcher) flush(enabled bool) { 131 // Stop all the watchers and clear it 132 for _, watcher := range w.watchers { 133 watcher.StopWatch() 134 } 135 136 // Kill everything associated with the watcher 137 if w.exitFn != nil { 138 w.exitFn() 139 } 140 141 w.watchers = make(map[string]*deploymentWatcher, 32) 142 w.ctx, w.exitFn = context.WithCancel(context.Background()) 143 144 if enabled { 145 w.allocUpdateBatcher = NewAllocUpdateBatcher(w.ctx, w.updateBatchDuration, w.raft) 146 } else { 147 w.allocUpdateBatcher = nil 148 } 149 } 150 151 // watchDeployments is the long lived go-routine that watches for deployments to 152 // add and remove watchers on. 153 func (w *Watcher) watchDeployments(ctx context.Context) { 154 dindex := uint64(1) 155 for { 156 // Block getting all deployments using the last deployment index. 157 deployments, idx, err := w.getDeploys(ctx, dindex) 158 if err != nil { 159 if err == context.Canceled { 160 return 161 } 162 163 w.logger.Error("failed to retrieve deployments", "error", err) 164 } 165 166 // Update the latest index 167 dindex = idx 168 169 // Ensure we are tracking the things we should and not tracking what we 170 // shouldn't be 171 for _, d := range deployments { 172 if d.Active() { 173 if err := w.add(d); err != nil { 174 w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err) 175 } 176 } else { 177 w.remove(d) 178 } 179 } 180 } 181 } 182 183 // getDeploys retrieves all deployments blocking at the given index. 184 func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) { 185 resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx) 186 if err != nil { 187 return nil, 0, err 188 } 189 190 return resp.([]*structs.Deployment), index, nil 191 } 192 193 // getDeploysImpl retrieves all deployments from the passed state store. 194 func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 195 196 iter, err := state.Deployments(ws) 197 if err != nil { 198 return nil, 0, err 199 } 200 201 var deploys []*structs.Deployment 202 for { 203 raw := iter.Next() 204 if raw == nil { 205 break 206 } 207 deploy := raw.(*structs.Deployment) 208 deploys = append(deploys, deploy) 209 } 210 211 // Use the last index that affected the deployment table 212 index, err := state.Index("deployment") 213 if err != nil { 214 return nil, 0, err 215 } 216 217 return deploys, index, nil 218 } 219 220 // add adds a deployment to the watch list 221 func (w *Watcher) add(d *structs.Deployment) error { 222 w.l.Lock() 223 defer w.l.Unlock() 224 _, err := w.addLocked(d) 225 return err 226 } 227 228 // addLocked adds a deployment to the watch list and should only be called when 229 // locked. Creating the deploymentWatcher starts a go routine to .watch() it 230 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 231 // Not enabled so no-op 232 if !w.enabled { 233 return nil, nil 234 } 235 236 if !d.Active() { 237 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 238 } 239 240 // Already watched so just update the deployment 241 if w, ok := w.watchers[d.ID]; ok { 242 w.updateDeployment(d) 243 return nil, nil 244 } 245 246 // Get the job the deployment is referencing 247 snap, err := w.state.Snapshot() 248 if err != nil { 249 return nil, err 250 } 251 252 job, err := snap.JobByID(nil, d.Namespace, d.JobID) 253 if err != nil { 254 return nil, err 255 } 256 if job == nil { 257 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 258 } 259 260 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w) 261 w.watchers[d.ID] = watcher 262 return watcher, nil 263 } 264 265 // remove stops watching a deployment. This can be because the deployment is 266 // complete or being deleted. 267 func (w *Watcher) remove(d *structs.Deployment) { 268 w.l.Lock() 269 defer w.l.Unlock() 270 271 // Not enabled so no-op 272 if !w.enabled { 273 return 274 } 275 276 if watcher, ok := w.watchers[d.ID]; ok { 277 watcher.StopWatch() 278 delete(w.watchers, d.ID) 279 } 280 } 281 282 // forceAdd is used to force a lookup of the given deployment object and create 283 // a watcher. If the deployment does not exist or is terminal an error is 284 // returned. 285 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 286 snap, err := w.state.Snapshot() 287 if err != nil { 288 return nil, err 289 } 290 291 deployment, err := snap.DeploymentByID(nil, dID) 292 if err != nil { 293 return nil, err 294 } 295 296 if deployment == nil { 297 return nil, fmt.Errorf("unknown deployment %q", dID) 298 } 299 300 return w.addLocked(deployment) 301 } 302 303 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 304 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 305 w.l.Lock() 306 defer w.l.Unlock() 307 308 // Not enabled so no-op 309 if !w.enabled { 310 return nil, notEnabled 311 } 312 313 watcher, ok := w.watchers[dID] 314 if ok { 315 return watcher, nil 316 } 317 318 return w.forceAdd(dID) 319 } 320 321 // SetAllocHealth is used to set the health of allocations for a deployment. If 322 // there are any unhealthy allocations, the deployment is updated to be failed. 323 // Otherwise the allocations are updated and an evaluation is created. 324 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 325 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 326 if err != nil { 327 return err 328 } 329 330 return watcher.SetAllocHealth(req, resp) 331 } 332 333 // PromoteDeployment is used to promote a deployment. If promote is false, 334 // deployment is marked as failed. Otherwise the deployment is updated and an 335 // evaluation is created. 336 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 337 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 338 if err != nil { 339 return err 340 } 341 342 return watcher.PromoteDeployment(req, resp) 343 } 344 345 // PauseDeployment is used to toggle the pause state on a deployment. If the 346 // deployment is being unpaused, an evaluation is created. 347 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 348 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 349 if err != nil { 350 return err 351 } 352 353 return watcher.PauseDeployment(req, resp) 354 } 355 356 // FailDeployment is used to fail the deployment. 357 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 358 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 359 if err != nil { 360 return err 361 } 362 363 return watcher.FailDeployment(req, resp) 364 } 365 366 // createUpdate commits the given allocation desired transition and evaluation 367 // to Raft but batches the commit with other calls. 368 func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) { 369 b := w.allocUpdateBatcher 370 if b == nil { 371 return 0, notEnabled 372 } 373 return b.CreateUpdate(allocs, eval).Results() 374 } 375 376 // upsertJob commits the given job to Raft 377 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 378 return w.raft.UpsertJob(job) 379 } 380 381 // upsertDeploymentStatusUpdate commits the given deployment update and optional 382 // evaluation to Raft 383 func (w *Watcher) upsertDeploymentStatusUpdate( 384 u *structs.DeploymentStatusUpdate, 385 e *structs.Evaluation, 386 j *structs.Job) (uint64, error) { 387 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 388 DeploymentUpdate: u, 389 Eval: e, 390 Job: j, 391 }) 392 } 393 394 // upsertDeploymentPromotion commits the given deployment promotion to Raft 395 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 396 return w.raft.UpdateDeploymentPromotion(req) 397 } 398 399 // upsertDeploymentAllocHealth commits the given allocation health changes to 400 // Raft 401 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 402 return w.raft.UpdateDeploymentAllocHealth(req) 403 }