github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/deploymentwatcher/deployments_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "golang.org/x/time/rate" 11 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 ) 16 17 const ( 18 // LimitStateQueriesPerSecond is the number of state queries allowed per 19 // second 20 LimitStateQueriesPerSecond = 100.0 21 22 // CrossDeploymentUpdateBatchDuration is the duration in which allocation 23 // desired transition and evaluation creation updates are batched across 24 // all deployment watchers before committing to Raft. 25 CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond 26 ) 27 28 var ( 29 // notEnabled is the error returned when the deployment watcher is not 30 // enabled 31 notEnabled = fmt.Errorf("deployment watcher not enabled") 32 ) 33 34 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 35 // to apply data transforms via Raft. 36 type DeploymentRaftEndpoints interface { 37 // UpsertJob is used to upsert a job 38 UpsertJob(job *structs.Job) (uint64, error) 39 40 // UpdateDeploymentStatus is used to make a deployment status update 41 // and potentially create an evaluation. 42 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 43 44 // UpdateDeploymentPromotion is used to promote canaries in a deployment 45 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 46 47 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 48 // deployment 49 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 50 51 // UpdateAllocDesiredTransition is used to update the desired transition 52 // for allocations. 53 UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error) 54 } 55 56 // Watcher is used to watch deployments and their allocations created 57 // by the scheduler and trigger the scheduler when allocation health 58 // transitions. 59 type Watcher struct { 60 enabled bool 61 logger *log.Logger 62 63 // queryLimiter is used to limit the rate of blocking queries 64 queryLimiter *rate.Limiter 65 66 // updateBatchDuration is the duration to batch allocation desired 67 // transition and eval creation across all deployment watchers 68 updateBatchDuration time.Duration 69 70 // raft contains the set of Raft endpoints that can be used by the 71 // deployments watcher 72 raft DeploymentRaftEndpoints 73 74 // state is the state that is watched for state changes. 75 state *state.StateStore 76 77 // watchers is the set of active watchers, one per deployment 78 watchers map[string]*deploymentWatcher 79 80 // allocUpdateBatcher is used to batch the creation of evaluations and 81 // allocation desired transition updates 82 allocUpdateBatcher *AllocUpdateBatcher 83 84 // ctx and exitFn are used to cancel the watcher 85 ctx context.Context 86 exitFn context.CancelFunc 87 88 l sync.RWMutex 89 } 90 91 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 92 // deployments and trigger the scheduler as needed. 93 func NewDeploymentsWatcher(logger *log.Logger, 94 raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, 95 updateBatchDuration time.Duration) *Watcher { 96 97 return &Watcher{ 98 raft: raft, 99 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 100 updateBatchDuration: updateBatchDuration, 101 logger: logger, 102 } 103 } 104 105 // SetEnabled is used to control if the watcher is enabled. The watcher 106 // should only be enabled on the active leader. When being enabled the state is 107 // passed in as it is no longer valid once a leader election has taken place. 108 func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) { 109 w.l.Lock() 110 defer w.l.Unlock() 111 112 wasEnabled := w.enabled 113 w.enabled = enabled 114 115 if state != nil { 116 w.state = state 117 } 118 119 // Flush the state to create the necessary objects 120 w.flush() 121 122 // If we are starting now, launch the watch daemon 123 if enabled && !wasEnabled { 124 go w.watchDeployments(w.ctx) 125 } 126 } 127 128 // flush is used to clear the state of the watcher 129 func (w *Watcher) flush() { 130 // Stop all the watchers and clear it 131 for _, watcher := range w.watchers { 132 watcher.StopWatch() 133 } 134 135 // Kill everything associated with the watcher 136 if w.exitFn != nil { 137 w.exitFn() 138 } 139 140 w.watchers = make(map[string]*deploymentWatcher, 32) 141 w.ctx, w.exitFn = context.WithCancel(context.Background()) 142 w.allocUpdateBatcher = NewAllocUpdateBatcher(w.updateBatchDuration, w.raft, w.ctx) 143 } 144 145 // watchDeployments is the long lived go-routine that watches for deployments to 146 // add and remove watchers on. 147 func (w *Watcher) watchDeployments(ctx context.Context) { 148 dindex := uint64(1) 149 for { 150 // Block getting all deployments using the last deployment index. 151 deployments, idx, err := w.getDeploys(ctx, dindex) 152 if err != nil { 153 if err == context.Canceled { 154 return 155 } 156 157 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deployments: %v", err) 158 } 159 160 // Update the latest index 161 dindex = idx 162 163 // Ensure we are tracking the things we should and not tracking what we 164 // shouldn't be 165 for _, d := range deployments { 166 if d.Active() { 167 if err := w.add(d); err != nil { 168 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err) 169 } 170 } else { 171 w.remove(d) 172 } 173 } 174 } 175 } 176 177 // getDeploys retrieves all deployments blocking at the given index. 178 func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) { 179 resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx) 180 if err != nil { 181 return nil, 0, err 182 } 183 184 return resp.([]*structs.Deployment), index, nil 185 } 186 187 // getDeploysImpl retrieves all deployments from the passed state store. 188 func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 189 190 iter, err := state.Deployments(ws) 191 if err != nil { 192 return nil, 0, err 193 } 194 195 var deploys []*structs.Deployment 196 for { 197 raw := iter.Next() 198 if raw == nil { 199 break 200 } 201 deploy := raw.(*structs.Deployment) 202 deploys = append(deploys, deploy) 203 } 204 205 // Use the last index that affected the deployment table 206 index, err := state.Index("deployment") 207 if err != nil { 208 return nil, 0, err 209 } 210 211 return deploys, index, nil 212 } 213 214 // add adds a deployment to the watch list 215 func (w *Watcher) add(d *structs.Deployment) error { 216 w.l.Lock() 217 defer w.l.Unlock() 218 _, err := w.addLocked(d) 219 return err 220 } 221 222 // addLocked adds a deployment to the watch list and should only be called when 223 // locked. 224 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 225 // Not enabled so no-op 226 if !w.enabled { 227 return nil, nil 228 } 229 230 if !d.Active() { 231 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 232 } 233 234 // Already watched so just update the deployment 235 if w, ok := w.watchers[d.ID]; ok { 236 w.updateDeployment(d) 237 return nil, nil 238 } 239 240 // Get the job the deployment is referencing 241 snap, err := w.state.Snapshot() 242 if err != nil { 243 return nil, err 244 } 245 246 job, err := snap.JobByID(nil, d.Namespace, d.JobID) 247 if err != nil { 248 return nil, err 249 } 250 if job == nil { 251 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 252 } 253 254 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w) 255 w.watchers[d.ID] = watcher 256 return watcher, nil 257 } 258 259 // remove stops watching a deployment. This can be because the deployment is 260 // complete or being deleted. 261 func (w *Watcher) remove(d *structs.Deployment) { 262 w.l.Lock() 263 defer w.l.Unlock() 264 265 // Not enabled so no-op 266 if !w.enabled { 267 return 268 } 269 270 if watcher, ok := w.watchers[d.ID]; ok { 271 watcher.StopWatch() 272 delete(w.watchers, d.ID) 273 } 274 } 275 276 // forceAdd is used to force a lookup of the given deployment object and create 277 // a watcher. If the deployment does not exist or is terminal an error is 278 // returned. 279 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 280 snap, err := w.state.Snapshot() 281 if err != nil { 282 return nil, err 283 } 284 285 deployment, err := snap.DeploymentByID(nil, dID) 286 if err != nil { 287 return nil, err 288 } 289 290 if deployment == nil { 291 return nil, fmt.Errorf("unknown deployment %q", dID) 292 } 293 294 return w.addLocked(deployment) 295 } 296 297 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 298 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 299 w.l.Lock() 300 defer w.l.Unlock() 301 302 // Not enabled so no-op 303 if !w.enabled { 304 return nil, notEnabled 305 } 306 307 watcher, ok := w.watchers[dID] 308 if ok { 309 return watcher, nil 310 } 311 312 return w.forceAdd(dID) 313 } 314 315 // SetAllocHealth is used to set the health of allocations for a deployment. If 316 // there are any unhealthy allocations, the deployment is updated to be failed. 317 // Otherwise the allocations are updated and an evaluation is created. 318 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 319 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 320 if err != nil { 321 return err 322 } 323 324 return watcher.SetAllocHealth(req, resp) 325 } 326 327 // PromoteDeployment is used to promote a deployment. If promote is false, 328 // deployment is marked as failed. Otherwise the deployment is updated and an 329 // evaluation is created. 330 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 331 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 332 if err != nil { 333 return err 334 } 335 336 return watcher.PromoteDeployment(req, resp) 337 } 338 339 // PauseDeployment is used to toggle the pause state on a deployment. If the 340 // deployment is being unpaused, an evaluation is created. 341 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 342 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 343 if err != nil { 344 return err 345 } 346 347 return watcher.PauseDeployment(req, resp) 348 } 349 350 // FailDeployment is used to fail the deployment. 351 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 352 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 353 if err != nil { 354 return err 355 } 356 357 return watcher.FailDeployment(req, resp) 358 } 359 360 // createUpdate commits the given allocation desired transition and evaluation 361 // to Raft but batches the commit with other calls. 362 func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) { 363 return w.allocUpdateBatcher.CreateUpdate(allocs, eval).Results() 364 } 365 366 // upsertJob commits the given job to Raft 367 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 368 return w.raft.UpsertJob(job) 369 } 370 371 // upsertDeploymentStatusUpdate commits the given deployment update and optional 372 // evaluation to Raft 373 func (w *Watcher) upsertDeploymentStatusUpdate( 374 u *structs.DeploymentStatusUpdate, 375 e *structs.Evaluation, 376 j *structs.Job) (uint64, error) { 377 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 378 DeploymentUpdate: u, 379 Eval: e, 380 Job: j, 381 }) 382 } 383 384 // upsertDeploymentPromotion commits the given deployment promotion to Raft 385 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 386 return w.raft.UpdateDeploymentPromotion(req) 387 } 388 389 // upsertDeploymentAllocHealth commits the given allocation health changes to 390 // Raft 391 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 392 return w.raft.UpdateDeploymentAllocHealth(req) 393 }