github.com/djenriquez/nomad-1@v0.8.1/nomad/deploymentwatcher/deployments_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "golang.org/x/time/rate" 11 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 ) 16 17 const ( 18 // LimitStateQueriesPerSecond is the number of state queries allowed per 19 // second 20 LimitStateQueriesPerSecond = 100.0 21 22 // CrossDeploymentEvalBatchDuration is the duration in which evaluations are 23 // batched across all deployment watchers before committing to Raft. 24 CrossDeploymentEvalBatchDuration = 250 * time.Millisecond 25 ) 26 27 var ( 28 // notEnabled is the error returned when the deployment watcher is not 29 // enabled 30 notEnabled = fmt.Errorf("deployment watcher not enabled") 31 ) 32 33 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 34 // to apply data transforms via Raft. 35 type DeploymentRaftEndpoints interface { 36 // UpsertEvals is used to upsert a set of evaluations 37 UpsertEvals([]*structs.Evaluation) (uint64, error) 38 39 // UpsertJob is used to upsert a job 40 UpsertJob(job *structs.Job) (uint64, error) 41 42 // UpdateDeploymentStatus is used to make a deployment status update 43 // and potentially create an evaluation. 44 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 45 46 // UpdateDeploymentPromotion is used to promote canaries in a deployment 47 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 48 49 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 50 // deployment 51 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 52 } 53 54 // Watcher is used to watch deployments and their allocations created 55 // by the scheduler and trigger the scheduler when allocation health 56 // transitions. 57 type Watcher struct { 58 enabled bool 59 logger *log.Logger 60 61 // queryLimiter is used to limit the rate of blocking queries 62 queryLimiter *rate.Limiter 63 64 // evalBatchDuration is the duration to batch eval creation across all 65 // deployment watchers 66 evalBatchDuration time.Duration 67 68 // raft contains the set of Raft endpoints that can be used by the 69 // deployments watcher 70 raft DeploymentRaftEndpoints 71 72 // state is the state that is watched for state changes. 73 state *state.StateStore 74 75 // watchers is the set of active watchers, one per deployment 76 watchers map[string]*deploymentWatcher 77 78 // evalBatcher is used to batch the creation of evaluations 79 evalBatcher *EvalBatcher 80 81 // ctx and exitFn are used to cancel the watcher 82 ctx context.Context 83 exitFn context.CancelFunc 84 85 l sync.RWMutex 86 } 87 88 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 89 // deployments and trigger the scheduler as needed. 90 func NewDeploymentsWatcher(logger *log.Logger, 91 raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, 92 evalBatchDuration time.Duration) *Watcher { 93 94 return &Watcher{ 95 raft: raft, 96 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 97 evalBatchDuration: evalBatchDuration, 98 logger: logger, 99 } 100 } 101 102 // SetEnabled is used to control if the watcher is enabled. The watcher 103 // should only be enabled on the active leader. When being enabled the state is 104 // passed in as it is no longer valid once a leader election has taken place. 105 func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) { 106 w.l.Lock() 107 defer w.l.Unlock() 108 109 wasEnabled := w.enabled 110 w.enabled = enabled 111 112 if state != nil { 113 w.state = state 114 } 115 116 // Flush the state to create the necessary objects 117 w.flush() 118 119 // If we are starting now, launch the watch daemon 120 if enabled && !wasEnabled { 121 go w.watchDeployments(w.ctx) 122 } 123 } 124 125 // flush is used to clear the state of the watcher 126 func (w *Watcher) flush() { 127 // Stop all the watchers and clear it 128 for _, watcher := range w.watchers { 129 watcher.StopWatch() 130 } 131 132 // Kill everything associated with the watcher 133 if w.exitFn != nil { 134 w.exitFn() 135 } 136 137 w.watchers = make(map[string]*deploymentWatcher, 32) 138 w.ctx, w.exitFn = context.WithCancel(context.Background()) 139 w.evalBatcher = NewEvalBatcher(w.evalBatchDuration, w.raft, w.ctx) 140 } 141 142 // watchDeployments is the long lived go-routine that watches for deployments to 143 // add and remove watchers on. 144 func (w *Watcher) watchDeployments(ctx context.Context) { 145 dindex := uint64(1) 146 for { 147 // Block getting all deployments using the last deployment index. 148 deployments, idx, err := w.getDeploys(ctx, dindex) 149 if err != nil { 150 if err == context.Canceled { 151 return 152 } 153 154 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deployments: %v", err) 155 } 156 157 // Update the latest index 158 dindex = idx 159 160 // Ensure we are tracking the things we should and not tracking what we 161 // shouldn't be 162 for _, d := range deployments { 163 if d.Active() { 164 if err := w.add(d); err != nil { 165 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err) 166 } 167 } else { 168 w.remove(d) 169 } 170 } 171 } 172 } 173 174 // getDeploys retrieves all deployments blocking at the given index. 175 func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) { 176 resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx) 177 if err != nil { 178 return nil, 0, err 179 } 180 181 return resp.([]*structs.Deployment), index, nil 182 } 183 184 // getDeploysImpl retrieves all deployments from the passed state store. 185 func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 186 187 iter, err := state.Deployments(ws) 188 if err != nil { 189 return nil, 0, err 190 } 191 192 var deploys []*structs.Deployment 193 for { 194 raw := iter.Next() 195 if raw == nil { 196 break 197 } 198 deploy := raw.(*structs.Deployment) 199 deploys = append(deploys, deploy) 200 } 201 202 // Use the last index that affected the deployment table 203 index, err := state.Index("deployment") 204 if err != nil { 205 return nil, 0, err 206 } 207 208 return deploys, index, nil 209 } 210 211 // add adds a deployment to the watch list 212 func (w *Watcher) add(d *structs.Deployment) error { 213 w.l.Lock() 214 defer w.l.Unlock() 215 _, err := w.addLocked(d) 216 return err 217 } 218 219 // addLocked adds a deployment to the watch list and should only be called when 220 // locked. 221 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 222 // Not enabled so no-op 223 if !w.enabled { 224 return nil, nil 225 } 226 227 if !d.Active() { 228 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 229 } 230 231 // Already watched so no-op 232 if _, ok := w.watchers[d.ID]; ok { 233 return nil, nil 234 } 235 236 // Get the job the deployment is referencing 237 snap, err := w.state.Snapshot() 238 if err != nil { 239 return nil, err 240 } 241 242 job, err := snap.JobByID(nil, d.Namespace, d.JobID) 243 if err != nil { 244 return nil, err 245 } 246 if job == nil { 247 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 248 } 249 250 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w) 251 w.watchers[d.ID] = watcher 252 return watcher, nil 253 } 254 255 // remove stops watching a deployment. This can be because the deployment is 256 // complete or being deleted. 257 func (w *Watcher) remove(d *structs.Deployment) { 258 w.l.Lock() 259 defer w.l.Unlock() 260 261 // Not enabled so no-op 262 if !w.enabled { 263 return 264 } 265 266 if watcher, ok := w.watchers[d.ID]; ok { 267 watcher.StopWatch() 268 delete(w.watchers, d.ID) 269 } 270 } 271 272 // forceAdd is used to force a lookup of the given deployment object and create 273 // a watcher. If the deployment does not exist or is terminal an error is 274 // returned. 275 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 276 snap, err := w.state.Snapshot() 277 if err != nil { 278 return nil, err 279 } 280 281 deployment, err := snap.DeploymentByID(nil, dID) 282 if err != nil { 283 return nil, err 284 } 285 286 if deployment == nil { 287 return nil, fmt.Errorf("unknown deployment %q", dID) 288 } 289 290 return w.addLocked(deployment) 291 } 292 293 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 294 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 295 w.l.Lock() 296 defer w.l.Unlock() 297 298 // Not enabled so no-op 299 if !w.enabled { 300 return nil, notEnabled 301 } 302 303 watcher, ok := w.watchers[dID] 304 if ok { 305 return watcher, nil 306 } 307 308 return w.forceAdd(dID) 309 } 310 311 // SetAllocHealth is used to set the health of allocations for a deployment. If 312 // there are any unhealthy allocations, the deployment is updated to be failed. 313 // Otherwise the allocations are updated and an evaluation is created. 314 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 315 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 316 if err != nil { 317 return err 318 } 319 320 return watcher.SetAllocHealth(req, resp) 321 } 322 323 // PromoteDeployment is used to promote a deployment. If promote is false, 324 // deployment is marked as failed. Otherwise the deployment is updated and an 325 // evaluation is created. 326 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 327 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 328 if err != nil { 329 return err 330 } 331 332 return watcher.PromoteDeployment(req, resp) 333 } 334 335 // PauseDeployment is used to toggle the pause state on a deployment. If the 336 // deployment is being unpaused, an evaluation is created. 337 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 338 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 339 if err != nil { 340 return err 341 } 342 343 return watcher.PauseDeployment(req, resp) 344 } 345 346 // FailDeployment is used to fail the deployment. 347 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 348 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 349 if err != nil { 350 return err 351 } 352 353 return watcher.FailDeployment(req, resp) 354 } 355 356 // createEvaluation commits the given evaluation to Raft but batches the commit 357 // with other calls. 358 func (w *Watcher) createEvaluation(eval *structs.Evaluation) (uint64, error) { 359 return w.evalBatcher.CreateEval(eval).Results() 360 } 361 362 // upsertJob commits the given job to Raft 363 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 364 return w.raft.UpsertJob(job) 365 } 366 367 // upsertDeploymentStatusUpdate commits the given deployment update and optional 368 // evaluation to Raft 369 func (w *Watcher) upsertDeploymentStatusUpdate( 370 u *structs.DeploymentStatusUpdate, 371 e *structs.Evaluation, 372 j *structs.Job) (uint64, error) { 373 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 374 DeploymentUpdate: u, 375 Eval: e, 376 Job: j, 377 }) 378 } 379 380 // upsertDeploymentPromotion commits the given deployment promotion to Raft 381 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 382 return w.raft.UpdateDeploymentPromotion(req) 383 } 384 385 // upsertDeploymentAllocHealth commits the given allocation health changes to 386 // Raft 387 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 388 return w.raft.UpdateDeploymentAllocHealth(req) 389 }