github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/deploymentwatcher/deployments_watcher.go (about) 1 package deploymentwatcher 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "golang.org/x/time/rate" 11 12 memdb "github.com/hashicorp/go-memdb" 13 "github.com/hashicorp/nomad/nomad/state" 14 "github.com/hashicorp/nomad/nomad/structs" 15 ) 16 17 const ( 18 // LimitStateQueriesPerSecond is the number of state queries allowed per 19 // second 20 LimitStateQueriesPerSecond = 100.0 21 22 // CrossDeploymentEvalBatchDuration is the duration in which evaluations are 23 // batched across all deployment watchers before committing to Raft. 24 CrossDeploymentEvalBatchDuration = 250 * time.Millisecond 25 ) 26 27 var ( 28 // notEnabled is the error returned when the deployment watcher is not 29 // enabled 30 notEnabled = fmt.Errorf("deployment watcher not enabled") 31 ) 32 33 // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions 34 // to apply data transforms via Raft. 35 type DeploymentRaftEndpoints interface { 36 // UpsertEvals is used to upsert a set of evaluations 37 UpsertEvals([]*structs.Evaluation) (uint64, error) 38 39 // UpsertJob is used to upsert a job 40 UpsertJob(job *structs.Job) (uint64, error) 41 42 // UpdateDeploymentStatus is used to make a deployment status update 43 // and potentially create an evaluation. 44 UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error) 45 46 // UpdateDeploymentPromotion is used to promote canaries in a deployment 47 UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) 48 49 // UpdateDeploymentAllocHealth is used to set the health of allocations in a 50 // deployment 51 UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) 52 } 53 54 // Watcher is used to watch deployments and their allocations created 55 // by the scheduler and trigger the scheduler when allocation health 56 // transistions. 57 type Watcher struct { 58 enabled bool 59 logger *log.Logger 60 61 // queryLimiter is used to limit the rate of blocking queries 62 queryLimiter *rate.Limiter 63 64 // evalBatchDuration is the duration to batch eval creation across all 65 // deployment watchers 66 evalBatchDuration time.Duration 67 68 // raft contains the set of Raft endpoints that can be used by the 69 // deployments watcher 70 raft DeploymentRaftEndpoints 71 72 // state is the state that is watched for state changes. 73 state *state.StateStore 74 75 // watchers is the set of active watchers, one per deployment 76 watchers map[string]*deploymentWatcher 77 78 // evalBatcher is used to batch the creation of evaluations 79 evalBatcher *EvalBatcher 80 81 // ctx and exitFn are used to cancel the watcher 82 ctx context.Context 83 exitFn context.CancelFunc 84 85 l sync.RWMutex 86 } 87 88 // NewDeploymentsWatcher returns a deployments watcher that is used to watch 89 // deployments and trigger the scheduler as needed. 90 func NewDeploymentsWatcher(logger *log.Logger, 91 raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, 92 evalBatchDuration time.Duration) *Watcher { 93 94 return &Watcher{ 95 raft: raft, 96 queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), 97 evalBatchDuration: evalBatchDuration, 98 logger: logger, 99 } 100 } 101 102 // SetEnabled is used to control if the watcher is enabled. The watcher 103 // should only be enabled on the active leader. When being enabled the state is 104 // passsed in as it is no longer valid once a leader election has taken place. 105 func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) error { 106 w.l.Lock() 107 defer w.l.Unlock() 108 109 wasEnabled := w.enabled 110 w.enabled = enabled 111 112 if state != nil { 113 w.state = state 114 } 115 116 // Flush the state to create the necessary objects 117 w.flush() 118 119 // If we are starting now, launch the watch daemon 120 if enabled && !wasEnabled { 121 go w.watchDeployments(w.ctx) 122 } 123 124 return nil 125 } 126 127 // flush is used to clear the state of the watcher 128 func (w *Watcher) flush() { 129 // Stop all the watchers and clear it 130 for _, watcher := range w.watchers { 131 watcher.StopWatch() 132 } 133 134 // Kill everything associated with the watcher 135 if w.exitFn != nil { 136 w.exitFn() 137 } 138 139 w.watchers = make(map[string]*deploymentWatcher, 32) 140 w.ctx, w.exitFn = context.WithCancel(context.Background()) 141 w.evalBatcher = NewEvalBatcher(w.evalBatchDuration, w.raft, w.ctx) 142 } 143 144 // watchDeployments is the long lived go-routine that watches for deployments to 145 // add and remove watchers on. 146 func (w *Watcher) watchDeployments(ctx context.Context) { 147 dindex := uint64(1) 148 for { 149 // Block getting all deployments using the last deployment index. 150 deployments, idx, err := w.getDeploys(ctx, dindex) 151 if err != nil { 152 if err == context.Canceled { 153 return 154 } 155 156 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deploylements: %v", err) 157 } 158 159 // Update the latest index 160 dindex = idx 161 162 // Ensure we are tracking the things we should and not tracking what we 163 // shouldn't be 164 for _, d := range deployments { 165 if d.Active() { 166 if err := w.add(d); err != nil { 167 w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err) 168 } 169 } else { 170 w.remove(d) 171 } 172 } 173 } 174 } 175 176 // getDeploys retrieves all deployments blocking at the given index. 177 func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) { 178 resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx) 179 if err != nil { 180 return nil, 0, err 181 } 182 183 return resp.([]*structs.Deployment), index, nil 184 } 185 186 // getDeploysImpl retrieves all deployments from the passed state store. 187 func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { 188 189 iter, err := state.Deployments(ws) 190 if err != nil { 191 return nil, 0, err 192 } 193 194 var deploys []*structs.Deployment 195 for { 196 raw := iter.Next() 197 if raw == nil { 198 break 199 } 200 deploy := raw.(*structs.Deployment) 201 deploys = append(deploys, deploy) 202 } 203 204 // Use the last index that affected the deployment table 205 index, err := state.Index("deployment") 206 if err != nil { 207 return nil, 0, err 208 } 209 210 return deploys, index, nil 211 } 212 213 // add adds a deployment to the watch list 214 func (w *Watcher) add(d *structs.Deployment) error { 215 w.l.Lock() 216 defer w.l.Unlock() 217 _, err := w.addLocked(d) 218 return err 219 } 220 221 // addLocked adds a deployment to the watch list and should only be called when 222 // locked. 223 func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) { 224 // Not enabled so no-op 225 if !w.enabled { 226 return nil, nil 227 } 228 229 if !d.Active() { 230 return nil, fmt.Errorf("deployment %q is terminal", d.ID) 231 } 232 233 // Already watched so no-op 234 if _, ok := w.watchers[d.ID]; ok { 235 return nil, nil 236 } 237 238 // Get the job the deployment is referencing 239 snap, err := w.state.Snapshot() 240 if err != nil { 241 return nil, err 242 } 243 244 job, err := snap.JobByID(nil, d.Namespace, d.JobID) 245 if err != nil { 246 return nil, err 247 } 248 if job == nil { 249 return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID) 250 } 251 252 watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w) 253 w.watchers[d.ID] = watcher 254 return watcher, nil 255 } 256 257 // remove stops watching a deployment. This can be because the deployment is 258 // complete or being deleted. 259 func (w *Watcher) remove(d *structs.Deployment) { 260 w.l.Lock() 261 defer w.l.Unlock() 262 263 // Not enabled so no-op 264 if !w.enabled { 265 return 266 } 267 268 if watcher, ok := w.watchers[d.ID]; ok { 269 watcher.StopWatch() 270 delete(w.watchers, d.ID) 271 } 272 } 273 274 // forceAdd is used to force a lookup of the given deployment object and create 275 // a watcher. If the deployment does not exist or is terminal an error is 276 // returned. 277 func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) { 278 snap, err := w.state.Snapshot() 279 if err != nil { 280 return nil, err 281 } 282 283 deployment, err := snap.DeploymentByID(nil, dID) 284 if err != nil { 285 return nil, err 286 } 287 288 if deployment == nil { 289 return nil, fmt.Errorf("unknown deployment %q", dID) 290 } 291 292 return w.addLocked(deployment) 293 } 294 295 // getOrCreateWatcher returns the deployment watcher for the given deployment ID. 296 func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) { 297 w.l.Lock() 298 defer w.l.Unlock() 299 300 // Not enabled so no-op 301 if !w.enabled { 302 return nil, notEnabled 303 } 304 305 watcher, ok := w.watchers[dID] 306 if ok { 307 return watcher, nil 308 } 309 310 return w.forceAdd(dID) 311 } 312 313 // SetAllocHealth is used to set the health of allocations for a deployment. If 314 // there are any unhealthy allocations, the deployment is updated to be failed. 315 // Otherwise the allocations are updated and an evaluation is created. 316 func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error { 317 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 318 if err != nil { 319 return err 320 } 321 322 return watcher.SetAllocHealth(req, resp) 323 } 324 325 // PromoteDeployment is used to promote a deployment. If promote is false, 326 // deployment is marked as failed. Otherwise the deployment is updated and an 327 // evaluation is created. 328 func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error { 329 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 330 if err != nil { 331 return err 332 } 333 334 return watcher.PromoteDeployment(req, resp) 335 } 336 337 // PauseDeployment is used to toggle the pause state on a deployment. If the 338 // deployment is being unpaused, an evaluation is created. 339 func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error { 340 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 341 if err != nil { 342 return err 343 } 344 345 return watcher.PauseDeployment(req, resp) 346 } 347 348 // FailDeployment is used to fail the deployment. 349 func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error { 350 watcher, err := w.getOrCreateWatcher(req.DeploymentID) 351 if err != nil { 352 return err 353 } 354 355 return watcher.FailDeployment(req, resp) 356 } 357 358 // createEvaluation commits the given evaluation to Raft but batches the commit 359 // with other calls. 360 func (w *Watcher) createEvaluation(eval *structs.Evaluation) (uint64, error) { 361 return w.evalBatcher.CreateEval(eval).Results() 362 } 363 364 // upsertJob commits the given job to Raft 365 func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) { 366 return w.raft.UpsertJob(job) 367 } 368 369 // upsertDeploymentStatusUpdate commits the given deployment update and optional 370 // evaluation to Raft 371 func (w *Watcher) upsertDeploymentStatusUpdate( 372 u *structs.DeploymentStatusUpdate, 373 e *structs.Evaluation, 374 j *structs.Job) (uint64, error) { 375 return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{ 376 DeploymentUpdate: u, 377 Eval: e, 378 Job: j, 379 }) 380 } 381 382 // upsertDeploymentPromotion commits the given deployment promotion to Raft 383 func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) { 384 return w.raft.UpdateDeploymentPromotion(req) 385 } 386 387 // upsertDeploymentAllocHealth commits the given allocation health changes to 388 // Raft 389 func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) { 390 return w.raft.UpdateDeploymentAllocHealth(req) 391 }