github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/multiwatcher/worker.go (about) 1 // Copyright 2019 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package multiwatcher 5 6 import ( 7 "sync" 8 "time" 9 10 "github.com/juju/collections/deque" 11 "github.com/juju/errors" 12 "github.com/juju/worker/v3" 13 "github.com/prometheus/client_golang/prometheus" 14 "gopkg.in/tomb.v2" 15 16 "github.com/juju/juju/core/multiwatcher" 17 "github.com/juju/juju/state" 18 "github.com/juju/juju/state/watcher" 19 ) 20 21 // Config is an argument struct used to create a Worker. 22 type Config struct { 23 Clock Clock 24 Logger Logger 25 Backing state.AllWatcherBacking 26 PrometheusRegisterer prometheus.Registerer 27 Cleanup func() 28 } 29 30 // Validate validates the worker configuration. 31 func (config Config) Validate() error { 32 if config.Clock == nil { 33 return errors.NotValidf("missing Clock") 34 } 35 if config.Logger == nil { 36 return errors.NotValidf("missing Logger") 37 } 38 if config.Backing == nil { 39 return errors.NotValidf("missing Backing") 40 } 41 if config.PrometheusRegisterer == nil { 42 return errors.NotValidf("missing PrometheusRegisterer") 43 } 44 return nil 45 } 46 47 // Worker runs the primary goroutine for managing the multiwatchers. 48 type Worker struct { 49 config Config 50 51 tomb tomb.Tomb 52 metrics *Collector 53 54 // store holds information about all known entities. 55 store multiwatcher.Store 56 57 // request receives requests from Multiwatcher clients. 58 request chan *request 59 60 // Each entry in the waiting map holds a linked list of Next requests 61 // outstanding for the associated params. 62 waiting map[*Watcher]*request 63 64 mu sync.Mutex 65 watchers []*Watcher 66 restartCount int 67 // remember the last five errors that caused us to restart the internal loop 68 errors []error 69 70 // The worker should not block incoming events from the watcher on the 71 // processing of those events. Use a queue to store the events that are 72 // needed to be processed. 73 pending *deque.Deque 74 data chan struct{} 75 closed chan struct{} 76 } 77 78 // request holds a message from the Multiwatcher to the 79 // storeManager for some changes. The request will be 80 // replied to when some changes are available. 81 type request struct { 82 // w holds the Multiwatcher that has originated the request. 83 watcher *Watcher 84 85 // reply receives a message when deltas are ready. If reply is 86 // nil, the Multiwatcher will be stopped. If the reply is true, 87 // the request has been processed; if false, the Multiwatcher 88 // has been stopped, 89 reply chan bool 90 91 // noChanges receives a message when the manager checks for changes 92 // and there are none. 93 noChanges chan struct{} 94 95 // changes is populated as part of the reply and will hold changes that have 96 // occurred since the last replied-to Next request. 97 changes []multiwatcher.Delta 98 99 // next points to the next request in the list of outstanding 100 // requests on a given watcher. It is used only by the central 101 // storeManager goroutine. 102 next *request 103 } 104 105 type queueEntry struct { 106 change *watcher.Change 107 created time.Time 108 } 109 110 // NewWorkerShim is a method used for hooking up the specific NewWorker 111 // to the manifold NewWorker config arg. This allows other tests to use 112 // the NewWorker to get something that acts as a multiwatcher.Factory 113 // without having to cast the worker. 114 func NewWorkerShim(config Config) (worker.Worker, error) { 115 return NewWorker(config) 116 } 117 118 // NewWorker creates the worker and starts the loop goroutine. 119 func NewWorker(config Config) (*Worker, error) { 120 if err := config.Validate(); err != nil { 121 return nil, errors.Trace(err) 122 } 123 closed := make(chan struct{}) 124 close(closed) 125 w := &Worker{ 126 config: config, 127 // There always needs to be a valid request channel. 128 request: make(chan *request), 129 waiting: make(map[*Watcher]*request), 130 store: multiwatcher.NewStore(config.Logger), 131 pending: deque.New(), 132 data: make(chan struct{}, 1), 133 closed: closed, 134 } 135 w.metrics = NewMetricsCollector(w) 136 w.tomb.Go(w.loop) 137 return w, nil 138 } 139 140 const ( 141 // Keys used in the Report method are used to retrieve from 142 // the map in the metrics code, so define constants for the keys. 143 reportWatcherKey = "num-watchers" 144 reportStoreKey = "store-size" 145 reportRestartKey = "restart-count" 146 reportQueueSizeKey = "queue-size" 147 reportQueueAgeKey = "queue-age" 148 reportErrorsKey = "errors" 149 ) 150 151 // Report is shown up in the engine report of the agent. 152 func (w *Worker) Report() map[string]interface{} { 153 w.mu.Lock() 154 count := len(w.watchers) 155 store := w.store.Size() 156 restart := w.restartCount 157 var errs []string 158 for _, err := range w.errors { 159 errs = append(errs, err.Error()) 160 } 161 var queueAge float64 162 var queueSize int 163 if w.pending != nil { 164 queueSize = w.pending.Len() 165 if front, exists := w.pending.Front(); exists { 166 entry := front.(*queueEntry) 167 queueAge = w.config.Clock.Now().Sub(entry.created).Seconds() 168 } 169 } 170 w.mu.Unlock() 171 172 report := map[string]interface{}{ 173 reportWatcherKey: count, 174 reportStoreKey: store, 175 reportRestartKey: restart, 176 reportQueueSizeKey: queueSize, 177 reportQueueAgeKey: queueAge, 178 } 179 if len(errs) > 0 { 180 report[reportErrorsKey] = errs 181 } 182 return report 183 } 184 185 // WatchController returns entity delta events for all models in the controller. 186 func (w *Worker) WatchController() multiwatcher.Watcher { 187 return w.newWatcher(nil) 188 } 189 190 // WatchModel returns entity delta events just for the specified model. 191 func (w *Worker) WatchModel(modelUUID string) multiwatcher.Watcher { 192 return w.newWatcher( 193 func(in []multiwatcher.Delta) []multiwatcher.Delta { 194 // Returns an empty slice if there is nothing to match with the 195 // implementation of the Watcher for noChanges. Both could potentially 196 // be updated to return a nil slice. 197 result := make([]multiwatcher.Delta, 0, len(in)) 198 for _, delta := range in { 199 if delta.Entity.EntityID().ModelUUID == modelUUID { 200 result = append(result, delta) 201 } 202 } 203 return result 204 }) 205 } 206 207 func (w *Worker) newWatcher(filter func([]multiwatcher.Delta) []multiwatcher.Delta) *Watcher { 208 w.mu.Lock() 209 watch := &Watcher{ 210 request: w.request, 211 control: &w.tomb, 212 logger: w.config.Logger, 213 // Buffered err channel as if there is a fetch error on the all watcher backing 214 // the error is passed to the watcher. 215 err: make(chan error, 1), 216 filter: filter, 217 } 218 w.watchers = append(w.watchers, watch) 219 w.mu.Unlock() 220 return watch 221 } 222 223 func (w *Worker) loop() error { 224 w.config.Logger.Tracef("worker loop started") 225 defer w.config.Logger.Tracef("worker loop completed") 226 defer func() { 227 if w.config.Cleanup != nil { 228 w.config.Cleanup() 229 } 230 }() 231 232 _ = w.config.PrometheusRegisterer.Register(w.metrics) 233 defer w.config.PrometheusRegisterer.Unregister(w.metrics) 234 235 for { 236 err := w.inner() 237 select { 238 case <-w.tomb.Dying(): 239 return nil 240 default: 241 w.mu.Lock() 242 w.restartCount++ 243 w.errors = append(w.errors, err) 244 if len(w.errors) > 5 { 245 // Remembering the last five errors is somewhat of an arbitrary number, 246 // but we want more than just the last one, and we do want a cap. 247 // Since we only ever add one at a time, we know that removing just 248 // the first one will get us back to five. 249 w.errors = w.errors[1:] 250 } 251 w.store = multiwatcher.NewStore(w.config.Logger) 252 w.request = make(chan *request) 253 w.waiting = make(map[*Watcher]*request) 254 // Since the worker itself isn't dying, we need to manually stop all 255 // the watchers. 256 for _, watch := range w.watchers { 257 watch.err <- err 258 } 259 w.watchers = nil 260 w.mu.Unlock() 261 } 262 } 263 } 264 265 // We don't want to restart the worker just because the backing has raised an error. 266 // If it does, we record the error, and start again with a new store. 267 func (w *Worker) inner() error { 268 w.config.Logger.Tracef("worker inner started") 269 defer w.config.Logger.Tracef("worker inner completed") 270 271 // Create the wait group, and set up the defer before the watching 272 // the backing, as we want the backing unwatch to happen before the 273 // waitgroup wait call. This is to ensure we aren't blocking the 274 // backing event generator. 275 var wg sync.WaitGroup 276 wg.Add(1) 277 defer wg.Wait() 278 279 backing := w.config.Backing 280 in := make(chan watcher.Change) 281 backing.Watch(in) 282 defer backing.Unwatch(in) 283 284 processError := make(chan error) 285 286 go func() { 287 err := w.process(backing, w.tomb.Dying()) 288 select { 289 case <-w.tomb.Dying(): 290 case processError <- err: 291 } 292 wg.Done() 293 }() 294 295 for { 296 select { 297 case <-w.tomb.Dying(): 298 return errors.Trace(tomb.ErrDying) 299 case err := <-processError: 300 return errors.Trace(err) 301 case change := <-in: 302 w.append(&change) 303 } 304 } 305 } 306 307 func (w *Worker) append(change *watcher.Change) { 308 start := w.config.Clock.Now() 309 w.mu.Lock() 310 if inQueue := w.changePending(change); inQueue { 311 w.metrics.dupe.Inc() 312 } else { 313 element := &queueEntry{ 314 change: change, 315 created: start, 316 } 317 w.pending.PushBack(element) 318 if w.pending.Len() == 1 { 319 select { 320 // In all normal cases, we can push something onto sm.data 321 // as it is a buffered channel. And if the length is one, 322 // then data should be empty. However paranoia and all that. 323 case w.data <- struct{}{}: 324 default: 325 } 326 } 327 } 328 w.mu.Unlock() 329 finish := w.config.Clock.Now() 330 w.metrics.append.Observe(float64(finish.Sub(start).Milliseconds())) 331 } 332 333 func (w *Worker) changePending(change *watcher.Change) bool { 334 // Function assumes lock already held by append function. 335 336 // Look to see if there is already an entry in the pending queue 337 // for the same collection and id. 338 var entry *queueEntry 339 iter := w.pending.Iterator() 340 for iter.Next(&entry) { 341 if entry.change.C == change.C && entry.change.Id == change.Id { 342 return true 343 } 344 } 345 return false 346 } 347 348 func (w *Worker) process(backing state.AllWatcherBacking, done <-chan struct{}) error { 349 // We have no idea what changes the watcher might be trying to 350 // send us while getAll proceeds, but we don't mind, because 351 // backing.Changed is idempotent with respect to both updates 352 // and removals. 353 if err := backing.GetAll(w.store); err != nil { 354 return errors.Trace(err) 355 } 356 var next <-chan struct{} 357 358 for { 359 select { 360 case <-done: 361 return nil 362 case <-w.data: 363 // Has new data been pushed on? 364 w.config.Logger.Tracef("new data pushed on queue") 365 case <-next: 366 // If there was already data, next is a closed channel. 367 // Otherwise it is nil, so won't pass through. 368 w.config.Logger.Tracef("process data on queue") 369 case req := <-w.request: 370 // If we get a watcher request to handle while we are 371 // waiting for changes, handle it, and respond. 372 w.config.Logger.Tracef("handle request: %#v", req) 373 w.handle(req) 374 } 375 change, empty := w.popOne() 376 if empty { 377 next = nil 378 } else { 379 next = w.closed 380 } 381 if change != nil { 382 start := w.config.Clock.Now() 383 if err := backing.Changed(w.store, *change); err != nil { 384 return errors.Trace(err) 385 } 386 // We don't care about observing changes that error out. 387 // They shouldn't happen very often at all. 388 finish := w.config.Clock.Now() 389 w.metrics.process.Observe(float64(finish.Sub(start).Milliseconds())) 390 } 391 w.respond() 392 } 393 } 394 395 func (w *Worker) popOne() (*watcher.Change, bool) { 396 w.mu.Lock() 397 defer w.mu.Unlock() 398 val, ok := w.pending.PopFront() 399 if !ok { 400 // nothing to do 401 return nil, true 402 } 403 empty := w.pending.Len() == 0 404 entry := val.(*queueEntry) 405 return entry.change, empty 406 } 407 408 // Kill implements worker.Worker.Kill. 409 func (w *Worker) Kill() { 410 w.tomb.Kill(nil) 411 } 412 413 // Wait implements worker.Worker.Wait. 414 func (w *Worker) Wait() error { 415 return errors.Trace(w.tomb.Wait()) 416 } 417 418 // handle processes a request from a Multiwatcher. 419 func (w *Worker) handle(req *request) { 420 w.config.Logger.Tracef("start handle") 421 defer w.config.Logger.Tracef("finish handle") 422 if req.watcher.stopped { 423 w.config.Logger.Tracef("watcher %p is stopped", req.watcher) 424 // The watcher has previously been stopped. 425 if req.reply != nil { 426 select { 427 case req.reply <- false: 428 case <-w.tomb.Dying(): 429 } 430 } 431 return 432 } 433 if req.reply == nil { 434 w.config.Logger.Tracef("request to stop watcher %p", req.watcher) 435 // This is a request to stop the watcher. 436 for req := w.waiting[req.watcher]; req != nil; req = req.next { 437 select { 438 case req.reply <- false: 439 case <-w.tomb.Dying(): 440 } 441 } 442 delete(w.waiting, req.watcher) 443 req.watcher.stopped = true 444 w.store.DecReference(req.watcher.revno) 445 return 446 } 447 // Add request to head of list. 448 w.config.Logger.Tracef("add watcher %p request to waiting", req.watcher) 449 req.next = w.waiting[req.watcher] 450 w.waiting[req.watcher] = req 451 } 452 453 // respond responds to all outstanding requests that are satisfiable. 454 func (w *Worker) respond() { 455 w.config.Logger.Tracef("start respond") 456 defer w.config.Logger.Tracef("finish respond") 457 for watch, req := range w.waiting { 458 revno := watch.revno 459 changes, latestRevno := w.store.ChangesSince(revno) 460 w.config.Logger.Tracef("%d changes since %d for watcher %p", len(changes), revno, watch) 461 if len(changes) == 0 { 462 if req.noChanges != nil { 463 w.config.Logger.Tracef("sending down noChanges for watcher %p", watch) 464 select { 465 case req.noChanges <- struct{}{}: 466 case <-w.tomb.Dying(): 467 return 468 } 469 470 w.removeWaitingReq(watch, req) 471 } 472 continue 473 } 474 475 req.changes = changes 476 watch.revno = latestRevno 477 478 w.config.Logger.Tracef("sending changes down reply channel for watcher %p", watch) 479 select { 480 case req.reply <- true: 481 case <-w.tomb.Dying(): 482 return 483 } 484 485 w.removeWaitingReq(watch, req) 486 w.store.AddReference(revno) 487 } 488 } 489 490 func (w *Worker) removeWaitingReq(watcher *Watcher, req *request) { 491 if next := req.next; next == nil { 492 // Last request for this watcher. 493 delete(w.waiting, watcher) 494 } else { 495 w.waiting[watcher] = next 496 } 497 }