github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/multiwatcher/worker.go (about)

     1  // Copyright 2019 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package multiwatcher
     5  
     6  import (
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/juju/collections/deque"
    11  	"github.com/juju/errors"
    12  	"github.com/juju/worker/v3"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"gopkg.in/tomb.v2"
    15  
    16  	"github.com/juju/juju/core/multiwatcher"
    17  	"github.com/juju/juju/state"
    18  	"github.com/juju/juju/state/watcher"
    19  )
    20  
    21  // Config is an argument struct used to create a Worker.
    22  type Config struct {
    23  	Clock                Clock
    24  	Logger               Logger
    25  	Backing              state.AllWatcherBacking
    26  	PrometheusRegisterer prometheus.Registerer
    27  	Cleanup              func()
    28  }
    29  
    30  // Validate validates the worker configuration.
    31  func (config Config) Validate() error {
    32  	if config.Clock == nil {
    33  		return errors.NotValidf("missing Clock")
    34  	}
    35  	if config.Logger == nil {
    36  		return errors.NotValidf("missing Logger")
    37  	}
    38  	if config.Backing == nil {
    39  		return errors.NotValidf("missing Backing")
    40  	}
    41  	if config.PrometheusRegisterer == nil {
    42  		return errors.NotValidf("missing PrometheusRegisterer")
    43  	}
    44  	return nil
    45  }
    46  
    47  // Worker runs the primary goroutine for managing the multiwatchers.
    48  type Worker struct {
    49  	config Config
    50  
    51  	tomb    tomb.Tomb
    52  	metrics *Collector
    53  
    54  	// store holds information about all known entities.
    55  	store multiwatcher.Store
    56  
    57  	// request receives requests from Multiwatcher clients.
    58  	request chan *request
    59  
    60  	// Each entry in the waiting map holds a linked list of Next requests
    61  	// outstanding for the associated params.
    62  	waiting map[*Watcher]*request
    63  
    64  	mu           sync.Mutex
    65  	watchers     []*Watcher
    66  	restartCount int
    67  	// remember the last five errors that caused us to restart the internal loop
    68  	errors []error
    69  
    70  	// The worker should not block incoming events from the watcher on the
    71  	// processing of those events. Use a queue to store the events that are
    72  	// needed to be processed.
    73  	pending *deque.Deque
    74  	data    chan struct{}
    75  	closed  chan struct{}
    76  }
    77  
    78  // request holds a message from the Multiwatcher to the
    79  // storeManager for some changes. The request will be
    80  // replied to when some changes are available.
    81  type request struct {
    82  	// w holds the Multiwatcher that has originated the request.
    83  	watcher *Watcher
    84  
    85  	// reply receives a message when deltas are ready.  If reply is
    86  	// nil, the Multiwatcher will be stopped.  If the reply is true,
    87  	// the request has been processed; if false, the Multiwatcher
    88  	// has been stopped,
    89  	reply chan bool
    90  
    91  	// noChanges receives a message when the manager checks for changes
    92  	// and there are none.
    93  	noChanges chan struct{}
    94  
    95  	// changes is populated as part of the reply and will hold changes that have
    96  	// occurred since the last replied-to Next request.
    97  	changes []multiwatcher.Delta
    98  
    99  	// next points to the next request in the list of outstanding
   100  	// requests on a given watcher.  It is used only by the central
   101  	// storeManager goroutine.
   102  	next *request
   103  }
   104  
   105  type queueEntry struct {
   106  	change  *watcher.Change
   107  	created time.Time
   108  }
   109  
   110  // NewWorkerShim is a method used for hooking up the specific NewWorker
   111  // to the manifold NewWorker config arg. This allows other tests to use
   112  // the NewWorker to get something that acts as a multiwatcher.Factory
   113  // without having to cast the worker.
   114  func NewWorkerShim(config Config) (worker.Worker, error) {
   115  	return NewWorker(config)
   116  }
   117  
   118  // NewWorker creates the worker and starts the loop goroutine.
   119  func NewWorker(config Config) (*Worker, error) {
   120  	if err := config.Validate(); err != nil {
   121  		return nil, errors.Trace(err)
   122  	}
   123  	closed := make(chan struct{})
   124  	close(closed)
   125  	w := &Worker{
   126  		config: config,
   127  		// There always needs to be a valid request channel.
   128  		request: make(chan *request),
   129  		waiting: make(map[*Watcher]*request),
   130  		store:   multiwatcher.NewStore(config.Logger),
   131  		pending: deque.New(),
   132  		data:    make(chan struct{}, 1),
   133  		closed:  closed,
   134  	}
   135  	w.metrics = NewMetricsCollector(w)
   136  	w.tomb.Go(w.loop)
   137  	return w, nil
   138  }
   139  
   140  const (
   141  	// Keys used in the Report method are used to retrieve from
   142  	// the map in the metrics code, so define constants for the keys.
   143  	reportWatcherKey   = "num-watchers"
   144  	reportStoreKey     = "store-size"
   145  	reportRestartKey   = "restart-count"
   146  	reportQueueSizeKey = "queue-size"
   147  	reportQueueAgeKey  = "queue-age"
   148  	reportErrorsKey    = "errors"
   149  )
   150  
   151  // Report is shown up in the engine report of the agent.
   152  func (w *Worker) Report() map[string]interface{} {
   153  	w.mu.Lock()
   154  	count := len(w.watchers)
   155  	store := w.store.Size()
   156  	restart := w.restartCount
   157  	var errs []string
   158  	for _, err := range w.errors {
   159  		errs = append(errs, err.Error())
   160  	}
   161  	var queueAge float64
   162  	var queueSize int
   163  	if w.pending != nil {
   164  		queueSize = w.pending.Len()
   165  		if front, exists := w.pending.Front(); exists {
   166  			entry := front.(*queueEntry)
   167  			queueAge = w.config.Clock.Now().Sub(entry.created).Seconds()
   168  		}
   169  	}
   170  	w.mu.Unlock()
   171  
   172  	report := map[string]interface{}{
   173  		reportWatcherKey:   count,
   174  		reportStoreKey:     store,
   175  		reportRestartKey:   restart,
   176  		reportQueueSizeKey: queueSize,
   177  		reportQueueAgeKey:  queueAge,
   178  	}
   179  	if len(errs) > 0 {
   180  		report[reportErrorsKey] = errs
   181  	}
   182  	return report
   183  }
   184  
   185  // WatchController returns entity delta events for all models in the controller.
   186  func (w *Worker) WatchController() multiwatcher.Watcher {
   187  	return w.newWatcher(nil)
   188  }
   189  
   190  // WatchModel returns entity delta events just for the specified model.
   191  func (w *Worker) WatchModel(modelUUID string) multiwatcher.Watcher {
   192  	return w.newWatcher(
   193  		func(in []multiwatcher.Delta) []multiwatcher.Delta {
   194  			// Returns an empty slice if there is nothing to match with the
   195  			// implementation of the Watcher for noChanges. Both could potentially
   196  			// be updated to return a nil slice.
   197  			result := make([]multiwatcher.Delta, 0, len(in))
   198  			for _, delta := range in {
   199  				if delta.Entity.EntityID().ModelUUID == modelUUID {
   200  					result = append(result, delta)
   201  				}
   202  			}
   203  			return result
   204  		})
   205  }
   206  
   207  func (w *Worker) newWatcher(filter func([]multiwatcher.Delta) []multiwatcher.Delta) *Watcher {
   208  	w.mu.Lock()
   209  	watch := &Watcher{
   210  		request: w.request,
   211  		control: &w.tomb,
   212  		logger:  w.config.Logger,
   213  		// Buffered err channel as if there is a fetch error on the all watcher backing
   214  		// the error is passed to the watcher.
   215  		err:    make(chan error, 1),
   216  		filter: filter,
   217  	}
   218  	w.watchers = append(w.watchers, watch)
   219  	w.mu.Unlock()
   220  	return watch
   221  }
   222  
   223  func (w *Worker) loop() error {
   224  	w.config.Logger.Tracef("worker loop started")
   225  	defer w.config.Logger.Tracef("worker loop completed")
   226  	defer func() {
   227  		if w.config.Cleanup != nil {
   228  			w.config.Cleanup()
   229  		}
   230  	}()
   231  
   232  	_ = w.config.PrometheusRegisterer.Register(w.metrics)
   233  	defer w.config.PrometheusRegisterer.Unregister(w.metrics)
   234  
   235  	for {
   236  		err := w.inner()
   237  		select {
   238  		case <-w.tomb.Dying():
   239  			return nil
   240  		default:
   241  			w.mu.Lock()
   242  			w.restartCount++
   243  			w.errors = append(w.errors, err)
   244  			if len(w.errors) > 5 {
   245  				// Remembering the last five errors is somewhat of an arbitrary number,
   246  				// but we want more than just the last one, and we do want a cap.
   247  				// Since we only ever add one at a time, we know that removing just
   248  				// the first one will get us back to five.
   249  				w.errors = w.errors[1:]
   250  			}
   251  			w.store = multiwatcher.NewStore(w.config.Logger)
   252  			w.request = make(chan *request)
   253  			w.waiting = make(map[*Watcher]*request)
   254  			// Since the worker itself isn't dying, we need to manually stop all
   255  			// the watchers.
   256  			for _, watch := range w.watchers {
   257  				watch.err <- err
   258  			}
   259  			w.watchers = nil
   260  			w.mu.Unlock()
   261  		}
   262  	}
   263  }
   264  
   265  // We don't want to restart the worker just because the backing has raised an error.
   266  // If it does, we record the error, and start again with a new store.
   267  func (w *Worker) inner() error {
   268  	w.config.Logger.Tracef("worker inner started")
   269  	defer w.config.Logger.Tracef("worker inner completed")
   270  
   271  	// Create the wait group, and set up the defer before the watching
   272  	// the backing, as we want the backing unwatch to happen before the
   273  	// waitgroup wait call. This is to ensure we aren't blocking the
   274  	// backing event generator.
   275  	var wg sync.WaitGroup
   276  	wg.Add(1)
   277  	defer wg.Wait()
   278  
   279  	backing := w.config.Backing
   280  	in := make(chan watcher.Change)
   281  	backing.Watch(in)
   282  	defer backing.Unwatch(in)
   283  
   284  	processError := make(chan error)
   285  
   286  	go func() {
   287  		err := w.process(backing, w.tomb.Dying())
   288  		select {
   289  		case <-w.tomb.Dying():
   290  		case processError <- err:
   291  		}
   292  		wg.Done()
   293  	}()
   294  
   295  	for {
   296  		select {
   297  		case <-w.tomb.Dying():
   298  			return errors.Trace(tomb.ErrDying)
   299  		case err := <-processError:
   300  			return errors.Trace(err)
   301  		case change := <-in:
   302  			w.append(&change)
   303  		}
   304  	}
   305  }
   306  
   307  func (w *Worker) append(change *watcher.Change) {
   308  	start := w.config.Clock.Now()
   309  	w.mu.Lock()
   310  	if inQueue := w.changePending(change); inQueue {
   311  		w.metrics.dupe.Inc()
   312  	} else {
   313  		element := &queueEntry{
   314  			change:  change,
   315  			created: start,
   316  		}
   317  		w.pending.PushBack(element)
   318  		if w.pending.Len() == 1 {
   319  			select {
   320  			// In all normal cases, we can push something onto sm.data
   321  			// as it is a buffered channel. And if the length is one,
   322  			// then data should be empty. However paranoia and all that.
   323  			case w.data <- struct{}{}:
   324  			default:
   325  			}
   326  		}
   327  	}
   328  	w.mu.Unlock()
   329  	finish := w.config.Clock.Now()
   330  	w.metrics.append.Observe(float64(finish.Sub(start).Milliseconds()))
   331  }
   332  
   333  func (w *Worker) changePending(change *watcher.Change) bool {
   334  	// Function assumes lock already held by append function.
   335  
   336  	// Look to see if there is already an entry in the pending queue
   337  	// for the same collection and id.
   338  	var entry *queueEntry
   339  	iter := w.pending.Iterator()
   340  	for iter.Next(&entry) {
   341  		if entry.change.C == change.C && entry.change.Id == change.Id {
   342  			return true
   343  		}
   344  	}
   345  	return false
   346  }
   347  
   348  func (w *Worker) process(backing state.AllWatcherBacking, done <-chan struct{}) error {
   349  	// We have no idea what changes the watcher might be trying to
   350  	// send us while getAll proceeds, but we don't mind, because
   351  	// backing.Changed is idempotent with respect to both updates
   352  	// and removals.
   353  	if err := backing.GetAll(w.store); err != nil {
   354  		return errors.Trace(err)
   355  	}
   356  	var next <-chan struct{}
   357  
   358  	for {
   359  		select {
   360  		case <-done:
   361  			return nil
   362  		case <-w.data:
   363  			// Has new data been pushed on?
   364  			w.config.Logger.Tracef("new data pushed on queue")
   365  		case <-next:
   366  			// If there was already data, next is a closed channel.
   367  			// Otherwise it is nil, so won't pass through.
   368  			w.config.Logger.Tracef("process data on queue")
   369  		case req := <-w.request:
   370  			// If we get a watcher request to handle while we are
   371  			// waiting for changes, handle it, and respond.
   372  			w.config.Logger.Tracef("handle request: %#v", req)
   373  			w.handle(req)
   374  		}
   375  		change, empty := w.popOne()
   376  		if empty {
   377  			next = nil
   378  		} else {
   379  			next = w.closed
   380  		}
   381  		if change != nil {
   382  			start := w.config.Clock.Now()
   383  			if err := backing.Changed(w.store, *change); err != nil {
   384  				return errors.Trace(err)
   385  			}
   386  			// We don't care about observing changes that error out.
   387  			// They shouldn't happen very often at all.
   388  			finish := w.config.Clock.Now()
   389  			w.metrics.process.Observe(float64(finish.Sub(start).Milliseconds()))
   390  		}
   391  		w.respond()
   392  	}
   393  }
   394  
   395  func (w *Worker) popOne() (*watcher.Change, bool) {
   396  	w.mu.Lock()
   397  	defer w.mu.Unlock()
   398  	val, ok := w.pending.PopFront()
   399  	if !ok {
   400  		// nothing to do
   401  		return nil, true
   402  	}
   403  	empty := w.pending.Len() == 0
   404  	entry := val.(*queueEntry)
   405  	return entry.change, empty
   406  }
   407  
   408  // Kill implements worker.Worker.Kill.
   409  func (w *Worker) Kill() {
   410  	w.tomb.Kill(nil)
   411  }
   412  
   413  // Wait implements worker.Worker.Wait.
   414  func (w *Worker) Wait() error {
   415  	return errors.Trace(w.tomb.Wait())
   416  }
   417  
   418  // handle processes a request from a Multiwatcher.
   419  func (w *Worker) handle(req *request) {
   420  	w.config.Logger.Tracef("start handle")
   421  	defer w.config.Logger.Tracef("finish handle")
   422  	if req.watcher.stopped {
   423  		w.config.Logger.Tracef("watcher %p is stopped", req.watcher)
   424  		// The watcher has previously been stopped.
   425  		if req.reply != nil {
   426  			select {
   427  			case req.reply <- false:
   428  			case <-w.tomb.Dying():
   429  			}
   430  		}
   431  		return
   432  	}
   433  	if req.reply == nil {
   434  		w.config.Logger.Tracef("request to stop watcher %p", req.watcher)
   435  		// This is a request to stop the watcher.
   436  		for req := w.waiting[req.watcher]; req != nil; req = req.next {
   437  			select {
   438  			case req.reply <- false:
   439  			case <-w.tomb.Dying():
   440  			}
   441  		}
   442  		delete(w.waiting, req.watcher)
   443  		req.watcher.stopped = true
   444  		w.store.DecReference(req.watcher.revno)
   445  		return
   446  	}
   447  	// Add request to head of list.
   448  	w.config.Logger.Tracef("add watcher %p request to waiting", req.watcher)
   449  	req.next = w.waiting[req.watcher]
   450  	w.waiting[req.watcher] = req
   451  }
   452  
   453  // respond responds to all outstanding requests that are satisfiable.
   454  func (w *Worker) respond() {
   455  	w.config.Logger.Tracef("start respond")
   456  	defer w.config.Logger.Tracef("finish respond")
   457  	for watch, req := range w.waiting {
   458  		revno := watch.revno
   459  		changes, latestRevno := w.store.ChangesSince(revno)
   460  		w.config.Logger.Tracef("%d changes since %d for watcher %p", len(changes), revno, watch)
   461  		if len(changes) == 0 {
   462  			if req.noChanges != nil {
   463  				w.config.Logger.Tracef("sending down noChanges for watcher %p", watch)
   464  				select {
   465  				case req.noChanges <- struct{}{}:
   466  				case <-w.tomb.Dying():
   467  					return
   468  				}
   469  
   470  				w.removeWaitingReq(watch, req)
   471  			}
   472  			continue
   473  		}
   474  
   475  		req.changes = changes
   476  		watch.revno = latestRevno
   477  
   478  		w.config.Logger.Tracef("sending changes down reply channel for watcher %p", watch)
   479  		select {
   480  		case req.reply <- true:
   481  		case <-w.tomb.Dying():
   482  			return
   483  		}
   484  
   485  		w.removeWaitingReq(watch, req)
   486  		w.store.AddReference(revno)
   487  	}
   488  }
   489  
   490  func (w *Worker) removeWaitingReq(watcher *Watcher, req *request) {
   491  	if next := req.next; next == nil {
   492  		// Last request for this watcher.
   493  		delete(w.waiting, watcher)
   494  	} else {
   495  		w.waiting[watcher] = next
   496  	}
   497  }