github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/metrics/matcher/cache/cache.go (about)

     1  // Copyright (c) 2017 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package cache
    22  
    23  import (
    24  	"errors"
    25  	"math/rand"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/m3db/m3/src/metrics/matcher/namespace"
    30  	"github.com/m3db/m3/src/metrics/metric/id"
    31  	"github.com/m3db/m3/src/metrics/rules"
    32  	"github.com/m3db/m3/src/x/clock"
    33  
    34  	"github.com/uber-go/tally"
    35  )
    36  
    37  const (
    38  	numOngoingTasks          = 2
    39  	deletionThrottleInterval = 100 * time.Millisecond
    40  )
    41  
    42  var (
    43  	errCacheClosed = errors.New("cache is already closed")
    44  )
    45  
    46  // Cache caches the rule matching result associated with metrics.
    47  type Cache interface {
    48  	rules.Matcher
    49  
    50  	// Register sets the source for a given namespace.
    51  	Register(namespace []byte, source rules.Matcher)
    52  
    53  	// Refresh clears the cached results for the given source for a given namespace.
    54  	Refresh(namespace []byte, source rules.Matcher)
    55  
    56  	// Unregister deletes the cached results for a given namespace.
    57  	Unregister(namespace []byte)
    58  
    59  	// Close closes the cache.
    60  	Close() error
    61  }
    62  
    63  type setType int
    64  
    65  const (
    66  	dontSetIfNotFound setType = iota
    67  	setIfNotFound
    68  )
    69  
    70  type sleepFn func(time.Duration)
    71  
    72  type elementPtr *element
    73  
    74  type results struct {
    75  	elems  *elemMap
    76  	source rules.Matcher
    77  }
    78  
    79  func newResults(source rules.Matcher) results {
    80  	return results{
    81  		elems:  newElemMap(elemMapOptions{}),
    82  		source: source,
    83  	}
    84  }
    85  
    86  type cacheMetrics struct {
    87  	hits                tally.Counter
    88  	misses              tally.Counter
    89  	expires             tally.Counter
    90  	registers           tally.Counter
    91  	registerExists      tally.Counter
    92  	updates             tally.Counter
    93  	updateNotExists     tally.Counter
    94  	updateStaleSource   tally.Counter
    95  	unregisters         tally.Counter
    96  	unregisterNotExists tally.Counter
    97  	promotions          tally.Counter
    98  	evictions           tally.Counter
    99  	deletions           tally.Counter
   100  }
   101  
   102  func newCacheMetrics(scope tally.Scope) cacheMetrics {
   103  	return cacheMetrics{
   104  		hits:                scope.Counter("hits"),
   105  		misses:              scope.Counter("misses"),
   106  		expires:             scope.Counter("expires"),
   107  		registers:           scope.Counter("registers"),
   108  		registerExists:      scope.Counter("register-exists"),
   109  		updates:             scope.Counter("updates"),
   110  		updateNotExists:     scope.Counter("update-not-exists"),
   111  		updateStaleSource:   scope.Counter("update-stale-source"),
   112  		unregisters:         scope.Counter("unregisters"),
   113  		unregisterNotExists: scope.Counter("unregister-not-exists"),
   114  		promotions:          scope.Counter("promotions"),
   115  		evictions:           scope.Counter("evictions"),
   116  		deletions:           scope.Counter("deletions"),
   117  	}
   118  }
   119  
   120  // cache is an LRU-based read-through cache.
   121  type cache struct {
   122  	sync.RWMutex
   123  
   124  	capacity          int
   125  	nowFn             clock.NowFn
   126  	freshDuration     time.Duration
   127  	stutterDuration   time.Duration
   128  	evictionBatchSize int
   129  	deletionBatchSize int
   130  	invalidationMode  InvalidationMode
   131  	sleepFn           sleepFn
   132  	nsResolver        namespace.Resolver
   133  
   134  	namespaces *namespaceResultsMap
   135  	list       lockedList
   136  	evictCh    chan struct{}
   137  	deleteCh   chan struct{}
   138  	toDelete   []*elemMap
   139  	wgWorker   sync.WaitGroup
   140  	closed     bool
   141  	closedCh   chan struct{}
   142  	metrics    cacheMetrics
   143  }
   144  
   145  // NewCache creates a new cache.
   146  func NewCache(opts Options) Cache {
   147  	clockOpts := opts.ClockOptions()
   148  	instrumentOpts := opts.InstrumentOptions()
   149  	c := &cache{
   150  		capacity:          opts.Capacity(),
   151  		nowFn:             clockOpts.NowFn(),
   152  		freshDuration:     opts.FreshDuration(),
   153  		stutterDuration:   opts.StutterDuration(),
   154  		evictionBatchSize: opts.EvictionBatchSize(),
   155  		deletionBatchSize: opts.DeletionBatchSize(),
   156  		invalidationMode:  opts.InvalidationMode(),
   157  		sleepFn:           time.Sleep,
   158  		namespaces:        newNamespaceResultsMap(namespaceResultsMapOptions{}),
   159  		evictCh:           make(chan struct{}, 1),
   160  		deleteCh:          make(chan struct{}, 1),
   161  		closedCh:          make(chan struct{}),
   162  		metrics:           newCacheMetrics(instrumentOpts.MetricsScope()),
   163  		nsResolver:        opts.NamespaceResolver(),
   164  	}
   165  
   166  	c.wgWorker.Add(numOngoingTasks)
   167  	go c.evict()
   168  	go c.delete()
   169  
   170  	return c
   171  }
   172  
   173  func (c *cache) ForwardMatch(id id.ID, fromNanos, toNanos int64,
   174  	opts rules.MatchOptions) (rules.MatchResult, error) {
   175  	namespace := c.nsResolver.Resolve(id)
   176  	c.RLock()
   177  	res, found, err := c.tryGetWithLock(namespace, id, fromNanos, toNanos, dontSetIfNotFound, opts)
   178  	c.RUnlock()
   179  	if err != nil {
   180  		return rules.MatchResult{}, err
   181  	}
   182  	if found {
   183  		return res, nil
   184  	}
   185  
   186  	c.Lock()
   187  	res, _, err = c.tryGetWithLock(namespace, id, fromNanos, toNanos, setIfNotFound, opts)
   188  	c.Unlock()
   189  	if err != nil {
   190  		return rules.MatchResult{}, err
   191  	}
   192  	return res, nil
   193  }
   194  
   195  func (c *cache) Register(namespace []byte, source rules.Matcher) {
   196  	c.Lock()
   197  	defer c.Unlock()
   198  
   199  	if results, exist := c.namespaces.Get(namespace); !exist {
   200  		c.namespaces.Set(namespace, newResults(source))
   201  		c.metrics.registers.Inc(1)
   202  	} else {
   203  		c.refreshWithLock(namespace, source, results)
   204  		c.metrics.registerExists.Inc(1)
   205  	}
   206  }
   207  
   208  func (c *cache) Refresh(namespace []byte, source rules.Matcher) {
   209  	c.Lock()
   210  	defer c.Unlock()
   211  
   212  	results, exist := c.namespaces.Get(namespace)
   213  	// NB: The namespace does not exist yet. This could happen if the source update came
   214  	// before its namespace is registered. It is safe to ignore this premature update
   215  	// because the namespace will eventually register itself and refreshes the cache.
   216  	if !exist {
   217  		c.metrics.updateNotExists.Inc(1)
   218  		return
   219  	}
   220  	// NB: The source to update is different from what's stored in the cache. This could
   221  	// happen if the namespace is changed, removed, and then revived before the rule change
   222  	// could be processed. It is safe to ignore this stale update because the last rule
   223  	// change update will eventually be processed and the cache will be refreshed.
   224  	if results.source != source {
   225  		c.metrics.updateStaleSource.Inc(1)
   226  		return
   227  	}
   228  	c.refreshWithLock(namespace, source, results)
   229  	c.metrics.updates.Inc(1)
   230  }
   231  
   232  func (c *cache) Unregister(namespace []byte) {
   233  	c.Lock()
   234  	defer c.Unlock()
   235  
   236  	results, exists := c.namespaces.Get(namespace)
   237  	if !exists {
   238  		c.metrics.unregisterNotExists.Inc(1)
   239  		return
   240  	}
   241  	c.namespaces.Delete(namespace)
   242  	c.toDelete = append(c.toDelete, results.elems)
   243  	c.notifyDeletion()
   244  	c.metrics.unregisters.Inc(1)
   245  }
   246  
   247  func (c *cache) Close() error {
   248  	c.Lock()
   249  	if c.closed {
   250  		c.Unlock()
   251  		return errCacheClosed
   252  	}
   253  	c.closed = true
   254  	c.Unlock()
   255  
   256  	close(c.closedCh)
   257  	c.wgWorker.Wait()
   258  	return nil
   259  }
   260  
   261  // tryGetWithLock attempts to get the match result, returning true if a match
   262  // result is successfully determined and no further processing is required,
   263  // and false otherwise.
   264  func (c *cache) tryGetWithLock(
   265  	namespace []byte,
   266  	id id.ID,
   267  	fromNanos, toNanos int64,
   268  	setType setType,
   269  	matchOpts rules.MatchOptions,
   270  ) (rules.MatchResult, bool, error) {
   271  	res := rules.EmptyMatchResult
   272  	results, exists := c.namespaces.Get(namespace)
   273  	if !exists {
   274  		c.metrics.hits.Inc(1)
   275  		return res, true, nil
   276  	}
   277  	entry, exists := results.elems.Get(id.Bytes())
   278  	if exists {
   279  		elem := (*element)(entry)
   280  		res = elem.result
   281  		// NB(xichen): the cached match result expires when a new rule takes effect.
   282  		// Therefore we need to check if the cache result is valid up to the end
   283  		// of the match time range, a.k.a. toNanos.
   284  		if !res.HasExpired(toNanos) {
   285  			// NB(xichen): in order to avoid the overhead acquiring an exclusive
   286  			// lock to perform a promotion to move the element to the front of the
   287  			// list, we set an expiry time for each promotion and do not perform
   288  			// another promotion if the previous one is still fresh. This should be
   289  			// good enough because if the cache is sufficiently large, the frequently
   290  			// accessed items should be still near the front of the list. Additionally,
   291  			// we can still achieve the exact LRU semantics by setting fresh duration
   292  			// and stutter duration to 0.
   293  			now := c.nowFn()
   294  			if elem.ShouldPromote(now) {
   295  				c.promote(now, elem)
   296  			}
   297  			c.metrics.hits.Inc(1)
   298  			return res, true, nil
   299  		}
   300  		c.metrics.expires.Inc(1)
   301  	}
   302  	if setType == dontSetIfNotFound {
   303  		return res, false, nil
   304  	}
   305  	// NB(xichen): the result is either not cached, or cached but invalid, in both
   306  	// cases we should use the source to compute the result and set it in the cache.
   307  	res, err := c.setWithLock(namespace, id, fromNanos, toNanos, results, exists, matchOpts)
   308  	if err != nil {
   309  		return rules.MatchResult{}, false, err
   310  	}
   311  	return res, true, nil
   312  }
   313  
   314  func (c *cache) setWithLock(
   315  	namespace []byte,
   316  	id id.ID,
   317  	fromNanos, toNanos int64,
   318  	results results,
   319  	invalidate bool,
   320  	matchOpts rules.MatchOptions,
   321  ) (rules.MatchResult, error) {
   322  	// NB(xichen): if a cached result is invalid, it's very likely that we've reached
   323  	// a new cutover time and the old cached results are now invalid, therefore it's
   324  	// preferrable to invalidate everything to save the overhead of multiple invalidations.
   325  	if invalidate {
   326  		results = c.invalidateWithLock(namespace, id.Bytes(), results)
   327  	}
   328  	res, err := results.source.ForwardMatch(id, fromNanos, toNanos, matchOpts)
   329  	if err != nil {
   330  		return rules.MatchResult{}, err
   331  	}
   332  	newElem := newElement(namespace, id.Bytes(), res)
   333  	newElem.SetPromotionExpiry(c.newPromotionExpiry(c.nowFn()))
   334  	results.elems.Set(id.Bytes(), newElem)
   335  	// NB(xichen): we don't evict until the number of cached items goes
   336  	// above the capacity by at least the eviction batch size to amortize
   337  	// the eviction overhead.
   338  	if newSize := c.add(newElem); newSize > c.capacity+c.evictionBatchSize {
   339  		c.notifyEviction()
   340  	}
   341  	c.metrics.misses.Inc(1)
   342  	return res, nil
   343  }
   344  
   345  // refreshWithLock clears the existing cached results for namespace nsHash
   346  // and associates the namespace results with a new source.
   347  func (c *cache) refreshWithLock(namespace []byte, source rules.Matcher, results results) {
   348  	c.toDelete = append(c.toDelete, results.elems)
   349  	c.notifyDeletion()
   350  	results.source = source
   351  	results.elems = newElemMap(elemMapOptions{})
   352  	c.namespaces.Set(namespace, results)
   353  }
   354  
   355  func (c *cache) add(elem *element) int {
   356  	c.list.Lock()
   357  	c.list.PushFront(elem)
   358  	size := c.list.Len()
   359  	c.list.Unlock()
   360  	return size
   361  }
   362  
   363  func (c *cache) promote(now time.Time, elem *element) {
   364  	c.list.Lock()
   365  	// Bail if someone else got ahead of us and promoted this element.
   366  	if !elem.ShouldPromote(now) {
   367  		c.list.Unlock()
   368  		return
   369  	}
   370  	// Otherwise proceed with promotion.
   371  	elem.SetPromotionExpiry(c.newPromotionExpiry(now))
   372  	c.list.MoveToFront(elem)
   373  	c.list.Unlock()
   374  	c.metrics.promotions.Inc(1)
   375  }
   376  
   377  func (c *cache) invalidateWithLock(namespace, id []byte, results results) results {
   378  	if c.invalidationMode == InvalidateAll {
   379  		c.toDelete = append(c.toDelete, results.elems)
   380  		c.notifyDeletion()
   381  		results.elems = newElemMap(elemMapOptions{})
   382  		c.namespaces.Set(namespace, results)
   383  	} else {
   384  		// Guaranteed to be in the map when invalidateWithLock is called
   385  		elem, _ := results.elems.Get(id)
   386  		results.elems.Delete(id)
   387  		c.list.Lock()
   388  		c.list.Remove(elem)
   389  		c.list.Unlock()
   390  	}
   391  	return results
   392  }
   393  
   394  func (c *cache) evict() {
   395  	defer c.wgWorker.Done()
   396  
   397  	for {
   398  		select {
   399  		case <-c.evictCh:
   400  			c.doEvict()
   401  		case <-c.closedCh:
   402  			return
   403  		}
   404  	}
   405  }
   406  
   407  func (c *cache) doEvict() {
   408  	c.Lock()
   409  	c.list.Lock()
   410  	numEvicted := 0
   411  	for c.list.Len() > c.capacity {
   412  		elem := c.list.Back()
   413  		c.list.Remove(elem)
   414  		numEvicted++
   415  		// NB(xichen): the namespace owning this element may have been deleted,
   416  		// in which case we simply continue. This is okay because the deleted element
   417  		// will be marked as deleted so when the deletion goroutine sees and tries to
   418  		// delete it again, it will be a no op, at which point it will be removed from
   419  		// the owning map as well.
   420  		results, exists := c.namespaces.Get(elem.namespace)
   421  		if !exists {
   422  			continue
   423  		}
   424  		results.elems.Delete(elem.id)
   425  	}
   426  	c.list.Unlock()
   427  	c.Unlock()
   428  	c.metrics.evictions.Inc(int64(numEvicted))
   429  }
   430  
   431  func (c *cache) delete() {
   432  	defer c.wgWorker.Done()
   433  
   434  	for {
   435  		select {
   436  		case <-c.deleteCh:
   437  			c.doDelete()
   438  		case <-c.closedCh:
   439  			return
   440  		}
   441  	}
   442  }
   443  
   444  func (c *cache) doDelete() {
   445  	c.Lock()
   446  	if len(c.toDelete) == 0 {
   447  		c.Unlock()
   448  		return
   449  	}
   450  
   451  	// NB(xichen): add pooling if deletion happens frequent enough.
   452  	toDelete := c.toDelete
   453  	c.toDelete = nil
   454  	c.Unlock()
   455  
   456  	allDeleted := 0
   457  	deleted := 0
   458  	c.list.Lock()
   459  	for _, elems := range toDelete {
   460  		for _, entry := range elems.Iter() {
   461  			elem := entry.Value()
   462  			c.list.Remove(elem)
   463  			allDeleted++
   464  			deleted++
   465  			// If we have deleted enough elements, release the lock
   466  			// and give other goroutines a chance to acquire the lock
   467  			// since deletion does not need to be fast.
   468  			if deleted >= c.deletionBatchSize {
   469  				c.list.Unlock()
   470  				c.sleepFn(deletionThrottleInterval)
   471  				deleted = 0
   472  				c.list.Lock()
   473  			}
   474  		}
   475  	}
   476  	c.list.Unlock()
   477  	c.metrics.deletions.Inc(int64(allDeleted))
   478  }
   479  
   480  func (c *cache) notifyEviction() {
   481  	select {
   482  	case c.evictCh <- struct{}{}:
   483  	default:
   484  	}
   485  }
   486  
   487  func (c *cache) notifyDeletion() {
   488  	select {
   489  	case c.deleteCh <- struct{}{}:
   490  	default:
   491  	}
   492  }
   493  
   494  func (c *cache) newPromotionExpiry(now time.Time) time.Time {
   495  	expiry := now.Add(c.freshDuration)
   496  	if c.stutterDuration > 0 {
   497  		expiry = expiry.Add(time.Duration(rand.Int63n(int64(c.stutterDuration))))
   498  	}
   499  	return expiry
   500  }