github.com/outbrain/consul@v1.4.5/agent/cache/cache.go (about)

     1  // Package cache provides caching features for data from a Consul server.
     2  //
     3  // While this is similar in some ways to the "agent/ae" package, a key
     4  // difference is that with anti-entropy, the agent is the authoritative
     5  // source so it resolves differences the server may have. With caching (this
     6  // package), the server is the authoritative source and we do our best to
     7  // balance performance and correctness, depending on the type of data being
     8  // requested.
     9  //
    10  // The types of data that can be cached is configurable via the Type interface.
    11  // This allows specialized behavior for certain types of data. Each type of
    12  // Consul data (CA roots, leaf certs, intentions, KV, catalog, etc.) will
    13  // have to be manually implemented. This usually is not much work, see
    14  // the "agent/cache-types" package.
    15  package cache
    16  
    17  import (
    18  	"container/heap"
    19  	"fmt"
    20  	"sync"
    21  	"sync/atomic"
    22  	"time"
    23  
    24  	"github.com/armon/go-metrics"
    25  	"github.com/hashicorp/consul/lib"
    26  )
    27  
    28  //go:generate mockery -all -inpkg
    29  
    30  // Constants related to refresh backoff. We probably don't ever need to
    31  // make these configurable knobs since they primarily exist to lower load.
    32  const (
    33  	CacheRefreshBackoffMin = 3               // 3 attempts before backing off
    34  	CacheRefreshMaxWait    = 1 * time.Minute // maximum backoff wait time
    35  )
    36  
    37  // Cache is a agent-local cache of Consul data. Create a Cache using the
    38  // New function. A zero-value Cache is not ready for usage and will result
    39  // in a panic.
    40  //
    41  // The types of data to be cached must be registered via RegisterType. Then,
    42  // calls to Get specify the type and a Request implementation. The
    43  // implementation of Request is usually done directly on the standard RPC
    44  // struct in agent/structs.  This API makes cache usage a mostly drop-in
    45  // replacement for non-cached RPC calls.
    46  //
    47  // The cache is partitioned by ACL and datacenter. This allows the cache
    48  // to be safe for multi-DC queries and for queries where the data is modified
    49  // due to ACLs all without the cache having to have any clever logic, at
    50  // the slight expense of a less perfect cache.
    51  //
    52  // The Cache exposes various metrics via go-metrics. Please view the source
    53  // searching for "metrics." to see the various metrics exposed. These can be
    54  // used to explore the performance of the cache.
    55  type Cache struct {
    56  	// types stores the list of data types that the cache knows how to service.
    57  	// These can be dynamically registered with RegisterType.
    58  	typesLock sync.RWMutex
    59  	types     map[string]typeEntry
    60  
    61  	// entries contains the actual cache data. Access to entries and
    62  	// entriesExpiryHeap must be protected by entriesLock.
    63  	//
    64  	// entriesExpiryHeap is a heap of *cacheEntry values ordered by
    65  	// expiry, with the soonest to expire being first in the list (index 0).
    66  	//
    67  	// NOTE(mitchellh): The entry map key is currently a string in the format
    68  	// of "<DC>/<ACL token>/<Request key>" in order to properly partition
    69  	// requests to different datacenters and ACL tokens. This format has some
    70  	// big drawbacks: we can't evict by datacenter, ACL token, etc. For an
    71  	// initial implementation this works and the tests are agnostic to the
    72  	// internal storage format so changing this should be possible safely.
    73  	entriesLock       sync.RWMutex
    74  	entries           map[string]cacheEntry
    75  	entriesExpiryHeap *expiryHeap
    76  
    77  	// stopped is used as an atomic flag to signal that the Cache has been
    78  	// discarded so background fetches and expiry processing should stop.
    79  	stopped uint32
    80  	// stopCh is closed when Close is called
    81  	stopCh chan struct{}
    82  }
    83  
    84  // typeEntry is a single type that is registered with a Cache.
    85  type typeEntry struct {
    86  	Type Type
    87  	Opts *RegisterOptions
    88  }
    89  
    90  // ResultMeta is returned from Get calls along with the value and can be used
    91  // to expose information about the cache status for debugging or testing.
    92  type ResultMeta struct {
    93  	// Hit indicates whether or not the request was a cache hit
    94  	Hit bool
    95  
    96  	// Age identifies how "stale" the result is. It's semantics differ based on
    97  	// whether or not the cache type performs background refresh or not as defined
    98  	// in https://www.consul.io/api/index.html#agent-caching.
    99  	//
   100  	// For background refresh types, Age is 0 unless the background blocking query
   101  	// is currently in a failed state and so not keeping up with the server's
   102  	// values. If it is non-zero it represents the time since the first failure to
   103  	// connect during background refresh, and is reset after a background request
   104  	// does manage to reconnect and either return successfully, or block for at
   105  	// least the yamux keepalive timeout of 30 seconds (which indicates the
   106  	// connection is OK but blocked as expected).
   107  	//
   108  	// For simple cache types, Age is the time since the result being returned was
   109  	// fetched from the servers.
   110  	Age time.Duration
   111  
   112  	// Index is the internal ModifyIndex for the cache entry. Not all types
   113  	// support blocking and all that do will likely have this in their result type
   114  	// already but this allows generic code to reason about whether cache values
   115  	// have changed.
   116  	Index uint64
   117  }
   118  
   119  // Options are options for the Cache.
   120  type Options struct {
   121  	// Nothing currently, reserved.
   122  }
   123  
   124  // New creates a new cache with the given RPC client and reasonable defaults.
   125  // Further settings can be tweaked on the returned value.
   126  func New(*Options) *Cache {
   127  	// Initialize the heap. The buffer of 1 is really important because
   128  	// its possible for the expiry loop to trigger the heap to update
   129  	// itself and it'd block forever otherwise.
   130  	h := &expiryHeap{NotifyCh: make(chan struct{}, 1)}
   131  	heap.Init(h)
   132  
   133  	c := &Cache{
   134  		types:             make(map[string]typeEntry),
   135  		entries:           make(map[string]cacheEntry),
   136  		entriesExpiryHeap: h,
   137  		stopCh:            make(chan struct{}),
   138  	}
   139  
   140  	// Start the expiry watcher
   141  	go c.runExpiryLoop()
   142  
   143  	return c
   144  }
   145  
   146  // RegisterOptions are options that can be associated with a type being
   147  // registered for the cache. This changes the behavior of the cache for
   148  // this type.
   149  type RegisterOptions struct {
   150  	// LastGetTTL is the time that the values returned by this type remain
   151  	// in the cache after the last get operation. If a value isn't accessed
   152  	// within this duration, the value is purged from the cache and
   153  	// background refreshing will cease.
   154  	LastGetTTL time.Duration
   155  
   156  	// Refresh configures whether the data is actively refreshed or if
   157  	// the data is only refreshed on an explicit Get. The default (false)
   158  	// is to only request data on explicit Get.
   159  	Refresh bool
   160  
   161  	// RefreshTimer is the time between attempting to refresh data.
   162  	// If this is zero, then data is refreshed immediately when a fetch
   163  	// is returned.
   164  	//
   165  	// RefreshTimeout determines the maximum query time for a refresh
   166  	// operation. This is specified as part of the query options and is
   167  	// expected to be implemented by the Type itself.
   168  	//
   169  	// Using these values, various "refresh" mechanisms can be implemented:
   170  	//
   171  	//   * With a high timer duration and a low timeout, a timer-based
   172  	//     refresh can be set that minimizes load on the Consul servers.
   173  	//
   174  	//   * With a low timer and high timeout duration, a blocking-query-based
   175  	//     refresh can be set so that changes in server data are recognized
   176  	//     within the cache very quickly.
   177  	//
   178  	RefreshTimer   time.Duration
   179  	RefreshTimeout time.Duration
   180  }
   181  
   182  // RegisterType registers a cacheable type.
   183  //
   184  // This makes the type available for Get but does not automatically perform
   185  // any prefetching. In order to populate the cache, Get must be called.
   186  func (c *Cache) RegisterType(n string, typ Type, opts *RegisterOptions) {
   187  	if opts == nil {
   188  		opts = &RegisterOptions{}
   189  	}
   190  	if opts.LastGetTTL == 0 {
   191  		opts.LastGetTTL = 72 * time.Hour // reasonable default is days
   192  	}
   193  
   194  	c.typesLock.Lock()
   195  	defer c.typesLock.Unlock()
   196  	c.types[n] = typeEntry{Type: typ, Opts: opts}
   197  }
   198  
   199  // Get loads the data for the given type and request. If data satisfying the
   200  // minimum index is present in the cache, it is returned immediately. Otherwise,
   201  // this will block until the data is available or the request timeout is
   202  // reached.
   203  //
   204  // Multiple Get calls for the same Request (matching CacheKey value) will
   205  // block on a single network request.
   206  //
   207  // The timeout specified by the Request will be the timeout on the cache
   208  // Get, and does not correspond to the timeout of any background data
   209  // fetching. If the timeout is reached before data satisfying the minimum
   210  // index is retrieved, the last known value (maybe nil) is returned. No
   211  // error is returned on timeout. This matches the behavior of Consul blocking
   212  // queries.
   213  func (c *Cache) Get(t string, r Request) (interface{}, ResultMeta, error) {
   214  	return c.getWithIndex(t, r, r.CacheInfo().MinIndex)
   215  }
   216  
   217  // getWithIndex implements the main Get functionality but allows internal
   218  // callers (Watch) to manipulate the blocking index separately from the actual
   219  // request object.
   220  func (c *Cache) getWithIndex(t string, r Request, minIndex uint64) (interface{}, ResultMeta, error) {
   221  	info := r.CacheInfo()
   222  	if info.Key == "" {
   223  		metrics.IncrCounter([]string{"consul", "cache", "bypass"}, 1)
   224  
   225  		// If no key is specified, then we do not cache this request.
   226  		// Pass directly through to the backend.
   227  		return c.fetchDirect(t, r, minIndex)
   228  	}
   229  
   230  	// Get the actual key for our entry
   231  	key := c.entryKey(t, &info)
   232  
   233  	// First time through
   234  	first := true
   235  
   236  	// timeoutCh for watching our timeout
   237  	var timeoutCh <-chan time.Time
   238  
   239  RETRY_GET:
   240  	// Get the type that we're fetching
   241  	c.typesLock.RLock()
   242  	tEntry, ok := c.types[t]
   243  	c.typesLock.RUnlock()
   244  	if !ok {
   245  		// Shouldn't happen given that we successfully fetched this at least
   246  		// once. But be robust against panics.
   247  		return nil, ResultMeta{}, fmt.Errorf("unknown type in cache: %s", t)
   248  	}
   249  
   250  	// Get the current value
   251  	c.entriesLock.RLock()
   252  	entry, ok := c.entries[key]
   253  	c.entriesLock.RUnlock()
   254  
   255  	// Check if we have a hit
   256  	cacheHit := ok && entry.Valid
   257  
   258  	supportsBlocking := tEntry.Type.SupportsBlocking()
   259  
   260  	// Check index is not specified or lower than value, or the type doesn't
   261  	// support blocking.
   262  	if cacheHit && supportsBlocking &&
   263  		minIndex > 0 && minIndex >= entry.Index {
   264  		// MinIndex was given and matches or is higher than current value so we
   265  		// ignore the cache and fallthrough to blocking on a new value below.
   266  		cacheHit = false
   267  	}
   268  
   269  	// Check MaxAge is not exceeded if this is not a background refreshing type
   270  	// and MaxAge was specified.
   271  	if cacheHit && !tEntry.Opts.Refresh && info.MaxAge > 0 &&
   272  		!entry.FetchedAt.IsZero() && info.MaxAge < time.Since(entry.FetchedAt) {
   273  		cacheHit = false
   274  	}
   275  
   276  	// Check if we are requested to revalidate. If so the first time round the
   277  	// loop is not a hit but subsequent ones should be treated normally.
   278  	if cacheHit && !tEntry.Opts.Refresh && info.MustRevalidate && first {
   279  		cacheHit = false
   280  	}
   281  
   282  	if cacheHit {
   283  		meta := ResultMeta{Index: entry.Index}
   284  		if first {
   285  			metrics.IncrCounter([]string{"consul", "cache", t, "hit"}, 1)
   286  			meta.Hit = true
   287  		}
   288  
   289  		// If refresh is enabled, calculate age based on whether the background
   290  		// routine is still connected.
   291  		if tEntry.Opts.Refresh {
   292  			meta.Age = time.Duration(0)
   293  			if !entry.RefreshLostContact.IsZero() {
   294  				meta.Age = time.Since(entry.RefreshLostContact)
   295  			}
   296  		} else {
   297  			// For non-background refresh types, the age is just how long since we
   298  			// fetched it last.
   299  			if !entry.FetchedAt.IsZero() {
   300  				meta.Age = time.Since(entry.FetchedAt)
   301  			}
   302  		}
   303  
   304  		// Touch the expiration and fix the heap.
   305  		c.entriesLock.Lock()
   306  		entry.Expiry.Reset()
   307  		c.entriesExpiryHeap.Fix(entry.Expiry)
   308  		c.entriesLock.Unlock()
   309  
   310  		// We purposely do not return an error here since the cache only works with
   311  		// fetching values that either have a value or have an error, but not both.
   312  		// The Error may be non-nil in the entry in the case that an error has
   313  		// occurred _since_ the last good value, but we still want to return the
   314  		// good value to clients that are not requesting a specific version. The
   315  		// effect of this is that blocking clients will all see an error immediately
   316  		// without waiting a whole timeout to see it, but clients that just look up
   317  		// cache with an older index than the last valid result will still see the
   318  		// result and not the error here. I.e. the error is not "cached" without a
   319  		// new fetch attempt occurring, but the last good value can still be fetched
   320  		// from cache.
   321  		return entry.Value, meta, nil
   322  	}
   323  
   324  	// If this isn't our first time through and our last value has an error, then
   325  	// we return the error. This has the behavior that we don't sit in a retry
   326  	// loop getting the same error for the entire duration of the timeout.
   327  	// Instead, we make one effort to fetch a new value, and if there was an
   328  	// error, we return. Note that the invariant is that if both entry.Value AND
   329  	// entry.Error are non-nil, the error _must_ be more recent than the Value. In
   330  	// other words valid fetches should reset the error. See
   331  	// https://github.com/hashicorp/consul/issues/4480.
   332  	if !first && entry.Error != nil {
   333  		return entry.Value, ResultMeta{Index: entry.Index}, entry.Error
   334  	}
   335  
   336  	if first {
   337  		// We increment two different counters for cache misses depending on
   338  		// whether we're missing because we didn't have the data at all,
   339  		// or if we're missing because we're blocking on a set index.
   340  		if minIndex == 0 {
   341  			metrics.IncrCounter([]string{"consul", "cache", t, "miss_new"}, 1)
   342  		} else {
   343  			metrics.IncrCounter([]string{"consul", "cache", t, "miss_block"}, 1)
   344  		}
   345  	}
   346  
   347  	// No longer our first time through
   348  	first = false
   349  
   350  	// Set our timeout channel if we must
   351  	if info.Timeout > 0 && timeoutCh == nil {
   352  		timeoutCh = time.After(info.Timeout)
   353  	}
   354  
   355  	// At this point, we know we either don't have a value at all or the
   356  	// value we have is too old. We need to wait for new data.
   357  	waiterCh, err := c.fetch(t, key, r, true, 0)
   358  	if err != nil {
   359  		return nil, ResultMeta{Index: entry.Index}, err
   360  	}
   361  
   362  	select {
   363  	case <-waiterCh:
   364  		// Our fetch returned, retry the get from the cache.
   365  		goto RETRY_GET
   366  
   367  	case <-timeoutCh:
   368  		// Timeout on the cache read, just return whatever we have.
   369  		return entry.Value, ResultMeta{Index: entry.Index}, nil
   370  	}
   371  }
   372  
   373  // entryKey returns the key for the entry in the cache. See the note
   374  // about the entry key format in the structure docs for Cache.
   375  func (c *Cache) entryKey(t string, r *RequestInfo) string {
   376  	return fmt.Sprintf("%s/%s/%s/%s", t, r.Datacenter, r.Token, r.Key)
   377  }
   378  
   379  // fetch triggers a new background fetch for the given Request. If a
   380  // background fetch is already running for a matching Request, the waiter
   381  // channel for that request is returned. The effect of this is that there
   382  // is only ever one blocking query for any matching requests.
   383  //
   384  // If allowNew is true then the fetch should create the cache entry
   385  // if it doesn't exist. If this is false, then fetch will do nothing
   386  // if the entry doesn't exist. This latter case is to support refreshing.
   387  func (c *Cache) fetch(t, key string, r Request, allowNew bool, attempt uint) (<-chan struct{}, error) {
   388  	// Get the type that we're fetching
   389  	c.typesLock.RLock()
   390  	tEntry, ok := c.types[t]
   391  	c.typesLock.RUnlock()
   392  	if !ok {
   393  		return nil, fmt.Errorf("unknown type in cache: %s", t)
   394  	}
   395  
   396  	// We acquire a write lock because we may have to set Fetching to true.
   397  	c.entriesLock.Lock()
   398  	defer c.entriesLock.Unlock()
   399  	entry, ok := c.entries[key]
   400  
   401  	// If we aren't allowing new values and we don't have an existing value,
   402  	// return immediately. We return an immediately-closed channel so nothing
   403  	// blocks.
   404  	if !ok && !allowNew {
   405  		ch := make(chan struct{})
   406  		close(ch)
   407  		return ch, nil
   408  	}
   409  
   410  	// If we already have an entry and it is actively fetching, then return
   411  	// the currently active waiter.
   412  	if ok && entry.Fetching {
   413  		return entry.Waiter, nil
   414  	}
   415  
   416  	// If we don't have an entry, then create it. The entry must be marked
   417  	// as invalid so that it isn't returned as a valid value for a zero index.
   418  	if !ok {
   419  		entry = cacheEntry{Valid: false, Waiter: make(chan struct{})}
   420  	}
   421  
   422  	// Set that we're fetching to true, which makes it so that future
   423  	// identical calls to fetch will return the same waiter rather than
   424  	// perform multiple fetches.
   425  	entry.Fetching = true
   426  	c.entries[key] = entry
   427  	metrics.SetGauge([]string{"consul", "cache", "entries_count"}, float32(len(c.entries)))
   428  
   429  	// The actual Fetch must be performed in a goroutine.
   430  	go func() {
   431  		// If we have background refresh and currently are in "disconnected" state,
   432  		// waiting for a response might mean we mark our results as stale for up to
   433  		// 10 minutes (max blocking timeout) after connection is restored. To reduce
   434  		// that window, we assume that if the fetch takes more than 31 seconds then
   435  		// they are correctly blocking. We choose 31 seconds because yamux
   436  		// keepalives are every 30 seconds so the RPC should fail if the packets are
   437  		// being blackholed for more than 30 seconds.
   438  		var connectedTimer *time.Timer
   439  		if tEntry.Opts.Refresh && entry.Index > 0 &&
   440  			tEntry.Opts.RefreshTimeout > (31*time.Second) {
   441  			connectedTimer = time.AfterFunc(31*time.Second, func() {
   442  				c.entriesLock.Lock()
   443  				defer c.entriesLock.Unlock()
   444  				entry, ok := c.entries[key]
   445  				if !ok || entry.RefreshLostContact.IsZero() {
   446  					return
   447  				}
   448  				entry.RefreshLostContact = time.Time{}
   449  				c.entries[key] = entry
   450  			})
   451  		}
   452  
   453  		fOpts := FetchOptions{}
   454  		if tEntry.Type.SupportsBlocking() {
   455  			fOpts.MinIndex = entry.Index
   456  			fOpts.Timeout = tEntry.Opts.RefreshTimeout
   457  		}
   458  		if entry.Valid {
   459  			fOpts.LastResult = &FetchResult{
   460  				Value: entry.Value,
   461  				State: entry.State,
   462  				Index: entry.Index,
   463  			}
   464  		}
   465  
   466  		// Start building the new entry by blocking on the fetch.
   467  		result, err := tEntry.Type.Fetch(fOpts, r)
   468  		if connectedTimer != nil {
   469  			connectedTimer.Stop()
   470  		}
   471  
   472  		// Copy the existing entry to start.
   473  		newEntry := entry
   474  		newEntry.Fetching = false
   475  
   476  		// Importantly, always reset the Error. Having both Error and a Value that
   477  		// are non-nil is allowed in the cache entry but it indicates that the Error
   478  		// is _newer_ than the last good value. So if the err is nil then we need to
   479  		// reset to replace any _older_ errors and avoid them bubbling up. If the
   480  		// error is non-nil then we need to set it anyway and used to do it in the
   481  		// code below. See https://github.com/hashicorp/consul/issues/4480.
   482  		newEntry.Error = err
   483  
   484  		if result.Value != nil {
   485  			// A new value was given, so we create a brand new entry.
   486  			newEntry.Value = result.Value
   487  			newEntry.State = result.State
   488  			newEntry.Index = result.Index
   489  			newEntry.FetchedAt = time.Now()
   490  			if newEntry.Index < 1 {
   491  				// Less than one is invalid unless there was an error and in this case
   492  				// there wasn't since a value was returned. If a badly behaved RPC
   493  				// returns 0 when it has no data, we might get into a busy loop here. We
   494  				// set this to minimum of 1 which is safe because no valid user data can
   495  				// ever be written at raft index 1 due to the bootstrap process for
   496  				// raft. This insure that any subsequent background refresh request will
   497  				// always block, but allows the initial request to return immediately
   498  				// even if there is no data.
   499  				newEntry.Index = 1
   500  			}
   501  
   502  			// This is a valid entry with a result
   503  			newEntry.Valid = true
   504  		} else if result.State != nil && err == nil {
   505  			// Also set state if it's non-nil but Value is nil. This is important in the
   506  			// case we are returning nil due to a timeout or a transient error like rate
   507  			// limiting that we want to mask from the user - there is no result yet but
   508  			// we want to manage retrying internally before we return an error to user.
   509  			// The retrying state is in State so we need to still update that in the
   510  			// entry even if we don't have an actual result yet (e.g. hit a rate limit
   511  			// on first request for a leaf certificate).
   512  			newEntry.State = result.State
   513  		}
   514  
   515  		// Error handling
   516  		if err == nil {
   517  			metrics.IncrCounter([]string{"consul", "cache", "fetch_success"}, 1)
   518  			metrics.IncrCounter([]string{"consul", "cache", t, "fetch_success"}, 1)
   519  
   520  			if result.Index > 0 {
   521  				// Reset the attempts counter so we don't have any backoff
   522  				attempt = 0
   523  			} else {
   524  				// Result having a zero index is an implicit error case. There was no
   525  				// actual error but it implies the RPC found in index (nothing written
   526  				// yet for that type) but didn't take care to return safe "1" index. We
   527  				// don't want to actually treat it like an error by setting
   528  				// newEntry.Error to something non-nil, but we should guard against 100%
   529  				// CPU burn hot loops caused by that case which will never block but
   530  				// also won't backoff either. So we treat it as a failed attempt so that
   531  				// at least the failure backoff will save our CPU while still
   532  				// periodically refreshing so normal service can resume when the servers
   533  				// actually have something to return from the RPC. If we get in this
   534  				// state it can be considered a bug in the RPC implementation (to ever
   535  				// return a zero index) however since it can happen this is a safety net
   536  				// for the future.
   537  				attempt++
   538  			}
   539  
   540  			// If we have refresh active, this successful response means cache is now
   541  			// "connected" and should not be stale. Reset the lost contact timer.
   542  			if tEntry.Opts.Refresh {
   543  				newEntry.RefreshLostContact = time.Time{}
   544  			}
   545  		} else {
   546  			metrics.IncrCounter([]string{"consul", "cache", "fetch_error"}, 1)
   547  			metrics.IncrCounter([]string{"consul", "cache", t, "fetch_error"}, 1)
   548  
   549  			// Increment attempt counter
   550  			attempt++
   551  
   552  			// If we are refreshing and just failed, updated the lost contact time as
   553  			// our cache will be stale until we get successfully reconnected. We only
   554  			// set this on the first failure (if it's zero) so we can track how long
   555  			// it's been since we had a valid connection/up-to-date view of the state.
   556  			if tEntry.Opts.Refresh && newEntry.RefreshLostContact.IsZero() {
   557  				newEntry.RefreshLostContact = time.Now()
   558  			}
   559  		}
   560  
   561  		// Create a new waiter that will be used for the next fetch.
   562  		newEntry.Waiter = make(chan struct{})
   563  
   564  		// Set our entry
   565  		c.entriesLock.Lock()
   566  
   567  		// If this is a new entry (not in the heap yet), then setup the
   568  		// initial expiry information and insert. If we're already in
   569  		// the heap we do nothing since we're reusing the same entry.
   570  		if newEntry.Expiry == nil || newEntry.Expiry.HeapIndex == -1 {
   571  			newEntry.Expiry = &cacheEntryExpiry{
   572  				Key: key,
   573  				TTL: tEntry.Opts.LastGetTTL,
   574  			}
   575  			newEntry.Expiry.Reset()
   576  			heap.Push(c.entriesExpiryHeap, newEntry.Expiry)
   577  		}
   578  
   579  		c.entries[key] = newEntry
   580  		c.entriesLock.Unlock()
   581  
   582  		// Trigger the old waiter
   583  		close(entry.Waiter)
   584  
   585  		// If refresh is enabled, run the refresh in due time. The refresh
   586  		// below might block, but saves us from spawning another goroutine.
   587  		if tEntry.Opts.Refresh {
   588  			c.refresh(tEntry.Opts, attempt, t, key, r)
   589  		}
   590  	}()
   591  
   592  	return entry.Waiter, nil
   593  }
   594  
   595  // fetchDirect fetches the given request with no caching. Because this
   596  // bypasses the caching entirely, multiple matching requests will result
   597  // in multiple actual RPC calls (unlike fetch).
   598  func (c *Cache) fetchDirect(t string, r Request, minIndex uint64) (interface{}, ResultMeta, error) {
   599  	// Get the type that we're fetching
   600  	c.typesLock.RLock()
   601  	tEntry, ok := c.types[t]
   602  	c.typesLock.RUnlock()
   603  	if !ok {
   604  		return nil, ResultMeta{}, fmt.Errorf("unknown type in cache: %s", t)
   605  	}
   606  
   607  	// Fetch it with the min index specified directly by the request.
   608  	result, err := tEntry.Type.Fetch(FetchOptions{
   609  		MinIndex: minIndex,
   610  	}, r)
   611  	if err != nil {
   612  		return nil, ResultMeta{}, err
   613  	}
   614  
   615  	// Return the result and ignore the rest
   616  	return result.Value, ResultMeta{}, nil
   617  }
   618  
   619  func backOffWait(failures uint) time.Duration {
   620  	if failures > CacheRefreshBackoffMin {
   621  		shift := failures - CacheRefreshBackoffMin
   622  		waitTime := CacheRefreshMaxWait
   623  		if shift < 31 {
   624  			waitTime = (1 << shift) * time.Second
   625  		}
   626  		if waitTime > CacheRefreshMaxWait {
   627  			waitTime = CacheRefreshMaxWait
   628  		}
   629  		return waitTime + lib.RandomStagger(waitTime)
   630  	}
   631  	return 0
   632  }
   633  
   634  // refresh triggers a fetch for a specific Request according to the
   635  // registration options.
   636  func (c *Cache) refresh(opts *RegisterOptions, attempt uint, t string, key string, r Request) {
   637  	// Sanity-check, we should not schedule anything that has refresh disabled
   638  	if !opts.Refresh {
   639  		return
   640  	}
   641  	// Check if cache was stopped
   642  	if atomic.LoadUint32(&c.stopped) == 1 {
   643  		return
   644  	}
   645  
   646  	// If we're over the attempt minimum, start an exponential backoff.
   647  	if wait := backOffWait(attempt); wait > 0 {
   648  		time.Sleep(wait)
   649  	}
   650  
   651  	// If we have a timer, wait for it
   652  	if opts.RefreshTimer > 0 {
   653  		time.Sleep(opts.RefreshTimer)
   654  	}
   655  
   656  	// Trigger. The "allowNew" field is false because in the time we were
   657  	// waiting to refresh we may have expired and got evicted. If that
   658  	// happened, we don't want to create a new entry.
   659  	c.fetch(t, key, r, false, attempt)
   660  }
   661  
   662  // runExpiryLoop is a blocking function that watches the expiration
   663  // heap and invalidates entries that have expired.
   664  func (c *Cache) runExpiryLoop() {
   665  	var expiryTimer *time.Timer
   666  	for {
   667  		// If we have a previous timer, stop it.
   668  		if expiryTimer != nil {
   669  			expiryTimer.Stop()
   670  		}
   671  
   672  		// Get the entry expiring soonest
   673  		var entry *cacheEntryExpiry
   674  		var expiryCh <-chan time.Time
   675  		c.entriesLock.RLock()
   676  		if len(c.entriesExpiryHeap.Entries) > 0 {
   677  			entry = c.entriesExpiryHeap.Entries[0]
   678  			expiryTimer = time.NewTimer(entry.Expires.Sub(time.Now()))
   679  			expiryCh = expiryTimer.C
   680  		}
   681  		c.entriesLock.RUnlock()
   682  
   683  		select {
   684  		case <-c.stopCh:
   685  			return
   686  		case <-c.entriesExpiryHeap.NotifyCh:
   687  			// Entries changed, so the heap may have changed. Restart loop.
   688  
   689  		case <-expiryCh:
   690  			c.entriesLock.Lock()
   691  
   692  			// Entry expired! Remove it.
   693  			delete(c.entries, entry.Key)
   694  			heap.Remove(c.entriesExpiryHeap, entry.HeapIndex)
   695  
   696  			// This is subtle but important: if we race and simultaneously
   697  			// evict and fetch a new value, then we set this to -1 to
   698  			// have it treated as a new value so that the TTL is extended.
   699  			entry.HeapIndex = -1
   700  
   701  			// Set some metrics
   702  			metrics.IncrCounter([]string{"consul", "cache", "evict_expired"}, 1)
   703  			metrics.SetGauge([]string{"consul", "cache", "entries_count"}, float32(len(c.entries)))
   704  
   705  			c.entriesLock.Unlock()
   706  		}
   707  	}
   708  }
   709  
   710  // Close stops any background work and frees all resources for the cache.
   711  // Current Fetch requests are allowed to continue to completion and callers may
   712  // still access the current cache values so coordination isn't needed with
   713  // callers, however no background activity will continue. It's intended to close
   714  // the cache at agent shutdown so no further requests should be made, however
   715  // concurrent or in-flight ones won't break.
   716  func (c *Cache) Close() error {
   717  	wasStopped := atomic.SwapUint32(&c.stopped, 1)
   718  	if wasStopped == 0 {
   719  		// First time only, close stop chan
   720  		close(c.stopCh)
   721  	}
   722  	return nil
   723  }