sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/cache/cache.go

sigs.k8s.io/prow@v0.0.0-20240503223140-c5e374dc7eb1/pkg/cache/cache.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cache
    18  
    19  import (
    20  	"fmt"
    21  	"sync"
    22  
    23  	"github.com/hashicorp/golang-lru/simplelru"
    24  	"github.com/sirupsen/logrus"
    25  )
    26  
    27  // Overview
    28  //
    29  // LRUCache uses an off-the-shelf LRU cache library for the low-level
    30  // caching implementation, which uses the empty interface for keys and values.
    31  // The values are what we store in the cache, and to retrieve them, we have to
    32  // provide a key (which must be a hashable object). We wrap this cache with a
    33  // single lock, and use an algorithm for a concurrent non-blocking cache to make
    34  // it both thread-safe and also resistant to so-called cache stampedes, where
    35  // many concurrent threads all attempt to look up the same (missing) key/value
    36  // pair from the cache (see Alan Donovan and Brian Kernighan, "The Go
    37  // Programming Language" (Addison-Wesley, 2016), p. 277).
    38  //
    39  // In practical terms, this means that if 1000 requests come in at the same time
    40  // for the same key, only the first one will perform the value construction
    41  // while the other 999 will wait for this first goroutine to finish resolving
    42  // the value. This property makes this cache resilient and is what is meant by
    43  // "non-blocking".
    44  
    45  // LRUCache is the actual concurrent non-blocking cache.
    46  type LRUCache struct {
    47  	*sync.Mutex
    48  	*simplelru.LRU
    49  	callbacks Callbacks
    50  }
    51  
    52  // Callbacks stores various callbacks that may fire during the lifetime of an
    53  // LRUCache.
    54  //
    55  // NOTE: You must make sure that your callbacks are able to return quickly,
    56  // because having slow callbacks will result in degraded cache performance
    57  // (because the cache invokes your callbacks synchronously). The reason why we
    58  // do this synchronously (and not invoke callbacks in a separate goroutine
    59  // ourselves) is because we want to give users the flexibility to do that
    60  // themselves. Hard-coding in a `go ...` invocation in our callback call sites
    61  // would risk unnecessarily costing performance if the callbacks themselves are
    62  // already optimized to return quickly.
    63  type Callbacks struct {
    64  	LookupsCallback         EventCallback
    65  	HitsCallback            EventCallback
    66  	MissesCallback          EventCallback
    67  	ForcedEvictionsCallback simplelru.EvictCallback
    68  	ManualEvictionsCallback EventCallback
    69  }
    70  
    71  // EventCallback is similar to simplelru.EvictCallback, except that it doesn't
    72  // take a value argument.
    73  type EventCallback func(key interface{})
    74  
    75  // ValConstructor is used to construct a value. The assumption is that this
    76  // ValConstructor is expensive to compute, and that we need to memoize it via
    77  // the LRUCache. The raw values of a cache are only constructed after a cache
    78  // miss (and only the first cache miss). Using this type allows us to use any
    79  // arbitrary function whose resulting value needs to be memoized (saved in the
    80  // cache). This type also allows us to delay running the expensive computation
    81  // until we actually need it (after a cache miss).
    82  type ValConstructor func() (interface{}, error)
    83  
    84  // Promise is a wrapper around cache value construction; it is used to
    85  // synchronize the to-be-cached value between the first thread that undergoes a
    86  // cache miss and subsequent threads that attempt to look up the same cache
    87  // entry (cache hit). When the Promise is resolved (when the
    88  // "valConstructionPending" channel is closed), the value is ready for
    89  // concurrent reads.
    90  type Promise struct {
    91  	valConstructor         ValConstructor
    92  	valConstructionPending chan struct{}
    93  	val                    interface{}
    94  	err                    error
    95  }
    96  
    97  func newPromise(valConstructor ValConstructor) *Promise {
    98  	return &Promise{
    99  		valConstructor:         valConstructor,
   100  		valConstructionPending: make(chan struct{}),
   101  	}
   102  }
   103  
   104  // waitForResolution blocks the current thread until the first thread that
   105  // detected a cache miss has finished constructing the value (see resolve()).
   106  func (p *Promise) waitForResolution() {
   107  	<-p.valConstructionPending
   108  }
   109  
   110  // resolve resolves the Promise by constructing the value and closing the
   111  // valConstructionPending channel, thereby unblocking any other thread that has
   112  // been waiting for the value to be constructed.
   113  func (p *Promise) resolve() {
   114  	p.val, p.err = p.valConstructor()
   115  	close(p.valConstructionPending)
   116  }
   117  
   118  // NewLRUCache returns a new LRUCache with a given size (number of elements).
   119  // The forcedEvictionsCallback is a function that is called when an eviction occurs in the
   120  // underlying cache.
   121  func NewLRUCache(size int,
   122  	callbacks Callbacks) (*LRUCache, error) {
   123  	cache, err := simplelru.NewLRU(size, callbacks.ForcedEvictionsCallback)
   124  	if err != nil {
   125  		return nil, err
   126  	}
   127  
   128  	return &LRUCache{
   129  		&sync.Mutex{},
   130  		cache,
   131  		callbacks,
   132  	}, nil
   133  }
   134  
   135  // GetOrAdd tries to use a cache if it is available to get a Value. It is
   136  // assumed that Value is expensive to construct from scratch, which is the
   137  // reason why we try to use the cache in the first place. If we do end up
   138  // constructing a Value from scratch, we store it into the cache with a
   139  // corresponding key, so that we can look up the Value with just the key in the
   140  // future.
   141  //
   142  // This cache is resistant to cache stampedes because it uses a duplicate
   143  // suppression strategy. This is also called request coalescing.
   144  func (lruCache *LRUCache) GetOrAdd(
   145  	key interface{},
   146  	valConstructor ValConstructor) (interface{}, bool, error) {
   147  
   148  	// Cache lookup.
   149  	if lruCache.callbacks.LookupsCallback != nil {
   150  		lruCache.callbacks.LookupsCallback(key)
   151  	}
   152  	lruCache.Lock()
   153  	var promise *Promise
   154  	var ok bool
   155  	maybePromise, promisePending := lruCache.Get(key)
   156  
   157  	if promisePending {
   158  		// A promise exists, BUT the wrapped value inside it (p.val) might
   159  		// not be written to yet by the thread that is actually resolving the
   160  		// promise.
   161  		//
   162  		// For now we just unlock the overall lruCache itself so that it can
   163  		// service other GetOrAdd() calls to it.
   164  		lruCache.Unlock()
   165  
   166  		// Record the cache "hit". To be more precise, there are actually two
   167  		// possibilities here --- either the value is already ready to be
   168  		// consumed (a true cache hit), or the value is being constructed and we
   169  		// have to wait for the promise to resolve first.
   170  		if lruCache.callbacks.HitsCallback != nil {
   171  			lruCache.callbacks.HitsCallback(key)
   172  		}
   173  
   174  		// If the type is not a Promise type, there's no need to wait and we can
   175  		// just return immediately with an error.
   176  		promise, ok = maybePromise.(*Promise)
   177  		if !ok {
   178  			err := fmt.Errorf("Programmer error: expected cache entry type '*Promise', got '%T'", maybePromise)
   179  			logrus.WithField("key", key).Error(err)
   180  			return nil, false, err
   181  		}
   182  
   183  		// Block until the first thread originally created this promise has
   184  		// finished resolving it. Then it's safe to return the resolved values
   185  		// of the promise below.
   186  		//
   187  		// If the original thread resolved the promise already a long time ago
   188  		// (by calling resolve()), then this this waitForResolution() will
   189  		// finish immediately and we will not block at all.
   190  		promise.waitForResolution()
   191  	} else {
   192  		// No promise exists for this key. In other words, we are the first
   193  		// thread to ask for this key's value and so We have no choice but to
   194  		// construct the value ourselves (this call is expensive!) and add it to
   195  		// the cache.
   196  		//
   197  		// If there are other concurrent threads that call GetOrAdd() with the
   198  		// same key and corresponding value constructor, we force them to use
   199  		// the same value as us (so that they don't have to also call
   200  		// valConstructor()). We do this with the following algorithm:
   201  		//
   202  		//  1. immediately create a Promise to construct the value;
   203  		//  2. actually construct the value (expensive operation);
   204  		//  3. resolve the promise to alert all threads looking at the same Promise
   205  		//     get the value from Step 2.
   206  		//
   207  		// This mitigation strategy is a kind of "duplicate suppression", also
   208  		// called "request coalescing". The problem of having to deal with a
   209  		// flood of multiple requests for the same cache entry is also called
   210  		// "cache stampede".
   211  
   212  		// Step 1
   213  		//
   214  		// Let other threads know about our promise to construct the value. We
   215  		// don't care if the underlying LRU cache had to evict an existing
   216  		// entry.
   217  		promise = newPromise(valConstructor)
   218  		_ = lruCache.Add(key, promise)
   219  		// We must unlock here so that the cache does not block other GetOrAdd()
   220  		// calls to it for different (or same) key/value pairs.
   221  		lruCache.Unlock()
   222  
   223  		// Record the cache miss.
   224  		if lruCache.callbacks.MissesCallback != nil {
   225  			lruCache.callbacks.MissesCallback(key)
   226  		}
   227  
   228  		// Step 2 & 3
   229  		//
   230  		// Construct the value (expensive operation), and broadcast to all
   231  		// watchers of this promise that it is ready to be read from (no data
   232  		// race!).
   233  		promise.resolve()
   234  
   235  		// If the value construction (expensive operation) failed, then we
   236  		// delete the cached entry so that we may attempt to re-try again in the
   237  		// future (instead of waiting for the LRUCache to evict it on its own
   238  		// over time).
   239  		//
   240  		// NOTE: It may be the case that the underlying lruCache itself decided
   241  		// to evict this key by the time we try to Lock() it here and evict it
   242  		// ourselves. I.e., it may be the case that the lruCache evicted our key
   243  		// because there just happened to be a massive load of calls with lots
   244  		// of different keys, forcing all old cached values to be evicted. But
   245  		// this is a minor concern because (1) it is unlikely to happen and (2)
   246  		// even if it does happen, our eviction will be a NOP because the key we
   247  		// want to delete wouldn't be in the cache anyway (it's already been
   248  		// evicted!).
   249  		//
   250  		// Another possibility is that by the time we run attempt to delete the
   251  		// key here, there has been not only an eviction of this same key, but
   252  		// the creation of another entry with the same key with valid results.
   253  		// So at worst we would be wrongfully invalidating a cache entry.
   254  		//
   255  		// TODO: If our cache implementation supports a TTL mechanism, then we
   256  		// could just set that instead and let the cached entry expire on its
   257  		// own (we would not have to do this eviction ourselves manually).
   258  		if promise.err != nil {
   259  			logrus.WithField("key", key).Infof("promise was successfully resolved, but the call to resolve() returned an error; deleting key from cache...")
   260  
   261  			lruCache.Lock()
   262  			weDeletedThisKey := lruCache.Remove(key)
   263  			lruCache.Unlock()
   264  			if weDeletedThisKey {
   265  				if lruCache.callbacks.ManualEvictionsCallback != nil {
   266  					lruCache.callbacks.ManualEvictionsCallback(key)
   267  				}
   268  				logrus.WithField("key", key).Infof("successfully deleted")
   269  			} else {
   270  				err := fmt.Errorf("unexpected (non-problematic) race: key deleted by the cache without our knowledge; our own deletion of this key was a NOP but this does not constitute a problem")
   271  				logrus.WithField("key", key).Info(err)
   272  			}
   273  		}
   274  	}
   275  
   276  	return promise.val, ok, promise.err
   277  }