google.golang.org/grpc@v1.72.2/balancer/rls/cache.go (about)

     1  /*
     2   *
     3   * Copyright 2021 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package rls
    20  
    21  import (
    22  	"container/list"
    23  	"time"
    24  
    25  	"github.com/google/uuid"
    26  	estats "google.golang.org/grpc/experimental/stats"
    27  	"google.golang.org/grpc/internal/backoff"
    28  	internalgrpclog "google.golang.org/grpc/internal/grpclog"
    29  	"google.golang.org/grpc/internal/grpcsync"
    30  )
    31  
    32  // cacheKey represents the key used to uniquely identify an entry in the data
    33  // cache and in the pending requests map.
    34  type cacheKey struct {
    35  	// path is the full path of the incoming RPC request.
    36  	path string
    37  	// keys is a stringified version of the RLS request key map built using the
    38  	// RLS keyBuilder. Since maps are not a type which is comparable in Go, it
    39  	// cannot be part of the key for another map (entries in the data cache and
    40  	// pending requests map are stored in maps).
    41  	keys string
    42  }
    43  
    44  // cacheEntry wraps all the data to be stored in a data cache entry.
    45  type cacheEntry struct {
    46  	// childPolicyWrappers contains the list of child policy wrappers
    47  	// corresponding to the targets returned by the RLS server for this entry.
    48  	childPolicyWrappers []*childPolicyWrapper
    49  	// headerData is received in the RLS response and is to be sent in the
    50  	// X-Google-RLS-Data header for matching RPCs.
    51  	headerData string
    52  	// expiryTime is the absolute time at which this cache entry stops
    53  	// being valid. When an RLS request succeeds, this is set to the current
    54  	// time plus the max_age field from the LB policy config.
    55  	expiryTime time.Time
    56  	// staleTime is the absolute time after which this cache entry will be
    57  	// proactively refreshed if an incoming RPC matches this entry. When an RLS
    58  	// request succeeds, this is set to the current time plus the stale_age from
    59  	// the LB policy config.
    60  	staleTime time.Time
    61  	// earliestEvictTime is the absolute time before which this entry should not
    62  	// be evicted from the cache. When a cache entry is created, this is set to
    63  	// the current time plus a default value of 5 seconds. This is required to
    64  	// make sure that a new entry added to the cache is not evicted before the
    65  	// RLS response arrives (usually when the cache is too small).
    66  	earliestEvictTime time.Time
    67  
    68  	// status stores the RPC status of the previous RLS request for this
    69  	// entry. Picks for entries with a non-nil value for this field are failed
    70  	// with the error stored here.
    71  	status error
    72  	// backoffState contains all backoff related state. When an RLS request
    73  	// succeeds, backoffState is reset. This state moves between the data cache
    74  	// and the pending requests map.
    75  	backoffState *backoffState
    76  	// backoffTime is the absolute time at which the backoff period for this
    77  	// entry ends. When an RLS request fails, this is set to the current time
    78  	// plus the backoff value returned by the backoffState. The backoff timer is
    79  	// also setup with this value. No new RLS requests are sent out for this
    80  	// entry until the backoff period ends.
    81  	//
    82  	// Set to zero time instant upon a successful RLS response.
    83  	backoffTime time.Time
    84  	// backoffExpiryTime is the absolute time at which an entry which has gone
    85  	// through backoff stops being valid.  When an RLS request fails, this is
    86  	// set to the current time plus twice the backoff time. The cache expiry
    87  	// timer will only delete entries for which both expiryTime and
    88  	// backoffExpiryTime are in the past.
    89  	//
    90  	// Set to zero time instant upon a successful RLS response.
    91  	backoffExpiryTime time.Time
    92  
    93  	// size stores the size of this cache entry. Used to enforce the cache size
    94  	// specified in the LB policy configuration.
    95  	size int64
    96  }
    97  
    98  // backoffState wraps all backoff related state associated with a cache entry.
    99  type backoffState struct {
   100  	// retries keeps track of the number of RLS failures, to be able to
   101  	// determine the amount of time to backoff before the next attempt.
   102  	retries int
   103  	// bs is the exponential backoff implementation which returns the amount of
   104  	// time to backoff, given the number of retries.
   105  	bs backoff.Strategy
   106  	// timer fires when the backoff period ends and incoming requests after this
   107  	// will trigger a new RLS request.
   108  	timer *time.Timer
   109  }
   110  
   111  // lru is a cache implementation with a least recently used eviction policy.
   112  // Internally it uses a doubly linked list, with the least recently used element
   113  // at the front of the list and the most recently used element at the back of
   114  // the list. The value stored in this cache will be of type `cacheKey`.
   115  //
   116  // It is not safe for concurrent access.
   117  type lru struct {
   118  	ll *list.List
   119  
   120  	// A map from the value stored in the lru to its underlying list element is
   121  	// maintained to have a clean API. Without this, a subset of the lru's API
   122  	// would accept/return cacheKey while another subset would accept/return
   123  	// list elements.
   124  	m map[cacheKey]*list.Element
   125  }
   126  
   127  // newLRU creates a new cache with a least recently used eviction policy.
   128  func newLRU() *lru {
   129  	return &lru{
   130  		ll: list.New(),
   131  		m:  make(map[cacheKey]*list.Element),
   132  	}
   133  }
   134  
   135  func (l *lru) addEntry(key cacheKey) {
   136  	e := l.ll.PushBack(key)
   137  	l.m[key] = e
   138  }
   139  
   140  func (l *lru) makeRecent(key cacheKey) {
   141  	e := l.m[key]
   142  	l.ll.MoveToBack(e)
   143  }
   144  
   145  func (l *lru) removeEntry(key cacheKey) {
   146  	e := l.m[key]
   147  	l.ll.Remove(e)
   148  	delete(l.m, key)
   149  }
   150  
   151  func (l *lru) getLeastRecentlyUsed() cacheKey {
   152  	e := l.ll.Front()
   153  	if e == nil {
   154  		return cacheKey{}
   155  	}
   156  	return e.Value.(cacheKey)
   157  }
   158  
   159  // dataCache contains a cache of RLS data used by the LB policy to make routing
   160  // decisions.
   161  //
   162  // The dataCache will be keyed by the request's path and keys, represented by
   163  // the `cacheKey` type. It will maintain the cache keys in an `lru` and the
   164  // cache data, represented by the `cacheEntry` type, in a native map.
   165  //
   166  // It is not safe for concurrent access.
   167  type dataCache struct {
   168  	maxSize         int64 // Maximum allowed size.
   169  	currentSize     int64 // Current size.
   170  	keys            *lru  // Cache keys maintained in lru order.
   171  	entries         map[cacheKey]*cacheEntry
   172  	logger          *internalgrpclog.PrefixLogger
   173  	shutdown        *grpcsync.Event
   174  	rlsServerTarget string
   175  
   176  	// Read only after initialization.
   177  	grpcTarget      string
   178  	uuid            string
   179  	metricsRecorder estats.MetricsRecorder
   180  }
   181  
   182  func newDataCache(size int64, logger *internalgrpclog.PrefixLogger, metricsRecorder estats.MetricsRecorder, grpcTarget string) *dataCache {
   183  	return &dataCache{
   184  		maxSize:         size,
   185  		keys:            newLRU(),
   186  		entries:         make(map[cacheKey]*cacheEntry),
   187  		logger:          logger,
   188  		shutdown:        grpcsync.NewEvent(),
   189  		grpcTarget:      grpcTarget,
   190  		uuid:            uuid.New().String(),
   191  		metricsRecorder: metricsRecorder,
   192  	}
   193  }
   194  
   195  // updateRLSServerTarget updates the RLS Server Target the RLS Balancer is
   196  // configured with.
   197  func (dc *dataCache) updateRLSServerTarget(rlsServerTarget string) {
   198  	dc.rlsServerTarget = rlsServerTarget
   199  }
   200  
   201  // resize changes the maximum allowed size of the data cache.
   202  //
   203  // The return value indicates if an entry with a valid backoff timer was
   204  // evicted. This is important to the RLS LB policy which would send a new picker
   205  // on the channel to re-process any RPCs queued as a result of this backoff
   206  // timer.
   207  func (dc *dataCache) resize(size int64) (backoffCancelled bool) {
   208  	if dc.shutdown.HasFired() {
   209  		return false
   210  	}
   211  
   212  	backoffCancelled = false
   213  	for dc.currentSize > size {
   214  		key := dc.keys.getLeastRecentlyUsed()
   215  		entry, ok := dc.entries[key]
   216  		if !ok {
   217  			// This should never happen.
   218  			dc.logger.Errorf("cacheKey %+v not found in the cache while attempting to resize it", key)
   219  			break
   220  		}
   221  
   222  		// When we encounter a cache entry whose minimum expiration time is in
   223  		// the future, we abort the LRU pass, which may temporarily leave the
   224  		// cache being too large. This is necessary to ensure that in cases
   225  		// where the cache is too small, when we receive an RLS Response, we
   226  		// keep the resulting cache entry around long enough for the pending
   227  		// incoming requests to be re-processed through the new Picker. If we
   228  		// didn't do this, then we'd risk throwing away each RLS response as we
   229  		// receive it, in which case we would fail to actually route any of our
   230  		// incoming requests.
   231  		if entry.earliestEvictTime.After(time.Now()) {
   232  			dc.logger.Warningf("cachekey %+v is too recent to be evicted. Stopping cache resizing for now", key)
   233  			break
   234  		}
   235  
   236  		// Stop the backoff timer before evicting the entry.
   237  		if entry.backoffState != nil && entry.backoffState.timer != nil {
   238  			if entry.backoffState.timer.Stop() {
   239  				entry.backoffState.timer = nil
   240  				backoffCancelled = true
   241  			}
   242  		}
   243  		dc.deleteAndCleanup(key, entry)
   244  	}
   245  	dc.maxSize = size
   246  	return backoffCancelled
   247  }
   248  
   249  // evictExpiredEntries sweeps through the cache and deletes expired entries. An
   250  // expired entry is one for which both the `expiryTime` and `backoffExpiryTime`
   251  // fields are in the past.
   252  //
   253  // The return value indicates if any expired entries were evicted.
   254  //
   255  // The LB policy invokes this method periodically to purge expired entries.
   256  func (dc *dataCache) evictExpiredEntries() bool {
   257  	if dc.shutdown.HasFired() {
   258  		return false
   259  	}
   260  
   261  	evicted := false
   262  	for key, entry := range dc.entries {
   263  		// Only evict entries for which both the data expiration time and
   264  		// backoff expiration time fields are in the past.
   265  		now := time.Now()
   266  		if entry.expiryTime.After(now) || entry.backoffExpiryTime.After(now) {
   267  			continue
   268  		}
   269  		dc.deleteAndCleanup(key, entry)
   270  		evicted = true
   271  	}
   272  	return evicted
   273  }
   274  
   275  // resetBackoffState sweeps through the cache and for entries with a backoff
   276  // state, the backoff timer is cancelled and the backoff state is reset. The
   277  // return value indicates if any entries were mutated in this fashion.
   278  //
   279  // The LB policy invokes this method when the control channel moves from READY
   280  // to TRANSIENT_FAILURE back to READY. See `monitorConnectivityState` method on
   281  // the `controlChannel` type for more details.
   282  func (dc *dataCache) resetBackoffState(newBackoffState *backoffState) bool {
   283  	if dc.shutdown.HasFired() {
   284  		return false
   285  	}
   286  
   287  	backoffReset := false
   288  	for _, entry := range dc.entries {
   289  		if entry.backoffState == nil {
   290  			continue
   291  		}
   292  		if entry.backoffState.timer != nil {
   293  			entry.backoffState.timer.Stop()
   294  			entry.backoffState.timer = nil
   295  		}
   296  		entry.backoffState = &backoffState{bs: newBackoffState.bs}
   297  		entry.backoffTime = time.Time{}
   298  		entry.backoffExpiryTime = time.Time{}
   299  		backoffReset = true
   300  	}
   301  	return backoffReset
   302  }
   303  
   304  // addEntry adds a cache entry for the given key.
   305  //
   306  // Return value backoffCancelled indicates if a cache entry with a valid backoff
   307  // timer was evicted to make space for the current entry. This is important to
   308  // the RLS LB policy which would send a new picker on the channel to re-process
   309  // any RPCs queued as a result of this backoff timer.
   310  //
   311  // Return value ok indicates if entry was successfully added to the cache.
   312  func (dc *dataCache) addEntry(key cacheKey, entry *cacheEntry) (backoffCancelled bool, ok bool) {
   313  	if dc.shutdown.HasFired() {
   314  		return false, false
   315  	}
   316  
   317  	// Handle the extremely unlikely case that a single entry is bigger than the
   318  	// size of the cache.
   319  	if entry.size > dc.maxSize {
   320  		return false, false
   321  	}
   322  	dc.entries[key] = entry
   323  	dc.currentSize += entry.size
   324  	dc.keys.addEntry(key)
   325  	// If the new entry makes the cache go over its configured size, remove some
   326  	// old entries.
   327  	if dc.currentSize > dc.maxSize {
   328  		backoffCancelled = dc.resize(dc.maxSize)
   329  	}
   330  	cacheSizeMetric.Record(dc.metricsRecorder, dc.currentSize, dc.grpcTarget, dc.rlsServerTarget, dc.uuid)
   331  	cacheEntriesMetric.Record(dc.metricsRecorder, int64(len(dc.entries)), dc.grpcTarget, dc.rlsServerTarget, dc.uuid)
   332  	return backoffCancelled, true
   333  }
   334  
   335  // updateEntrySize updates the size of a cache entry and the current size of the
   336  // data cache. An entry's size can change upon receipt of an RLS response.
   337  func (dc *dataCache) updateEntrySize(entry *cacheEntry, newSize int64) {
   338  	dc.currentSize -= entry.size
   339  	entry.size = newSize
   340  	dc.currentSize += entry.size
   341  	cacheSizeMetric.Record(dc.metricsRecorder, dc.currentSize, dc.grpcTarget, dc.rlsServerTarget, dc.uuid)
   342  }
   343  
   344  func (dc *dataCache) getEntry(key cacheKey) *cacheEntry {
   345  	if dc.shutdown.HasFired() {
   346  		return nil
   347  	}
   348  
   349  	entry, ok := dc.entries[key]
   350  	if !ok {
   351  		return nil
   352  	}
   353  	dc.keys.makeRecent(key)
   354  	return entry
   355  }
   356  
   357  func (dc *dataCache) removeEntryForTesting(key cacheKey) {
   358  	entry, ok := dc.entries[key]
   359  	if !ok {
   360  		return
   361  	}
   362  	dc.deleteAndCleanup(key, entry)
   363  }
   364  
   365  // deleteAndCleanup performs actions required at the time of deleting an entry
   366  // from the data cache.
   367  // - the entry is removed from the map of entries
   368  // - current size of the data cache is update
   369  // - the key is removed from the LRU
   370  func (dc *dataCache) deleteAndCleanup(key cacheKey, entry *cacheEntry) {
   371  	delete(dc.entries, key)
   372  	dc.currentSize -= entry.size
   373  	dc.keys.removeEntry(key)
   374  	cacheSizeMetric.Record(dc.metricsRecorder, dc.currentSize, dc.grpcTarget, dc.rlsServerTarget, dc.uuid)
   375  	cacheEntriesMetric.Record(dc.metricsRecorder, int64(len(dc.entries)), dc.grpcTarget, dc.rlsServerTarget, dc.uuid)
   376  }
   377  
   378  func (dc *dataCache) stop() {
   379  	for key, entry := range dc.entries {
   380  		dc.deleteAndCleanup(key, entry)
   381  	}
   382  	dc.shutdown.Fire()
   383  }