k8s.io/client-go@v0.22.2/tools/record/events_cache.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package record
    18  
    19  import (
    20  	"encoding/json"
    21  	"fmt"
    22  	"strings"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/golang/groupcache/lru"
    27  
    28  	v1 "k8s.io/api/core/v1"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/util/clock"
    31  	"k8s.io/apimachinery/pkg/util/sets"
    32  	"k8s.io/apimachinery/pkg/util/strategicpatch"
    33  	"k8s.io/client-go/util/flowcontrol"
    34  )
    35  
    36  const (
    37  	maxLruCacheEntries = 4096
    38  
    39  	// if we see the same event that varies only by message
    40  	// more than 10 times in a 10 minute period, aggregate the event
    41  	defaultAggregateMaxEvents         = 10
    42  	defaultAggregateIntervalInSeconds = 600
    43  
    44  	// by default, allow a source to send 25 events about an object
    45  	// but control the refill rate to 1 new event every 5 minutes
    46  	// this helps control the long-tail of events for things that are always
    47  	// unhealthy
    48  	defaultSpamBurst = 25
    49  	defaultSpamQPS   = 1. / 300.
    50  )
    51  
    52  // getEventKey builds unique event key based on source, involvedObject, reason, message
    53  func getEventKey(event *v1.Event) string {
    54  	return strings.Join([]string{
    55  		event.Source.Component,
    56  		event.Source.Host,
    57  		event.InvolvedObject.Kind,
    58  		event.InvolvedObject.Namespace,
    59  		event.InvolvedObject.Name,
    60  		event.InvolvedObject.FieldPath,
    61  		string(event.InvolvedObject.UID),
    62  		event.InvolvedObject.APIVersion,
    63  		event.Type,
    64  		event.Reason,
    65  		event.Message,
    66  	},
    67  		"")
    68  }
    69  
    70  // getSpamKey builds unique event key based on source, involvedObject
    71  func getSpamKey(event *v1.Event) string {
    72  	return strings.Join([]string{
    73  		event.Source.Component,
    74  		event.Source.Host,
    75  		event.InvolvedObject.Kind,
    76  		event.InvolvedObject.Namespace,
    77  		event.InvolvedObject.Name,
    78  		string(event.InvolvedObject.UID),
    79  		event.InvolvedObject.APIVersion,
    80  	},
    81  		"")
    82  }
    83  
    84  // EventFilterFunc is a function that returns true if the event should be skipped
    85  type EventFilterFunc func(event *v1.Event) bool
    86  
    87  // EventSourceObjectSpamFilter is responsible for throttling
    88  // the amount of events a source and object can produce.
    89  type EventSourceObjectSpamFilter struct {
    90  	sync.RWMutex
    91  
    92  	// the cache that manages last synced state
    93  	cache *lru.Cache
    94  
    95  	// burst is the amount of events we allow per source + object
    96  	burst int
    97  
    98  	// qps is the refill rate of the token bucket in queries per second
    99  	qps float32
   100  
   101  	// clock is used to allow for testing over a time interval
   102  	clock clock.Clock
   103  }
   104  
   105  // NewEventSourceObjectSpamFilter allows burst events from a source about an object with the specified qps refill.
   106  func NewEventSourceObjectSpamFilter(lruCacheSize, burst int, qps float32, clock clock.Clock) *EventSourceObjectSpamFilter {
   107  	return &EventSourceObjectSpamFilter{
   108  		cache: lru.New(lruCacheSize),
   109  		burst: burst,
   110  		qps:   qps,
   111  		clock: clock,
   112  	}
   113  }
   114  
   115  // spamRecord holds data used to perform spam filtering decisions.
   116  type spamRecord struct {
   117  	// rateLimiter controls the rate of events about this object
   118  	rateLimiter flowcontrol.RateLimiter
   119  }
   120  
   121  // Filter controls that a given source+object are not exceeding the allowed rate.
   122  func (f *EventSourceObjectSpamFilter) Filter(event *v1.Event) bool {
   123  	var record spamRecord
   124  
   125  	// controls our cached information about this event (source+object)
   126  	eventKey := getSpamKey(event)
   127  
   128  	// do we have a record of similar events in our cache?
   129  	f.Lock()
   130  	defer f.Unlock()
   131  	value, found := f.cache.Get(eventKey)
   132  	if found {
   133  		record = value.(spamRecord)
   134  	}
   135  
   136  	// verify we have a rate limiter for this record
   137  	if record.rateLimiter == nil {
   138  		record.rateLimiter = flowcontrol.NewTokenBucketRateLimiterWithClock(f.qps, f.burst, f.clock)
   139  	}
   140  
   141  	// ensure we have available rate
   142  	filter := !record.rateLimiter.TryAccept()
   143  
   144  	// update the cache
   145  	f.cache.Add(eventKey, record)
   146  
   147  	return filter
   148  }
   149  
   150  // EventAggregatorKeyFunc is responsible for grouping events for aggregation
   151  // It returns a tuple of the following:
   152  // aggregateKey - key the identifies the aggregate group to bucket this event
   153  // localKey - key that makes this event in the local group
   154  type EventAggregatorKeyFunc func(event *v1.Event) (aggregateKey string, localKey string)
   155  
   156  // EventAggregatorByReasonFunc aggregates events by exact match on event.Source, event.InvolvedObject, event.Type,
   157  // event.Reason, event.ReportingController and event.ReportingInstance
   158  func EventAggregatorByReasonFunc(event *v1.Event) (string, string) {
   159  	return strings.Join([]string{
   160  		event.Source.Component,
   161  		event.Source.Host,
   162  		event.InvolvedObject.Kind,
   163  		event.InvolvedObject.Namespace,
   164  		event.InvolvedObject.Name,
   165  		string(event.InvolvedObject.UID),
   166  		event.InvolvedObject.APIVersion,
   167  		event.Type,
   168  		event.Reason,
   169  		event.ReportingController,
   170  		event.ReportingInstance,
   171  	},
   172  		""), event.Message
   173  }
   174  
   175  // EventAggregatorMessageFunc is responsible for producing an aggregation message
   176  type EventAggregatorMessageFunc func(event *v1.Event) string
   177  
   178  // EventAggregatorByReasonMessageFunc returns an aggregate message by prefixing the incoming message
   179  func EventAggregatorByReasonMessageFunc(event *v1.Event) string {
   180  	return "(combined from similar events): " + event.Message
   181  }
   182  
   183  // EventAggregator identifies similar events and aggregates them into a single event
   184  type EventAggregator struct {
   185  	sync.RWMutex
   186  
   187  	// The cache that manages aggregation state
   188  	cache *lru.Cache
   189  
   190  	// The function that groups events for aggregation
   191  	keyFunc EventAggregatorKeyFunc
   192  
   193  	// The function that generates a message for an aggregate event
   194  	messageFunc EventAggregatorMessageFunc
   195  
   196  	// The maximum number of events in the specified interval before aggregation occurs
   197  	maxEvents uint
   198  
   199  	// The amount of time in seconds that must transpire since the last occurrence of a similar event before it's considered new
   200  	maxIntervalInSeconds uint
   201  
   202  	// clock is used to allow for testing over a time interval
   203  	clock clock.Clock
   204  }
   205  
   206  // NewEventAggregator returns a new instance of an EventAggregator
   207  func NewEventAggregator(lruCacheSize int, keyFunc EventAggregatorKeyFunc, messageFunc EventAggregatorMessageFunc,
   208  	maxEvents int, maxIntervalInSeconds int, clock clock.Clock) *EventAggregator {
   209  	return &EventAggregator{
   210  		cache:                lru.New(lruCacheSize),
   211  		keyFunc:              keyFunc,
   212  		messageFunc:          messageFunc,
   213  		maxEvents:            uint(maxEvents),
   214  		maxIntervalInSeconds: uint(maxIntervalInSeconds),
   215  		clock:                clock,
   216  	}
   217  }
   218  
   219  // aggregateRecord holds data used to perform aggregation decisions
   220  type aggregateRecord struct {
   221  	// we track the number of unique local keys we have seen in the aggregate set to know when to actually aggregate
   222  	// if the size of this set exceeds the max, we know we need to aggregate
   223  	localKeys sets.String
   224  	// The last time at which the aggregate was recorded
   225  	lastTimestamp metav1.Time
   226  }
   227  
   228  // EventAggregate checks if a similar event has been seen according to the
   229  // aggregation configuration (max events, max interval, etc) and returns:
   230  //
   231  // - The (potentially modified) event that should be created
   232  // - The cache key for the event, for correlation purposes. This will be set to
   233  //   the full key for normal events, and to the result of
   234  //   EventAggregatorMessageFunc for aggregate events.
   235  func (e *EventAggregator) EventAggregate(newEvent *v1.Event) (*v1.Event, string) {
   236  	now := metav1.NewTime(e.clock.Now())
   237  	var record aggregateRecord
   238  	// eventKey is the full cache key for this event
   239  	eventKey := getEventKey(newEvent)
   240  	// aggregateKey is for the aggregate event, if one is needed.
   241  	aggregateKey, localKey := e.keyFunc(newEvent)
   242  
   243  	// Do we have a record of similar events in our cache?
   244  	e.Lock()
   245  	defer e.Unlock()
   246  	value, found := e.cache.Get(aggregateKey)
   247  	if found {
   248  		record = value.(aggregateRecord)
   249  	}
   250  
   251  	// Is the previous record too old? If so, make a fresh one. Note: if we didn't
   252  	// find a similar record, its lastTimestamp will be the zero value, so we
   253  	// create a new one in that case.
   254  	maxInterval := time.Duration(e.maxIntervalInSeconds) * time.Second
   255  	interval := now.Time.Sub(record.lastTimestamp.Time)
   256  	if interval > maxInterval {
   257  		record = aggregateRecord{localKeys: sets.NewString()}
   258  	}
   259  
   260  	// Write the new event into the aggregation record and put it on the cache
   261  	record.localKeys.Insert(localKey)
   262  	record.lastTimestamp = now
   263  	e.cache.Add(aggregateKey, record)
   264  
   265  	// If we are not yet over the threshold for unique events, don't correlate them
   266  	if uint(record.localKeys.Len()) < e.maxEvents {
   267  		return newEvent, eventKey
   268  	}
   269  
   270  	// do not grow our local key set any larger than max
   271  	record.localKeys.PopAny()
   272  
   273  	// create a new aggregate event, and return the aggregateKey as the cache key
   274  	// (so that it can be overwritten.)
   275  	eventCopy := &v1.Event{
   276  		ObjectMeta: metav1.ObjectMeta{
   277  			Name:      fmt.Sprintf("%v.%x", newEvent.InvolvedObject.Name, now.UnixNano()),
   278  			Namespace: newEvent.Namespace,
   279  		},
   280  		Count:          1,
   281  		FirstTimestamp: now,
   282  		InvolvedObject: newEvent.InvolvedObject,
   283  		LastTimestamp:  now,
   284  		Message:        e.messageFunc(newEvent),
   285  		Type:           newEvent.Type,
   286  		Reason:         newEvent.Reason,
   287  		Source:         newEvent.Source,
   288  	}
   289  	return eventCopy, aggregateKey
   290  }
   291  
   292  // eventLog records data about when an event was observed
   293  type eventLog struct {
   294  	// The number of times the event has occurred since first occurrence.
   295  	count uint
   296  
   297  	// The time at which the event was first recorded.
   298  	firstTimestamp metav1.Time
   299  
   300  	// The unique name of the first occurrence of this event
   301  	name string
   302  
   303  	// Resource version returned from previous interaction with server
   304  	resourceVersion string
   305  }
   306  
   307  // eventLogger logs occurrences of an event
   308  type eventLogger struct {
   309  	sync.RWMutex
   310  	cache *lru.Cache
   311  	clock clock.Clock
   312  }
   313  
   314  // newEventLogger observes events and counts their frequencies
   315  func newEventLogger(lruCacheEntries int, clock clock.Clock) *eventLogger {
   316  	return &eventLogger{cache: lru.New(lruCacheEntries), clock: clock}
   317  }
   318  
   319  // eventObserve records an event, or updates an existing one if key is a cache hit
   320  func (e *eventLogger) eventObserve(newEvent *v1.Event, key string) (*v1.Event, []byte, error) {
   321  	var (
   322  		patch []byte
   323  		err   error
   324  	)
   325  	eventCopy := *newEvent
   326  	event := &eventCopy
   327  
   328  	e.Lock()
   329  	defer e.Unlock()
   330  
   331  	// Check if there is an existing event we should update
   332  	lastObservation := e.lastEventObservationFromCache(key)
   333  
   334  	// If we found a result, prepare a patch
   335  	if lastObservation.count > 0 {
   336  		// update the event based on the last observation so patch will work as desired
   337  		event.Name = lastObservation.name
   338  		event.ResourceVersion = lastObservation.resourceVersion
   339  		event.FirstTimestamp = lastObservation.firstTimestamp
   340  		event.Count = int32(lastObservation.count) + 1
   341  
   342  		eventCopy2 := *event
   343  		eventCopy2.Count = 0
   344  		eventCopy2.LastTimestamp = metav1.NewTime(time.Unix(0, 0))
   345  		eventCopy2.Message = ""
   346  
   347  		newData, _ := json.Marshal(event)
   348  		oldData, _ := json.Marshal(eventCopy2)
   349  		patch, err = strategicpatch.CreateTwoWayMergePatch(oldData, newData, event)
   350  	}
   351  
   352  	// record our new observation
   353  	e.cache.Add(
   354  		key,
   355  		eventLog{
   356  			count:           uint(event.Count),
   357  			firstTimestamp:  event.FirstTimestamp,
   358  			name:            event.Name,
   359  			resourceVersion: event.ResourceVersion,
   360  		},
   361  	)
   362  	return event, patch, err
   363  }
   364  
   365  // updateState updates its internal tracking information based on latest server state
   366  func (e *eventLogger) updateState(event *v1.Event) {
   367  	key := getEventKey(event)
   368  	e.Lock()
   369  	defer e.Unlock()
   370  	// record our new observation
   371  	e.cache.Add(
   372  		key,
   373  		eventLog{
   374  			count:           uint(event.Count),
   375  			firstTimestamp:  event.FirstTimestamp,
   376  			name:            event.Name,
   377  			resourceVersion: event.ResourceVersion,
   378  		},
   379  	)
   380  }
   381  
   382  // lastEventObservationFromCache returns the event from the cache, reads must be protected via external lock
   383  func (e *eventLogger) lastEventObservationFromCache(key string) eventLog {
   384  	value, ok := e.cache.Get(key)
   385  	if ok {
   386  		observationValue, ok := value.(eventLog)
   387  		if ok {
   388  			return observationValue
   389  		}
   390  	}
   391  	return eventLog{}
   392  }
   393  
   394  // EventCorrelator processes all incoming events and performs analysis to avoid overwhelming the system.  It can filter all
   395  // incoming events to see if the event should be filtered from further processing.  It can aggregate similar events that occur
   396  // frequently to protect the system from spamming events that are difficult for users to distinguish.  It performs de-duplication
   397  // to ensure events that are observed multiple times are compacted into a single event with increasing counts.
   398  type EventCorrelator struct {
   399  	// the function to filter the event
   400  	filterFunc EventFilterFunc
   401  	// the object that performs event aggregation
   402  	aggregator *EventAggregator
   403  	// the object that observes events as they come through
   404  	logger *eventLogger
   405  }
   406  
   407  // EventCorrelateResult is the result of a Correlate
   408  type EventCorrelateResult struct {
   409  	// the event after correlation
   410  	Event *v1.Event
   411  	// if provided, perform a strategic patch when updating the record on the server
   412  	Patch []byte
   413  	// if true, do no further processing of the event
   414  	Skip bool
   415  }
   416  
   417  // NewEventCorrelator returns an EventCorrelator configured with default values.
   418  //
   419  // The EventCorrelator is responsible for event filtering, aggregating, and counting
   420  // prior to interacting with the API server to record the event.
   421  //
   422  // The default behavior is as follows:
   423  //   * Aggregation is performed if a similar event is recorded 10 times
   424  //     in a 10 minute rolling interval.  A similar event is an event that varies only by
   425  //     the Event.Message field.  Rather than recording the precise event, aggregation
   426  //     will create a new event whose message reports that it has combined events with
   427  //     the same reason.
   428  //   * Events are incrementally counted if the exact same event is encountered multiple
   429  //     times.
   430  //   * A source may burst 25 events about an object, but has a refill rate budget
   431  //     per object of 1 event every 5 minutes to control long-tail of spam.
   432  func NewEventCorrelator(clock clock.Clock) *EventCorrelator {
   433  	cacheSize := maxLruCacheEntries
   434  	spamFilter := NewEventSourceObjectSpamFilter(cacheSize, defaultSpamBurst, defaultSpamQPS, clock)
   435  	return &EventCorrelator{
   436  		filterFunc: spamFilter.Filter,
   437  		aggregator: NewEventAggregator(
   438  			cacheSize,
   439  			EventAggregatorByReasonFunc,
   440  			EventAggregatorByReasonMessageFunc,
   441  			defaultAggregateMaxEvents,
   442  			defaultAggregateIntervalInSeconds,
   443  			clock),
   444  
   445  		logger: newEventLogger(cacheSize, clock),
   446  	}
   447  }
   448  
   449  func NewEventCorrelatorWithOptions(options CorrelatorOptions) *EventCorrelator {
   450  	optionsWithDefaults := populateDefaults(options)
   451  	spamFilter := NewEventSourceObjectSpamFilter(optionsWithDefaults.LRUCacheSize,
   452  		optionsWithDefaults.BurstSize, optionsWithDefaults.QPS, optionsWithDefaults.Clock)
   453  	return &EventCorrelator{
   454  		filterFunc: spamFilter.Filter,
   455  		aggregator: NewEventAggregator(
   456  			optionsWithDefaults.LRUCacheSize,
   457  			optionsWithDefaults.KeyFunc,
   458  			optionsWithDefaults.MessageFunc,
   459  			optionsWithDefaults.MaxEvents,
   460  			optionsWithDefaults.MaxIntervalInSeconds,
   461  			optionsWithDefaults.Clock),
   462  		logger: newEventLogger(optionsWithDefaults.LRUCacheSize, optionsWithDefaults.Clock),
   463  	}
   464  }
   465  
   466  // populateDefaults populates the zero value options with defaults
   467  func populateDefaults(options CorrelatorOptions) CorrelatorOptions {
   468  	if options.LRUCacheSize == 0 {
   469  		options.LRUCacheSize = maxLruCacheEntries
   470  	}
   471  	if options.BurstSize == 0 {
   472  		options.BurstSize = defaultSpamBurst
   473  	}
   474  	if options.QPS == 0 {
   475  		options.QPS = defaultSpamQPS
   476  	}
   477  	if options.KeyFunc == nil {
   478  		options.KeyFunc = EventAggregatorByReasonFunc
   479  	}
   480  	if options.MessageFunc == nil {
   481  		options.MessageFunc = EventAggregatorByReasonMessageFunc
   482  	}
   483  	if options.MaxEvents == 0 {
   484  		options.MaxEvents = defaultAggregateMaxEvents
   485  	}
   486  	if options.MaxIntervalInSeconds == 0 {
   487  		options.MaxIntervalInSeconds = defaultAggregateIntervalInSeconds
   488  	}
   489  	if options.Clock == nil {
   490  		options.Clock = clock.RealClock{}
   491  	}
   492  	return options
   493  }
   494  
   495  // EventCorrelate filters, aggregates, counts, and de-duplicates all incoming events
   496  func (c *EventCorrelator) EventCorrelate(newEvent *v1.Event) (*EventCorrelateResult, error) {
   497  	if newEvent == nil {
   498  		return nil, fmt.Errorf("event is nil")
   499  	}
   500  	aggregateEvent, ckey := c.aggregator.EventAggregate(newEvent)
   501  	observedEvent, patch, err := c.logger.eventObserve(aggregateEvent, ckey)
   502  	if c.filterFunc(observedEvent) {
   503  		return &EventCorrelateResult{Skip: true}, nil
   504  	}
   505  	return &EventCorrelateResult{Event: observedEvent, Patch: patch}, err
   506  }
   507  
   508  // UpdateState based on the latest observed state from server
   509  func (c *EventCorrelator) UpdateState(event *v1.Event) {
   510  	c.logger.updateState(event)
   511  }