k8s.io/client-go@v0.22.2/tools/record/events_cache.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package record 18 19 import ( 20 "encoding/json" 21 "fmt" 22 "strings" 23 "sync" 24 "time" 25 26 "github.com/golang/groupcache/lru" 27 28 v1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/util/clock" 31 "k8s.io/apimachinery/pkg/util/sets" 32 "k8s.io/apimachinery/pkg/util/strategicpatch" 33 "k8s.io/client-go/util/flowcontrol" 34 ) 35 36 const ( 37 maxLruCacheEntries = 4096 38 39 // if we see the same event that varies only by message 40 // more than 10 times in a 10 minute period, aggregate the event 41 defaultAggregateMaxEvents = 10 42 defaultAggregateIntervalInSeconds = 600 43 44 // by default, allow a source to send 25 events about an object 45 // but control the refill rate to 1 new event every 5 minutes 46 // this helps control the long-tail of events for things that are always 47 // unhealthy 48 defaultSpamBurst = 25 49 defaultSpamQPS = 1. / 300. 50 ) 51 52 // getEventKey builds unique event key based on source, involvedObject, reason, message 53 func getEventKey(event *v1.Event) string { 54 return strings.Join([]string{ 55 event.Source.Component, 56 event.Source.Host, 57 event.InvolvedObject.Kind, 58 event.InvolvedObject.Namespace, 59 event.InvolvedObject.Name, 60 event.InvolvedObject.FieldPath, 61 string(event.InvolvedObject.UID), 62 event.InvolvedObject.APIVersion, 63 event.Type, 64 event.Reason, 65 event.Message, 66 }, 67 "") 68 } 69 70 // getSpamKey builds unique event key based on source, involvedObject 71 func getSpamKey(event *v1.Event) string { 72 return strings.Join([]string{ 73 event.Source.Component, 74 event.Source.Host, 75 event.InvolvedObject.Kind, 76 event.InvolvedObject.Namespace, 77 event.InvolvedObject.Name, 78 string(event.InvolvedObject.UID), 79 event.InvolvedObject.APIVersion, 80 }, 81 "") 82 } 83 84 // EventFilterFunc is a function that returns true if the event should be skipped 85 type EventFilterFunc func(event *v1.Event) bool 86 87 // EventSourceObjectSpamFilter is responsible for throttling 88 // the amount of events a source and object can produce. 89 type EventSourceObjectSpamFilter struct { 90 sync.RWMutex 91 92 // the cache that manages last synced state 93 cache *lru.Cache 94 95 // burst is the amount of events we allow per source + object 96 burst int 97 98 // qps is the refill rate of the token bucket in queries per second 99 qps float32 100 101 // clock is used to allow for testing over a time interval 102 clock clock.Clock 103 } 104 105 // NewEventSourceObjectSpamFilter allows burst events from a source about an object with the specified qps refill. 106 func NewEventSourceObjectSpamFilter(lruCacheSize, burst int, qps float32, clock clock.Clock) *EventSourceObjectSpamFilter { 107 return &EventSourceObjectSpamFilter{ 108 cache: lru.New(lruCacheSize), 109 burst: burst, 110 qps: qps, 111 clock: clock, 112 } 113 } 114 115 // spamRecord holds data used to perform spam filtering decisions. 116 type spamRecord struct { 117 // rateLimiter controls the rate of events about this object 118 rateLimiter flowcontrol.RateLimiter 119 } 120 121 // Filter controls that a given source+object are not exceeding the allowed rate. 122 func (f *EventSourceObjectSpamFilter) Filter(event *v1.Event) bool { 123 var record spamRecord 124 125 // controls our cached information about this event (source+object) 126 eventKey := getSpamKey(event) 127 128 // do we have a record of similar events in our cache? 129 f.Lock() 130 defer f.Unlock() 131 value, found := f.cache.Get(eventKey) 132 if found { 133 record = value.(spamRecord) 134 } 135 136 // verify we have a rate limiter for this record 137 if record.rateLimiter == nil { 138 record.rateLimiter = flowcontrol.NewTokenBucketRateLimiterWithClock(f.qps, f.burst, f.clock) 139 } 140 141 // ensure we have available rate 142 filter := !record.rateLimiter.TryAccept() 143 144 // update the cache 145 f.cache.Add(eventKey, record) 146 147 return filter 148 } 149 150 // EventAggregatorKeyFunc is responsible for grouping events for aggregation 151 // It returns a tuple of the following: 152 // aggregateKey - key the identifies the aggregate group to bucket this event 153 // localKey - key that makes this event in the local group 154 type EventAggregatorKeyFunc func(event *v1.Event) (aggregateKey string, localKey string) 155 156 // EventAggregatorByReasonFunc aggregates events by exact match on event.Source, event.InvolvedObject, event.Type, 157 // event.Reason, event.ReportingController and event.ReportingInstance 158 func EventAggregatorByReasonFunc(event *v1.Event) (string, string) { 159 return strings.Join([]string{ 160 event.Source.Component, 161 event.Source.Host, 162 event.InvolvedObject.Kind, 163 event.InvolvedObject.Namespace, 164 event.InvolvedObject.Name, 165 string(event.InvolvedObject.UID), 166 event.InvolvedObject.APIVersion, 167 event.Type, 168 event.Reason, 169 event.ReportingController, 170 event.ReportingInstance, 171 }, 172 ""), event.Message 173 } 174 175 // EventAggregatorMessageFunc is responsible for producing an aggregation message 176 type EventAggregatorMessageFunc func(event *v1.Event) string 177 178 // EventAggregatorByReasonMessageFunc returns an aggregate message by prefixing the incoming message 179 func EventAggregatorByReasonMessageFunc(event *v1.Event) string { 180 return "(combined from similar events): " + event.Message 181 } 182 183 // EventAggregator identifies similar events and aggregates them into a single event 184 type EventAggregator struct { 185 sync.RWMutex 186 187 // The cache that manages aggregation state 188 cache *lru.Cache 189 190 // The function that groups events for aggregation 191 keyFunc EventAggregatorKeyFunc 192 193 // The function that generates a message for an aggregate event 194 messageFunc EventAggregatorMessageFunc 195 196 // The maximum number of events in the specified interval before aggregation occurs 197 maxEvents uint 198 199 // The amount of time in seconds that must transpire since the last occurrence of a similar event before it's considered new 200 maxIntervalInSeconds uint 201 202 // clock is used to allow for testing over a time interval 203 clock clock.Clock 204 } 205 206 // NewEventAggregator returns a new instance of an EventAggregator 207 func NewEventAggregator(lruCacheSize int, keyFunc EventAggregatorKeyFunc, messageFunc EventAggregatorMessageFunc, 208 maxEvents int, maxIntervalInSeconds int, clock clock.Clock) *EventAggregator { 209 return &EventAggregator{ 210 cache: lru.New(lruCacheSize), 211 keyFunc: keyFunc, 212 messageFunc: messageFunc, 213 maxEvents: uint(maxEvents), 214 maxIntervalInSeconds: uint(maxIntervalInSeconds), 215 clock: clock, 216 } 217 } 218 219 // aggregateRecord holds data used to perform aggregation decisions 220 type aggregateRecord struct { 221 // we track the number of unique local keys we have seen in the aggregate set to know when to actually aggregate 222 // if the size of this set exceeds the max, we know we need to aggregate 223 localKeys sets.String 224 // The last time at which the aggregate was recorded 225 lastTimestamp metav1.Time 226 } 227 228 // EventAggregate checks if a similar event has been seen according to the 229 // aggregation configuration (max events, max interval, etc) and returns: 230 // 231 // - The (potentially modified) event that should be created 232 // - The cache key for the event, for correlation purposes. This will be set to 233 // the full key for normal events, and to the result of 234 // EventAggregatorMessageFunc for aggregate events. 235 func (e *EventAggregator) EventAggregate(newEvent *v1.Event) (*v1.Event, string) { 236 now := metav1.NewTime(e.clock.Now()) 237 var record aggregateRecord 238 // eventKey is the full cache key for this event 239 eventKey := getEventKey(newEvent) 240 // aggregateKey is for the aggregate event, if one is needed. 241 aggregateKey, localKey := e.keyFunc(newEvent) 242 243 // Do we have a record of similar events in our cache? 244 e.Lock() 245 defer e.Unlock() 246 value, found := e.cache.Get(aggregateKey) 247 if found { 248 record = value.(aggregateRecord) 249 } 250 251 // Is the previous record too old? If so, make a fresh one. Note: if we didn't 252 // find a similar record, its lastTimestamp will be the zero value, so we 253 // create a new one in that case. 254 maxInterval := time.Duration(e.maxIntervalInSeconds) * time.Second 255 interval := now.Time.Sub(record.lastTimestamp.Time) 256 if interval > maxInterval { 257 record = aggregateRecord{localKeys: sets.NewString()} 258 } 259 260 // Write the new event into the aggregation record and put it on the cache 261 record.localKeys.Insert(localKey) 262 record.lastTimestamp = now 263 e.cache.Add(aggregateKey, record) 264 265 // If we are not yet over the threshold for unique events, don't correlate them 266 if uint(record.localKeys.Len()) < e.maxEvents { 267 return newEvent, eventKey 268 } 269 270 // do not grow our local key set any larger than max 271 record.localKeys.PopAny() 272 273 // create a new aggregate event, and return the aggregateKey as the cache key 274 // (so that it can be overwritten.) 275 eventCopy := &v1.Event{ 276 ObjectMeta: metav1.ObjectMeta{ 277 Name: fmt.Sprintf("%v.%x", newEvent.InvolvedObject.Name, now.UnixNano()), 278 Namespace: newEvent.Namespace, 279 }, 280 Count: 1, 281 FirstTimestamp: now, 282 InvolvedObject: newEvent.InvolvedObject, 283 LastTimestamp: now, 284 Message: e.messageFunc(newEvent), 285 Type: newEvent.Type, 286 Reason: newEvent.Reason, 287 Source: newEvent.Source, 288 } 289 return eventCopy, aggregateKey 290 } 291 292 // eventLog records data about when an event was observed 293 type eventLog struct { 294 // The number of times the event has occurred since first occurrence. 295 count uint 296 297 // The time at which the event was first recorded. 298 firstTimestamp metav1.Time 299 300 // The unique name of the first occurrence of this event 301 name string 302 303 // Resource version returned from previous interaction with server 304 resourceVersion string 305 } 306 307 // eventLogger logs occurrences of an event 308 type eventLogger struct { 309 sync.RWMutex 310 cache *lru.Cache 311 clock clock.Clock 312 } 313 314 // newEventLogger observes events and counts their frequencies 315 func newEventLogger(lruCacheEntries int, clock clock.Clock) *eventLogger { 316 return &eventLogger{cache: lru.New(lruCacheEntries), clock: clock} 317 } 318 319 // eventObserve records an event, or updates an existing one if key is a cache hit 320 func (e *eventLogger) eventObserve(newEvent *v1.Event, key string) (*v1.Event, []byte, error) { 321 var ( 322 patch []byte 323 err error 324 ) 325 eventCopy := *newEvent 326 event := &eventCopy 327 328 e.Lock() 329 defer e.Unlock() 330 331 // Check if there is an existing event we should update 332 lastObservation := e.lastEventObservationFromCache(key) 333 334 // If we found a result, prepare a patch 335 if lastObservation.count > 0 { 336 // update the event based on the last observation so patch will work as desired 337 event.Name = lastObservation.name 338 event.ResourceVersion = lastObservation.resourceVersion 339 event.FirstTimestamp = lastObservation.firstTimestamp 340 event.Count = int32(lastObservation.count) + 1 341 342 eventCopy2 := *event 343 eventCopy2.Count = 0 344 eventCopy2.LastTimestamp = metav1.NewTime(time.Unix(0, 0)) 345 eventCopy2.Message = "" 346 347 newData, _ := json.Marshal(event) 348 oldData, _ := json.Marshal(eventCopy2) 349 patch, err = strategicpatch.CreateTwoWayMergePatch(oldData, newData, event) 350 } 351 352 // record our new observation 353 e.cache.Add( 354 key, 355 eventLog{ 356 count: uint(event.Count), 357 firstTimestamp: event.FirstTimestamp, 358 name: event.Name, 359 resourceVersion: event.ResourceVersion, 360 }, 361 ) 362 return event, patch, err 363 } 364 365 // updateState updates its internal tracking information based on latest server state 366 func (e *eventLogger) updateState(event *v1.Event) { 367 key := getEventKey(event) 368 e.Lock() 369 defer e.Unlock() 370 // record our new observation 371 e.cache.Add( 372 key, 373 eventLog{ 374 count: uint(event.Count), 375 firstTimestamp: event.FirstTimestamp, 376 name: event.Name, 377 resourceVersion: event.ResourceVersion, 378 }, 379 ) 380 } 381 382 // lastEventObservationFromCache returns the event from the cache, reads must be protected via external lock 383 func (e *eventLogger) lastEventObservationFromCache(key string) eventLog { 384 value, ok := e.cache.Get(key) 385 if ok { 386 observationValue, ok := value.(eventLog) 387 if ok { 388 return observationValue 389 } 390 } 391 return eventLog{} 392 } 393 394 // EventCorrelator processes all incoming events and performs analysis to avoid overwhelming the system. It can filter all 395 // incoming events to see if the event should be filtered from further processing. It can aggregate similar events that occur 396 // frequently to protect the system from spamming events that are difficult for users to distinguish. It performs de-duplication 397 // to ensure events that are observed multiple times are compacted into a single event with increasing counts. 398 type EventCorrelator struct { 399 // the function to filter the event 400 filterFunc EventFilterFunc 401 // the object that performs event aggregation 402 aggregator *EventAggregator 403 // the object that observes events as they come through 404 logger *eventLogger 405 } 406 407 // EventCorrelateResult is the result of a Correlate 408 type EventCorrelateResult struct { 409 // the event after correlation 410 Event *v1.Event 411 // if provided, perform a strategic patch when updating the record on the server 412 Patch []byte 413 // if true, do no further processing of the event 414 Skip bool 415 } 416 417 // NewEventCorrelator returns an EventCorrelator configured with default values. 418 // 419 // The EventCorrelator is responsible for event filtering, aggregating, and counting 420 // prior to interacting with the API server to record the event. 421 // 422 // The default behavior is as follows: 423 // * Aggregation is performed if a similar event is recorded 10 times 424 // in a 10 minute rolling interval. A similar event is an event that varies only by 425 // the Event.Message field. Rather than recording the precise event, aggregation 426 // will create a new event whose message reports that it has combined events with 427 // the same reason. 428 // * Events are incrementally counted if the exact same event is encountered multiple 429 // times. 430 // * A source may burst 25 events about an object, but has a refill rate budget 431 // per object of 1 event every 5 minutes to control long-tail of spam. 432 func NewEventCorrelator(clock clock.Clock) *EventCorrelator { 433 cacheSize := maxLruCacheEntries 434 spamFilter := NewEventSourceObjectSpamFilter(cacheSize, defaultSpamBurst, defaultSpamQPS, clock) 435 return &EventCorrelator{ 436 filterFunc: spamFilter.Filter, 437 aggregator: NewEventAggregator( 438 cacheSize, 439 EventAggregatorByReasonFunc, 440 EventAggregatorByReasonMessageFunc, 441 defaultAggregateMaxEvents, 442 defaultAggregateIntervalInSeconds, 443 clock), 444 445 logger: newEventLogger(cacheSize, clock), 446 } 447 } 448 449 func NewEventCorrelatorWithOptions(options CorrelatorOptions) *EventCorrelator { 450 optionsWithDefaults := populateDefaults(options) 451 spamFilter := NewEventSourceObjectSpamFilter(optionsWithDefaults.LRUCacheSize, 452 optionsWithDefaults.BurstSize, optionsWithDefaults.QPS, optionsWithDefaults.Clock) 453 return &EventCorrelator{ 454 filterFunc: spamFilter.Filter, 455 aggregator: NewEventAggregator( 456 optionsWithDefaults.LRUCacheSize, 457 optionsWithDefaults.KeyFunc, 458 optionsWithDefaults.MessageFunc, 459 optionsWithDefaults.MaxEvents, 460 optionsWithDefaults.MaxIntervalInSeconds, 461 optionsWithDefaults.Clock), 462 logger: newEventLogger(optionsWithDefaults.LRUCacheSize, optionsWithDefaults.Clock), 463 } 464 } 465 466 // populateDefaults populates the zero value options with defaults 467 func populateDefaults(options CorrelatorOptions) CorrelatorOptions { 468 if options.LRUCacheSize == 0 { 469 options.LRUCacheSize = maxLruCacheEntries 470 } 471 if options.BurstSize == 0 { 472 options.BurstSize = defaultSpamBurst 473 } 474 if options.QPS == 0 { 475 options.QPS = defaultSpamQPS 476 } 477 if options.KeyFunc == nil { 478 options.KeyFunc = EventAggregatorByReasonFunc 479 } 480 if options.MessageFunc == nil { 481 options.MessageFunc = EventAggregatorByReasonMessageFunc 482 } 483 if options.MaxEvents == 0 { 484 options.MaxEvents = defaultAggregateMaxEvents 485 } 486 if options.MaxIntervalInSeconds == 0 { 487 options.MaxIntervalInSeconds = defaultAggregateIntervalInSeconds 488 } 489 if options.Clock == nil { 490 options.Clock = clock.RealClock{} 491 } 492 return options 493 } 494 495 // EventCorrelate filters, aggregates, counts, and de-duplicates all incoming events 496 func (c *EventCorrelator) EventCorrelate(newEvent *v1.Event) (*EventCorrelateResult, error) { 497 if newEvent == nil { 498 return nil, fmt.Errorf("event is nil") 499 } 500 aggregateEvent, ckey := c.aggregator.EventAggregate(newEvent) 501 observedEvent, patch, err := c.logger.eventObserve(aggregateEvent, ckey) 502 if c.filterFunc(observedEvent) { 503 return &EventCorrelateResult{Skip: true}, nil 504 } 505 return &EventCorrelateResult{Event: observedEvent, Patch: patch}, err 506 } 507 508 // UpdateState based on the latest observed state from server 509 func (c *EventCorrelator) UpdateState(event *v1.Event) { 510 c.logger.updateState(event) 511 }