k8s.io/apiserver@v0.29.3/pkg/storage/cacher/cache_watcher.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cacher 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 "time" 24 25 "k8s.io/apimachinery/pkg/runtime" 26 "k8s.io/apimachinery/pkg/runtime/schema" 27 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 28 "k8s.io/apimachinery/pkg/watch" 29 "k8s.io/apiserver/pkg/storage" 30 "k8s.io/apiserver/pkg/storage/cacher/metrics" 31 utilflowcontrol "k8s.io/apiserver/pkg/util/flowcontrol" 32 33 "k8s.io/klog/v2" 34 ) 35 36 // possible states of the cache watcher 37 const ( 38 // cacheWatcherWaitingForBookmark indicates the cacher 39 // is waiting for a bookmark event with a specific RV set 40 cacheWatcherWaitingForBookmark = iota 41 42 // cacheWatcherBookmarkReceived indicates that the cacher 43 // has received a bookmark event with required RV 44 cacheWatcherBookmarkReceived 45 46 // cacheWatcherBookmarkSent indicates that the cacher 47 // has already sent a bookmark event to a client 48 cacheWatcherBookmarkSent 49 ) 50 51 // cacheWatcher implements watch.Interface 52 // this is not thread-safe 53 type cacheWatcher struct { 54 input chan *watchCacheEvent 55 result chan watch.Event 56 done chan struct{} 57 filter filterWithAttrsFunc 58 stopped bool 59 forget func(bool) 60 versioner storage.Versioner 61 // The watcher will be closed by server after the deadline, 62 // save it here to send bookmark events before that. 63 deadline time.Time 64 allowWatchBookmarks bool 65 groupResource schema.GroupResource 66 67 // human readable identifier that helps assigning cacheWatcher 68 // instance with request 69 identifier string 70 71 // drainInputBuffer indicates whether we should delay closing this watcher 72 // and send all event in the input buffer. 73 drainInputBuffer bool 74 75 // bookmarkAfterResourceVersion holds an RV that indicates 76 // when we should start delivering bookmark events. 77 // If this field holds the value of 0 that means 78 // we don't have any special preferences toward delivering bookmark events. 79 // Note that this field is used in conjunction with the state field. 80 // It should not be changed once the watcher has been started. 81 bookmarkAfterResourceVersion uint64 82 83 // stateMutex protects state 84 stateMutex sync.Mutex 85 86 // state holds a numeric value indicating the current state of the watcher 87 state int 88 } 89 90 func newCacheWatcher( 91 chanSize int, 92 filter filterWithAttrsFunc, 93 forget func(bool), 94 versioner storage.Versioner, 95 deadline time.Time, 96 allowWatchBookmarks bool, 97 groupResource schema.GroupResource, 98 identifier string, 99 ) *cacheWatcher { 100 return &cacheWatcher{ 101 input: make(chan *watchCacheEvent, chanSize), 102 result: make(chan watch.Event, chanSize), 103 done: make(chan struct{}), 104 filter: filter, 105 stopped: false, 106 forget: forget, 107 versioner: versioner, 108 deadline: deadline, 109 allowWatchBookmarks: allowWatchBookmarks, 110 groupResource: groupResource, 111 identifier: identifier, 112 } 113 } 114 115 // Implements watch.Interface. 116 func (c *cacheWatcher) ResultChan() <-chan watch.Event { 117 return c.result 118 } 119 120 // Implements watch.Interface. 121 func (c *cacheWatcher) Stop() { 122 c.forget(false) 123 } 124 125 // we rely on the fact that stopLocked is actually protected by Cacher.Lock() 126 func (c *cacheWatcher) stopLocked() { 127 if !c.stopped { 128 c.stopped = true 129 // stop without draining the input channel was requested. 130 if !c.drainInputBuffer { 131 close(c.done) 132 } 133 close(c.input) 134 } 135 136 // Even if the watcher was already stopped, if it previously was 137 // using draining mode and it's not using it now we need to 138 // close the done channel now. Otherwise we could leak the 139 // processing goroutine if it will be trying to put more objects 140 // into result channel, the channel will be full and there will 141 // already be noone on the processing the events on the receiving end. 142 if !c.drainInputBuffer && !c.isDoneChannelClosedLocked() { 143 close(c.done) 144 } 145 } 146 147 func (c *cacheWatcher) nonblockingAdd(event *watchCacheEvent) bool { 148 // if the bookmarkAfterResourceVersion hasn't been seen 149 // we will try to deliver a bookmark event every second. 150 // the following check will discard a bookmark event 151 // if it is < than the bookmarkAfterResourceVersion 152 // so that we don't pollute the input channel 153 if event.Type == watch.Bookmark && event.ResourceVersion < c.bookmarkAfterResourceVersion { 154 return false 155 } 156 select { 157 case c.input <- event: 158 c.markBookmarkAfterRvAsReceived(event) 159 return true 160 default: 161 return false 162 } 163 } 164 165 // Nil timer means that add will not block (if it can't send event immediately, it will break the watcher) 166 // 167 // Note that bookmark events are never added via the add method only via the nonblockingAdd. 168 // Changing this behaviour will require moving the markBookmarkAfterRvAsReceived method 169 func (c *cacheWatcher) add(event *watchCacheEvent, timer *time.Timer) bool { 170 // Try to send the event immediately, without blocking. 171 if c.nonblockingAdd(event) { 172 return true 173 } 174 175 closeFunc := func() { 176 // This means that we couldn't send event to that watcher. 177 // Since we don't want to block on it infinitely, 178 // we simply terminate it. 179 metrics.TerminatedWatchersCounter.WithLabelValues(c.groupResource.String()).Inc() 180 // This means that we couldn't send event to that watcher. 181 // Since we don't want to block on it infinitely, we simply terminate it. 182 183 // we are graceful = false, when: 184 // 185 // (a) The bookmarkAfterResourceVersionReceived hasn't been received, 186 // we can safely terminate the watcher. Because the client is waiting 187 // for this specific bookmark, and we even haven't received one. 188 // (b) We have seen the bookmarkAfterResourceVersion, and it was sent already to the client. 189 // We can simply terminate the watcher. 190 191 // we are graceful = true, when: 192 // 193 // (a) We have seen a bookmark, but it hasn't been sent to the client yet. 194 // That means we should drain the input buffer which contains 195 // the bookmarkAfterResourceVersion we want. We do that to make progress 196 // as clients can re-establish a new watch with the given RV and receive 197 // further notifications. 198 graceful := func() bool { 199 c.stateMutex.Lock() 200 defer c.stateMutex.Unlock() 201 return c.state == cacheWatcherBookmarkReceived 202 }() 203 klog.V(1).Infof("Forcing %v watcher close due to unresponsiveness: %v. len(c.input) = %v, len(c.result) = %v, graceful = %v", c.groupResource.String(), c.identifier, len(c.input), len(c.result), graceful) 204 c.forget(graceful) 205 } 206 207 if timer == nil { 208 closeFunc() 209 return false 210 } 211 212 // OK, block sending, but only until timer fires. 213 select { 214 case c.input <- event: 215 return true 216 case <-timer.C: 217 closeFunc() 218 return false 219 } 220 } 221 222 func (c *cacheWatcher) nextBookmarkTime(now time.Time, bookmarkFrequency time.Duration) (time.Time, bool) { 223 // We try to send bookmarks: 224 // 225 // (a) right before the watcher timeout - for now we simply set it 2s before 226 // the deadline 227 // 228 // (b) roughly every minute 229 // 230 // (c) immediately when the bookmarkAfterResourceVersion wasn't confirmed 231 // in this scenario the client have already seen (or is in the process of sending) 232 // all initial data and is interested in seeing 233 // a specific RV value (aka. the bookmarkAfterResourceVersion) 234 // since we don't know when the cacher will see the RV we increase frequency 235 // 236 // (b) gives us periodicity if the watch breaks due to unexpected 237 // conditions, (a) ensures that on timeout the watcher is as close to 238 // now as possible - this covers 99% of cases. 239 240 if !c.wasBookmarkAfterRvReceived() { 241 return time.Time{}, true // schedule immediately 242 } 243 244 heartbeatTime := now.Add(bookmarkFrequency) 245 if c.deadline.IsZero() { 246 // Timeout is set by our client libraries (e.g. reflector) as well as defaulted by 247 // apiserver if properly configured. So this shoudln't happen in practice. 248 return heartbeatTime, true 249 } 250 if pretimeoutTime := c.deadline.Add(-2 * time.Second); pretimeoutTime.Before(heartbeatTime) { 251 heartbeatTime = pretimeoutTime 252 } 253 254 if heartbeatTime.Before(now) { 255 return time.Time{}, false 256 } 257 return heartbeatTime, true 258 } 259 260 // wasBookmarkAfterRvReceived same as wasBookmarkAfterRvReceivedLocked just acquires a lock 261 func (c *cacheWatcher) wasBookmarkAfterRvReceived() bool { 262 c.stateMutex.Lock() 263 defer c.stateMutex.Unlock() 264 return c.wasBookmarkAfterRvReceivedLocked() 265 } 266 267 // wasBookmarkAfterRvReceivedLocked checks if the given cacheWatcher 268 // have seen a bookmark event >= bookmarkAfterResourceVersion 269 func (c *cacheWatcher) wasBookmarkAfterRvReceivedLocked() bool { 270 return c.state != cacheWatcherWaitingForBookmark 271 } 272 273 // markBookmarkAfterRvAsReceived indicates that the given cacheWatcher 274 // have seen a bookmark event >= bookmarkAfterResourceVersion 275 func (c *cacheWatcher) markBookmarkAfterRvAsReceived(event *watchCacheEvent) { 276 if event.Type == watch.Bookmark { 277 c.stateMutex.Lock() 278 defer c.stateMutex.Unlock() 279 if c.wasBookmarkAfterRvReceivedLocked() { 280 return 281 } 282 // bookmark events are scheduled by startDispatchingBookmarkEvents method 283 // since we received a bookmark event that means we have 284 // converged towards the expected RV and it is okay to update the state so that 285 // this cacher can be scheduler for a regular bookmark events 286 c.state = cacheWatcherBookmarkReceived 287 } 288 } 289 290 // wasBookmarkAfterRvSentLocked checks if a bookmark event 291 // with an RV >= the bookmarkAfterResourceVersion has been sent by this watcher 292 func (c *cacheWatcher) wasBookmarkAfterRvSentLocked() bool { 293 return c.state == cacheWatcherBookmarkSent 294 } 295 296 // wasBookmarkAfterRvSent same as wasBookmarkAfterRvSentLocked just acquires a lock 297 func (c *cacheWatcher) wasBookmarkAfterRvSent() bool { 298 c.stateMutex.Lock() 299 defer c.stateMutex.Unlock() 300 return c.wasBookmarkAfterRvSentLocked() 301 } 302 303 // markBookmarkAfterRvSent indicates that the given cacheWatcher 304 // have sent a bookmark event with an RV >= the bookmarkAfterResourceVersion 305 // 306 // this function relies on the fact that the nonblockingAdd method 307 // won't admit a bookmark event with an RV < the bookmarkAfterResourceVersion 308 // so the first received bookmark event is considered to match the bookmarkAfterResourceVersion 309 func (c *cacheWatcher) markBookmarkAfterRvSent(event *watchCacheEvent) { 310 // note that bookmark events are not so common so will acquire a lock every ~60 second or so 311 if event.Type == watch.Bookmark { 312 c.stateMutex.Lock() 313 defer c.stateMutex.Unlock() 314 if !c.wasBookmarkAfterRvSentLocked() { 315 c.state = cacheWatcherBookmarkSent 316 } 317 } 318 } 319 320 // setBookmarkAfterResourceVersion sets the bookmarkAfterResourceVersion and the state associated with it 321 func (c *cacheWatcher) setBookmarkAfterResourceVersion(bookmarkAfterResourceVersion uint64) { 322 state := cacheWatcherWaitingForBookmark 323 if bookmarkAfterResourceVersion == 0 { 324 state = cacheWatcherBookmarkSent // if no specific RV was requested we assume no-op 325 } 326 c.state = state 327 c.bookmarkAfterResourceVersion = bookmarkAfterResourceVersion 328 } 329 330 // setDrainInputBufferLocked if set to true indicates that we should delay closing this watcher 331 // until we send all events residing in the input buffer. 332 func (c *cacheWatcher) setDrainInputBufferLocked(drain bool) { 333 c.drainInputBuffer = drain 334 } 335 336 // isDoneChannelClosed checks if c.done channel is closed 337 func (c *cacheWatcher) isDoneChannelClosedLocked() bool { 338 select { 339 case <-c.done: 340 return true 341 default: 342 } 343 return false 344 } 345 346 func getMutableObject(object runtime.Object) runtime.Object { 347 if _, ok := object.(*cachingObject); ok { 348 // It is safe to return without deep-copy, because the underlying 349 // object will lazily perform deep-copy on the first try to change 350 // any of its fields. 351 return object 352 } 353 return object.DeepCopyObject() 354 } 355 356 func updateResourceVersion(object runtime.Object, versioner storage.Versioner, resourceVersion uint64) { 357 if err := versioner.UpdateObject(object, resourceVersion); err != nil { 358 utilruntime.HandleError(fmt.Errorf("failure to version api object (%d) %#v: %v", resourceVersion, object, err)) 359 } 360 } 361 362 func (c *cacheWatcher) convertToWatchEvent(event *watchCacheEvent) *watch.Event { 363 if event.Type == watch.Bookmark { 364 e := &watch.Event{Type: watch.Bookmark, Object: event.Object.DeepCopyObject()} 365 if !c.wasBookmarkAfterRvSent() { 366 if err := storage.AnnotateInitialEventsEndBookmark(e.Object); err != nil { 367 utilruntime.HandleError(fmt.Errorf("error while accessing object's metadata gr: %v, identifier: %v, obj: %#v, err: %v", c.groupResource, c.identifier, e.Object, err)) 368 return nil 369 } 370 } 371 return e 372 } 373 374 curObjPasses := event.Type != watch.Deleted && c.filter(event.Key, event.ObjLabels, event.ObjFields) 375 oldObjPasses := false 376 if event.PrevObject != nil { 377 oldObjPasses = c.filter(event.Key, event.PrevObjLabels, event.PrevObjFields) 378 } 379 if !curObjPasses && !oldObjPasses { 380 // Watcher is not interested in that object. 381 return nil 382 } 383 384 switch { 385 case curObjPasses && !oldObjPasses: 386 return &watch.Event{Type: watch.Added, Object: getMutableObject(event.Object)} 387 case curObjPasses && oldObjPasses: 388 return &watch.Event{Type: watch.Modified, Object: getMutableObject(event.Object)} 389 case !curObjPasses && oldObjPasses: 390 // return a delete event with the previous object content, but with the event's resource version 391 oldObj := getMutableObject(event.PrevObject) 392 // We know that if oldObj is cachingObject (which can only be set via 393 // setCachingObjects), its resourceVersion is already set correctly and 394 // we don't need to update it. However, since cachingObject efficiently 395 // handles noop updates, we avoid this microoptimization here. 396 updateResourceVersion(oldObj, c.versioner, event.ResourceVersion) 397 return &watch.Event{Type: watch.Deleted, Object: oldObj} 398 } 399 400 return nil 401 } 402 403 // NOTE: sendWatchCacheEvent is assumed to not modify <event> !!! 404 func (c *cacheWatcher) sendWatchCacheEvent(event *watchCacheEvent) { 405 watchEvent := c.convertToWatchEvent(event) 406 if watchEvent == nil { 407 // Watcher is not interested in that object. 408 return 409 } 410 411 // We need to ensure that if we put event X to the c.result, all 412 // previous events were already put into it before, no matter whether 413 // c.done is close or not. 414 // Thus we cannot simply select from c.done and c.result and this 415 // would give us non-determinism. 416 // At the same time, we don't want to block infinitely on putting 417 // to c.result, when c.done is already closed. 418 // 419 // This ensures that with c.done already close, we at most once go 420 // into the next select after this. With that, no matter which 421 // statement we choose there, we will deliver only consecutive 422 // events. 423 select { 424 case <-c.done: 425 return 426 default: 427 } 428 429 select { 430 case c.result <- *watchEvent: 431 c.markBookmarkAfterRvSent(event) 432 case <-c.done: 433 } 434 } 435 436 func (c *cacheWatcher) processInterval(ctx context.Context, cacheInterval *watchCacheInterval, resourceVersion uint64) { 437 defer utilruntime.HandleCrash() 438 defer close(c.result) 439 defer c.Stop() 440 441 // Check how long we are processing initEvents. 442 // As long as these are not processed, we are not processing 443 // any incoming events, so if it takes long, we may actually 444 // block all watchers for some time. 445 // TODO: From the logs it seems that there happens processing 446 // times even up to 1s which is very long. However, this doesn't 447 // depend that much on the number of initEvents. E.g. from the 448 // 2000-node Kubemark run we have logs like this, e.g.: 449 // ... processing 13862 initEvents took 66.808689ms 450 // ... processing 14040 initEvents took 993.532539ms 451 // We should understand what is blocking us in those cases (e.g. 452 // is it lack of CPU, network, or sth else) and potentially 453 // consider increase size of result buffer in those cases. 454 const initProcessThreshold = 500 * time.Millisecond 455 startTime := time.Now() 456 457 initEventCount := 0 458 for { 459 event, err := cacheInterval.Next() 460 if err != nil { 461 // An error indicates that the cache interval 462 // has been invalidated and can no longer serve 463 // events. 464 // 465 // Initially we considered sending an "out-of-history" 466 // Error event in this case, but because historically 467 // such events weren't sent out of the watchCache, we 468 // decided not to. This is still ok, because on watch 469 // closure, the watcher will try to re-instantiate the 470 // watch and then will get an explicit "out-of-history" 471 // window. There is potential for optimization, but for 472 // now, in order to be on the safe side and not break 473 // custom clients, the cost of it is something that we 474 // are fully accepting. 475 klog.Warningf("couldn't retrieve watch event to serve: %#v", err) 476 return 477 } 478 if event == nil { 479 break 480 } 481 c.sendWatchCacheEvent(event) 482 483 // With some events already sent, update resourceVersion so that 484 // events that were buffered and not yet processed won't be delivered 485 // to this watcher second time causing going back in time. 486 // 487 // There is one case where events are not necessary ordered by 488 // resourceVersion, being a case of watching from resourceVersion=0, 489 // which at the beginning returns the state of each objects. 490 // For the purpose of it, we need to max it with the resource version 491 // that we have so far. 492 if event.ResourceVersion > resourceVersion { 493 resourceVersion = event.ResourceVersion 494 } 495 initEventCount++ 496 } 497 498 if initEventCount > 0 { 499 metrics.InitCounter.WithLabelValues(c.groupResource.String()).Add(float64(initEventCount)) 500 } 501 processingTime := time.Since(startTime) 502 if processingTime > initProcessThreshold { 503 klog.V(2).Infof("processing %d initEvents of %s (%s) took %v", initEventCount, c.groupResource, c.identifier, processingTime) 504 } 505 506 c.process(ctx, resourceVersion) 507 } 508 509 func (c *cacheWatcher) process(ctx context.Context, resourceVersion uint64) { 510 // At this point we already start processing incoming watch events. 511 // However, the init event can still be processed because their serialization 512 // and sending to the client happens asynchrnously. 513 // TODO: As describe in the KEP, we would like to estimate that by delaying 514 // the initialization signal proportionally to the number of events to 515 // process, but we're leaving this to the tuning phase. 516 utilflowcontrol.WatchInitialized(ctx) 517 518 for { 519 select { 520 case event, ok := <-c.input: 521 if !ok { 522 return 523 } 524 // only send events newer than resourceVersion 525 // or a bookmark event with an RV equal to resourceVersion 526 // if we haven't sent one to the client 527 if event.ResourceVersion > resourceVersion || (event.Type == watch.Bookmark && event.ResourceVersion == resourceVersion && !c.wasBookmarkAfterRvSent()) { 528 c.sendWatchCacheEvent(event) 529 } 530 case <-ctx.Done(): 531 return 532 } 533 } 534 }