k8s.io/apiserver@v0.29.3/pkg/storage/cacher/cache_watcher.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package cacher
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"k8s.io/apimachinery/pkg/runtime"
    26  	"k8s.io/apimachinery/pkg/runtime/schema"
    27  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    28  	"k8s.io/apimachinery/pkg/watch"
    29  	"k8s.io/apiserver/pkg/storage"
    30  	"k8s.io/apiserver/pkg/storage/cacher/metrics"
    31  	utilflowcontrol "k8s.io/apiserver/pkg/util/flowcontrol"
    32  
    33  	"k8s.io/klog/v2"
    34  )
    35  
    36  // possible states of the cache watcher
    37  const (
    38  	// cacheWatcherWaitingForBookmark indicates the cacher
    39  	// is waiting for a bookmark event with a specific RV set
    40  	cacheWatcherWaitingForBookmark = iota
    41  
    42  	// cacheWatcherBookmarkReceived indicates that the cacher
    43  	// has received a bookmark event with required RV
    44  	cacheWatcherBookmarkReceived
    45  
    46  	// cacheWatcherBookmarkSent indicates that the cacher
    47  	// has already sent a bookmark event to a client
    48  	cacheWatcherBookmarkSent
    49  )
    50  
    51  // cacheWatcher implements watch.Interface
    52  // this is not thread-safe
    53  type cacheWatcher struct {
    54  	input     chan *watchCacheEvent
    55  	result    chan watch.Event
    56  	done      chan struct{}
    57  	filter    filterWithAttrsFunc
    58  	stopped   bool
    59  	forget    func(bool)
    60  	versioner storage.Versioner
    61  	// The watcher will be closed by server after the deadline,
    62  	// save it here to send bookmark events before that.
    63  	deadline            time.Time
    64  	allowWatchBookmarks bool
    65  	groupResource       schema.GroupResource
    66  
    67  	// human readable identifier that helps assigning cacheWatcher
    68  	// instance with request
    69  	identifier string
    70  
    71  	// drainInputBuffer indicates whether we should delay closing this watcher
    72  	// and send all event in the input buffer.
    73  	drainInputBuffer bool
    74  
    75  	// bookmarkAfterResourceVersion holds an RV that indicates
    76  	// when we should start delivering bookmark events.
    77  	// If this field holds the value of 0 that means
    78  	// we don't have any special preferences toward delivering bookmark events.
    79  	// Note that this field is used in conjunction with the state field.
    80  	// It should not be changed once the watcher has been started.
    81  	bookmarkAfterResourceVersion uint64
    82  
    83  	// stateMutex protects state
    84  	stateMutex sync.Mutex
    85  
    86  	// state holds a numeric value indicating the current state of the watcher
    87  	state int
    88  }
    89  
    90  func newCacheWatcher(
    91  	chanSize int,
    92  	filter filterWithAttrsFunc,
    93  	forget func(bool),
    94  	versioner storage.Versioner,
    95  	deadline time.Time,
    96  	allowWatchBookmarks bool,
    97  	groupResource schema.GroupResource,
    98  	identifier string,
    99  ) *cacheWatcher {
   100  	return &cacheWatcher{
   101  		input:               make(chan *watchCacheEvent, chanSize),
   102  		result:              make(chan watch.Event, chanSize),
   103  		done:                make(chan struct{}),
   104  		filter:              filter,
   105  		stopped:             false,
   106  		forget:              forget,
   107  		versioner:           versioner,
   108  		deadline:            deadline,
   109  		allowWatchBookmarks: allowWatchBookmarks,
   110  		groupResource:       groupResource,
   111  		identifier:          identifier,
   112  	}
   113  }
   114  
   115  // Implements watch.Interface.
   116  func (c *cacheWatcher) ResultChan() <-chan watch.Event {
   117  	return c.result
   118  }
   119  
   120  // Implements watch.Interface.
   121  func (c *cacheWatcher) Stop() {
   122  	c.forget(false)
   123  }
   124  
   125  // we rely on the fact that stopLocked is actually protected by Cacher.Lock()
   126  func (c *cacheWatcher) stopLocked() {
   127  	if !c.stopped {
   128  		c.stopped = true
   129  		// stop without draining the input channel was requested.
   130  		if !c.drainInputBuffer {
   131  			close(c.done)
   132  		}
   133  		close(c.input)
   134  	}
   135  
   136  	// Even if the watcher was already stopped, if it previously was
   137  	// using draining mode and it's not using it now we need to
   138  	// close the done channel now. Otherwise we could leak the
   139  	// processing goroutine if it will be trying to put more objects
   140  	// into result channel, the channel will be full and there will
   141  	// already be noone on the processing the events on the receiving end.
   142  	if !c.drainInputBuffer && !c.isDoneChannelClosedLocked() {
   143  		close(c.done)
   144  	}
   145  }
   146  
   147  func (c *cacheWatcher) nonblockingAdd(event *watchCacheEvent) bool {
   148  	// if the bookmarkAfterResourceVersion hasn't been seen
   149  	// we will try to deliver a bookmark event every second.
   150  	// the following check will discard a bookmark event
   151  	// if it is < than the bookmarkAfterResourceVersion
   152  	// so that we don't pollute the input channel
   153  	if event.Type == watch.Bookmark && event.ResourceVersion < c.bookmarkAfterResourceVersion {
   154  		return false
   155  	}
   156  	select {
   157  	case c.input <- event:
   158  		c.markBookmarkAfterRvAsReceived(event)
   159  		return true
   160  	default:
   161  		return false
   162  	}
   163  }
   164  
   165  // Nil timer means that add will not block (if it can't send event immediately, it will break the watcher)
   166  //
   167  // Note that bookmark events are never added via the add method only via the nonblockingAdd.
   168  // Changing this behaviour will require moving the markBookmarkAfterRvAsReceived method
   169  func (c *cacheWatcher) add(event *watchCacheEvent, timer *time.Timer) bool {
   170  	// Try to send the event immediately, without blocking.
   171  	if c.nonblockingAdd(event) {
   172  		return true
   173  	}
   174  
   175  	closeFunc := func() {
   176  		// This means that we couldn't send event to that watcher.
   177  		// Since we don't want to block on it infinitely,
   178  		// we simply terminate it.
   179  		metrics.TerminatedWatchersCounter.WithLabelValues(c.groupResource.String()).Inc()
   180  		// This means that we couldn't send event to that watcher.
   181  		// Since we don't want to block on it infinitely, we simply terminate it.
   182  
   183  		// we are graceful = false, when:
   184  		//
   185  		// (a) The bookmarkAfterResourceVersionReceived hasn't been received,
   186  		//     we can safely terminate the watcher. Because the client is waiting
   187  		//     for this specific bookmark, and we even haven't received one.
   188  		// (b) We have seen the bookmarkAfterResourceVersion, and it was sent already to the client.
   189  		//     We can simply terminate the watcher.
   190  
   191  		// we are graceful = true, when:
   192  		//
   193  		// (a) We have seen a bookmark, but it hasn't been sent to the client yet.
   194  		//     That means we should drain the input buffer which contains
   195  		//     the bookmarkAfterResourceVersion we want. We do that to make progress
   196  		//     as clients can re-establish a new watch with the given RV and receive
   197  		//     further notifications.
   198  		graceful := func() bool {
   199  			c.stateMutex.Lock()
   200  			defer c.stateMutex.Unlock()
   201  			return c.state == cacheWatcherBookmarkReceived
   202  		}()
   203  		klog.V(1).Infof("Forcing %v watcher close due to unresponsiveness: %v. len(c.input) = %v, len(c.result) = %v, graceful = %v", c.groupResource.String(), c.identifier, len(c.input), len(c.result), graceful)
   204  		c.forget(graceful)
   205  	}
   206  
   207  	if timer == nil {
   208  		closeFunc()
   209  		return false
   210  	}
   211  
   212  	// OK, block sending, but only until timer fires.
   213  	select {
   214  	case c.input <- event:
   215  		return true
   216  	case <-timer.C:
   217  		closeFunc()
   218  		return false
   219  	}
   220  }
   221  
   222  func (c *cacheWatcher) nextBookmarkTime(now time.Time, bookmarkFrequency time.Duration) (time.Time, bool) {
   223  	// We try to send bookmarks:
   224  	//
   225  	// (a) right before the watcher timeout - for now we simply set it 2s before
   226  	//     the deadline
   227  	//
   228  	// (b) roughly every minute
   229  	//
   230  	// (c) immediately when the bookmarkAfterResourceVersion wasn't confirmed
   231  	//     in this scenario the client have already seen (or is in the process of sending)
   232  	//     all initial data and is interested in seeing
   233  	//     a specific RV value (aka. the bookmarkAfterResourceVersion)
   234  	//     since we don't know when the cacher will see the RV we increase frequency
   235  	//
   236  	// (b) gives us periodicity if the watch breaks due to unexpected
   237  	// conditions, (a) ensures that on timeout the watcher is as close to
   238  	// now as possible - this covers 99% of cases.
   239  
   240  	if !c.wasBookmarkAfterRvReceived() {
   241  		return time.Time{}, true // schedule immediately
   242  	}
   243  
   244  	heartbeatTime := now.Add(bookmarkFrequency)
   245  	if c.deadline.IsZero() {
   246  		// Timeout is set by our client libraries (e.g. reflector) as well as defaulted by
   247  		// apiserver if properly configured. So this shoudln't happen in practice.
   248  		return heartbeatTime, true
   249  	}
   250  	if pretimeoutTime := c.deadline.Add(-2 * time.Second); pretimeoutTime.Before(heartbeatTime) {
   251  		heartbeatTime = pretimeoutTime
   252  	}
   253  
   254  	if heartbeatTime.Before(now) {
   255  		return time.Time{}, false
   256  	}
   257  	return heartbeatTime, true
   258  }
   259  
   260  // wasBookmarkAfterRvReceived same as wasBookmarkAfterRvReceivedLocked just acquires a lock
   261  func (c *cacheWatcher) wasBookmarkAfterRvReceived() bool {
   262  	c.stateMutex.Lock()
   263  	defer c.stateMutex.Unlock()
   264  	return c.wasBookmarkAfterRvReceivedLocked()
   265  }
   266  
   267  // wasBookmarkAfterRvReceivedLocked checks if the given cacheWatcher
   268  // have seen a bookmark event >= bookmarkAfterResourceVersion
   269  func (c *cacheWatcher) wasBookmarkAfterRvReceivedLocked() bool {
   270  	return c.state != cacheWatcherWaitingForBookmark
   271  }
   272  
   273  // markBookmarkAfterRvAsReceived indicates that the given cacheWatcher
   274  // have seen a bookmark event >= bookmarkAfterResourceVersion
   275  func (c *cacheWatcher) markBookmarkAfterRvAsReceived(event *watchCacheEvent) {
   276  	if event.Type == watch.Bookmark {
   277  		c.stateMutex.Lock()
   278  		defer c.stateMutex.Unlock()
   279  		if c.wasBookmarkAfterRvReceivedLocked() {
   280  			return
   281  		}
   282  		// bookmark events are scheduled by startDispatchingBookmarkEvents method
   283  		// since we received a bookmark event that means we have
   284  		// converged towards the expected RV and it is okay to update the state so that
   285  		// this cacher can be scheduler for a regular bookmark events
   286  		c.state = cacheWatcherBookmarkReceived
   287  	}
   288  }
   289  
   290  // wasBookmarkAfterRvSentLocked checks if a bookmark event
   291  // with an RV >= the bookmarkAfterResourceVersion has been sent by this watcher
   292  func (c *cacheWatcher) wasBookmarkAfterRvSentLocked() bool {
   293  	return c.state == cacheWatcherBookmarkSent
   294  }
   295  
   296  // wasBookmarkAfterRvSent same as wasBookmarkAfterRvSentLocked just acquires a lock
   297  func (c *cacheWatcher) wasBookmarkAfterRvSent() bool {
   298  	c.stateMutex.Lock()
   299  	defer c.stateMutex.Unlock()
   300  	return c.wasBookmarkAfterRvSentLocked()
   301  }
   302  
   303  // markBookmarkAfterRvSent indicates that the given cacheWatcher
   304  // have sent a bookmark event with an RV >= the bookmarkAfterResourceVersion
   305  //
   306  // this function relies on the fact that the nonblockingAdd method
   307  // won't admit a bookmark event with an RV < the bookmarkAfterResourceVersion
   308  // so the first received bookmark event is considered to match the bookmarkAfterResourceVersion
   309  func (c *cacheWatcher) markBookmarkAfterRvSent(event *watchCacheEvent) {
   310  	// note that bookmark events are not so common so will acquire a lock every ~60 second or so
   311  	if event.Type == watch.Bookmark {
   312  		c.stateMutex.Lock()
   313  		defer c.stateMutex.Unlock()
   314  		if !c.wasBookmarkAfterRvSentLocked() {
   315  			c.state = cacheWatcherBookmarkSent
   316  		}
   317  	}
   318  }
   319  
   320  // setBookmarkAfterResourceVersion sets the bookmarkAfterResourceVersion and the state associated with it
   321  func (c *cacheWatcher) setBookmarkAfterResourceVersion(bookmarkAfterResourceVersion uint64) {
   322  	state := cacheWatcherWaitingForBookmark
   323  	if bookmarkAfterResourceVersion == 0 {
   324  		state = cacheWatcherBookmarkSent // if no specific RV was requested we assume no-op
   325  	}
   326  	c.state = state
   327  	c.bookmarkAfterResourceVersion = bookmarkAfterResourceVersion
   328  }
   329  
   330  // setDrainInputBufferLocked if set to true indicates that we should delay closing this watcher
   331  // until we send all events residing in the input buffer.
   332  func (c *cacheWatcher) setDrainInputBufferLocked(drain bool) {
   333  	c.drainInputBuffer = drain
   334  }
   335  
   336  // isDoneChannelClosed checks if c.done channel is closed
   337  func (c *cacheWatcher) isDoneChannelClosedLocked() bool {
   338  	select {
   339  	case <-c.done:
   340  		return true
   341  	default:
   342  	}
   343  	return false
   344  }
   345  
   346  func getMutableObject(object runtime.Object) runtime.Object {
   347  	if _, ok := object.(*cachingObject); ok {
   348  		// It is safe to return without deep-copy, because the underlying
   349  		// object will lazily perform deep-copy on the first try to change
   350  		// any of its fields.
   351  		return object
   352  	}
   353  	return object.DeepCopyObject()
   354  }
   355  
   356  func updateResourceVersion(object runtime.Object, versioner storage.Versioner, resourceVersion uint64) {
   357  	if err := versioner.UpdateObject(object, resourceVersion); err != nil {
   358  		utilruntime.HandleError(fmt.Errorf("failure to version api object (%d) %#v: %v", resourceVersion, object, err))
   359  	}
   360  }
   361  
   362  func (c *cacheWatcher) convertToWatchEvent(event *watchCacheEvent) *watch.Event {
   363  	if event.Type == watch.Bookmark {
   364  		e := &watch.Event{Type: watch.Bookmark, Object: event.Object.DeepCopyObject()}
   365  		if !c.wasBookmarkAfterRvSent() {
   366  			if err := storage.AnnotateInitialEventsEndBookmark(e.Object); err != nil {
   367  				utilruntime.HandleError(fmt.Errorf("error while accessing object's metadata gr: %v, identifier: %v, obj: %#v, err: %v", c.groupResource, c.identifier, e.Object, err))
   368  				return nil
   369  			}
   370  		}
   371  		return e
   372  	}
   373  
   374  	curObjPasses := event.Type != watch.Deleted && c.filter(event.Key, event.ObjLabels, event.ObjFields)
   375  	oldObjPasses := false
   376  	if event.PrevObject != nil {
   377  		oldObjPasses = c.filter(event.Key, event.PrevObjLabels, event.PrevObjFields)
   378  	}
   379  	if !curObjPasses && !oldObjPasses {
   380  		// Watcher is not interested in that object.
   381  		return nil
   382  	}
   383  
   384  	switch {
   385  	case curObjPasses && !oldObjPasses:
   386  		return &watch.Event{Type: watch.Added, Object: getMutableObject(event.Object)}
   387  	case curObjPasses && oldObjPasses:
   388  		return &watch.Event{Type: watch.Modified, Object: getMutableObject(event.Object)}
   389  	case !curObjPasses && oldObjPasses:
   390  		// return a delete event with the previous object content, but with the event's resource version
   391  		oldObj := getMutableObject(event.PrevObject)
   392  		// We know that if oldObj is cachingObject (which can only be set via
   393  		// setCachingObjects), its resourceVersion is already set correctly and
   394  		// we don't need to update it. However, since cachingObject efficiently
   395  		// handles noop updates, we avoid this microoptimization here.
   396  		updateResourceVersion(oldObj, c.versioner, event.ResourceVersion)
   397  		return &watch.Event{Type: watch.Deleted, Object: oldObj}
   398  	}
   399  
   400  	return nil
   401  }
   402  
   403  // NOTE: sendWatchCacheEvent is assumed to not modify <event> !!!
   404  func (c *cacheWatcher) sendWatchCacheEvent(event *watchCacheEvent) {
   405  	watchEvent := c.convertToWatchEvent(event)
   406  	if watchEvent == nil {
   407  		// Watcher is not interested in that object.
   408  		return
   409  	}
   410  
   411  	// We need to ensure that if we put event X to the c.result, all
   412  	// previous events were already put into it before, no matter whether
   413  	// c.done is close or not.
   414  	// Thus we cannot simply select from c.done and c.result and this
   415  	// would give us non-determinism.
   416  	// At the same time, we don't want to block infinitely on putting
   417  	// to c.result, when c.done is already closed.
   418  	//
   419  	// This ensures that with c.done already close, we at most once go
   420  	// into the next select after this. With that, no matter which
   421  	// statement we choose there, we will deliver only consecutive
   422  	// events.
   423  	select {
   424  	case <-c.done:
   425  		return
   426  	default:
   427  	}
   428  
   429  	select {
   430  	case c.result <- *watchEvent:
   431  		c.markBookmarkAfterRvSent(event)
   432  	case <-c.done:
   433  	}
   434  }
   435  
   436  func (c *cacheWatcher) processInterval(ctx context.Context, cacheInterval *watchCacheInterval, resourceVersion uint64) {
   437  	defer utilruntime.HandleCrash()
   438  	defer close(c.result)
   439  	defer c.Stop()
   440  
   441  	// Check how long we are processing initEvents.
   442  	// As long as these are not processed, we are not processing
   443  	// any incoming events, so if it takes long, we may actually
   444  	// block all watchers for some time.
   445  	// TODO: From the logs it seems that there happens processing
   446  	// times even up to 1s which is very long. However, this doesn't
   447  	// depend that much on the number of initEvents. E.g. from the
   448  	// 2000-node Kubemark run we have logs like this, e.g.:
   449  	// ... processing 13862 initEvents took 66.808689ms
   450  	// ... processing 14040 initEvents took 993.532539ms
   451  	// We should understand what is blocking us in those cases (e.g.
   452  	// is it lack of CPU, network, or sth else) and potentially
   453  	// consider increase size of result buffer in those cases.
   454  	const initProcessThreshold = 500 * time.Millisecond
   455  	startTime := time.Now()
   456  
   457  	initEventCount := 0
   458  	for {
   459  		event, err := cacheInterval.Next()
   460  		if err != nil {
   461  			// An error indicates that the cache interval
   462  			// has been invalidated and can no longer serve
   463  			// events.
   464  			//
   465  			// Initially we considered sending an "out-of-history"
   466  			// Error event in this case, but because historically
   467  			// such events weren't sent out of the watchCache, we
   468  			// decided not to. This is still ok, because on watch
   469  			// closure, the watcher will try to re-instantiate the
   470  			// watch and then will get an explicit "out-of-history"
   471  			// window. There is potential for optimization, but for
   472  			// now, in order to be on the safe side and not break
   473  			// custom clients, the cost of it is something that we
   474  			// are fully accepting.
   475  			klog.Warningf("couldn't retrieve watch event to serve: %#v", err)
   476  			return
   477  		}
   478  		if event == nil {
   479  			break
   480  		}
   481  		c.sendWatchCacheEvent(event)
   482  
   483  		// With some events already sent, update resourceVersion so that
   484  		// events that were buffered and not yet processed won't be delivered
   485  		// to this watcher second time causing going back in time.
   486  		//
   487  		// There is one case where events are not necessary ordered by
   488  		// resourceVersion, being a case of watching from resourceVersion=0,
   489  		// which at the beginning returns the state of each objects.
   490  		// For the purpose of it, we need to max it with the resource version
   491  		// that we have so far.
   492  		if event.ResourceVersion > resourceVersion {
   493  			resourceVersion = event.ResourceVersion
   494  		}
   495  		initEventCount++
   496  	}
   497  
   498  	if initEventCount > 0 {
   499  		metrics.InitCounter.WithLabelValues(c.groupResource.String()).Add(float64(initEventCount))
   500  	}
   501  	processingTime := time.Since(startTime)
   502  	if processingTime > initProcessThreshold {
   503  		klog.V(2).Infof("processing %d initEvents of %s (%s) took %v", initEventCount, c.groupResource, c.identifier, processingTime)
   504  	}
   505  
   506  	c.process(ctx, resourceVersion)
   507  }
   508  
   509  func (c *cacheWatcher) process(ctx context.Context, resourceVersion uint64) {
   510  	// At this point we already start processing incoming watch events.
   511  	// However, the init event can still be processed because their serialization
   512  	// and sending to the client happens asynchrnously.
   513  	// TODO: As describe in the KEP, we would like to estimate that by delaying
   514  	//   the initialization signal proportionally to the number of events to
   515  	//   process, but we're leaving this to the tuning phase.
   516  	utilflowcontrol.WatchInitialized(ctx)
   517  
   518  	for {
   519  		select {
   520  		case event, ok := <-c.input:
   521  			if !ok {
   522  				return
   523  			}
   524  			// only send events newer than resourceVersion
   525  			// or a bookmark event with an RV equal to resourceVersion
   526  			// if we haven't sent one to the client
   527  			if event.ResourceVersion > resourceVersion || (event.Type == watch.Bookmark && event.ResourceVersion == resourceVersion && !c.wasBookmarkAfterRvSent()) {
   528  				c.sendWatchCacheEvent(event)
   529  			}
   530  		case <-ctx.Done():
   531  			return
   532  		}
   533  	}
   534  }