k8s.io/client-go@v0.31.1/tools/watch/retrywatcher.go

k8s.io/client-go@v0.31.1/tools/watch/retrywatcher.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package watch
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"net/http"
    25  	"sync"
    26  	"time"
    27  
    28  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/util/dump"
    31  	"k8s.io/apimachinery/pkg/util/net"
    32  	"k8s.io/apimachinery/pkg/util/wait"
    33  	"k8s.io/apimachinery/pkg/watch"
    34  	"k8s.io/client-go/tools/cache"
    35  	"k8s.io/klog/v2"
    36  )
    37  
    38  // resourceVersionGetter is an interface used to get resource version from events.
    39  // We can't reuse an interface from meta otherwise it would be a cyclic dependency and we need just this one method
    40  type resourceVersionGetter interface {
    41  	GetResourceVersion() string
    42  }
    43  
    44  // RetryWatcher will make sure that in case the underlying watcher is closed (e.g. due to API timeout or etcd timeout)
    45  // it will get restarted from the last point without the consumer even knowing about it.
    46  // RetryWatcher does that by inspecting events and keeping track of resourceVersion.
    47  // Especially useful when using watch.UntilWithoutRetry where premature termination is causing issues and flakes.
    48  // Please note that this is not resilient to etcd cache not having the resource version anymore - you would need to
    49  // use Informers for that.
    50  type RetryWatcher struct {
    51  	lastResourceVersion string
    52  	watcherClient       cache.Watcher
    53  	resultChan          chan watch.Event
    54  	stopChan            chan struct{}
    55  	doneChan            chan struct{}
    56  	minRestartDelay     time.Duration
    57  	stopChanLock        sync.Mutex
    58  }
    59  
    60  // NewRetryWatcher creates a new RetryWatcher.
    61  // It will make sure that watches gets restarted in case of recoverable errors.
    62  // The initialResourceVersion will be given to watch method when first called.
    63  func NewRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher) (*RetryWatcher, error) {
    64  	return newRetryWatcher(initialResourceVersion, watcherClient, 1*time.Second)
    65  }
    66  
    67  func newRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher, minRestartDelay time.Duration) (*RetryWatcher, error) {
    68  	switch initialResourceVersion {
    69  	case "", "0":
    70  		// TODO: revisit this if we ever get WATCH v2 where it means start "now"
    71  		//       without doing the synthetic list of objects at the beginning (see #74022)
    72  		return nil, fmt.Errorf("initial RV %q is not supported due to issues with underlying WATCH", initialResourceVersion)
    73  	default:
    74  		break
    75  	}
    76  
    77  	rw := &RetryWatcher{
    78  		lastResourceVersion: initialResourceVersion,
    79  		watcherClient:       watcherClient,
    80  		stopChan:            make(chan struct{}),
    81  		doneChan:            make(chan struct{}),
    82  		resultChan:          make(chan watch.Event, 0),
    83  		minRestartDelay:     minRestartDelay,
    84  	}
    85  
    86  	go rw.receive()
    87  	return rw, nil
    88  }
    89  
    90  func (rw *RetryWatcher) send(event watch.Event) bool {
    91  	// Writing to an unbuffered channel is blocking operation
    92  	// and we need to check if stop wasn't requested while doing so.
    93  	select {
    94  	case rw.resultChan <- event:
    95  		return true
    96  	case <-rw.stopChan:
    97  		return false
    98  	}
    99  }
   100  
   101  // doReceive returns true when it is done, false otherwise.
   102  // If it is not done the second return value holds the time to wait before calling it again.
   103  func (rw *RetryWatcher) doReceive() (bool, time.Duration) {
   104  	watcher, err := rw.watcherClient.Watch(metav1.ListOptions{
   105  		ResourceVersion:     rw.lastResourceVersion,
   106  		AllowWatchBookmarks: true,
   107  	})
   108  	// We are very unlikely to hit EOF here since we are just establishing the call,
   109  	// but it may happen that the apiserver is just shutting down (e.g. being restarted)
   110  	// This is consistent with how it is handled for informers
   111  	switch err {
   112  	case nil:
   113  		break
   114  
   115  	case io.EOF:
   116  		// watch closed normally
   117  		return false, 0
   118  
   119  	case io.ErrUnexpectedEOF:
   120  		klog.V(1).InfoS("Watch closed with unexpected EOF", "err", err)
   121  		return false, 0
   122  
   123  	default:
   124  		msg := "Watch failed"
   125  		if net.IsProbableEOF(err) || net.IsTimeout(err) {
   126  			klog.V(5).InfoS(msg, "err", err)
   127  			// Retry
   128  			return false, 0
   129  		}
   130  
   131  		klog.ErrorS(err, msg)
   132  		// Retry
   133  		return false, 0
   134  	}
   135  
   136  	if watcher == nil {
   137  		klog.ErrorS(nil, "Watch returned nil watcher")
   138  		// Retry
   139  		return false, 0
   140  	}
   141  
   142  	ch := watcher.ResultChan()
   143  	defer watcher.Stop()
   144  
   145  	for {
   146  		select {
   147  		case <-rw.stopChan:
   148  			klog.V(4).InfoS("Stopping RetryWatcher.")
   149  			return true, 0
   150  		case event, ok := <-ch:
   151  			if !ok {
   152  				klog.V(4).InfoS("Failed to get event! Re-creating the watcher.", "resourceVersion", rw.lastResourceVersion)
   153  				return false, 0
   154  			}
   155  
   156  			// We need to inspect the event and get ResourceVersion out of it
   157  			switch event.Type {
   158  			case watch.Added, watch.Modified, watch.Deleted, watch.Bookmark:
   159  				metaObject, ok := event.Object.(resourceVersionGetter)
   160  				if !ok {
   161  					_ = rw.send(watch.Event{
   162  						Type:   watch.Error,
   163  						Object: &apierrors.NewInternalError(errors.New("retryWatcher: doesn't support resourceVersion")).ErrStatus,
   164  					})
   165  					// We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
   166  					return true, 0
   167  				}
   168  
   169  				resourceVersion := metaObject.GetResourceVersion()
   170  				if resourceVersion == "" {
   171  					_ = rw.send(watch.Event{
   172  						Type:   watch.Error,
   173  						Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher: object %#v doesn't support resourceVersion", event.Object)).ErrStatus,
   174  					})
   175  					// We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
   176  					return true, 0
   177  				}
   178  
   179  				// All is fine; send the non-bookmark events and update resource version.
   180  				if event.Type != watch.Bookmark {
   181  					ok = rw.send(event)
   182  					if !ok {
   183  						return true, 0
   184  					}
   185  				}
   186  				rw.lastResourceVersion = resourceVersion
   187  
   188  				continue
   189  
   190  			case watch.Error:
   191  				// This round trip allows us to handle unstructured status
   192  				errObject := apierrors.FromObject(event.Object)
   193  				statusErr, ok := errObject.(*apierrors.StatusError)
   194  				if !ok {
   195  					klog.Error(fmt.Sprintf("Received an error which is not *metav1.Status but %s", dump.Pretty(event.Object)))
   196  					// Retry unknown errors
   197  					return false, 0
   198  				}
   199  
   200  				status := statusErr.ErrStatus
   201  
   202  				statusDelay := time.Duration(0)
   203  				if status.Details != nil {
   204  					statusDelay = time.Duration(status.Details.RetryAfterSeconds) * time.Second
   205  				}
   206  
   207  				switch status.Code {
   208  				case http.StatusGone:
   209  					// Never retry RV too old errors
   210  					_ = rw.send(event)
   211  					return true, 0
   212  
   213  				case http.StatusGatewayTimeout, http.StatusInternalServerError:
   214  					// Retry
   215  					return false, statusDelay
   216  
   217  				default:
   218  					// We retry by default. RetryWatcher is meant to proceed unless it is certain
   219  					// that it can't. If we are not certain, we proceed with retry and leave it
   220  					// up to the user to timeout if needed.
   221  
   222  					// Log here so we have a record of hitting the unexpected error
   223  					// and we can whitelist some error codes if we missed any that are expected.
   224  					klog.V(5).Info(fmt.Sprintf("Retrying after unexpected error: %s", dump.Pretty(event.Object)))
   225  
   226  					// Retry
   227  					return false, statusDelay
   228  				}
   229  
   230  			default:
   231  				klog.Errorf("Failed to recognize Event type %q", event.Type)
   232  				_ = rw.send(watch.Event{
   233  					Type:   watch.Error,
   234  					Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher failed to recognize Event type %q", event.Type)).ErrStatus,
   235  				})
   236  				// We are unable to restart the watch and have to stop the loop or this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
   237  				return true, 0
   238  			}
   239  		}
   240  	}
   241  }
   242  
   243  // receive reads the result from a watcher, restarting it if necessary.
   244  func (rw *RetryWatcher) receive() {
   245  	defer close(rw.doneChan)
   246  	defer close(rw.resultChan)
   247  
   248  	klog.V(4).Info("Starting RetryWatcher.")
   249  	defer klog.V(4).Info("Stopping RetryWatcher.")
   250  
   251  	ctx, cancel := context.WithCancel(context.Background())
   252  	defer cancel()
   253  	go func() {
   254  		select {
   255  		case <-rw.stopChan:
   256  			cancel()
   257  			return
   258  		case <-ctx.Done():
   259  			return
   260  		}
   261  	}()
   262  
   263  	// We use non sliding until so we don't introduce delays on happy path when WATCH call
   264  	// timeouts or gets closed and we need to reestablish it while also avoiding hot loops.
   265  	wait.NonSlidingUntilWithContext(ctx, func(ctx context.Context) {
   266  		done, retryAfter := rw.doReceive()
   267  		if done {
   268  			cancel()
   269  			return
   270  		}
   271  
   272  		timer := time.NewTimer(retryAfter)
   273  		select {
   274  		case <-ctx.Done():
   275  			timer.Stop()
   276  			return
   277  		case <-timer.C:
   278  		}
   279  
   280  		klog.V(4).Infof("Restarting RetryWatcher at RV=%q", rw.lastResourceVersion)
   281  	}, rw.minRestartDelay)
   282  }
   283  
   284  // ResultChan implements Interface.
   285  func (rw *RetryWatcher) ResultChan() <-chan watch.Event {
   286  	return rw.resultChan
   287  }
   288  
   289  // Stop implements Interface.
   290  func (rw *RetryWatcher) Stop() {
   291  	rw.stopChanLock.Lock()
   292  	defer rw.stopChanLock.Unlock()
   293  
   294  	// Prevent closing an already closed channel to prevent a panic
   295  	select {
   296  	case <-rw.stopChan:
   297  	default:
   298  		close(rw.stopChan)
   299  	}
   300  }
   301  
   302  // Done allows the caller to be notified when Retry watcher stops.
   303  func (rw *RetryWatcher) Done() <-chan struct{} {
   304  	return rw.doneChan
   305  }