k8s.io/client-go@v0.22.2/tools/watch/retrywatcher.go

k8s.io/client-go@v0.22.2/tools/watch/retrywatcher.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package watch
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"io"
    24  	"net/http"
    25  	"time"
    26  
    27  	"github.com/davecgh/go-spew/spew"
    28  
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/util/net"
    32  	"k8s.io/apimachinery/pkg/util/wait"
    33  	"k8s.io/apimachinery/pkg/watch"
    34  	"k8s.io/client-go/tools/cache"
    35  	"k8s.io/klog/v2"
    36  )
    37  
    38  // resourceVersionGetter is an interface used to get resource version from events.
    39  // We can't reuse an interface from meta otherwise it would be a cyclic dependency and we need just this one method
    40  type resourceVersionGetter interface {
    41  	GetResourceVersion() string
    42  }
    43  
    44  // RetryWatcher will make sure that in case the underlying watcher is closed (e.g. due to API timeout or etcd timeout)
    45  // it will get restarted from the last point without the consumer even knowing about it.
    46  // RetryWatcher does that by inspecting events and keeping track of resourceVersion.
    47  // Especially useful when using watch.UntilWithoutRetry where premature termination is causing issues and flakes.
    48  // Please note that this is not resilient to etcd cache not having the resource version anymore - you would need to
    49  // use Informers for that.
    50  type RetryWatcher struct {
    51  	lastResourceVersion string
    52  	watcherClient       cache.Watcher
    53  	resultChan          chan watch.Event
    54  	stopChan            chan struct{}
    55  	doneChan            chan struct{}
    56  	minRestartDelay     time.Duration
    57  }
    58  
    59  // NewRetryWatcher creates a new RetryWatcher.
    60  // It will make sure that watches gets restarted in case of recoverable errors.
    61  // The initialResourceVersion will be given to watch method when first called.
    62  func NewRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher) (*RetryWatcher, error) {
    63  	return newRetryWatcher(initialResourceVersion, watcherClient, 1*time.Second)
    64  }
    65  
    66  func newRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher, minRestartDelay time.Duration) (*RetryWatcher, error) {
    67  	switch initialResourceVersion {
    68  	case "", "0":
    69  		// TODO: revisit this if we ever get WATCH v2 where it means start "now"
    70  		//       without doing the synthetic list of objects at the beginning (see #74022)
    71  		return nil, fmt.Errorf("initial RV %q is not supported due to issues with underlying WATCH", initialResourceVersion)
    72  	default:
    73  		break
    74  	}
    75  
    76  	rw := &RetryWatcher{
    77  		lastResourceVersion: initialResourceVersion,
    78  		watcherClient:       watcherClient,
    79  		stopChan:            make(chan struct{}),
    80  		doneChan:            make(chan struct{}),
    81  		resultChan:          make(chan watch.Event, 0),
    82  		minRestartDelay:     minRestartDelay,
    83  	}
    84  
    85  	go rw.receive()
    86  	return rw, nil
    87  }
    88  
    89  func (rw *RetryWatcher) send(event watch.Event) bool {
    90  	// Writing to an unbuffered channel is blocking operation
    91  	// and we need to check if stop wasn't requested while doing so.
    92  	select {
    93  	case rw.resultChan <- event:
    94  		return true
    95  	case <-rw.stopChan:
    96  		return false
    97  	}
    98  }
    99  
   100  // doReceive returns true when it is done, false otherwise.
   101  // If it is not done the second return value holds the time to wait before calling it again.
   102  func (rw *RetryWatcher) doReceive() (bool, time.Duration) {
   103  	watcher, err := rw.watcherClient.Watch(metav1.ListOptions{
   104  		ResourceVersion:     rw.lastResourceVersion,
   105  		AllowWatchBookmarks: true,
   106  	})
   107  	// We are very unlikely to hit EOF here since we are just establishing the call,
   108  	// but it may happen that the apiserver is just shutting down (e.g. being restarted)
   109  	// This is consistent with how it is handled for informers
   110  	switch err {
   111  	case nil:
   112  		break
   113  
   114  	case io.EOF:
   115  		// watch closed normally
   116  		return false, 0
   117  
   118  	case io.ErrUnexpectedEOF:
   119  		klog.V(1).InfoS("Watch closed with unexpected EOF", "err", err)
   120  		return false, 0
   121  
   122  	default:
   123  		msg := "Watch failed"
   124  		if net.IsProbableEOF(err) || net.IsTimeout(err) {
   125  			klog.V(5).InfoS(msg, "err", err)
   126  			// Retry
   127  			return false, 0
   128  		}
   129  
   130  		klog.ErrorS(err, msg)
   131  		// Retry
   132  		return false, 0
   133  	}
   134  
   135  	if watcher == nil {
   136  		klog.ErrorS(nil, "Watch returned nil watcher")
   137  		// Retry
   138  		return false, 0
   139  	}
   140  
   141  	ch := watcher.ResultChan()
   142  	defer watcher.Stop()
   143  
   144  	for {
   145  		select {
   146  		case <-rw.stopChan:
   147  			klog.V(4).InfoS("Stopping RetryWatcher.")
   148  			return true, 0
   149  		case event, ok := <-ch:
   150  			if !ok {
   151  				klog.V(4).InfoS("Failed to get event! Re-creating the watcher.", "resourceVersion", rw.lastResourceVersion)
   152  				return false, 0
   153  			}
   154  
   155  			// We need to inspect the event and get ResourceVersion out of it
   156  			switch event.Type {
   157  			case watch.Added, watch.Modified, watch.Deleted, watch.Bookmark:
   158  				metaObject, ok := event.Object.(resourceVersionGetter)
   159  				if !ok {
   160  					_ = rw.send(watch.Event{
   161  						Type:   watch.Error,
   162  						Object: &apierrors.NewInternalError(errors.New("retryWatcher: doesn't support resourceVersion")).ErrStatus,
   163  					})
   164  					// We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
   165  					return true, 0
   166  				}
   167  
   168  				resourceVersion := metaObject.GetResourceVersion()
   169  				if resourceVersion == "" {
   170  					_ = rw.send(watch.Event{
   171  						Type:   watch.Error,
   172  						Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher: object %#v doesn't support resourceVersion", event.Object)).ErrStatus,
   173  					})
   174  					// We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
   175  					return true, 0
   176  				}
   177  
   178  				// All is fine; send the non-bookmark events and update resource version.
   179  				if event.Type != watch.Bookmark {
   180  					ok = rw.send(event)
   181  					if !ok {
   182  						return true, 0
   183  					}
   184  				}
   185  				rw.lastResourceVersion = resourceVersion
   186  
   187  				continue
   188  
   189  			case watch.Error:
   190  				// This round trip allows us to handle unstructured status
   191  				errObject := apierrors.FromObject(event.Object)
   192  				statusErr, ok := errObject.(*apierrors.StatusError)
   193  				if !ok {
   194  					klog.Error(spew.Sprintf("Received an error which is not *metav1.Status but %#+v", event.Object))
   195  					// Retry unknown errors
   196  					return false, 0
   197  				}
   198  
   199  				status := statusErr.ErrStatus
   200  
   201  				statusDelay := time.Duration(0)
   202  				if status.Details != nil {
   203  					statusDelay = time.Duration(status.Details.RetryAfterSeconds) * time.Second
   204  				}
   205  
   206  				switch status.Code {
   207  				case http.StatusGone:
   208  					// Never retry RV too old errors
   209  					_ = rw.send(event)
   210  					return true, 0
   211  
   212  				case http.StatusGatewayTimeout, http.StatusInternalServerError:
   213  					// Retry
   214  					return false, statusDelay
   215  
   216  				default:
   217  					// We retry by default. RetryWatcher is meant to proceed unless it is certain
   218  					// that it can't. If we are not certain, we proceed with retry and leave it
   219  					// up to the user to timeout if needed.
   220  
   221  					// Log here so we have a record of hitting the unexpected error
   222  					// and we can whitelist some error codes if we missed any that are expected.
   223  					klog.V(5).Info(spew.Sprintf("Retrying after unexpected error: %#+v", event.Object))
   224  
   225  					// Retry
   226  					return false, statusDelay
   227  				}
   228  
   229  			default:
   230  				klog.Errorf("Failed to recognize Event type %q", event.Type)
   231  				_ = rw.send(watch.Event{
   232  					Type:   watch.Error,
   233  					Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher failed to recognize Event type %q", event.Type)).ErrStatus,
   234  				})
   235  				// We are unable to restart the watch and have to stop the loop or this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data!
   236  				return true, 0
   237  			}
   238  		}
   239  	}
   240  }
   241  
   242  // receive reads the result from a watcher, restarting it if necessary.
   243  func (rw *RetryWatcher) receive() {
   244  	defer close(rw.doneChan)
   245  	defer close(rw.resultChan)
   246  
   247  	klog.V(4).Info("Starting RetryWatcher.")
   248  	defer klog.V(4).Info("Stopping RetryWatcher.")
   249  
   250  	ctx, cancel := context.WithCancel(context.Background())
   251  	defer cancel()
   252  	go func() {
   253  		select {
   254  		case <-rw.stopChan:
   255  			cancel()
   256  			return
   257  		case <-ctx.Done():
   258  			return
   259  		}
   260  	}()
   261  
   262  	// We use non sliding until so we don't introduce delays on happy path when WATCH call
   263  	// timeouts or gets closed and we need to reestablish it while also avoiding hot loops.
   264  	wait.NonSlidingUntilWithContext(ctx, func(ctx context.Context) {
   265  		done, retryAfter := rw.doReceive()
   266  		if done {
   267  			cancel()
   268  			return
   269  		}
   270  
   271  		time.Sleep(retryAfter)
   272  
   273  		klog.V(4).Infof("Restarting RetryWatcher at RV=%q", rw.lastResourceVersion)
   274  	}, rw.minRestartDelay)
   275  }
   276  
   277  // ResultChan implements Interface.
   278  func (rw *RetryWatcher) ResultChan() <-chan watch.Event {
   279  	return rw.resultChan
   280  }
   281  
   282  // Stop implements Interface.
   283  func (rw *RetryWatcher) Stop() {
   284  	close(rw.stopChan)
   285  }
   286  
   287  // Done allows the caller to be notified when Retry watcher stops.
   288  func (rw *RetryWatcher) Done() <-chan struct{} {
   289  	return rw.doneChan
   290  }