k8s.io/client-go@v0.22.2/tools/watch/retrywatcher.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package watch 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "io" 24 "net/http" 25 "time" 26 27 "github.com/davecgh/go-spew/spew" 28 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 "k8s.io/apimachinery/pkg/util/net" 32 "k8s.io/apimachinery/pkg/util/wait" 33 "k8s.io/apimachinery/pkg/watch" 34 "k8s.io/client-go/tools/cache" 35 "k8s.io/klog/v2" 36 ) 37 38 // resourceVersionGetter is an interface used to get resource version from events. 39 // We can't reuse an interface from meta otherwise it would be a cyclic dependency and we need just this one method 40 type resourceVersionGetter interface { 41 GetResourceVersion() string 42 } 43 44 // RetryWatcher will make sure that in case the underlying watcher is closed (e.g. due to API timeout or etcd timeout) 45 // it will get restarted from the last point without the consumer even knowing about it. 46 // RetryWatcher does that by inspecting events and keeping track of resourceVersion. 47 // Especially useful when using watch.UntilWithoutRetry where premature termination is causing issues and flakes. 48 // Please note that this is not resilient to etcd cache not having the resource version anymore - you would need to 49 // use Informers for that. 50 type RetryWatcher struct { 51 lastResourceVersion string 52 watcherClient cache.Watcher 53 resultChan chan watch.Event 54 stopChan chan struct{} 55 doneChan chan struct{} 56 minRestartDelay time.Duration 57 } 58 59 // NewRetryWatcher creates a new RetryWatcher. 60 // It will make sure that watches gets restarted in case of recoverable errors. 61 // The initialResourceVersion will be given to watch method when first called. 62 func NewRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher) (*RetryWatcher, error) { 63 return newRetryWatcher(initialResourceVersion, watcherClient, 1*time.Second) 64 } 65 66 func newRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher, minRestartDelay time.Duration) (*RetryWatcher, error) { 67 switch initialResourceVersion { 68 case "", "0": 69 // TODO: revisit this if we ever get WATCH v2 where it means start "now" 70 // without doing the synthetic list of objects at the beginning (see #74022) 71 return nil, fmt.Errorf("initial RV %q is not supported due to issues with underlying WATCH", initialResourceVersion) 72 default: 73 break 74 } 75 76 rw := &RetryWatcher{ 77 lastResourceVersion: initialResourceVersion, 78 watcherClient: watcherClient, 79 stopChan: make(chan struct{}), 80 doneChan: make(chan struct{}), 81 resultChan: make(chan watch.Event, 0), 82 minRestartDelay: minRestartDelay, 83 } 84 85 go rw.receive() 86 return rw, nil 87 } 88 89 func (rw *RetryWatcher) send(event watch.Event) bool { 90 // Writing to an unbuffered channel is blocking operation 91 // and we need to check if stop wasn't requested while doing so. 92 select { 93 case rw.resultChan <- event: 94 return true 95 case <-rw.stopChan: 96 return false 97 } 98 } 99 100 // doReceive returns true when it is done, false otherwise. 101 // If it is not done the second return value holds the time to wait before calling it again. 102 func (rw *RetryWatcher) doReceive() (bool, time.Duration) { 103 watcher, err := rw.watcherClient.Watch(metav1.ListOptions{ 104 ResourceVersion: rw.lastResourceVersion, 105 AllowWatchBookmarks: true, 106 }) 107 // We are very unlikely to hit EOF here since we are just establishing the call, 108 // but it may happen that the apiserver is just shutting down (e.g. being restarted) 109 // This is consistent with how it is handled for informers 110 switch err { 111 case nil: 112 break 113 114 case io.EOF: 115 // watch closed normally 116 return false, 0 117 118 case io.ErrUnexpectedEOF: 119 klog.V(1).InfoS("Watch closed with unexpected EOF", "err", err) 120 return false, 0 121 122 default: 123 msg := "Watch failed" 124 if net.IsProbableEOF(err) || net.IsTimeout(err) { 125 klog.V(5).InfoS(msg, "err", err) 126 // Retry 127 return false, 0 128 } 129 130 klog.ErrorS(err, msg) 131 // Retry 132 return false, 0 133 } 134 135 if watcher == nil { 136 klog.ErrorS(nil, "Watch returned nil watcher") 137 // Retry 138 return false, 0 139 } 140 141 ch := watcher.ResultChan() 142 defer watcher.Stop() 143 144 for { 145 select { 146 case <-rw.stopChan: 147 klog.V(4).InfoS("Stopping RetryWatcher.") 148 return true, 0 149 case event, ok := <-ch: 150 if !ok { 151 klog.V(4).InfoS("Failed to get event! Re-creating the watcher.", "resourceVersion", rw.lastResourceVersion) 152 return false, 0 153 } 154 155 // We need to inspect the event and get ResourceVersion out of it 156 switch event.Type { 157 case watch.Added, watch.Modified, watch.Deleted, watch.Bookmark: 158 metaObject, ok := event.Object.(resourceVersionGetter) 159 if !ok { 160 _ = rw.send(watch.Event{ 161 Type: watch.Error, 162 Object: &apierrors.NewInternalError(errors.New("retryWatcher: doesn't support resourceVersion")).ErrStatus, 163 }) 164 // We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! 165 return true, 0 166 } 167 168 resourceVersion := metaObject.GetResourceVersion() 169 if resourceVersion == "" { 170 _ = rw.send(watch.Event{ 171 Type: watch.Error, 172 Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher: object %#v doesn't support resourceVersion", event.Object)).ErrStatus, 173 }) 174 // We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! 175 return true, 0 176 } 177 178 // All is fine; send the non-bookmark events and update resource version. 179 if event.Type != watch.Bookmark { 180 ok = rw.send(event) 181 if !ok { 182 return true, 0 183 } 184 } 185 rw.lastResourceVersion = resourceVersion 186 187 continue 188 189 case watch.Error: 190 // This round trip allows us to handle unstructured status 191 errObject := apierrors.FromObject(event.Object) 192 statusErr, ok := errObject.(*apierrors.StatusError) 193 if !ok { 194 klog.Error(spew.Sprintf("Received an error which is not *metav1.Status but %#+v", event.Object)) 195 // Retry unknown errors 196 return false, 0 197 } 198 199 status := statusErr.ErrStatus 200 201 statusDelay := time.Duration(0) 202 if status.Details != nil { 203 statusDelay = time.Duration(status.Details.RetryAfterSeconds) * time.Second 204 } 205 206 switch status.Code { 207 case http.StatusGone: 208 // Never retry RV too old errors 209 _ = rw.send(event) 210 return true, 0 211 212 case http.StatusGatewayTimeout, http.StatusInternalServerError: 213 // Retry 214 return false, statusDelay 215 216 default: 217 // We retry by default. RetryWatcher is meant to proceed unless it is certain 218 // that it can't. If we are not certain, we proceed with retry and leave it 219 // up to the user to timeout if needed. 220 221 // Log here so we have a record of hitting the unexpected error 222 // and we can whitelist some error codes if we missed any that are expected. 223 klog.V(5).Info(spew.Sprintf("Retrying after unexpected error: %#+v", event.Object)) 224 225 // Retry 226 return false, statusDelay 227 } 228 229 default: 230 klog.Errorf("Failed to recognize Event type %q", event.Type) 231 _ = rw.send(watch.Event{ 232 Type: watch.Error, 233 Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher failed to recognize Event type %q", event.Type)).ErrStatus, 234 }) 235 // We are unable to restart the watch and have to stop the loop or this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! 236 return true, 0 237 } 238 } 239 } 240 } 241 242 // receive reads the result from a watcher, restarting it if necessary. 243 func (rw *RetryWatcher) receive() { 244 defer close(rw.doneChan) 245 defer close(rw.resultChan) 246 247 klog.V(4).Info("Starting RetryWatcher.") 248 defer klog.V(4).Info("Stopping RetryWatcher.") 249 250 ctx, cancel := context.WithCancel(context.Background()) 251 defer cancel() 252 go func() { 253 select { 254 case <-rw.stopChan: 255 cancel() 256 return 257 case <-ctx.Done(): 258 return 259 } 260 }() 261 262 // We use non sliding until so we don't introduce delays on happy path when WATCH call 263 // timeouts or gets closed and we need to reestablish it while also avoiding hot loops. 264 wait.NonSlidingUntilWithContext(ctx, func(ctx context.Context) { 265 done, retryAfter := rw.doReceive() 266 if done { 267 cancel() 268 return 269 } 270 271 time.Sleep(retryAfter) 272 273 klog.V(4).Infof("Restarting RetryWatcher at RV=%q", rw.lastResourceVersion) 274 }, rw.minRestartDelay) 275 } 276 277 // ResultChan implements Interface. 278 func (rw *RetryWatcher) ResultChan() <-chan watch.Event { 279 return rw.resultChan 280 } 281 282 // Stop implements Interface. 283 func (rw *RetryWatcher) Stop() { 284 close(rw.stopChan) 285 } 286 287 // Done allows the caller to be notified when Retry watcher stops. 288 func (rw *RetryWatcher) Done() <-chan struct{} { 289 return rw.doneChan 290 }