k8s.io/client-go@v0.31.1/tools/watch/retrywatcher.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package watch 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "io" 24 "net/http" 25 "sync" 26 "time" 27 28 apierrors "k8s.io/apimachinery/pkg/api/errors" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/util/dump" 31 "k8s.io/apimachinery/pkg/util/net" 32 "k8s.io/apimachinery/pkg/util/wait" 33 "k8s.io/apimachinery/pkg/watch" 34 "k8s.io/client-go/tools/cache" 35 "k8s.io/klog/v2" 36 ) 37 38 // resourceVersionGetter is an interface used to get resource version from events. 39 // We can't reuse an interface from meta otherwise it would be a cyclic dependency and we need just this one method 40 type resourceVersionGetter interface { 41 GetResourceVersion() string 42 } 43 44 // RetryWatcher will make sure that in case the underlying watcher is closed (e.g. due to API timeout or etcd timeout) 45 // it will get restarted from the last point without the consumer even knowing about it. 46 // RetryWatcher does that by inspecting events and keeping track of resourceVersion. 47 // Especially useful when using watch.UntilWithoutRetry where premature termination is causing issues and flakes. 48 // Please note that this is not resilient to etcd cache not having the resource version anymore - you would need to 49 // use Informers for that. 50 type RetryWatcher struct { 51 lastResourceVersion string 52 watcherClient cache.Watcher 53 resultChan chan watch.Event 54 stopChan chan struct{} 55 doneChan chan struct{} 56 minRestartDelay time.Duration 57 stopChanLock sync.Mutex 58 } 59 60 // NewRetryWatcher creates a new RetryWatcher. 61 // It will make sure that watches gets restarted in case of recoverable errors. 62 // The initialResourceVersion will be given to watch method when first called. 63 func NewRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher) (*RetryWatcher, error) { 64 return newRetryWatcher(initialResourceVersion, watcherClient, 1*time.Second) 65 } 66 67 func newRetryWatcher(initialResourceVersion string, watcherClient cache.Watcher, minRestartDelay time.Duration) (*RetryWatcher, error) { 68 switch initialResourceVersion { 69 case "", "0": 70 // TODO: revisit this if we ever get WATCH v2 where it means start "now" 71 // without doing the synthetic list of objects at the beginning (see #74022) 72 return nil, fmt.Errorf("initial RV %q is not supported due to issues with underlying WATCH", initialResourceVersion) 73 default: 74 break 75 } 76 77 rw := &RetryWatcher{ 78 lastResourceVersion: initialResourceVersion, 79 watcherClient: watcherClient, 80 stopChan: make(chan struct{}), 81 doneChan: make(chan struct{}), 82 resultChan: make(chan watch.Event, 0), 83 minRestartDelay: minRestartDelay, 84 } 85 86 go rw.receive() 87 return rw, nil 88 } 89 90 func (rw *RetryWatcher) send(event watch.Event) bool { 91 // Writing to an unbuffered channel is blocking operation 92 // and we need to check if stop wasn't requested while doing so. 93 select { 94 case rw.resultChan <- event: 95 return true 96 case <-rw.stopChan: 97 return false 98 } 99 } 100 101 // doReceive returns true when it is done, false otherwise. 102 // If it is not done the second return value holds the time to wait before calling it again. 103 func (rw *RetryWatcher) doReceive() (bool, time.Duration) { 104 watcher, err := rw.watcherClient.Watch(metav1.ListOptions{ 105 ResourceVersion: rw.lastResourceVersion, 106 AllowWatchBookmarks: true, 107 }) 108 // We are very unlikely to hit EOF here since we are just establishing the call, 109 // but it may happen that the apiserver is just shutting down (e.g. being restarted) 110 // This is consistent with how it is handled for informers 111 switch err { 112 case nil: 113 break 114 115 case io.EOF: 116 // watch closed normally 117 return false, 0 118 119 case io.ErrUnexpectedEOF: 120 klog.V(1).InfoS("Watch closed with unexpected EOF", "err", err) 121 return false, 0 122 123 default: 124 msg := "Watch failed" 125 if net.IsProbableEOF(err) || net.IsTimeout(err) { 126 klog.V(5).InfoS(msg, "err", err) 127 // Retry 128 return false, 0 129 } 130 131 klog.ErrorS(err, msg) 132 // Retry 133 return false, 0 134 } 135 136 if watcher == nil { 137 klog.ErrorS(nil, "Watch returned nil watcher") 138 // Retry 139 return false, 0 140 } 141 142 ch := watcher.ResultChan() 143 defer watcher.Stop() 144 145 for { 146 select { 147 case <-rw.stopChan: 148 klog.V(4).InfoS("Stopping RetryWatcher.") 149 return true, 0 150 case event, ok := <-ch: 151 if !ok { 152 klog.V(4).InfoS("Failed to get event! Re-creating the watcher.", "resourceVersion", rw.lastResourceVersion) 153 return false, 0 154 } 155 156 // We need to inspect the event and get ResourceVersion out of it 157 switch event.Type { 158 case watch.Added, watch.Modified, watch.Deleted, watch.Bookmark: 159 metaObject, ok := event.Object.(resourceVersionGetter) 160 if !ok { 161 _ = rw.send(watch.Event{ 162 Type: watch.Error, 163 Object: &apierrors.NewInternalError(errors.New("retryWatcher: doesn't support resourceVersion")).ErrStatus, 164 }) 165 // We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! 166 return true, 0 167 } 168 169 resourceVersion := metaObject.GetResourceVersion() 170 if resourceVersion == "" { 171 _ = rw.send(watch.Event{ 172 Type: watch.Error, 173 Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher: object %#v doesn't support resourceVersion", event.Object)).ErrStatus, 174 }) 175 // We have to abort here because this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! 176 return true, 0 177 } 178 179 // All is fine; send the non-bookmark events and update resource version. 180 if event.Type != watch.Bookmark { 181 ok = rw.send(event) 182 if !ok { 183 return true, 0 184 } 185 } 186 rw.lastResourceVersion = resourceVersion 187 188 continue 189 190 case watch.Error: 191 // This round trip allows us to handle unstructured status 192 errObject := apierrors.FromObject(event.Object) 193 statusErr, ok := errObject.(*apierrors.StatusError) 194 if !ok { 195 klog.Error(fmt.Sprintf("Received an error which is not *metav1.Status but %s", dump.Pretty(event.Object))) 196 // Retry unknown errors 197 return false, 0 198 } 199 200 status := statusErr.ErrStatus 201 202 statusDelay := time.Duration(0) 203 if status.Details != nil { 204 statusDelay = time.Duration(status.Details.RetryAfterSeconds) * time.Second 205 } 206 207 switch status.Code { 208 case http.StatusGone: 209 // Never retry RV too old errors 210 _ = rw.send(event) 211 return true, 0 212 213 case http.StatusGatewayTimeout, http.StatusInternalServerError: 214 // Retry 215 return false, statusDelay 216 217 default: 218 // We retry by default. RetryWatcher is meant to proceed unless it is certain 219 // that it can't. If we are not certain, we proceed with retry and leave it 220 // up to the user to timeout if needed. 221 222 // Log here so we have a record of hitting the unexpected error 223 // and we can whitelist some error codes if we missed any that are expected. 224 klog.V(5).Info(fmt.Sprintf("Retrying after unexpected error: %s", dump.Pretty(event.Object))) 225 226 // Retry 227 return false, statusDelay 228 } 229 230 default: 231 klog.Errorf("Failed to recognize Event type %q", event.Type) 232 _ = rw.send(watch.Event{ 233 Type: watch.Error, 234 Object: &apierrors.NewInternalError(fmt.Errorf("retryWatcher failed to recognize Event type %q", event.Type)).ErrStatus, 235 }) 236 // We are unable to restart the watch and have to stop the loop or this might cause lastResourceVersion inconsistency by skipping a potential RV with valid data! 237 return true, 0 238 } 239 } 240 } 241 } 242 243 // receive reads the result from a watcher, restarting it if necessary. 244 func (rw *RetryWatcher) receive() { 245 defer close(rw.doneChan) 246 defer close(rw.resultChan) 247 248 klog.V(4).Info("Starting RetryWatcher.") 249 defer klog.V(4).Info("Stopping RetryWatcher.") 250 251 ctx, cancel := context.WithCancel(context.Background()) 252 defer cancel() 253 go func() { 254 select { 255 case <-rw.stopChan: 256 cancel() 257 return 258 case <-ctx.Done(): 259 return 260 } 261 }() 262 263 // We use non sliding until so we don't introduce delays on happy path when WATCH call 264 // timeouts or gets closed and we need to reestablish it while also avoiding hot loops. 265 wait.NonSlidingUntilWithContext(ctx, func(ctx context.Context) { 266 done, retryAfter := rw.doReceive() 267 if done { 268 cancel() 269 return 270 } 271 272 timer := time.NewTimer(retryAfter) 273 select { 274 case <-ctx.Done(): 275 timer.Stop() 276 return 277 case <-timer.C: 278 } 279 280 klog.V(4).Infof("Restarting RetryWatcher at RV=%q", rw.lastResourceVersion) 281 }, rw.minRestartDelay) 282 } 283 284 // ResultChan implements Interface. 285 func (rw *RetryWatcher) ResultChan() <-chan watch.Event { 286 return rw.resultChan 287 } 288 289 // Stop implements Interface. 290 func (rw *RetryWatcher) Stop() { 291 rw.stopChanLock.Lock() 292 defer rw.stopChanLock.Unlock() 293 294 // Prevent closing an already closed channel to prevent a panic 295 select { 296 case <-rw.stopChan: 297 default: 298 close(rw.stopChan) 299 } 300 } 301 302 // Done allows the caller to be notified when Retry watcher stops. 303 func (rw *RetryWatcher) Done() <-chan struct{} { 304 return rw.doneChan 305 }