k8s.io/apiserver@v0.29.3/pkg/storage/etcd3/watcher.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package etcd3 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "os" 24 "strconv" 25 "strings" 26 "sync" 27 "time" 28 29 clientv3 "go.etcd.io/etcd/client/v3" 30 grpccodes "google.golang.org/grpc/codes" 31 grpcstatus "google.golang.org/grpc/status" 32 33 apierrors "k8s.io/apimachinery/pkg/api/errors" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/runtime/schema" 36 "k8s.io/apimachinery/pkg/util/wait" 37 "k8s.io/apimachinery/pkg/watch" 38 "k8s.io/apiserver/pkg/features" 39 "k8s.io/apiserver/pkg/storage" 40 "k8s.io/apiserver/pkg/storage/etcd3/metrics" 41 "k8s.io/apiserver/pkg/storage/value" 42 utilfeature "k8s.io/apiserver/pkg/util/feature" 43 utilflowcontrol "k8s.io/apiserver/pkg/util/flowcontrol" 44 "k8s.io/klog/v2" 45 ) 46 47 const ( 48 // We have set a buffer in order to reduce times of context switches. 49 incomingBufSize = 100 50 outgoingBufSize = 100 51 ) 52 53 // defaultWatcherMaxLimit is used to facilitate construction tests 54 var defaultWatcherMaxLimit int64 = maxLimit 55 56 // fatalOnDecodeError is used during testing to panic the server if watcher encounters a decoding error 57 var fatalOnDecodeError = false 58 59 func init() { 60 // check to see if we are running in a test environment 61 TestOnlySetFatalOnDecodeError(true) 62 fatalOnDecodeError, _ = strconv.ParseBool(os.Getenv("KUBE_PANIC_WATCH_DECODE_ERROR")) 63 } 64 65 // TestOnlySetFatalOnDecodeError should only be used for cases where decode errors are expected and need to be tested. e.g. conversion webhooks. 66 func TestOnlySetFatalOnDecodeError(b bool) { 67 fatalOnDecodeError = b 68 } 69 70 type watcher struct { 71 client *clientv3.Client 72 codec runtime.Codec 73 newFunc func() runtime.Object 74 objectType string 75 groupResource schema.GroupResource 76 versioner storage.Versioner 77 transformer value.Transformer 78 getCurrentStorageRV func(context.Context) (uint64, error) 79 } 80 81 // watchChan implements watch.Interface. 82 type watchChan struct { 83 watcher *watcher 84 key string 85 initialRev int64 86 recursive bool 87 progressNotify bool 88 internalPred storage.SelectionPredicate 89 ctx context.Context 90 cancel context.CancelFunc 91 incomingEventChan chan *event 92 resultChan chan watch.Event 93 errChan chan error 94 } 95 96 // Watch watches on a key and returns a watch.Interface that transfers relevant notifications. 97 // If rev is zero, it will return the existing object(s) and then start watching from 98 // the maximum revision+1 from returned objects. 99 // If rev is non-zero, it will watch events happened after given revision. 100 // If opts.Recursive is false, it watches on given key. 101 // If opts.Recursive is true, it watches any children and directories under the key, excluding the root key itself. 102 // pred must be non-nil. Only if opts.Predicate matches the change, it will be returned. 103 func (w *watcher) Watch(ctx context.Context, key string, rev int64, opts storage.ListOptions) (watch.Interface, error) { 104 if opts.Recursive && !strings.HasSuffix(key, "/") { 105 key += "/" 106 } 107 if opts.ProgressNotify && w.newFunc == nil { 108 return nil, apierrors.NewInternalError(errors.New("progressNotify for watch is unsupported by the etcd storage because no newFunc was provided")) 109 } 110 startWatchRV, err := w.getStartWatchResourceVersion(ctx, rev, opts) 111 if err != nil { 112 return nil, err 113 } 114 wc := w.createWatchChan(ctx, key, startWatchRV, opts.Recursive, opts.ProgressNotify, opts.Predicate) 115 go wc.run(isInitialEventsEndBookmarkRequired(opts), areInitialEventsRequired(rev, opts)) 116 117 // For etcd watch we don't have an easy way to answer whether the watch 118 // has already caught up. So in the initial version (given that watchcache 119 // is by default enabled for all resources but Events), we just deliver 120 // the initialization signal immediately. Improving this will be explored 121 // in the future. 122 utilflowcontrol.WatchInitialized(ctx) 123 124 return wc, nil 125 } 126 127 func (w *watcher) createWatchChan(ctx context.Context, key string, rev int64, recursive, progressNotify bool, pred storage.SelectionPredicate) *watchChan { 128 wc := &watchChan{ 129 watcher: w, 130 key: key, 131 initialRev: rev, 132 recursive: recursive, 133 progressNotify: progressNotify, 134 internalPred: pred, 135 incomingEventChan: make(chan *event, incomingBufSize), 136 resultChan: make(chan watch.Event, outgoingBufSize), 137 errChan: make(chan error, 1), 138 } 139 if pred.Empty() { 140 // The filter doesn't filter out any object. 141 wc.internalPred = storage.Everything 142 } 143 wc.ctx, wc.cancel = context.WithCancel(ctx) 144 return wc 145 } 146 147 // getStartWatchResourceVersion returns a ResourceVersion 148 // the watch will be started from. 149 // Depending on the input parameters the semantics of the returned ResourceVersion are: 150 // - start at Exact (return resourceVersion) 151 // - start at Most Recent (return an RV from etcd) 152 func (w *watcher) getStartWatchResourceVersion(ctx context.Context, resourceVersion int64, opts storage.ListOptions) (int64, error) { 153 if resourceVersion > 0 { 154 return resourceVersion, nil 155 } 156 if !utilfeature.DefaultFeatureGate.Enabled(features.WatchList) { 157 return 0, nil 158 } 159 if opts.SendInitialEvents == nil || *opts.SendInitialEvents { 160 // note that when opts.SendInitialEvents=true 161 // we will be issuing a consistent LIST request 162 // against etcd followed by the special bookmark event 163 return 0, nil 164 } 165 // at this point the clients is interested 166 // only in getting a stream of events 167 // starting at the MostRecent point in time (RV) 168 currentStorageRV, err := w.getCurrentStorageRV(ctx) 169 if err != nil { 170 return 0, err 171 } 172 // currentStorageRV is taken from resp.Header.Revision (int64) 173 // and cast to uint64, so it is safe to do reverse 174 // at some point we should unify the interface but that 175 // would require changing Versioner.UpdateList 176 return int64(currentStorageRV), nil 177 } 178 179 // isInitialEventsEndBookmarkRequired since there is no way to directly set 180 // opts.ProgressNotify from the API and the etcd3 impl doesn't support 181 // notification for external clients we simply return initialEventsEndBookmarkRequired 182 // to only send the bookmark event after the initial list call. 183 // 184 // see: https://github.com/kubernetes/kubernetes/issues/120348 185 func isInitialEventsEndBookmarkRequired(opts storage.ListOptions) bool { 186 if !utilfeature.DefaultFeatureGate.Enabled(features.WatchList) { 187 return false 188 } 189 return opts.SendInitialEvents != nil && *opts.SendInitialEvents && opts.Predicate.AllowWatchBookmarks 190 } 191 192 // areInitialEventsRequired returns true if all events from the etcd should be returned. 193 func areInitialEventsRequired(resourceVersion int64, opts storage.ListOptions) bool { 194 if opts.SendInitialEvents == nil && resourceVersion == 0 { 195 return true // legacy case 196 } 197 if !utilfeature.DefaultFeatureGate.Enabled(features.WatchList) { 198 return false 199 } 200 return opts.SendInitialEvents != nil && *opts.SendInitialEvents 201 } 202 203 type etcdError interface { 204 Code() grpccodes.Code 205 Error() string 206 } 207 208 type grpcError interface { 209 GRPCStatus() *grpcstatus.Status 210 } 211 212 func isCancelError(err error) bool { 213 if err == nil { 214 return false 215 } 216 if err == context.Canceled { 217 return true 218 } 219 if etcdErr, ok := err.(etcdError); ok && etcdErr.Code() == grpccodes.Canceled { 220 return true 221 } 222 if grpcErr, ok := err.(grpcError); ok && grpcErr.GRPCStatus().Code() == grpccodes.Canceled { 223 return true 224 } 225 return false 226 } 227 228 func (wc *watchChan) run(initialEventsEndBookmarkRequired, forceInitialEvents bool) { 229 watchClosedCh := make(chan struct{}) 230 go wc.startWatching(watchClosedCh, initialEventsEndBookmarkRequired, forceInitialEvents) 231 232 var resultChanWG sync.WaitGroup 233 resultChanWG.Add(1) 234 go wc.processEvent(&resultChanWG) 235 236 select { 237 case err := <-wc.errChan: 238 if isCancelError(err) { 239 break 240 } 241 errResult := transformErrorToEvent(err) 242 if errResult != nil { 243 // error result is guaranteed to be received by user before closing ResultChan. 244 select { 245 case wc.resultChan <- *errResult: 246 case <-wc.ctx.Done(): // user has given up all results 247 } 248 } 249 case <-watchClosedCh: 250 case <-wc.ctx.Done(): // user cancel 251 } 252 253 // We use wc.ctx to reap all goroutines. Under whatever condition, we should stop them all. 254 // It's fine to double cancel. 255 wc.cancel() 256 257 // we need to wait until resultChan wouldn't be used anymore 258 resultChanWG.Wait() 259 close(wc.resultChan) 260 } 261 262 func (wc *watchChan) Stop() { 263 wc.cancel() 264 } 265 266 func (wc *watchChan) ResultChan() <-chan watch.Event { 267 return wc.resultChan 268 } 269 270 func (wc *watchChan) RequestWatchProgress() error { 271 return wc.watcher.client.RequestProgress(wc.ctx) 272 } 273 274 // sync tries to retrieve existing data and send them to process. 275 // The revision to watch will be set to the revision in response. 276 // All events sent will have isCreated=true 277 func (wc *watchChan) sync() error { 278 opts := []clientv3.OpOption{} 279 if wc.recursive { 280 opts = append(opts, clientv3.WithLimit(defaultWatcherMaxLimit)) 281 rangeEnd := clientv3.GetPrefixRangeEnd(wc.key) 282 opts = append(opts, clientv3.WithRange(rangeEnd)) 283 } 284 285 var err error 286 var lastKey []byte 287 var withRev int64 288 var getResp *clientv3.GetResponse 289 290 metricsOp := "get" 291 if wc.recursive { 292 metricsOp = "list" 293 } 294 295 preparedKey := wc.key 296 297 for { 298 startTime := time.Now() 299 getResp, err = wc.watcher.client.KV.Get(wc.ctx, preparedKey, opts...) 300 metrics.RecordEtcdRequest(metricsOp, wc.watcher.groupResource.String(), err, startTime) 301 if err != nil { 302 return interpretListError(err, true, preparedKey, wc.key) 303 } 304 305 if len(getResp.Kvs) == 0 && getResp.More { 306 return fmt.Errorf("no results were found, but etcd indicated there were more values remaining") 307 } 308 309 // send items from the response until no more results 310 for i, kv := range getResp.Kvs { 311 lastKey = kv.Key 312 wc.sendEvent(parseKV(kv)) 313 // free kv early. Long lists can take O(seconds) to decode. 314 getResp.Kvs[i] = nil 315 } 316 317 if withRev == 0 { 318 wc.initialRev = getResp.Header.Revision 319 } 320 321 // no more results remain 322 if !getResp.More { 323 return nil 324 } 325 326 preparedKey = string(lastKey) + "\x00" 327 if withRev == 0 { 328 withRev = getResp.Header.Revision 329 opts = append(opts, clientv3.WithRev(withRev)) 330 } 331 } 332 } 333 334 func logWatchChannelErr(err error) { 335 switch { 336 case strings.Contains(err.Error(), "mvcc: required revision has been compacted"): 337 // mvcc revision compaction which is regarded as warning, not error 338 klog.Warningf("watch chan error: %v", err) 339 case isCancelError(err): 340 // expected when watches close, no need to log 341 default: 342 klog.Errorf("watch chan error: %v", err) 343 } 344 } 345 346 // startWatching does: 347 // - get current objects if initialRev=0; set initialRev to current rev 348 // - watch on given key and send events to process. 349 // 350 // initialEventsEndBookmarkSent helps us keep track 351 // of whether we have sent an annotated bookmark event. 352 // 353 // it's important to note that we don't 354 // need to track the actual RV because 355 // we only send the bookmark event 356 // after the initial list call. 357 // 358 // when this variable is set to false, 359 // it means we don't have any specific 360 // preferences for delivering bookmark events. 361 func (wc *watchChan) startWatching(watchClosedCh chan struct{}, initialEventsEndBookmarkRequired, forceInitialEvents bool) { 362 if wc.initialRev > 0 && forceInitialEvents { 363 currentStorageRV, err := wc.watcher.getCurrentStorageRV(wc.ctx) 364 if err != nil { 365 wc.sendError(err) 366 return 367 } 368 if uint64(wc.initialRev) > currentStorageRV { 369 wc.sendError(storage.NewTooLargeResourceVersionError(uint64(wc.initialRev), currentStorageRV, int(wait.Jitter(1*time.Second, 3).Seconds()))) 370 return 371 } 372 } 373 if forceInitialEvents { 374 if err := wc.sync(); err != nil { 375 klog.Errorf("failed to sync with latest state: %v", err) 376 wc.sendError(err) 377 return 378 } 379 } 380 if initialEventsEndBookmarkRequired { 381 wc.sendEvent(func() *event { 382 e := progressNotifyEvent(wc.initialRev) 383 e.isInitialEventsEndBookmark = true 384 return e 385 }()) 386 } 387 opts := []clientv3.OpOption{clientv3.WithRev(wc.initialRev + 1), clientv3.WithPrevKV()} 388 if wc.recursive { 389 opts = append(opts, clientv3.WithPrefix()) 390 } 391 if wc.progressNotify { 392 opts = append(opts, clientv3.WithProgressNotify()) 393 } 394 wch := wc.watcher.client.Watch(wc.ctx, wc.key, opts...) 395 for wres := range wch { 396 if wres.Err() != nil { 397 err := wres.Err() 398 // If there is an error on server (e.g. compaction), the channel will return it before closed. 399 logWatchChannelErr(err) 400 wc.sendError(err) 401 return 402 } 403 if wres.IsProgressNotify() { 404 wc.sendEvent(progressNotifyEvent(wres.Header.GetRevision())) 405 metrics.RecordEtcdBookmark(wc.watcher.groupResource.String()) 406 continue 407 } 408 409 for _, e := range wres.Events { 410 metrics.RecordEtcdEvent(wc.watcher.groupResource.String()) 411 parsedEvent, err := parseEvent(e) 412 if err != nil { 413 logWatchChannelErr(err) 414 wc.sendError(err) 415 return 416 } 417 wc.sendEvent(parsedEvent) 418 } 419 } 420 // When we come to this point, it's only possible that client side ends the watch. 421 // e.g. cancel the context, close the client. 422 // If this watch chan is broken and context isn't cancelled, other goroutines will still hang. 423 // We should notify the main thread that this goroutine has exited. 424 close(watchClosedCh) 425 } 426 427 // processEvent processes events from etcd watcher and sends results to resultChan. 428 func (wc *watchChan) processEvent(wg *sync.WaitGroup) { 429 defer wg.Done() 430 431 for { 432 select { 433 case e := <-wc.incomingEventChan: 434 res := wc.transform(e) 435 if res == nil { 436 continue 437 } 438 if len(wc.resultChan) == outgoingBufSize { 439 klog.V(3).InfoS("Fast watcher, slow processing. Probably caused by slow dispatching events to watchers", "outgoingEvents", outgoingBufSize, "objectType", wc.watcher.objectType, "groupResource", wc.watcher.groupResource) 440 } 441 // If user couldn't receive results fast enough, we also block incoming events from watcher. 442 // Because storing events in local will cause more memory usage. 443 // The worst case would be closing the fast watcher. 444 select { 445 case wc.resultChan <- *res: 446 case <-wc.ctx.Done(): 447 return 448 } 449 case <-wc.ctx.Done(): 450 return 451 } 452 } 453 } 454 455 func (wc *watchChan) filter(obj runtime.Object) bool { 456 if wc.internalPred.Empty() { 457 return true 458 } 459 matched, err := wc.internalPred.Matches(obj) 460 return err == nil && matched 461 } 462 463 func (wc *watchChan) acceptAll() bool { 464 return wc.internalPred.Empty() 465 } 466 467 // transform transforms an event into a result for user if not filtered. 468 func (wc *watchChan) transform(e *event) (res *watch.Event) { 469 curObj, oldObj, err := wc.prepareObjs(e) 470 if err != nil { 471 klog.Errorf("failed to prepare current and previous objects: %v", err) 472 wc.sendError(err) 473 return nil 474 } 475 476 switch { 477 case e.isProgressNotify: 478 object := wc.watcher.newFunc() 479 if err := wc.watcher.versioner.UpdateObject(object, uint64(e.rev)); err != nil { 480 klog.Errorf("failed to propagate object version: %v", err) 481 return nil 482 } 483 if e.isInitialEventsEndBookmark { 484 if err := storage.AnnotateInitialEventsEndBookmark(object); err != nil { 485 wc.sendError(fmt.Errorf("error while accessing object's metadata gr: %v, type: %v, obj: %#v, err: %v", wc.watcher.groupResource, wc.watcher.objectType, object, err)) 486 return nil 487 } 488 } 489 res = &watch.Event{ 490 Type: watch.Bookmark, 491 Object: object, 492 } 493 case e.isDeleted: 494 if !wc.filter(oldObj) { 495 return nil 496 } 497 res = &watch.Event{ 498 Type: watch.Deleted, 499 Object: oldObj, 500 } 501 case e.isCreated: 502 if !wc.filter(curObj) { 503 return nil 504 } 505 res = &watch.Event{ 506 Type: watch.Added, 507 Object: curObj, 508 } 509 default: 510 if wc.acceptAll() { 511 res = &watch.Event{ 512 Type: watch.Modified, 513 Object: curObj, 514 } 515 return res 516 } 517 curObjPasses := wc.filter(curObj) 518 oldObjPasses := wc.filter(oldObj) 519 switch { 520 case curObjPasses && oldObjPasses: 521 res = &watch.Event{ 522 Type: watch.Modified, 523 Object: curObj, 524 } 525 case curObjPasses && !oldObjPasses: 526 res = &watch.Event{ 527 Type: watch.Added, 528 Object: curObj, 529 } 530 case !curObjPasses && oldObjPasses: 531 res = &watch.Event{ 532 Type: watch.Deleted, 533 Object: oldObj, 534 } 535 } 536 } 537 return res 538 } 539 540 func transformErrorToEvent(err error) *watch.Event { 541 err = interpretWatchError(err) 542 if _, ok := err.(apierrors.APIStatus); !ok { 543 err = apierrors.NewInternalError(err) 544 } 545 status := err.(apierrors.APIStatus).Status() 546 return &watch.Event{ 547 Type: watch.Error, 548 Object: &status, 549 } 550 } 551 552 func (wc *watchChan) sendError(err error) { 553 select { 554 case wc.errChan <- err: 555 case <-wc.ctx.Done(): 556 } 557 } 558 559 func (wc *watchChan) sendEvent(e *event) { 560 if len(wc.incomingEventChan) == incomingBufSize { 561 klog.V(3).InfoS("Fast watcher, slow processing. Probably caused by slow decoding, user not receiving fast, or other processing logic", "incomingEvents", incomingBufSize, "objectType", wc.watcher.objectType, "groupResource", wc.watcher.groupResource) 562 } 563 select { 564 case wc.incomingEventChan <- e: 565 case <-wc.ctx.Done(): 566 } 567 } 568 569 func (wc *watchChan) prepareObjs(e *event) (curObj runtime.Object, oldObj runtime.Object, err error) { 570 if e.isProgressNotify { 571 // progressNotify events doesn't contain neither current nor previous object version, 572 return nil, nil, nil 573 } 574 575 if !e.isDeleted { 576 data, _, err := wc.watcher.transformer.TransformFromStorage(wc.ctx, e.value, authenticatedDataString(e.key)) 577 if err != nil { 578 return nil, nil, err 579 } 580 curObj, err = decodeObj(wc.watcher.codec, wc.watcher.versioner, data, e.rev) 581 if err != nil { 582 return nil, nil, err 583 } 584 } 585 // We need to decode prevValue, only if this is deletion event or 586 // the underlying filter doesn't accept all objects (otherwise we 587 // know that the filter for previous object will return true and 588 // we need the object only to compute whether it was filtered out 589 // before). 590 if len(e.prevValue) > 0 && (e.isDeleted || !wc.acceptAll()) { 591 data, _, err := wc.watcher.transformer.TransformFromStorage(wc.ctx, e.prevValue, authenticatedDataString(e.key)) 592 if err != nil { 593 return nil, nil, err 594 } 595 // Note that this sends the *old* object with the etcd revision for the time at 596 // which it gets deleted. 597 oldObj, err = decodeObj(wc.watcher.codec, wc.watcher.versioner, data, e.rev) 598 if err != nil { 599 return nil, nil, err 600 } 601 } 602 return curObj, oldObj, nil 603 } 604 605 func decodeObj(codec runtime.Codec, versioner storage.Versioner, data []byte, rev int64) (_ runtime.Object, err error) { 606 obj, err := runtime.Decode(codec, []byte(data)) 607 if err != nil { 608 if fatalOnDecodeError { 609 // we are running in a test environment and thus an 610 // error here is due to a coder mistake if the defer 611 // does not catch it 612 panic(err) 613 } 614 return nil, err 615 } 616 // ensure resource version is set on the object we load from etcd 617 if err := versioner.UpdateObject(obj, uint64(rev)); err != nil { 618 return nil, fmt.Errorf("failure to version api object (%d) %#v: %v", rev, obj, err) 619 } 620 return obj, nil 621 }