k8s.io/apiserver@v0.29.3/pkg/storage/etcd3/store.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package etcd3 18 19 import ( 20 "bytes" 21 "context" 22 "errors" 23 "fmt" 24 "path" 25 "reflect" 26 "strings" 27 "time" 28 29 clientv3 "go.etcd.io/etcd/client/v3" 30 "go.opentelemetry.io/otel/attribute" 31 32 apierrors "k8s.io/apimachinery/pkg/api/errors" 33 "k8s.io/apimachinery/pkg/api/meta" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 36 "k8s.io/apimachinery/pkg/conversion" 37 "k8s.io/apimachinery/pkg/runtime" 38 "k8s.io/apimachinery/pkg/runtime/schema" 39 "k8s.io/apimachinery/pkg/watch" 40 "k8s.io/apiserver/pkg/audit" 41 "k8s.io/apiserver/pkg/storage" 42 "k8s.io/apiserver/pkg/storage/etcd3/metrics" 43 "k8s.io/apiserver/pkg/storage/value" 44 "k8s.io/component-base/tracing" 45 "k8s.io/klog/v2" 46 ) 47 48 const ( 49 // maxLimit is a maximum page limit increase used when fetching objects from etcd. 50 // This limit is used only for increasing page size by kube-apiserver. If request 51 // specifies larger limit initially, it won't be changed. 52 maxLimit = 10000 53 ) 54 55 // authenticatedDataString satisfies the value.Context interface. It uses the key to 56 // authenticate the stored data. This does not defend against reuse of previously 57 // encrypted values under the same key, but will prevent an attacker from using an 58 // encrypted value from a different key. A stronger authenticated data segment would 59 // include the etcd3 Version field (which is incremented on each write to a key and 60 // reset when the key is deleted), but an attacker with write access to etcd can 61 // force deletion and recreation of keys to weaken that angle. 62 type authenticatedDataString string 63 64 // AuthenticatedData implements the value.Context interface. 65 func (d authenticatedDataString) AuthenticatedData() []byte { 66 return []byte(string(d)) 67 } 68 69 var _ value.Context = authenticatedDataString("") 70 71 type store struct { 72 client *clientv3.Client 73 codec runtime.Codec 74 versioner storage.Versioner 75 transformer value.Transformer 76 pathPrefix string 77 groupResource schema.GroupResource 78 groupResourceString string 79 watcher *watcher 80 leaseManager *leaseManager 81 } 82 83 func (s *store) RequestWatchProgress(ctx context.Context) error { 84 // Use watchContext to match ctx metadata provided when creating the watch. 85 // In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache. 86 return s.client.RequestProgress(s.watchContext(ctx)) 87 } 88 89 type objState struct { 90 obj runtime.Object 91 meta *storage.ResponseMeta 92 rev int64 93 data []byte 94 stale bool 95 } 96 97 // New returns an etcd3 implementation of storage.Interface. 98 func New(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) storage.Interface { 99 return newStore(c, codec, newFunc, newListFunc, prefix, resourcePrefix, groupResource, transformer, leaseManagerConfig) 100 } 101 102 func newStore(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) *store { 103 versioner := storage.APIObjectVersioner{} 104 // for compatibility with etcd2 impl. 105 // no-op for default prefix of '/registry'. 106 // keeps compatibility with etcd2 impl for custom prefixes that don't start with '/' 107 pathPrefix := path.Join("/", prefix) 108 if !strings.HasSuffix(pathPrefix, "/") { 109 // Ensure the pathPrefix ends in "/" here to simplify key concatenation later. 110 pathPrefix += "/" 111 } 112 113 w := &watcher{ 114 client: c, 115 codec: codec, 116 newFunc: newFunc, 117 groupResource: groupResource, 118 versioner: versioner, 119 transformer: transformer, 120 } 121 if newFunc == nil { 122 w.objectType = "<unknown>" 123 } else { 124 w.objectType = reflect.TypeOf(newFunc()).String() 125 } 126 s := &store{ 127 client: c, 128 codec: codec, 129 versioner: versioner, 130 transformer: transformer, 131 pathPrefix: pathPrefix, 132 groupResource: groupResource, 133 groupResourceString: groupResource.String(), 134 watcher: w, 135 leaseManager: newDefaultLeaseManager(c, leaseManagerConfig), 136 } 137 138 w.getCurrentStorageRV = func(ctx context.Context) (uint64, error) { 139 return storage.GetCurrentResourceVersionFromStorage(ctx, s, newListFunc, resourcePrefix, w.objectType) 140 } 141 return s 142 } 143 144 // Versioner implements storage.Interface.Versioner. 145 func (s *store) Versioner() storage.Versioner { 146 return s.versioner 147 } 148 149 // Get implements storage.Interface.Get. 150 func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, out runtime.Object) error { 151 preparedKey, err := s.prepareKey(key) 152 if err != nil { 153 return err 154 } 155 startTime := time.Now() 156 getResp, err := s.client.KV.Get(ctx, preparedKey) 157 metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) 158 if err != nil { 159 return err 160 } 161 if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil { 162 return err 163 } 164 165 if len(getResp.Kvs) == 0 { 166 if opts.IgnoreNotFound { 167 return runtime.SetZeroValue(out) 168 } 169 return storage.NewKeyNotFoundError(preparedKey, 0) 170 } 171 kv := getResp.Kvs[0] 172 173 data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(preparedKey)) 174 if err != nil { 175 return storage.NewInternalError(err.Error()) 176 } 177 178 err = decode(s.codec, s.versioner, data, out, kv.ModRevision) 179 if err != nil { 180 recordDecodeError(s.groupResourceString, preparedKey) 181 return err 182 } 183 return nil 184 } 185 186 // Create implements storage.Interface.Create. 187 func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error { 188 preparedKey, err := s.prepareKey(key) 189 if err != nil { 190 return err 191 } 192 ctx, span := tracing.Start(ctx, "Create etcd3", 193 attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)), 194 attribute.String("key", key), 195 attribute.String("type", getTypeName(obj)), 196 attribute.String("resource", s.groupResourceString), 197 ) 198 defer span.End(500 * time.Millisecond) 199 if version, err := s.versioner.ObjectResourceVersion(obj); err == nil && version != 0 { 200 return storage.ErrResourceVersionSetOnCreate 201 } 202 if err := s.versioner.PrepareObjectForStorage(obj); err != nil { 203 return fmt.Errorf("PrepareObjectForStorage failed: %v", err) 204 } 205 span.AddEvent("About to Encode") 206 data, err := runtime.Encode(s.codec, obj) 207 if err != nil { 208 span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 209 return err 210 } 211 span.AddEvent("Encode succeeded", attribute.Int("len", len(data))) 212 213 opts, err := s.ttlOpts(ctx, int64(ttl)) 214 if err != nil { 215 return err 216 } 217 218 newData, err := s.transformer.TransformToStorage(ctx, data, authenticatedDataString(preparedKey)) 219 if err != nil { 220 span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error())) 221 return storage.NewInternalError(err.Error()) 222 } 223 span.AddEvent("TransformToStorage succeeded") 224 225 startTime := time.Now() 226 txnResp, err := s.client.KV.Txn(ctx).If( 227 notFound(preparedKey), 228 ).Then( 229 clientv3.OpPut(preparedKey, string(newData), opts...), 230 ).Commit() 231 metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime) 232 if err != nil { 233 span.AddEvent("Txn call failed", attribute.String("err", err.Error())) 234 return err 235 } 236 span.AddEvent("Txn call succeeded") 237 238 if !txnResp.Succeeded { 239 return storage.NewKeyExistsError(preparedKey, 0) 240 } 241 242 if out != nil { 243 putResp := txnResp.Responses[0].GetResponsePut() 244 err = decode(s.codec, s.versioner, data, out, putResp.Header.Revision) 245 if err != nil { 246 span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 247 recordDecodeError(s.groupResourceString, preparedKey) 248 return err 249 } 250 span.AddEvent("decode succeeded", attribute.Int("len", len(data))) 251 } 252 return nil 253 } 254 255 // Delete implements storage.Interface.Delete. 256 func (s *store) Delete( 257 ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, 258 validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error { 259 preparedKey, err := s.prepareKey(key) 260 if err != nil { 261 return err 262 } 263 v, err := conversion.EnforcePtr(out) 264 if err != nil { 265 return fmt.Errorf("unable to convert output object to pointer: %v", err) 266 } 267 return s.conditionalDelete(ctx, preparedKey, out, v, preconditions, validateDeletion, cachedExistingObject) 268 } 269 270 func (s *store) conditionalDelete( 271 ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions, 272 validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error { 273 getCurrentState := s.getCurrentState(ctx, key, v, false) 274 275 var origState *objState 276 var err error 277 var origStateIsCurrent bool 278 if cachedExistingObject != nil { 279 origState, err = s.getStateFromObject(cachedExistingObject) 280 } else { 281 origState, err = getCurrentState() 282 origStateIsCurrent = true 283 } 284 if err != nil { 285 return err 286 } 287 288 for { 289 if preconditions != nil { 290 if err := preconditions.Check(key, origState.obj); err != nil { 291 if origStateIsCurrent { 292 return err 293 } 294 295 // It's possible we're working with stale data. 296 // Remember the revision of the potentially stale data and the resulting update error 297 cachedRev := origState.rev 298 cachedUpdateErr := err 299 300 // Actually fetch 301 origState, err = getCurrentState() 302 if err != nil { 303 return err 304 } 305 origStateIsCurrent = true 306 307 // it turns out our cached data was not stale, return the error 308 if cachedRev == origState.rev { 309 return cachedUpdateErr 310 } 311 312 // Retry 313 continue 314 } 315 } 316 if err := validateDeletion(ctx, origState.obj); err != nil { 317 if origStateIsCurrent { 318 return err 319 } 320 321 // It's possible we're working with stale data. 322 // Remember the revision of the potentially stale data and the resulting update error 323 cachedRev := origState.rev 324 cachedUpdateErr := err 325 326 // Actually fetch 327 origState, err = getCurrentState() 328 if err != nil { 329 return err 330 } 331 origStateIsCurrent = true 332 333 // it turns out our cached data was not stale, return the error 334 if cachedRev == origState.rev { 335 return cachedUpdateErr 336 } 337 338 // Retry 339 continue 340 } 341 342 startTime := time.Now() 343 txnResp, err := s.client.KV.Txn(ctx).If( 344 clientv3.Compare(clientv3.ModRevision(key), "=", origState.rev), 345 ).Then( 346 clientv3.OpDelete(key), 347 ).Else( 348 clientv3.OpGet(key), 349 ).Commit() 350 metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime) 351 if err != nil { 352 return err 353 } 354 if !txnResp.Succeeded { 355 getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange()) 356 klog.V(4).Infof("deletion of %s failed because of a conflict, going to retry", key) 357 origState, err = s.getState(ctx, getResp, key, v, false) 358 if err != nil { 359 return err 360 } 361 origStateIsCurrent = true 362 continue 363 } 364 365 if len(txnResp.Responses) == 0 || txnResp.Responses[0].GetResponseDeleteRange() == nil { 366 return errors.New(fmt.Sprintf("invalid DeleteRange response: %v", txnResp.Responses)) 367 } 368 deleteResp := txnResp.Responses[0].GetResponseDeleteRange() 369 if deleteResp.Header == nil { 370 return errors.New("invalid DeleteRange response - nil header") 371 } 372 err = decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision) 373 if err != nil { 374 recordDecodeError(s.groupResourceString, key) 375 return err 376 } 377 return nil 378 } 379 } 380 381 // GuaranteedUpdate implements storage.Interface.GuaranteedUpdate. 382 func (s *store) GuaranteedUpdate( 383 ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool, 384 preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error { 385 preparedKey, err := s.prepareKey(key) 386 if err != nil { 387 return err 388 } 389 ctx, span := tracing.Start(ctx, "GuaranteedUpdate etcd3", 390 attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)), 391 attribute.String("key", key), 392 attribute.String("type", getTypeName(destination)), 393 attribute.String("resource", s.groupResourceString)) 394 defer span.End(500 * time.Millisecond) 395 396 v, err := conversion.EnforcePtr(destination) 397 if err != nil { 398 return fmt.Errorf("unable to convert output object to pointer: %v", err) 399 } 400 401 getCurrentState := s.getCurrentState(ctx, preparedKey, v, ignoreNotFound) 402 403 var origState *objState 404 var origStateIsCurrent bool 405 if cachedExistingObject != nil { 406 origState, err = s.getStateFromObject(cachedExistingObject) 407 } else { 408 origState, err = getCurrentState() 409 origStateIsCurrent = true 410 } 411 if err != nil { 412 return err 413 } 414 span.AddEvent("initial value restored") 415 416 transformContext := authenticatedDataString(preparedKey) 417 for { 418 if err := preconditions.Check(preparedKey, origState.obj); err != nil { 419 // If our data is already up to date, return the error 420 if origStateIsCurrent { 421 return err 422 } 423 424 // It's possible we were working with stale data 425 // Actually fetch 426 origState, err = getCurrentState() 427 if err != nil { 428 return err 429 } 430 origStateIsCurrent = true 431 // Retry 432 continue 433 } 434 435 ret, ttl, err := s.updateState(origState, tryUpdate) 436 if err != nil { 437 // If our data is already up to date, return the error 438 if origStateIsCurrent { 439 return err 440 } 441 442 // It's possible we were working with stale data 443 // Remember the revision of the potentially stale data and the resulting update error 444 cachedRev := origState.rev 445 cachedUpdateErr := err 446 447 // Actually fetch 448 origState, err = getCurrentState() 449 if err != nil { 450 return err 451 } 452 origStateIsCurrent = true 453 454 // it turns out our cached data was not stale, return the error 455 if cachedRev == origState.rev { 456 return cachedUpdateErr 457 } 458 459 // Retry 460 continue 461 } 462 463 span.AddEvent("About to Encode") 464 data, err := runtime.Encode(s.codec, ret) 465 if err != nil { 466 span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 467 return err 468 } 469 span.AddEvent("Encode succeeded", attribute.Int("len", len(data))) 470 if !origState.stale && bytes.Equal(data, origState.data) { 471 // if we skipped the original Get in this loop, we must refresh from 472 // etcd in order to be sure the data in the store is equivalent to 473 // our desired serialization 474 if !origStateIsCurrent { 475 origState, err = getCurrentState() 476 if err != nil { 477 return err 478 } 479 origStateIsCurrent = true 480 if !bytes.Equal(data, origState.data) { 481 // original data changed, restart loop 482 continue 483 } 484 } 485 // recheck that the data from etcd is not stale before short-circuiting a write 486 if !origState.stale { 487 err = decode(s.codec, s.versioner, origState.data, destination, origState.rev) 488 if err != nil { 489 recordDecodeError(s.groupResourceString, preparedKey) 490 return err 491 } 492 return nil 493 } 494 } 495 496 newData, err := s.transformer.TransformToStorage(ctx, data, transformContext) 497 if err != nil { 498 span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error())) 499 return storage.NewInternalError(err.Error()) 500 } 501 span.AddEvent("TransformToStorage succeeded") 502 503 opts, err := s.ttlOpts(ctx, int64(ttl)) 504 if err != nil { 505 return err 506 } 507 span.AddEvent("Transaction prepared") 508 509 startTime := time.Now() 510 txnResp, err := s.client.KV.Txn(ctx).If( 511 clientv3.Compare(clientv3.ModRevision(preparedKey), "=", origState.rev), 512 ).Then( 513 clientv3.OpPut(preparedKey, string(newData), opts...), 514 ).Else( 515 clientv3.OpGet(preparedKey), 516 ).Commit() 517 metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime) 518 if err != nil { 519 span.AddEvent("Txn call failed", attribute.String("err", err.Error())) 520 return err 521 } 522 span.AddEvent("Txn call completed") 523 span.AddEvent("Transaction committed") 524 if !txnResp.Succeeded { 525 getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange()) 526 klog.V(4).Infof("GuaranteedUpdate of %s failed because of a conflict, going to retry", preparedKey) 527 origState, err = s.getState(ctx, getResp, preparedKey, v, ignoreNotFound) 528 if err != nil { 529 return err 530 } 531 span.AddEvent("Retry value restored") 532 origStateIsCurrent = true 533 continue 534 } 535 putResp := txnResp.Responses[0].GetResponsePut() 536 537 err = decode(s.codec, s.versioner, data, destination, putResp.Header.Revision) 538 if err != nil { 539 span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 540 recordDecodeError(s.groupResourceString, preparedKey) 541 return err 542 } 543 span.AddEvent("decode succeeded", attribute.Int("len", len(data))) 544 return nil 545 } 546 } 547 548 func getNewItemFunc(listObj runtime.Object, v reflect.Value) func() runtime.Object { 549 // For unstructured lists with a target group/version, preserve the group/version in the instantiated list items 550 if unstructuredList, isUnstructured := listObj.(*unstructured.UnstructuredList); isUnstructured { 551 if apiVersion := unstructuredList.GetAPIVersion(); len(apiVersion) > 0 { 552 return func() runtime.Object { 553 return &unstructured.Unstructured{Object: map[string]interface{}{"apiVersion": apiVersion}} 554 } 555 } 556 } 557 558 // Otherwise just instantiate an empty item 559 elem := v.Type().Elem() 560 return func() runtime.Object { 561 return reflect.New(elem).Interface().(runtime.Object) 562 } 563 } 564 565 func (s *store) Count(key string) (int64, error) { 566 preparedKey, err := s.prepareKey(key) 567 if err != nil { 568 return 0, err 569 } 570 571 // We need to make sure the key ended with "/" so that we only get children "directories". 572 // e.g. if we have key "/a", "/a/b", "/ab", getting keys with prefix "/a" will return all three, 573 // while with prefix "/a/" will return only "/a/b" which is the correct answer. 574 if !strings.HasSuffix(preparedKey, "/") { 575 preparedKey += "/" 576 } 577 578 startTime := time.Now() 579 getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly()) 580 metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime) 581 if err != nil { 582 return 0, err 583 } 584 return getResp.Count, nil 585 } 586 587 // GetList implements storage.Interface. 588 func (s *store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error { 589 preparedKey, err := s.prepareKey(key) 590 if err != nil { 591 return err 592 } 593 ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", opts.Recursive), 594 attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)), 595 attribute.String("key", key), 596 attribute.String("resourceVersion", opts.ResourceVersion), 597 attribute.String("resourceVersionMatch", string(opts.ResourceVersionMatch)), 598 attribute.Int("limit", int(opts.Predicate.Limit)), 599 attribute.String("continue", opts.Predicate.Continue)) 600 defer span.End(500 * time.Millisecond) 601 listPtr, err := meta.GetItemsPtr(listObj) 602 if err != nil { 603 return err 604 } 605 v, err := conversion.EnforcePtr(listPtr) 606 if err != nil || v.Kind() != reflect.Slice { 607 return fmt.Errorf("need ptr to slice: %v", err) 608 } 609 610 // For recursive lists, we need to make sure the key ended with "/" so that we only 611 // get children "directories". e.g. if we have key "/a", "/a/b", "/ab", getting keys 612 // with prefix "/a" will return all three, while with prefix "/a/" will return only 613 // "/a/b" which is the correct answer. 614 if opts.Recursive && !strings.HasSuffix(preparedKey, "/") { 615 preparedKey += "/" 616 } 617 keyPrefix := preparedKey 618 619 // set the appropriate clientv3 options to filter the returned data set 620 var limitOption *clientv3.OpOption 621 limit := opts.Predicate.Limit 622 var paging bool 623 options := make([]clientv3.OpOption, 0, 4) 624 if opts.Predicate.Limit > 0 { 625 paging = true 626 options = append(options, clientv3.WithLimit(limit)) 627 limitOption = &options[len(options)-1] 628 } 629 630 if opts.Recursive { 631 rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix) 632 options = append(options, clientv3.WithRange(rangeEnd)) 633 } 634 635 newItemFunc := getNewItemFunc(listObj, v) 636 637 var continueRV, withRev int64 638 var continueKey string 639 switch { 640 case opts.Recursive && len(opts.Predicate.Continue) > 0: 641 continueKey, continueRV, err = storage.DecodeContinue(opts.Predicate.Continue, keyPrefix) 642 if err != nil { 643 return apierrors.NewBadRequest(fmt.Sprintf("invalid continue token: %v", err)) 644 } 645 646 if len(opts.ResourceVersion) > 0 && opts.ResourceVersion != "0" { 647 return apierrors.NewBadRequest("specifying resource version is not allowed when using continue") 648 } 649 preparedKey = continueKey 650 // If continueRV > 0, the LIST request needs a specific resource version. 651 // continueRV==0 is invalid. 652 // If continueRV < 0, the request is for the latest resource version. 653 if continueRV > 0 { 654 withRev = continueRV 655 } 656 case len(opts.ResourceVersion) > 0: 657 parsedRV, err := s.versioner.ParseResourceVersion(opts.ResourceVersion) 658 if err != nil { 659 return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err)) 660 } 661 switch opts.ResourceVersionMatch { 662 case metav1.ResourceVersionMatchNotOlderThan: 663 // The not older than constraint is checked after we get a response from etcd, 664 // and returnedRV is then set to the revision we get from the etcd response. 665 case metav1.ResourceVersionMatchExact: 666 withRev = int64(parsedRV) 667 case "": // legacy case 668 if opts.Recursive && opts.Predicate.Limit > 0 && parsedRV > 0 { 669 withRev = int64(parsedRV) 670 } 671 default: 672 return fmt.Errorf("unknown ResourceVersionMatch value: %v", opts.ResourceVersionMatch) 673 } 674 } 675 676 if withRev != 0 { 677 options = append(options, clientv3.WithRev(withRev)) 678 } 679 680 // loop until we have filled the requested limit from etcd or there are no more results 681 var lastKey []byte 682 var hasMore bool 683 var getResp *clientv3.GetResponse 684 var numFetched int 685 var numEvald int 686 // Because these metrics are for understanding the costs of handling LIST requests, 687 // get them recorded even in error cases. 688 defer func() { 689 numReturn := v.Len() 690 metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn) 691 }() 692 693 metricsOp := "get" 694 if opts.Recursive { 695 metricsOp = "list" 696 } 697 698 for { 699 startTime := time.Now() 700 getResp, err = s.client.KV.Get(ctx, preparedKey, options...) 701 metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime) 702 if err != nil { 703 return interpretListError(err, len(opts.Predicate.Continue) > 0, continueKey, keyPrefix) 704 } 705 numFetched += len(getResp.Kvs) 706 if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil { 707 return err 708 } 709 hasMore = getResp.More 710 711 if len(getResp.Kvs) == 0 && getResp.More { 712 return fmt.Errorf("no results were found, but etcd indicated there were more values remaining") 713 } 714 // indicate to the client which resource version was returned, and use the same resource version for subsequent requests. 715 if withRev == 0 { 716 withRev = getResp.Header.Revision 717 options = append(options, clientv3.WithRev(withRev)) 718 } 719 720 // avoid small allocations for the result slice, since this can be called in many 721 // different contexts and we don't know how significantly the result will be filtered 722 if opts.Predicate.Empty() { 723 growSlice(v, len(getResp.Kvs)) 724 } else { 725 growSlice(v, 2048, len(getResp.Kvs)) 726 } 727 728 // take items from the response until the bucket is full, filtering as we go 729 for i, kv := range getResp.Kvs { 730 if paging && int64(v.Len()) >= opts.Predicate.Limit { 731 hasMore = true 732 break 733 } 734 lastKey = kv.Key 735 736 data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(kv.Key)) 737 if err != nil { 738 return storage.NewInternalErrorf("unable to transform key %q: %v", kv.Key, err) 739 } 740 741 if err := appendListItem(v, data, uint64(kv.ModRevision), opts.Predicate, s.codec, s.versioner, newItemFunc); err != nil { 742 recordDecodeError(s.groupResourceString, string(kv.Key)) 743 return err 744 } 745 numEvald++ 746 747 // free kv early. Long lists can take O(seconds) to decode. 748 getResp.Kvs[i] = nil 749 } 750 751 // no more results remain or we didn't request paging 752 if !hasMore || !paging { 753 break 754 } 755 // we're paging but we have filled our bucket 756 if int64(v.Len()) >= opts.Predicate.Limit { 757 break 758 } 759 760 if limit < maxLimit { 761 // We got incomplete result due to field/label selector dropping the object. 762 // Double page size to reduce total number of calls to etcd. 763 limit *= 2 764 if limit > maxLimit { 765 limit = maxLimit 766 } 767 *limitOption = clientv3.WithLimit(limit) 768 } 769 preparedKey = string(lastKey) + "\x00" 770 } 771 772 if v.IsNil() { 773 // Ensure that we never return a nil Items pointer in the result for consistency. 774 v.Set(reflect.MakeSlice(v.Type(), 0, 0)) 775 } 776 777 // instruct the client to begin querying from immediately after the last key we returned 778 // we never return a key that the client wouldn't be allowed to see 779 if hasMore { 780 // we want to start immediately after the last key 781 next, err := storage.EncodeContinue(string(lastKey)+"\x00", keyPrefix, withRev) 782 if err != nil { 783 return err 784 } 785 var remainingItemCount *int64 786 // getResp.Count counts in objects that do not match the pred. 787 // Instead of returning inaccurate count for non-empty selectors, we return nil. 788 // Only set remainingItemCount if the predicate is empty. 789 if opts.Predicate.Empty() { 790 c := int64(getResp.Count - opts.Predicate.Limit) 791 remainingItemCount = &c 792 } 793 return s.versioner.UpdateList(listObj, uint64(withRev), next, remainingItemCount) 794 } 795 796 // no continuation 797 return s.versioner.UpdateList(listObj, uint64(withRev), "", nil) 798 } 799 800 // growSlice takes a slice value and grows its capacity up 801 // to the maximum of the passed sizes or maxCapacity, whichever 802 // is smaller. Above maxCapacity decisions about allocation are left 803 // to the Go runtime on append. This allows a caller to make an 804 // educated guess about the potential size of the total list while 805 // still avoiding overly aggressive initial allocation. If sizes 806 // is empty maxCapacity will be used as the size to grow. 807 func growSlice(v reflect.Value, maxCapacity int, sizes ...int) { 808 cap := v.Cap() 809 max := cap 810 for _, size := range sizes { 811 if size > max { 812 max = size 813 } 814 } 815 if len(sizes) == 0 || max > maxCapacity { 816 max = maxCapacity 817 } 818 if max <= cap { 819 return 820 } 821 if v.Len() > 0 { 822 extra := reflect.MakeSlice(v.Type(), v.Len(), max) 823 reflect.Copy(extra, v) 824 v.Set(extra) 825 } else { 826 extra := reflect.MakeSlice(v.Type(), 0, max) 827 v.Set(extra) 828 } 829 } 830 831 // Watch implements storage.Interface.Watch. 832 func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) { 833 preparedKey, err := s.prepareKey(key) 834 if err != nil { 835 return nil, err 836 } 837 rev, err := s.versioner.ParseResourceVersion(opts.ResourceVersion) 838 if err != nil { 839 return nil, err 840 } 841 return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts) 842 } 843 844 func (s *store) watchContext(ctx context.Context) context.Context { 845 // The etcd server waits until it cannot find a leader for 3 election 846 // timeouts to cancel existing streams. 3 is currently a hard coded 847 // constant. The election timeout defaults to 1000ms. If the cluster is 848 // healthy, when the leader is stopped, the leadership transfer should be 849 // smooth. (leader transfers its leadership before stopping). If leader is 850 // hard killed, other servers will take an election timeout to realize 851 // leader lost and start campaign. 852 return clientv3.WithRequireLeader(ctx) 853 } 854 855 func (s *store) getCurrentState(ctx context.Context, key string, v reflect.Value, ignoreNotFound bool) func() (*objState, error) { 856 return func() (*objState, error) { 857 startTime := time.Now() 858 getResp, err := s.client.KV.Get(ctx, key) 859 metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) 860 if err != nil { 861 return nil, err 862 } 863 return s.getState(ctx, getResp, key, v, ignoreNotFound) 864 } 865 } 866 867 func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) { 868 state := &objState{ 869 meta: &storage.ResponseMeta{}, 870 } 871 872 if u, ok := v.Addr().Interface().(runtime.Unstructured); ok { 873 state.obj = u.NewEmptyInstance() 874 } else { 875 state.obj = reflect.New(v.Type()).Interface().(runtime.Object) 876 } 877 878 if len(getResp.Kvs) == 0 { 879 if !ignoreNotFound { 880 return nil, storage.NewKeyNotFoundError(key, 0) 881 } 882 if err := runtime.SetZeroValue(state.obj); err != nil { 883 return nil, err 884 } 885 } else { 886 data, stale, err := s.transformer.TransformFromStorage(ctx, getResp.Kvs[0].Value, authenticatedDataString(key)) 887 if err != nil { 888 return nil, storage.NewInternalError(err.Error()) 889 } 890 state.rev = getResp.Kvs[0].ModRevision 891 state.meta.ResourceVersion = uint64(state.rev) 892 state.data = data 893 state.stale = stale 894 if err := decode(s.codec, s.versioner, state.data, state.obj, state.rev); err != nil { 895 recordDecodeError(s.groupResourceString, key) 896 return nil, err 897 } 898 } 899 return state, nil 900 } 901 902 func (s *store) getStateFromObject(obj runtime.Object) (*objState, error) { 903 state := &objState{ 904 obj: obj, 905 meta: &storage.ResponseMeta{}, 906 } 907 908 rv, err := s.versioner.ObjectResourceVersion(obj) 909 if err != nil { 910 return nil, fmt.Errorf("couldn't get resource version: %v", err) 911 } 912 state.rev = int64(rv) 913 state.meta.ResourceVersion = uint64(state.rev) 914 915 // Compute the serialized form - for that we need to temporarily clean 916 // its resource version field (those are not stored in etcd). 917 if err := s.versioner.PrepareObjectForStorage(obj); err != nil { 918 return nil, fmt.Errorf("PrepareObjectForStorage failed: %v", err) 919 } 920 state.data, err = runtime.Encode(s.codec, obj) 921 if err != nil { 922 return nil, err 923 } 924 if err := s.versioner.UpdateObject(state.obj, uint64(rv)); err != nil { 925 klog.Errorf("failed to update object version: %v", err) 926 } 927 return state, nil 928 } 929 930 func (s *store) updateState(st *objState, userUpdate storage.UpdateFunc) (runtime.Object, uint64, error) { 931 ret, ttlPtr, err := userUpdate(st.obj, *st.meta) 932 if err != nil { 933 return nil, 0, err 934 } 935 936 if err := s.versioner.PrepareObjectForStorage(ret); err != nil { 937 return nil, 0, fmt.Errorf("PrepareObjectForStorage failed: %v", err) 938 } 939 var ttl uint64 940 if ttlPtr != nil { 941 ttl = *ttlPtr 942 } 943 return ret, ttl, nil 944 } 945 946 // ttlOpts returns client options based on given ttl. 947 // ttl: if ttl is non-zero, it will attach the key to a lease with ttl of roughly the same length 948 func (s *store) ttlOpts(ctx context.Context, ttl int64) ([]clientv3.OpOption, error) { 949 if ttl == 0 { 950 return nil, nil 951 } 952 id, err := s.leaseManager.GetLease(ctx, ttl) 953 if err != nil { 954 return nil, err 955 } 956 return []clientv3.OpOption{clientv3.WithLease(id)}, nil 957 } 958 959 // validateMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is 960 // greater than the most recent actualRevision available from storage. 961 func (s *store) validateMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error { 962 if minimumResourceVersion == "" { 963 return nil 964 } 965 minimumRV, err := s.versioner.ParseResourceVersion(minimumResourceVersion) 966 if err != nil { 967 return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err)) 968 } 969 // Enforce the storage.Interface guarantee that the resource version of the returned data 970 // "will be at least 'resourceVersion'". 971 if minimumRV > actualRevision { 972 return storage.NewTooLargeResourceVersionError(minimumRV, actualRevision, 0) 973 } 974 return nil 975 } 976 977 func (s *store) prepareKey(key string) (string, error) { 978 if key == ".." || 979 strings.HasPrefix(key, "../") || 980 strings.HasSuffix(key, "/..") || 981 strings.Contains(key, "/../") { 982 return "", fmt.Errorf("invalid key: %q", key) 983 } 984 if key == "." || 985 strings.HasPrefix(key, "./") || 986 strings.HasSuffix(key, "/.") || 987 strings.Contains(key, "/./") { 988 return "", fmt.Errorf("invalid key: %q", key) 989 } 990 if key == "" || key == "/" { 991 return "", fmt.Errorf("empty key: %q", key) 992 } 993 // We ensured that pathPrefix ends in '/' in construction, so skip any leading '/' in the key now. 994 startIndex := 0 995 if key[0] == '/' { 996 startIndex = 1 997 } 998 return s.pathPrefix + key[startIndex:], nil 999 } 1000 1001 // decode decodes value of bytes into object. It will also set the object resource version to rev. 1002 // On success, objPtr would be set to the object. 1003 func decode(codec runtime.Codec, versioner storage.Versioner, value []byte, objPtr runtime.Object, rev int64) error { 1004 if _, err := conversion.EnforcePtr(objPtr); err != nil { 1005 return fmt.Errorf("unable to convert output object to pointer: %v", err) 1006 } 1007 _, _, err := codec.Decode(value, nil, objPtr) 1008 if err != nil { 1009 return err 1010 } 1011 // being unable to set the version does not prevent the object from being extracted 1012 if err := versioner.UpdateObject(objPtr, uint64(rev)); err != nil { 1013 klog.Errorf("failed to update object version: %v", err) 1014 } 1015 return nil 1016 } 1017 1018 // appendListItem decodes and appends the object (if it passes filter) to v, which must be a slice. 1019 func appendListItem(v reflect.Value, data []byte, rev uint64, pred storage.SelectionPredicate, codec runtime.Codec, versioner storage.Versioner, newItemFunc func() runtime.Object) error { 1020 obj, _, err := codec.Decode(data, nil, newItemFunc()) 1021 if err != nil { 1022 return err 1023 } 1024 // being unable to set the version does not prevent the object from being extracted 1025 if err := versioner.UpdateObject(obj, rev); err != nil { 1026 klog.Errorf("failed to update object version: %v", err) 1027 } 1028 if matched, err := pred.Matches(obj); err == nil && matched { 1029 v.Set(reflect.Append(v, reflect.ValueOf(obj).Elem())) 1030 } 1031 return nil 1032 } 1033 1034 // recordDecodeError record decode error split by object type. 1035 func recordDecodeError(resource string, key string) { 1036 metrics.RecordDecodeError(resource) 1037 klog.V(4).Infof("Decoding %s \"%s\" failed", resource, key) 1038 } 1039 1040 func notFound(key string) clientv3.Cmp { 1041 return clientv3.Compare(clientv3.ModRevision(key), "=", 0) 1042 } 1043 1044 // getTypeName returns type name of an object for reporting purposes. 1045 func getTypeName(obj interface{}) string { 1046 return reflect.TypeOf(obj).String() 1047 }