k8s.io/apiserver@v0.31.1/pkg/storage/etcd3/store.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package etcd3 18 19 import ( 20 "bytes" 21 "context" 22 "errors" 23 "fmt" 24 "path" 25 "reflect" 26 "strings" 27 "time" 28 29 clientv3 "go.etcd.io/etcd/client/v3" 30 "go.opentelemetry.io/otel/attribute" 31 32 apierrors "k8s.io/apimachinery/pkg/api/errors" 33 "k8s.io/apimachinery/pkg/api/meta" 34 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 35 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 36 "k8s.io/apimachinery/pkg/conversion" 37 "k8s.io/apimachinery/pkg/runtime" 38 "k8s.io/apimachinery/pkg/runtime/schema" 39 "k8s.io/apimachinery/pkg/watch" 40 "k8s.io/apiserver/pkg/audit" 41 endpointsrequest "k8s.io/apiserver/pkg/endpoints/request" 42 "k8s.io/apiserver/pkg/features" 43 "k8s.io/apiserver/pkg/storage" 44 "k8s.io/apiserver/pkg/storage/etcd3/metrics" 45 etcdfeature "k8s.io/apiserver/pkg/storage/feature" 46 "k8s.io/apiserver/pkg/storage/value" 47 utilfeature "k8s.io/apiserver/pkg/util/feature" 48 "k8s.io/component-base/tracing" 49 "k8s.io/klog/v2" 50 ) 51 52 const ( 53 // maxLimit is a maximum page limit increase used when fetching objects from etcd. 54 // This limit is used only for increasing page size by kube-apiserver. If request 55 // specifies larger limit initially, it won't be changed. 56 maxLimit = 10000 57 ) 58 59 // authenticatedDataString satisfies the value.Context interface. It uses the key to 60 // authenticate the stored data. This does not defend against reuse of previously 61 // encrypted values under the same key, but will prevent an attacker from using an 62 // encrypted value from a different key. A stronger authenticated data segment would 63 // include the etcd3 Version field (which is incremented on each write to a key and 64 // reset when the key is deleted), but an attacker with write access to etcd can 65 // force deletion and recreation of keys to weaken that angle. 66 type authenticatedDataString string 67 68 // AuthenticatedData implements the value.Context interface. 69 func (d authenticatedDataString) AuthenticatedData() []byte { 70 return []byte(string(d)) 71 } 72 73 var _ value.Context = authenticatedDataString("") 74 75 type store struct { 76 client *clientv3.Client 77 codec runtime.Codec 78 versioner storage.Versioner 79 transformer value.Transformer 80 pathPrefix string 81 groupResource schema.GroupResource 82 groupResourceString string 83 watcher *watcher 84 leaseManager *leaseManager 85 } 86 87 func (s *store) RequestWatchProgress(ctx context.Context) error { 88 // Use watchContext to match ctx metadata provided when creating the watch. 89 // In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache. 90 return s.client.RequestProgress(s.watchContext(ctx)) 91 } 92 93 type objState struct { 94 obj runtime.Object 95 meta *storage.ResponseMeta 96 rev int64 97 data []byte 98 stale bool 99 } 100 101 // New returns an etcd3 implementation of storage.Interface. 102 func New(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) storage.Interface { 103 return newStore(c, codec, newFunc, newListFunc, prefix, resourcePrefix, groupResource, transformer, leaseManagerConfig) 104 } 105 106 func newStore(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) *store { 107 versioner := storage.APIObjectVersioner{} 108 // for compatibility with etcd2 impl. 109 // no-op for default prefix of '/registry'. 110 // keeps compatibility with etcd2 impl for custom prefixes that don't start with '/' 111 pathPrefix := path.Join("/", prefix) 112 if !strings.HasSuffix(pathPrefix, "/") { 113 // Ensure the pathPrefix ends in "/" here to simplify key concatenation later. 114 pathPrefix += "/" 115 } 116 117 w := &watcher{ 118 client: c, 119 codec: codec, 120 newFunc: newFunc, 121 groupResource: groupResource, 122 versioner: versioner, 123 transformer: transformer, 124 } 125 if newFunc == nil { 126 w.objectType = "<unknown>" 127 } else { 128 w.objectType = reflect.TypeOf(newFunc()).String() 129 } 130 s := &store{ 131 client: c, 132 codec: codec, 133 versioner: versioner, 134 transformer: transformer, 135 pathPrefix: pathPrefix, 136 groupResource: groupResource, 137 groupResourceString: groupResource.String(), 138 watcher: w, 139 leaseManager: newDefaultLeaseManager(c, leaseManagerConfig), 140 } 141 142 w.getCurrentStorageRV = func(ctx context.Context) (uint64, error) { 143 return storage.GetCurrentResourceVersionFromStorage(ctx, s, newListFunc, resourcePrefix, w.objectType) 144 } 145 if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache) || utilfeature.DefaultFeatureGate.Enabled(features.WatchList) { 146 etcdfeature.DefaultFeatureSupportChecker.CheckClient(c.Ctx(), c, storage.RequestWatchProgress) 147 } 148 return s 149 } 150 151 // Versioner implements storage.Interface.Versioner. 152 func (s *store) Versioner() storage.Versioner { 153 return s.versioner 154 } 155 156 // Get implements storage.Interface.Get. 157 func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, out runtime.Object) error { 158 preparedKey, err := s.prepareKey(key) 159 if err != nil { 160 return err 161 } 162 startTime := time.Now() 163 getResp, err := s.client.KV.Get(ctx, preparedKey) 164 metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) 165 if err != nil { 166 return err 167 } 168 if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil { 169 return err 170 } 171 172 if len(getResp.Kvs) == 0 { 173 if opts.IgnoreNotFound { 174 return runtime.SetZeroValue(out) 175 } 176 return storage.NewKeyNotFoundError(preparedKey, 0) 177 } 178 kv := getResp.Kvs[0] 179 180 data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(preparedKey)) 181 if err != nil { 182 return storage.NewInternalError(err.Error()) 183 } 184 185 err = decode(s.codec, s.versioner, data, out, kv.ModRevision) 186 if err != nil { 187 recordDecodeError(s.groupResourceString, preparedKey) 188 return err 189 } 190 return nil 191 } 192 193 // Create implements storage.Interface.Create. 194 func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error { 195 preparedKey, err := s.prepareKey(key) 196 if err != nil { 197 return err 198 } 199 ctx, span := tracing.Start(ctx, "Create etcd3", 200 attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)), 201 attribute.String("key", key), 202 attribute.String("type", getTypeName(obj)), 203 attribute.String("resource", s.groupResourceString), 204 ) 205 defer span.End(500 * time.Millisecond) 206 if version, err := s.versioner.ObjectResourceVersion(obj); err == nil && version != 0 { 207 return storage.ErrResourceVersionSetOnCreate 208 } 209 if err := s.versioner.PrepareObjectForStorage(obj); err != nil { 210 return fmt.Errorf("PrepareObjectForStorage failed: %v", err) 211 } 212 span.AddEvent("About to Encode") 213 data, err := runtime.Encode(s.codec, obj) 214 if err != nil { 215 span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 216 return err 217 } 218 span.AddEvent("Encode succeeded", attribute.Int("len", len(data))) 219 220 opts, err := s.ttlOpts(ctx, int64(ttl)) 221 if err != nil { 222 return err 223 } 224 225 newData, err := s.transformer.TransformToStorage(ctx, data, authenticatedDataString(preparedKey)) 226 if err != nil { 227 span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error())) 228 return storage.NewInternalError(err.Error()) 229 } 230 span.AddEvent("TransformToStorage succeeded") 231 232 startTime := time.Now() 233 txnResp, err := s.client.KV.Txn(ctx).If( 234 notFound(preparedKey), 235 ).Then( 236 clientv3.OpPut(preparedKey, string(newData), opts...), 237 ).Commit() 238 metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime) 239 if err != nil { 240 span.AddEvent("Txn call failed", attribute.String("err", err.Error())) 241 return err 242 } 243 span.AddEvent("Txn call succeeded") 244 245 if !txnResp.Succeeded { 246 return storage.NewKeyExistsError(preparedKey, 0) 247 } 248 249 if out != nil { 250 putResp := txnResp.Responses[0].GetResponsePut() 251 err = decode(s.codec, s.versioner, data, out, putResp.Header.Revision) 252 if err != nil { 253 span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 254 recordDecodeError(s.groupResourceString, preparedKey) 255 return err 256 } 257 span.AddEvent("decode succeeded", attribute.Int("len", len(data))) 258 } 259 return nil 260 } 261 262 // Delete implements storage.Interface.Delete. 263 func (s *store) Delete( 264 ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions, 265 validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error { 266 preparedKey, err := s.prepareKey(key) 267 if err != nil { 268 return err 269 } 270 v, err := conversion.EnforcePtr(out) 271 if err != nil { 272 return fmt.Errorf("unable to convert output object to pointer: %v", err) 273 } 274 return s.conditionalDelete(ctx, preparedKey, out, v, preconditions, validateDeletion, cachedExistingObject) 275 } 276 277 func (s *store) conditionalDelete( 278 ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions, 279 validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error { 280 getCurrentState := s.getCurrentState(ctx, key, v, false) 281 282 var origState *objState 283 var err error 284 var origStateIsCurrent bool 285 if cachedExistingObject != nil { 286 origState, err = s.getStateFromObject(cachedExistingObject) 287 } else { 288 origState, err = getCurrentState() 289 origStateIsCurrent = true 290 } 291 if err != nil { 292 return err 293 } 294 295 for { 296 if preconditions != nil { 297 if err := preconditions.Check(key, origState.obj); err != nil { 298 if origStateIsCurrent { 299 return err 300 } 301 302 // It's possible we're working with stale data. 303 // Remember the revision of the potentially stale data and the resulting update error 304 cachedRev := origState.rev 305 cachedUpdateErr := err 306 307 // Actually fetch 308 origState, err = getCurrentState() 309 if err != nil { 310 return err 311 } 312 origStateIsCurrent = true 313 314 // it turns out our cached data was not stale, return the error 315 if cachedRev == origState.rev { 316 return cachedUpdateErr 317 } 318 319 // Retry 320 continue 321 } 322 } 323 if err := validateDeletion(ctx, origState.obj); err != nil { 324 if origStateIsCurrent { 325 return err 326 } 327 328 // It's possible we're working with stale data. 329 // Remember the revision of the potentially stale data and the resulting update error 330 cachedRev := origState.rev 331 cachedUpdateErr := err 332 333 // Actually fetch 334 origState, err = getCurrentState() 335 if err != nil { 336 return err 337 } 338 origStateIsCurrent = true 339 340 // it turns out our cached data was not stale, return the error 341 if cachedRev == origState.rev { 342 return cachedUpdateErr 343 } 344 345 // Retry 346 continue 347 } 348 349 startTime := time.Now() 350 txnResp, err := s.client.KV.Txn(ctx).If( 351 clientv3.Compare(clientv3.ModRevision(key), "=", origState.rev), 352 ).Then( 353 clientv3.OpDelete(key), 354 ).Else( 355 clientv3.OpGet(key), 356 ).Commit() 357 metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime) 358 if err != nil { 359 return err 360 } 361 if !txnResp.Succeeded { 362 getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange()) 363 klog.V(4).Infof("deletion of %s failed because of a conflict, going to retry", key) 364 origState, err = s.getState(ctx, getResp, key, v, false) 365 if err != nil { 366 return err 367 } 368 origStateIsCurrent = true 369 continue 370 } 371 372 if len(txnResp.Responses) == 0 || txnResp.Responses[0].GetResponseDeleteRange() == nil { 373 return errors.New(fmt.Sprintf("invalid DeleteRange response: %v", txnResp.Responses)) 374 } 375 deleteResp := txnResp.Responses[0].GetResponseDeleteRange() 376 if deleteResp.Header == nil { 377 return errors.New("invalid DeleteRange response - nil header") 378 } 379 err = decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision) 380 if err != nil { 381 recordDecodeError(s.groupResourceString, key) 382 return err 383 } 384 return nil 385 } 386 } 387 388 // GuaranteedUpdate implements storage.Interface.GuaranteedUpdate. 389 func (s *store) GuaranteedUpdate( 390 ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool, 391 preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error { 392 preparedKey, err := s.prepareKey(key) 393 if err != nil { 394 return err 395 } 396 ctx, span := tracing.Start(ctx, "GuaranteedUpdate etcd3", 397 attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)), 398 attribute.String("key", key), 399 attribute.String("type", getTypeName(destination)), 400 attribute.String("resource", s.groupResourceString)) 401 defer span.End(500 * time.Millisecond) 402 403 v, err := conversion.EnforcePtr(destination) 404 if err != nil { 405 return fmt.Errorf("unable to convert output object to pointer: %v", err) 406 } 407 408 getCurrentState := s.getCurrentState(ctx, preparedKey, v, ignoreNotFound) 409 410 var origState *objState 411 var origStateIsCurrent bool 412 if cachedExistingObject != nil { 413 origState, err = s.getStateFromObject(cachedExistingObject) 414 } else { 415 origState, err = getCurrentState() 416 origStateIsCurrent = true 417 } 418 if err != nil { 419 return err 420 } 421 span.AddEvent("initial value restored") 422 423 transformContext := authenticatedDataString(preparedKey) 424 for { 425 if err := preconditions.Check(preparedKey, origState.obj); err != nil { 426 // If our data is already up to date, return the error 427 if origStateIsCurrent { 428 return err 429 } 430 431 // It's possible we were working with stale data 432 // Actually fetch 433 origState, err = getCurrentState() 434 if err != nil { 435 return err 436 } 437 origStateIsCurrent = true 438 // Retry 439 continue 440 } 441 442 ret, ttl, err := s.updateState(origState, tryUpdate) 443 if err != nil { 444 // If our data is already up to date, return the error 445 if origStateIsCurrent { 446 return err 447 } 448 449 // It's possible we were working with stale data 450 // Remember the revision of the potentially stale data and the resulting update error 451 cachedRev := origState.rev 452 cachedUpdateErr := err 453 454 // Actually fetch 455 origState, err = getCurrentState() 456 if err != nil { 457 return err 458 } 459 origStateIsCurrent = true 460 461 // it turns out our cached data was not stale, return the error 462 if cachedRev == origState.rev { 463 return cachedUpdateErr 464 } 465 466 // Retry 467 continue 468 } 469 470 span.AddEvent("About to Encode") 471 data, err := runtime.Encode(s.codec, ret) 472 if err != nil { 473 span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 474 return err 475 } 476 span.AddEvent("Encode succeeded", attribute.Int("len", len(data))) 477 if !origState.stale && bytes.Equal(data, origState.data) { 478 // if we skipped the original Get in this loop, we must refresh from 479 // etcd in order to be sure the data in the store is equivalent to 480 // our desired serialization 481 if !origStateIsCurrent { 482 origState, err = getCurrentState() 483 if err != nil { 484 return err 485 } 486 origStateIsCurrent = true 487 if !bytes.Equal(data, origState.data) { 488 // original data changed, restart loop 489 continue 490 } 491 } 492 // recheck that the data from etcd is not stale before short-circuiting a write 493 if !origState.stale { 494 err = decode(s.codec, s.versioner, origState.data, destination, origState.rev) 495 if err != nil { 496 recordDecodeError(s.groupResourceString, preparedKey) 497 return err 498 } 499 return nil 500 } 501 } 502 503 newData, err := s.transformer.TransformToStorage(ctx, data, transformContext) 504 if err != nil { 505 span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error())) 506 return storage.NewInternalError(err.Error()) 507 } 508 span.AddEvent("TransformToStorage succeeded") 509 510 opts, err := s.ttlOpts(ctx, int64(ttl)) 511 if err != nil { 512 return err 513 } 514 span.AddEvent("Transaction prepared") 515 516 startTime := time.Now() 517 txnResp, err := s.client.KV.Txn(ctx).If( 518 clientv3.Compare(clientv3.ModRevision(preparedKey), "=", origState.rev), 519 ).Then( 520 clientv3.OpPut(preparedKey, string(newData), opts...), 521 ).Else( 522 clientv3.OpGet(preparedKey), 523 ).Commit() 524 metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime) 525 if err != nil { 526 span.AddEvent("Txn call failed", attribute.String("err", err.Error())) 527 return err 528 } 529 span.AddEvent("Txn call completed") 530 span.AddEvent("Transaction committed") 531 if !txnResp.Succeeded { 532 getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange()) 533 klog.V(4).Infof("GuaranteedUpdate of %s failed because of a conflict, going to retry", preparedKey) 534 origState, err = s.getState(ctx, getResp, preparedKey, v, ignoreNotFound) 535 if err != nil { 536 return err 537 } 538 span.AddEvent("Retry value restored") 539 origStateIsCurrent = true 540 continue 541 } 542 putResp := txnResp.Responses[0].GetResponsePut() 543 544 err = decode(s.codec, s.versioner, data, destination, putResp.Header.Revision) 545 if err != nil { 546 span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error())) 547 recordDecodeError(s.groupResourceString, preparedKey) 548 return err 549 } 550 span.AddEvent("decode succeeded", attribute.Int("len", len(data))) 551 return nil 552 } 553 } 554 555 func getNewItemFunc(listObj runtime.Object, v reflect.Value) func() runtime.Object { 556 // For unstructured lists with a target group/version, preserve the group/version in the instantiated list items 557 if unstructuredList, isUnstructured := listObj.(*unstructured.UnstructuredList); isUnstructured { 558 if apiVersion := unstructuredList.GetAPIVersion(); len(apiVersion) > 0 { 559 return func() runtime.Object { 560 return &unstructured.Unstructured{Object: map[string]interface{}{"apiVersion": apiVersion}} 561 } 562 } 563 } 564 565 // Otherwise just instantiate an empty item 566 elem := v.Type().Elem() 567 return func() runtime.Object { 568 return reflect.New(elem).Interface().(runtime.Object) 569 } 570 } 571 572 func (s *store) Count(key string) (int64, error) { 573 preparedKey, err := s.prepareKey(key) 574 if err != nil { 575 return 0, err 576 } 577 578 // We need to make sure the key ended with "/" so that we only get children "directories". 579 // e.g. if we have key "/a", "/a/b", "/ab", getting keys with prefix "/a" will return all three, 580 // while with prefix "/a/" will return only "/a/b" which is the correct answer. 581 if !strings.HasSuffix(preparedKey, "/") { 582 preparedKey += "/" 583 } 584 585 startTime := time.Now() 586 getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly()) 587 metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime) 588 if err != nil { 589 return 0, err 590 } 591 return getResp.Count, nil 592 } 593 594 // ReadinessCheck implements storage.Interface. 595 func (s *store) ReadinessCheck() error { 596 return nil 597 } 598 599 // resolveGetListRev is used by GetList to resolve the rev to use in the client.KV.Get request. 600 func (s *store) resolveGetListRev(continueKey string, continueRV int64, opts storage.ListOptions) (int64, error) { 601 var withRev int64 602 // Uses continueRV if this is a continuation request. 603 if len(continueKey) > 0 { 604 if len(opts.ResourceVersion) > 0 && opts.ResourceVersion != "0" { 605 return withRev, apierrors.NewBadRequest("specifying resource version is not allowed when using continue") 606 } 607 // If continueRV > 0, the LIST request needs a specific resource version. 608 // continueRV==0 is invalid. 609 // If continueRV < 0, the request is for the latest resource version. 610 if continueRV > 0 { 611 withRev = continueRV 612 } 613 return withRev, nil 614 } 615 // Returns 0 if ResourceVersion is not specified. 616 if len(opts.ResourceVersion) == 0 { 617 return withRev, nil 618 } 619 parsedRV, err := s.versioner.ParseResourceVersion(opts.ResourceVersion) 620 if err != nil { 621 return withRev, apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err)) 622 } 623 624 switch opts.ResourceVersionMatch { 625 case metav1.ResourceVersionMatchNotOlderThan: 626 // The not older than constraint is checked after we get a response from etcd, 627 // and returnedRV is then set to the revision we get from the etcd response. 628 case metav1.ResourceVersionMatchExact: 629 withRev = int64(parsedRV) 630 case "": // legacy case 631 if opts.Recursive && opts.Predicate.Limit > 0 && parsedRV > 0 { 632 withRev = int64(parsedRV) 633 } 634 default: 635 return withRev, fmt.Errorf("unknown ResourceVersionMatch value: %v", opts.ResourceVersionMatch) 636 } 637 return withRev, nil 638 } 639 640 // GetList implements storage.Interface. 641 func (s *store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error { 642 preparedKey, err := s.prepareKey(key) 643 if err != nil { 644 return err 645 } 646 ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", opts.Recursive), 647 attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)), 648 attribute.String("key", key), 649 attribute.String("resourceVersion", opts.ResourceVersion), 650 attribute.String("resourceVersionMatch", string(opts.ResourceVersionMatch)), 651 attribute.Int("limit", int(opts.Predicate.Limit)), 652 attribute.String("continue", opts.Predicate.Continue)) 653 defer span.End(500 * time.Millisecond) 654 listPtr, err := meta.GetItemsPtr(listObj) 655 if err != nil { 656 return err 657 } 658 v, err := conversion.EnforcePtr(listPtr) 659 if err != nil || v.Kind() != reflect.Slice { 660 return fmt.Errorf("need ptr to slice: %v", err) 661 } 662 663 // For recursive lists, we need to make sure the key ended with "/" so that we only 664 // get children "directories". e.g. if we have key "/a", "/a/b", "/ab", getting keys 665 // with prefix "/a" will return all three, while with prefix "/a/" will return only 666 // "/a/b" which is the correct answer. 667 if opts.Recursive && !strings.HasSuffix(preparedKey, "/") { 668 preparedKey += "/" 669 } 670 keyPrefix := preparedKey 671 672 // set the appropriate clientv3 options to filter the returned data set 673 var limitOption *clientv3.OpOption 674 limit := opts.Predicate.Limit 675 var paging bool 676 options := make([]clientv3.OpOption, 0, 4) 677 if opts.Predicate.Limit > 0 { 678 paging = true 679 options = append(options, clientv3.WithLimit(limit)) 680 limitOption = &options[len(options)-1] 681 } 682 683 if opts.Recursive { 684 rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix) 685 options = append(options, clientv3.WithRange(rangeEnd)) 686 } 687 688 newItemFunc := getNewItemFunc(listObj, v) 689 690 var continueRV, withRev int64 691 var continueKey string 692 if opts.Recursive && len(opts.Predicate.Continue) > 0 { 693 continueKey, continueRV, err = storage.DecodeContinue(opts.Predicate.Continue, keyPrefix) 694 if err != nil { 695 return apierrors.NewBadRequest(fmt.Sprintf("invalid continue token: %v", err)) 696 } 697 preparedKey = continueKey 698 } 699 if withRev, err = s.resolveGetListRev(continueKey, continueRV, opts); err != nil { 700 return err 701 } 702 703 if withRev != 0 { 704 options = append(options, clientv3.WithRev(withRev)) 705 } 706 707 // loop until we have filled the requested limit from etcd or there are no more results 708 var lastKey []byte 709 var hasMore bool 710 var getResp *clientv3.GetResponse 711 var numFetched int 712 var numEvald int 713 // Because these metrics are for understanding the costs of handling LIST requests, 714 // get them recorded even in error cases. 715 defer func() { 716 numReturn := v.Len() 717 metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn) 718 }() 719 720 metricsOp := "get" 721 if opts.Recursive { 722 metricsOp = "list" 723 } 724 725 for { 726 startTime := time.Now() 727 getResp, err = s.client.KV.Get(ctx, preparedKey, options...) 728 metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime) 729 if err != nil { 730 return interpretListError(err, len(opts.Predicate.Continue) > 0, continueKey, keyPrefix) 731 } 732 numFetched += len(getResp.Kvs) 733 if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil { 734 return err 735 } 736 hasMore = getResp.More 737 738 if len(getResp.Kvs) == 0 && getResp.More { 739 return fmt.Errorf("no results were found, but etcd indicated there were more values remaining") 740 } 741 // indicate to the client which resource version was returned, and use the same resource version for subsequent requests. 742 if withRev == 0 { 743 withRev = getResp.Header.Revision 744 options = append(options, clientv3.WithRev(withRev)) 745 } 746 747 // avoid small allocations for the result slice, since this can be called in many 748 // different contexts and we don't know how significantly the result will be filtered 749 if opts.Predicate.Empty() { 750 growSlice(v, len(getResp.Kvs)) 751 } else { 752 growSlice(v, 2048, len(getResp.Kvs)) 753 } 754 755 // take items from the response until the bucket is full, filtering as we go 756 for i, kv := range getResp.Kvs { 757 if paging && int64(v.Len()) >= opts.Predicate.Limit { 758 hasMore = true 759 break 760 } 761 lastKey = kv.Key 762 763 data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(kv.Key)) 764 if err != nil { 765 return storage.NewInternalErrorf("unable to transform key %q: %v", kv.Key, err) 766 } 767 768 // Check if the request has already timed out before decode object 769 select { 770 case <-ctx.Done(): 771 // parent context is canceled or timed out, no point in continuing 772 return storage.NewTimeoutError(string(kv.Key), "request did not complete within requested timeout") 773 default: 774 } 775 776 obj, err := decodeListItem(ctx, data, uint64(kv.ModRevision), s.codec, s.versioner, newItemFunc) 777 if err != nil { 778 recordDecodeError(s.groupResourceString, string(kv.Key)) 779 return err 780 } 781 782 // being unable to set the version does not prevent the object from being extracted 783 if matched, err := opts.Predicate.Matches(obj); err == nil && matched { 784 v.Set(reflect.Append(v, reflect.ValueOf(obj).Elem())) 785 } 786 787 numEvald++ 788 789 // free kv early. Long lists can take O(seconds) to decode. 790 getResp.Kvs[i] = nil 791 } 792 793 // no more results remain or we didn't request paging 794 if !hasMore || !paging { 795 break 796 } 797 // we're paging but we have filled our bucket 798 if int64(v.Len()) >= opts.Predicate.Limit { 799 break 800 } 801 802 if limit < maxLimit { 803 // We got incomplete result due to field/label selector dropping the object. 804 // Double page size to reduce total number of calls to etcd. 805 limit *= 2 806 if limit > maxLimit { 807 limit = maxLimit 808 } 809 *limitOption = clientv3.WithLimit(limit) 810 } 811 preparedKey = string(lastKey) + "\x00" 812 } 813 814 if v.IsNil() { 815 // Ensure that we never return a nil Items pointer in the result for consistency. 816 v.Set(reflect.MakeSlice(v.Type(), 0, 0)) 817 } 818 819 continueValue, remainingItemCount, err := storage.PrepareContinueToken(string(lastKey), keyPrefix, withRev, getResp.Count, hasMore, opts) 820 if err != nil { 821 return err 822 } 823 return s.versioner.UpdateList(listObj, uint64(withRev), continueValue, remainingItemCount) 824 } 825 826 // growSlice takes a slice value and grows its capacity up 827 // to the maximum of the passed sizes or maxCapacity, whichever 828 // is smaller. Above maxCapacity decisions about allocation are left 829 // to the Go runtime on append. This allows a caller to make an 830 // educated guess about the potential size of the total list while 831 // still avoiding overly aggressive initial allocation. If sizes 832 // is empty maxCapacity will be used as the size to grow. 833 func growSlice(v reflect.Value, maxCapacity int, sizes ...int) { 834 cap := v.Cap() 835 max := cap 836 for _, size := range sizes { 837 if size > max { 838 max = size 839 } 840 } 841 if len(sizes) == 0 || max > maxCapacity { 842 max = maxCapacity 843 } 844 if max <= cap { 845 return 846 } 847 if v.Len() > 0 { 848 extra := reflect.MakeSlice(v.Type(), v.Len(), max) 849 reflect.Copy(extra, v) 850 v.Set(extra) 851 } else { 852 extra := reflect.MakeSlice(v.Type(), 0, max) 853 v.Set(extra) 854 } 855 } 856 857 // Watch implements storage.Interface.Watch. 858 func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) { 859 preparedKey, err := s.prepareKey(key) 860 if err != nil { 861 return nil, err 862 } 863 rev, err := s.versioner.ParseResourceVersion(opts.ResourceVersion) 864 if err != nil { 865 return nil, err 866 } 867 return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts) 868 } 869 870 func (s *store) watchContext(ctx context.Context) context.Context { 871 // The etcd server waits until it cannot find a leader for 3 election 872 // timeouts to cancel existing streams. 3 is currently a hard coded 873 // constant. The election timeout defaults to 1000ms. If the cluster is 874 // healthy, when the leader is stopped, the leadership transfer should be 875 // smooth. (leader transfers its leadership before stopping). If leader is 876 // hard killed, other servers will take an election timeout to realize 877 // leader lost and start campaign. 878 return clientv3.WithRequireLeader(ctx) 879 } 880 881 func (s *store) getCurrentState(ctx context.Context, key string, v reflect.Value, ignoreNotFound bool) func() (*objState, error) { 882 return func() (*objState, error) { 883 startTime := time.Now() 884 getResp, err := s.client.KV.Get(ctx, key) 885 metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime) 886 if err != nil { 887 return nil, err 888 } 889 return s.getState(ctx, getResp, key, v, ignoreNotFound) 890 } 891 } 892 893 func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) { 894 state := &objState{ 895 meta: &storage.ResponseMeta{}, 896 } 897 898 if u, ok := v.Addr().Interface().(runtime.Unstructured); ok { 899 state.obj = u.NewEmptyInstance() 900 } else { 901 state.obj = reflect.New(v.Type()).Interface().(runtime.Object) 902 } 903 904 if len(getResp.Kvs) == 0 { 905 if !ignoreNotFound { 906 return nil, storage.NewKeyNotFoundError(key, 0) 907 } 908 if err := runtime.SetZeroValue(state.obj); err != nil { 909 return nil, err 910 } 911 } else { 912 data, stale, err := s.transformer.TransformFromStorage(ctx, getResp.Kvs[0].Value, authenticatedDataString(key)) 913 if err != nil { 914 return nil, storage.NewInternalError(err.Error()) 915 } 916 state.rev = getResp.Kvs[0].ModRevision 917 state.meta.ResourceVersion = uint64(state.rev) 918 state.data = data 919 state.stale = stale 920 if err := decode(s.codec, s.versioner, state.data, state.obj, state.rev); err != nil { 921 recordDecodeError(s.groupResourceString, key) 922 return nil, err 923 } 924 } 925 return state, nil 926 } 927 928 func (s *store) getStateFromObject(obj runtime.Object) (*objState, error) { 929 state := &objState{ 930 obj: obj, 931 meta: &storage.ResponseMeta{}, 932 } 933 934 rv, err := s.versioner.ObjectResourceVersion(obj) 935 if err != nil { 936 return nil, fmt.Errorf("couldn't get resource version: %v", err) 937 } 938 state.rev = int64(rv) 939 state.meta.ResourceVersion = uint64(state.rev) 940 941 // Compute the serialized form - for that we need to temporarily clean 942 // its resource version field (those are not stored in etcd). 943 if err := s.versioner.PrepareObjectForStorage(obj); err != nil { 944 return nil, fmt.Errorf("PrepareObjectForStorage failed: %v", err) 945 } 946 state.data, err = runtime.Encode(s.codec, obj) 947 if err != nil { 948 return nil, err 949 } 950 if err := s.versioner.UpdateObject(state.obj, uint64(rv)); err != nil { 951 klog.Errorf("failed to update object version: %v", err) 952 } 953 return state, nil 954 } 955 956 func (s *store) updateState(st *objState, userUpdate storage.UpdateFunc) (runtime.Object, uint64, error) { 957 ret, ttlPtr, err := userUpdate(st.obj, *st.meta) 958 if err != nil { 959 return nil, 0, err 960 } 961 962 if err := s.versioner.PrepareObjectForStorage(ret); err != nil { 963 return nil, 0, fmt.Errorf("PrepareObjectForStorage failed: %v", err) 964 } 965 var ttl uint64 966 if ttlPtr != nil { 967 ttl = *ttlPtr 968 } 969 return ret, ttl, nil 970 } 971 972 // ttlOpts returns client options based on given ttl. 973 // ttl: if ttl is non-zero, it will attach the key to a lease with ttl of roughly the same length 974 func (s *store) ttlOpts(ctx context.Context, ttl int64) ([]clientv3.OpOption, error) { 975 if ttl == 0 { 976 return nil, nil 977 } 978 id, err := s.leaseManager.GetLease(ctx, ttl) 979 if err != nil { 980 return nil, err 981 } 982 return []clientv3.OpOption{clientv3.WithLease(id)}, nil 983 } 984 985 // validateMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is 986 // greater than the most recent actualRevision available from storage. 987 func (s *store) validateMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error { 988 if minimumResourceVersion == "" { 989 return nil 990 } 991 minimumRV, err := s.versioner.ParseResourceVersion(minimumResourceVersion) 992 if err != nil { 993 return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err)) 994 } 995 // Enforce the storage.Interface guarantee that the resource version of the returned data 996 // "will be at least 'resourceVersion'". 997 if minimumRV > actualRevision { 998 return storage.NewTooLargeResourceVersionError(minimumRV, actualRevision, 0) 999 } 1000 return nil 1001 } 1002 1003 func (s *store) prepareKey(key string) (string, error) { 1004 if key == ".." || 1005 strings.HasPrefix(key, "../") || 1006 strings.HasSuffix(key, "/..") || 1007 strings.Contains(key, "/../") { 1008 return "", fmt.Errorf("invalid key: %q", key) 1009 } 1010 if key == "." || 1011 strings.HasPrefix(key, "./") || 1012 strings.HasSuffix(key, "/.") || 1013 strings.Contains(key, "/./") { 1014 return "", fmt.Errorf("invalid key: %q", key) 1015 } 1016 if key == "" || key == "/" { 1017 return "", fmt.Errorf("empty key: %q", key) 1018 } 1019 // We ensured that pathPrefix ends in '/' in construction, so skip any leading '/' in the key now. 1020 startIndex := 0 1021 if key[0] == '/' { 1022 startIndex = 1 1023 } 1024 return s.pathPrefix + key[startIndex:], nil 1025 } 1026 1027 // decode decodes value of bytes into object. It will also set the object resource version to rev. 1028 // On success, objPtr would be set to the object. 1029 func decode(codec runtime.Codec, versioner storage.Versioner, value []byte, objPtr runtime.Object, rev int64) error { 1030 if _, err := conversion.EnforcePtr(objPtr); err != nil { 1031 return fmt.Errorf("unable to convert output object to pointer: %v", err) 1032 } 1033 _, _, err := codec.Decode(value, nil, objPtr) 1034 if err != nil { 1035 return err 1036 } 1037 // being unable to set the version does not prevent the object from being extracted 1038 if err := versioner.UpdateObject(objPtr, uint64(rev)); err != nil { 1039 klog.Errorf("failed to update object version: %v", err) 1040 } 1041 return nil 1042 } 1043 1044 // decodeListItem decodes bytes value in array into object. 1045 func decodeListItem(ctx context.Context, data []byte, rev uint64, codec runtime.Codec, versioner storage.Versioner, newItemFunc func() runtime.Object) (runtime.Object, error) { 1046 startedAt := time.Now() 1047 defer func() { 1048 endpointsrequest.TrackDecodeLatency(ctx, time.Since(startedAt)) 1049 }() 1050 1051 obj, _, err := codec.Decode(data, nil, newItemFunc()) 1052 if err != nil { 1053 return nil, err 1054 } 1055 1056 if err := versioner.UpdateObject(obj, rev); err != nil { 1057 klog.Errorf("failed to update object version: %v", err) 1058 } 1059 1060 return obj, nil 1061 } 1062 1063 // recordDecodeError record decode error split by object type. 1064 func recordDecodeError(resource string, key string) { 1065 metrics.RecordDecodeError(resource) 1066 klog.V(4).Infof("Decoding %s \"%s\" failed", resource, key) 1067 } 1068 1069 func notFound(key string) clientv3.Cmp { 1070 return clientv3.Compare(clientv3.ModRevision(key), "=", 0) 1071 } 1072 1073 // getTypeName returns type name of an object for reporting purposes. 1074 func getTypeName(obj interface{}) string { 1075 return reflect.TypeOf(obj).String() 1076 }