k8s.io/apiserver@v0.29.3/pkg/storage/etcd3/store.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package etcd3
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"path"
    25  	"reflect"
    26  	"strings"
    27  	"time"
    28  
    29  	clientv3 "go.etcd.io/etcd/client/v3"
    30  	"go.opentelemetry.io/otel/attribute"
    31  
    32  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    33  	"k8s.io/apimachinery/pkg/api/meta"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    36  	"k8s.io/apimachinery/pkg/conversion"
    37  	"k8s.io/apimachinery/pkg/runtime"
    38  	"k8s.io/apimachinery/pkg/runtime/schema"
    39  	"k8s.io/apimachinery/pkg/watch"
    40  	"k8s.io/apiserver/pkg/audit"
    41  	"k8s.io/apiserver/pkg/storage"
    42  	"k8s.io/apiserver/pkg/storage/etcd3/metrics"
    43  	"k8s.io/apiserver/pkg/storage/value"
    44  	"k8s.io/component-base/tracing"
    45  	"k8s.io/klog/v2"
    46  )
    47  
    48  const (
    49  	// maxLimit is a maximum page limit increase used when fetching objects from etcd.
    50  	// This limit is used only for increasing page size by kube-apiserver. If request
    51  	// specifies larger limit initially, it won't be changed.
    52  	maxLimit = 10000
    53  )
    54  
    55  // authenticatedDataString satisfies the value.Context interface. It uses the key to
    56  // authenticate the stored data. This does not defend against reuse of previously
    57  // encrypted values under the same key, but will prevent an attacker from using an
    58  // encrypted value from a different key. A stronger authenticated data segment would
    59  // include the etcd3 Version field (which is incremented on each write to a key and
    60  // reset when the key is deleted), but an attacker with write access to etcd can
    61  // force deletion and recreation of keys to weaken that angle.
    62  type authenticatedDataString string
    63  
    64  // AuthenticatedData implements the value.Context interface.
    65  func (d authenticatedDataString) AuthenticatedData() []byte {
    66  	return []byte(string(d))
    67  }
    68  
    69  var _ value.Context = authenticatedDataString("")
    70  
    71  type store struct {
    72  	client              *clientv3.Client
    73  	codec               runtime.Codec
    74  	versioner           storage.Versioner
    75  	transformer         value.Transformer
    76  	pathPrefix          string
    77  	groupResource       schema.GroupResource
    78  	groupResourceString string
    79  	watcher             *watcher
    80  	leaseManager        *leaseManager
    81  }
    82  
    83  func (s *store) RequestWatchProgress(ctx context.Context) error {
    84  	// Use watchContext to match ctx metadata provided when creating the watch.
    85  	// In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache.
    86  	return s.client.RequestProgress(s.watchContext(ctx))
    87  }
    88  
    89  type objState struct {
    90  	obj   runtime.Object
    91  	meta  *storage.ResponseMeta
    92  	rev   int64
    93  	data  []byte
    94  	stale bool
    95  }
    96  
    97  // New returns an etcd3 implementation of storage.Interface.
    98  func New(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) storage.Interface {
    99  	return newStore(c, codec, newFunc, newListFunc, prefix, resourcePrefix, groupResource, transformer, leaseManagerConfig)
   100  }
   101  
   102  func newStore(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) *store {
   103  	versioner := storage.APIObjectVersioner{}
   104  	// for compatibility with etcd2 impl.
   105  	// no-op for default prefix of '/registry'.
   106  	// keeps compatibility with etcd2 impl for custom prefixes that don't start with '/'
   107  	pathPrefix := path.Join("/", prefix)
   108  	if !strings.HasSuffix(pathPrefix, "/") {
   109  		// Ensure the pathPrefix ends in "/" here to simplify key concatenation later.
   110  		pathPrefix += "/"
   111  	}
   112  
   113  	w := &watcher{
   114  		client:        c,
   115  		codec:         codec,
   116  		newFunc:       newFunc,
   117  		groupResource: groupResource,
   118  		versioner:     versioner,
   119  		transformer:   transformer,
   120  	}
   121  	if newFunc == nil {
   122  		w.objectType = "<unknown>"
   123  	} else {
   124  		w.objectType = reflect.TypeOf(newFunc()).String()
   125  	}
   126  	s := &store{
   127  		client:              c,
   128  		codec:               codec,
   129  		versioner:           versioner,
   130  		transformer:         transformer,
   131  		pathPrefix:          pathPrefix,
   132  		groupResource:       groupResource,
   133  		groupResourceString: groupResource.String(),
   134  		watcher:             w,
   135  		leaseManager:        newDefaultLeaseManager(c, leaseManagerConfig),
   136  	}
   137  
   138  	w.getCurrentStorageRV = func(ctx context.Context) (uint64, error) {
   139  		return storage.GetCurrentResourceVersionFromStorage(ctx, s, newListFunc, resourcePrefix, w.objectType)
   140  	}
   141  	return s
   142  }
   143  
   144  // Versioner implements storage.Interface.Versioner.
   145  func (s *store) Versioner() storage.Versioner {
   146  	return s.versioner
   147  }
   148  
   149  // Get implements storage.Interface.Get.
   150  func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, out runtime.Object) error {
   151  	preparedKey, err := s.prepareKey(key)
   152  	if err != nil {
   153  		return err
   154  	}
   155  	startTime := time.Now()
   156  	getResp, err := s.client.KV.Get(ctx, preparedKey)
   157  	metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
   158  	if err != nil {
   159  		return err
   160  	}
   161  	if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
   162  		return err
   163  	}
   164  
   165  	if len(getResp.Kvs) == 0 {
   166  		if opts.IgnoreNotFound {
   167  			return runtime.SetZeroValue(out)
   168  		}
   169  		return storage.NewKeyNotFoundError(preparedKey, 0)
   170  	}
   171  	kv := getResp.Kvs[0]
   172  
   173  	data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(preparedKey))
   174  	if err != nil {
   175  		return storage.NewInternalError(err.Error())
   176  	}
   177  
   178  	err = decode(s.codec, s.versioner, data, out, kv.ModRevision)
   179  	if err != nil {
   180  		recordDecodeError(s.groupResourceString, preparedKey)
   181  		return err
   182  	}
   183  	return nil
   184  }
   185  
   186  // Create implements storage.Interface.Create.
   187  func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error {
   188  	preparedKey, err := s.prepareKey(key)
   189  	if err != nil {
   190  		return err
   191  	}
   192  	ctx, span := tracing.Start(ctx, "Create etcd3",
   193  		attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
   194  		attribute.String("key", key),
   195  		attribute.String("type", getTypeName(obj)),
   196  		attribute.String("resource", s.groupResourceString),
   197  	)
   198  	defer span.End(500 * time.Millisecond)
   199  	if version, err := s.versioner.ObjectResourceVersion(obj); err == nil && version != 0 {
   200  		return storage.ErrResourceVersionSetOnCreate
   201  	}
   202  	if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
   203  		return fmt.Errorf("PrepareObjectForStorage failed: %v", err)
   204  	}
   205  	span.AddEvent("About to Encode")
   206  	data, err := runtime.Encode(s.codec, obj)
   207  	if err != nil {
   208  		span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   209  		return err
   210  	}
   211  	span.AddEvent("Encode succeeded", attribute.Int("len", len(data)))
   212  
   213  	opts, err := s.ttlOpts(ctx, int64(ttl))
   214  	if err != nil {
   215  		return err
   216  	}
   217  
   218  	newData, err := s.transformer.TransformToStorage(ctx, data, authenticatedDataString(preparedKey))
   219  	if err != nil {
   220  		span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error()))
   221  		return storage.NewInternalError(err.Error())
   222  	}
   223  	span.AddEvent("TransformToStorage succeeded")
   224  
   225  	startTime := time.Now()
   226  	txnResp, err := s.client.KV.Txn(ctx).If(
   227  		notFound(preparedKey),
   228  	).Then(
   229  		clientv3.OpPut(preparedKey, string(newData), opts...),
   230  	).Commit()
   231  	metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
   232  	if err != nil {
   233  		span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
   234  		return err
   235  	}
   236  	span.AddEvent("Txn call succeeded")
   237  
   238  	if !txnResp.Succeeded {
   239  		return storage.NewKeyExistsError(preparedKey, 0)
   240  	}
   241  
   242  	if out != nil {
   243  		putResp := txnResp.Responses[0].GetResponsePut()
   244  		err = decode(s.codec, s.versioner, data, out, putResp.Header.Revision)
   245  		if err != nil {
   246  			span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   247  			recordDecodeError(s.groupResourceString, preparedKey)
   248  			return err
   249  		}
   250  		span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
   251  	}
   252  	return nil
   253  }
   254  
   255  // Delete implements storage.Interface.Delete.
   256  func (s *store) Delete(
   257  	ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions,
   258  	validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
   259  	preparedKey, err := s.prepareKey(key)
   260  	if err != nil {
   261  		return err
   262  	}
   263  	v, err := conversion.EnforcePtr(out)
   264  	if err != nil {
   265  		return fmt.Errorf("unable to convert output object to pointer: %v", err)
   266  	}
   267  	return s.conditionalDelete(ctx, preparedKey, out, v, preconditions, validateDeletion, cachedExistingObject)
   268  }
   269  
   270  func (s *store) conditionalDelete(
   271  	ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions,
   272  	validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
   273  	getCurrentState := s.getCurrentState(ctx, key, v, false)
   274  
   275  	var origState *objState
   276  	var err error
   277  	var origStateIsCurrent bool
   278  	if cachedExistingObject != nil {
   279  		origState, err = s.getStateFromObject(cachedExistingObject)
   280  	} else {
   281  		origState, err = getCurrentState()
   282  		origStateIsCurrent = true
   283  	}
   284  	if err != nil {
   285  		return err
   286  	}
   287  
   288  	for {
   289  		if preconditions != nil {
   290  			if err := preconditions.Check(key, origState.obj); err != nil {
   291  				if origStateIsCurrent {
   292  					return err
   293  				}
   294  
   295  				// It's possible we're working with stale data.
   296  				// Remember the revision of the potentially stale data and the resulting update error
   297  				cachedRev := origState.rev
   298  				cachedUpdateErr := err
   299  
   300  				// Actually fetch
   301  				origState, err = getCurrentState()
   302  				if err != nil {
   303  					return err
   304  				}
   305  				origStateIsCurrent = true
   306  
   307  				// it turns out our cached data was not stale, return the error
   308  				if cachedRev == origState.rev {
   309  					return cachedUpdateErr
   310  				}
   311  
   312  				// Retry
   313  				continue
   314  			}
   315  		}
   316  		if err := validateDeletion(ctx, origState.obj); err != nil {
   317  			if origStateIsCurrent {
   318  				return err
   319  			}
   320  
   321  			// It's possible we're working with stale data.
   322  			// Remember the revision of the potentially stale data and the resulting update error
   323  			cachedRev := origState.rev
   324  			cachedUpdateErr := err
   325  
   326  			// Actually fetch
   327  			origState, err = getCurrentState()
   328  			if err != nil {
   329  				return err
   330  			}
   331  			origStateIsCurrent = true
   332  
   333  			// it turns out our cached data was not stale, return the error
   334  			if cachedRev == origState.rev {
   335  				return cachedUpdateErr
   336  			}
   337  
   338  			// Retry
   339  			continue
   340  		}
   341  
   342  		startTime := time.Now()
   343  		txnResp, err := s.client.KV.Txn(ctx).If(
   344  			clientv3.Compare(clientv3.ModRevision(key), "=", origState.rev),
   345  		).Then(
   346  			clientv3.OpDelete(key),
   347  		).Else(
   348  			clientv3.OpGet(key),
   349  		).Commit()
   350  		metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
   351  		if err != nil {
   352  			return err
   353  		}
   354  		if !txnResp.Succeeded {
   355  			getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
   356  			klog.V(4).Infof("deletion of %s failed because of a conflict, going to retry", key)
   357  			origState, err = s.getState(ctx, getResp, key, v, false)
   358  			if err != nil {
   359  				return err
   360  			}
   361  			origStateIsCurrent = true
   362  			continue
   363  		}
   364  
   365  		if len(txnResp.Responses) == 0 || txnResp.Responses[0].GetResponseDeleteRange() == nil {
   366  			return errors.New(fmt.Sprintf("invalid DeleteRange response: %v", txnResp.Responses))
   367  		}
   368  		deleteResp := txnResp.Responses[0].GetResponseDeleteRange()
   369  		if deleteResp.Header == nil {
   370  			return errors.New("invalid DeleteRange response - nil header")
   371  		}
   372  		err = decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision)
   373  		if err != nil {
   374  			recordDecodeError(s.groupResourceString, key)
   375  			return err
   376  		}
   377  		return nil
   378  	}
   379  }
   380  
   381  // GuaranteedUpdate implements storage.Interface.GuaranteedUpdate.
   382  func (s *store) GuaranteedUpdate(
   383  	ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool,
   384  	preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error {
   385  	preparedKey, err := s.prepareKey(key)
   386  	if err != nil {
   387  		return err
   388  	}
   389  	ctx, span := tracing.Start(ctx, "GuaranteedUpdate etcd3",
   390  		attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
   391  		attribute.String("key", key),
   392  		attribute.String("type", getTypeName(destination)),
   393  		attribute.String("resource", s.groupResourceString))
   394  	defer span.End(500 * time.Millisecond)
   395  
   396  	v, err := conversion.EnforcePtr(destination)
   397  	if err != nil {
   398  		return fmt.Errorf("unable to convert output object to pointer: %v", err)
   399  	}
   400  
   401  	getCurrentState := s.getCurrentState(ctx, preparedKey, v, ignoreNotFound)
   402  
   403  	var origState *objState
   404  	var origStateIsCurrent bool
   405  	if cachedExistingObject != nil {
   406  		origState, err = s.getStateFromObject(cachedExistingObject)
   407  	} else {
   408  		origState, err = getCurrentState()
   409  		origStateIsCurrent = true
   410  	}
   411  	if err != nil {
   412  		return err
   413  	}
   414  	span.AddEvent("initial value restored")
   415  
   416  	transformContext := authenticatedDataString(preparedKey)
   417  	for {
   418  		if err := preconditions.Check(preparedKey, origState.obj); err != nil {
   419  			// If our data is already up to date, return the error
   420  			if origStateIsCurrent {
   421  				return err
   422  			}
   423  
   424  			// It's possible we were working with stale data
   425  			// Actually fetch
   426  			origState, err = getCurrentState()
   427  			if err != nil {
   428  				return err
   429  			}
   430  			origStateIsCurrent = true
   431  			// Retry
   432  			continue
   433  		}
   434  
   435  		ret, ttl, err := s.updateState(origState, tryUpdate)
   436  		if err != nil {
   437  			// If our data is already up to date, return the error
   438  			if origStateIsCurrent {
   439  				return err
   440  			}
   441  
   442  			// It's possible we were working with stale data
   443  			// Remember the revision of the potentially stale data and the resulting update error
   444  			cachedRev := origState.rev
   445  			cachedUpdateErr := err
   446  
   447  			// Actually fetch
   448  			origState, err = getCurrentState()
   449  			if err != nil {
   450  				return err
   451  			}
   452  			origStateIsCurrent = true
   453  
   454  			// it turns out our cached data was not stale, return the error
   455  			if cachedRev == origState.rev {
   456  				return cachedUpdateErr
   457  			}
   458  
   459  			// Retry
   460  			continue
   461  		}
   462  
   463  		span.AddEvent("About to Encode")
   464  		data, err := runtime.Encode(s.codec, ret)
   465  		if err != nil {
   466  			span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   467  			return err
   468  		}
   469  		span.AddEvent("Encode succeeded", attribute.Int("len", len(data)))
   470  		if !origState.stale && bytes.Equal(data, origState.data) {
   471  			// if we skipped the original Get in this loop, we must refresh from
   472  			// etcd in order to be sure the data in the store is equivalent to
   473  			// our desired serialization
   474  			if !origStateIsCurrent {
   475  				origState, err = getCurrentState()
   476  				if err != nil {
   477  					return err
   478  				}
   479  				origStateIsCurrent = true
   480  				if !bytes.Equal(data, origState.data) {
   481  					// original data changed, restart loop
   482  					continue
   483  				}
   484  			}
   485  			// recheck that the data from etcd is not stale before short-circuiting a write
   486  			if !origState.stale {
   487  				err = decode(s.codec, s.versioner, origState.data, destination, origState.rev)
   488  				if err != nil {
   489  					recordDecodeError(s.groupResourceString, preparedKey)
   490  					return err
   491  				}
   492  				return nil
   493  			}
   494  		}
   495  
   496  		newData, err := s.transformer.TransformToStorage(ctx, data, transformContext)
   497  		if err != nil {
   498  			span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error()))
   499  			return storage.NewInternalError(err.Error())
   500  		}
   501  		span.AddEvent("TransformToStorage succeeded")
   502  
   503  		opts, err := s.ttlOpts(ctx, int64(ttl))
   504  		if err != nil {
   505  			return err
   506  		}
   507  		span.AddEvent("Transaction prepared")
   508  
   509  		startTime := time.Now()
   510  		txnResp, err := s.client.KV.Txn(ctx).If(
   511  			clientv3.Compare(clientv3.ModRevision(preparedKey), "=", origState.rev),
   512  		).Then(
   513  			clientv3.OpPut(preparedKey, string(newData), opts...),
   514  		).Else(
   515  			clientv3.OpGet(preparedKey),
   516  		).Commit()
   517  		metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
   518  		if err != nil {
   519  			span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
   520  			return err
   521  		}
   522  		span.AddEvent("Txn call completed")
   523  		span.AddEvent("Transaction committed")
   524  		if !txnResp.Succeeded {
   525  			getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
   526  			klog.V(4).Infof("GuaranteedUpdate of %s failed because of a conflict, going to retry", preparedKey)
   527  			origState, err = s.getState(ctx, getResp, preparedKey, v, ignoreNotFound)
   528  			if err != nil {
   529  				return err
   530  			}
   531  			span.AddEvent("Retry value restored")
   532  			origStateIsCurrent = true
   533  			continue
   534  		}
   535  		putResp := txnResp.Responses[0].GetResponsePut()
   536  
   537  		err = decode(s.codec, s.versioner, data, destination, putResp.Header.Revision)
   538  		if err != nil {
   539  			span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   540  			recordDecodeError(s.groupResourceString, preparedKey)
   541  			return err
   542  		}
   543  		span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
   544  		return nil
   545  	}
   546  }
   547  
   548  func getNewItemFunc(listObj runtime.Object, v reflect.Value) func() runtime.Object {
   549  	// For unstructured lists with a target group/version, preserve the group/version in the instantiated list items
   550  	if unstructuredList, isUnstructured := listObj.(*unstructured.UnstructuredList); isUnstructured {
   551  		if apiVersion := unstructuredList.GetAPIVersion(); len(apiVersion) > 0 {
   552  			return func() runtime.Object {
   553  				return &unstructured.Unstructured{Object: map[string]interface{}{"apiVersion": apiVersion}}
   554  			}
   555  		}
   556  	}
   557  
   558  	// Otherwise just instantiate an empty item
   559  	elem := v.Type().Elem()
   560  	return func() runtime.Object {
   561  		return reflect.New(elem).Interface().(runtime.Object)
   562  	}
   563  }
   564  
   565  func (s *store) Count(key string) (int64, error) {
   566  	preparedKey, err := s.prepareKey(key)
   567  	if err != nil {
   568  		return 0, err
   569  	}
   570  
   571  	// We need to make sure the key ended with "/" so that we only get children "directories".
   572  	// e.g. if we have key "/a", "/a/b", "/ab", getting keys with prefix "/a" will return all three,
   573  	// while with prefix "/a/" will return only "/a/b" which is the correct answer.
   574  	if !strings.HasSuffix(preparedKey, "/") {
   575  		preparedKey += "/"
   576  	}
   577  
   578  	startTime := time.Now()
   579  	getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
   580  	metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
   581  	if err != nil {
   582  		return 0, err
   583  	}
   584  	return getResp.Count, nil
   585  }
   586  
   587  // GetList implements storage.Interface.
   588  func (s *store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
   589  	preparedKey, err := s.prepareKey(key)
   590  	if err != nil {
   591  		return err
   592  	}
   593  	ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", opts.Recursive),
   594  		attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
   595  		attribute.String("key", key),
   596  		attribute.String("resourceVersion", opts.ResourceVersion),
   597  		attribute.String("resourceVersionMatch", string(opts.ResourceVersionMatch)),
   598  		attribute.Int("limit", int(opts.Predicate.Limit)),
   599  		attribute.String("continue", opts.Predicate.Continue))
   600  	defer span.End(500 * time.Millisecond)
   601  	listPtr, err := meta.GetItemsPtr(listObj)
   602  	if err != nil {
   603  		return err
   604  	}
   605  	v, err := conversion.EnforcePtr(listPtr)
   606  	if err != nil || v.Kind() != reflect.Slice {
   607  		return fmt.Errorf("need ptr to slice: %v", err)
   608  	}
   609  
   610  	// For recursive lists, we need to make sure the key ended with "/" so that we only
   611  	// get children "directories". e.g. if we have key "/a", "/a/b", "/ab", getting keys
   612  	// with prefix "/a" will return all three, while with prefix "/a/" will return only
   613  	// "/a/b" which is the correct answer.
   614  	if opts.Recursive && !strings.HasSuffix(preparedKey, "/") {
   615  		preparedKey += "/"
   616  	}
   617  	keyPrefix := preparedKey
   618  
   619  	// set the appropriate clientv3 options to filter the returned data set
   620  	var limitOption *clientv3.OpOption
   621  	limit := opts.Predicate.Limit
   622  	var paging bool
   623  	options := make([]clientv3.OpOption, 0, 4)
   624  	if opts.Predicate.Limit > 0 {
   625  		paging = true
   626  		options = append(options, clientv3.WithLimit(limit))
   627  		limitOption = &options[len(options)-1]
   628  	}
   629  
   630  	if opts.Recursive {
   631  		rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
   632  		options = append(options, clientv3.WithRange(rangeEnd))
   633  	}
   634  
   635  	newItemFunc := getNewItemFunc(listObj, v)
   636  
   637  	var continueRV, withRev int64
   638  	var continueKey string
   639  	switch {
   640  	case opts.Recursive && len(opts.Predicate.Continue) > 0:
   641  		continueKey, continueRV, err = storage.DecodeContinue(opts.Predicate.Continue, keyPrefix)
   642  		if err != nil {
   643  			return apierrors.NewBadRequest(fmt.Sprintf("invalid continue token: %v", err))
   644  		}
   645  
   646  		if len(opts.ResourceVersion) > 0 && opts.ResourceVersion != "0" {
   647  			return apierrors.NewBadRequest("specifying resource version is not allowed when using continue")
   648  		}
   649  		preparedKey = continueKey
   650  		// If continueRV > 0, the LIST request needs a specific resource version.
   651  		// continueRV==0 is invalid.
   652  		// If continueRV < 0, the request is for the latest resource version.
   653  		if continueRV > 0 {
   654  			withRev = continueRV
   655  		}
   656  	case len(opts.ResourceVersion) > 0:
   657  		parsedRV, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
   658  		if err != nil {
   659  			return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
   660  		}
   661  		switch opts.ResourceVersionMatch {
   662  		case metav1.ResourceVersionMatchNotOlderThan:
   663  			// The not older than constraint is checked after we get a response from etcd,
   664  			// and returnedRV is then set to the revision we get from the etcd response.
   665  		case metav1.ResourceVersionMatchExact:
   666  			withRev = int64(parsedRV)
   667  		case "": // legacy case
   668  			if opts.Recursive && opts.Predicate.Limit > 0 && parsedRV > 0 {
   669  				withRev = int64(parsedRV)
   670  			}
   671  		default:
   672  			return fmt.Errorf("unknown ResourceVersionMatch value: %v", opts.ResourceVersionMatch)
   673  		}
   674  	}
   675  
   676  	if withRev != 0 {
   677  		options = append(options, clientv3.WithRev(withRev))
   678  	}
   679  
   680  	// loop until we have filled the requested limit from etcd or there are no more results
   681  	var lastKey []byte
   682  	var hasMore bool
   683  	var getResp *clientv3.GetResponse
   684  	var numFetched int
   685  	var numEvald int
   686  	// Because these metrics are for understanding the costs of handling LIST requests,
   687  	// get them recorded even in error cases.
   688  	defer func() {
   689  		numReturn := v.Len()
   690  		metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
   691  	}()
   692  
   693  	metricsOp := "get"
   694  	if opts.Recursive {
   695  		metricsOp = "list"
   696  	}
   697  
   698  	for {
   699  		startTime := time.Now()
   700  		getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
   701  		metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
   702  		if err != nil {
   703  			return interpretListError(err, len(opts.Predicate.Continue) > 0, continueKey, keyPrefix)
   704  		}
   705  		numFetched += len(getResp.Kvs)
   706  		if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
   707  			return err
   708  		}
   709  		hasMore = getResp.More
   710  
   711  		if len(getResp.Kvs) == 0 && getResp.More {
   712  			return fmt.Errorf("no results were found, but etcd indicated there were more values remaining")
   713  		}
   714  		// indicate to the client which resource version was returned, and use the same resource version for subsequent requests.
   715  		if withRev == 0 {
   716  			withRev = getResp.Header.Revision
   717  			options = append(options, clientv3.WithRev(withRev))
   718  		}
   719  
   720  		// avoid small allocations for the result slice, since this can be called in many
   721  		// different contexts and we don't know how significantly the result will be filtered
   722  		if opts.Predicate.Empty() {
   723  			growSlice(v, len(getResp.Kvs))
   724  		} else {
   725  			growSlice(v, 2048, len(getResp.Kvs))
   726  		}
   727  
   728  		// take items from the response until the bucket is full, filtering as we go
   729  		for i, kv := range getResp.Kvs {
   730  			if paging && int64(v.Len()) >= opts.Predicate.Limit {
   731  				hasMore = true
   732  				break
   733  			}
   734  			lastKey = kv.Key
   735  
   736  			data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(kv.Key))
   737  			if err != nil {
   738  				return storage.NewInternalErrorf("unable to transform key %q: %v", kv.Key, err)
   739  			}
   740  
   741  			if err := appendListItem(v, data, uint64(kv.ModRevision), opts.Predicate, s.codec, s.versioner, newItemFunc); err != nil {
   742  				recordDecodeError(s.groupResourceString, string(kv.Key))
   743  				return err
   744  			}
   745  			numEvald++
   746  
   747  			// free kv early. Long lists can take O(seconds) to decode.
   748  			getResp.Kvs[i] = nil
   749  		}
   750  
   751  		// no more results remain or we didn't request paging
   752  		if !hasMore || !paging {
   753  			break
   754  		}
   755  		// we're paging but we have filled our bucket
   756  		if int64(v.Len()) >= opts.Predicate.Limit {
   757  			break
   758  		}
   759  
   760  		if limit < maxLimit {
   761  			// We got incomplete result due to field/label selector dropping the object.
   762  			// Double page size to reduce total number of calls to etcd.
   763  			limit *= 2
   764  			if limit > maxLimit {
   765  				limit = maxLimit
   766  			}
   767  			*limitOption = clientv3.WithLimit(limit)
   768  		}
   769  		preparedKey = string(lastKey) + "\x00"
   770  	}
   771  
   772  	if v.IsNil() {
   773  		// Ensure that we never return a nil Items pointer in the result for consistency.
   774  		v.Set(reflect.MakeSlice(v.Type(), 0, 0))
   775  	}
   776  
   777  	// instruct the client to begin querying from immediately after the last key we returned
   778  	// we never return a key that the client wouldn't be allowed to see
   779  	if hasMore {
   780  		// we want to start immediately after the last key
   781  		next, err := storage.EncodeContinue(string(lastKey)+"\x00", keyPrefix, withRev)
   782  		if err != nil {
   783  			return err
   784  		}
   785  		var remainingItemCount *int64
   786  		// getResp.Count counts in objects that do not match the pred.
   787  		// Instead of returning inaccurate count for non-empty selectors, we return nil.
   788  		// Only set remainingItemCount if the predicate is empty.
   789  		if opts.Predicate.Empty() {
   790  			c := int64(getResp.Count - opts.Predicate.Limit)
   791  			remainingItemCount = &c
   792  		}
   793  		return s.versioner.UpdateList(listObj, uint64(withRev), next, remainingItemCount)
   794  	}
   795  
   796  	// no continuation
   797  	return s.versioner.UpdateList(listObj, uint64(withRev), "", nil)
   798  }
   799  
   800  // growSlice takes a slice value and grows its capacity up
   801  // to the maximum of the passed sizes or maxCapacity, whichever
   802  // is smaller. Above maxCapacity decisions about allocation are left
   803  // to the Go runtime on append. This allows a caller to make an
   804  // educated guess about the potential size of the total list while
   805  // still avoiding overly aggressive initial allocation. If sizes
   806  // is empty maxCapacity will be used as the size to grow.
   807  func growSlice(v reflect.Value, maxCapacity int, sizes ...int) {
   808  	cap := v.Cap()
   809  	max := cap
   810  	for _, size := range sizes {
   811  		if size > max {
   812  			max = size
   813  		}
   814  	}
   815  	if len(sizes) == 0 || max > maxCapacity {
   816  		max = maxCapacity
   817  	}
   818  	if max <= cap {
   819  		return
   820  	}
   821  	if v.Len() > 0 {
   822  		extra := reflect.MakeSlice(v.Type(), v.Len(), max)
   823  		reflect.Copy(extra, v)
   824  		v.Set(extra)
   825  	} else {
   826  		extra := reflect.MakeSlice(v.Type(), 0, max)
   827  		v.Set(extra)
   828  	}
   829  }
   830  
   831  // Watch implements storage.Interface.Watch.
   832  func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
   833  	preparedKey, err := s.prepareKey(key)
   834  	if err != nil {
   835  		return nil, err
   836  	}
   837  	rev, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
   838  	if err != nil {
   839  		return nil, err
   840  	}
   841  	return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts)
   842  }
   843  
   844  func (s *store) watchContext(ctx context.Context) context.Context {
   845  	// The etcd server waits until it cannot find a leader for 3 election
   846  	// timeouts to cancel existing streams. 3 is currently a hard coded
   847  	// constant. The election timeout defaults to 1000ms. If the cluster is
   848  	// healthy, when the leader is stopped, the leadership transfer should be
   849  	// smooth. (leader transfers its leadership before stopping). If leader is
   850  	// hard killed, other servers will take an election timeout to realize
   851  	// leader lost and start campaign.
   852  	return clientv3.WithRequireLeader(ctx)
   853  }
   854  
   855  func (s *store) getCurrentState(ctx context.Context, key string, v reflect.Value, ignoreNotFound bool) func() (*objState, error) {
   856  	return func() (*objState, error) {
   857  		startTime := time.Now()
   858  		getResp, err := s.client.KV.Get(ctx, key)
   859  		metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
   860  		if err != nil {
   861  			return nil, err
   862  		}
   863  		return s.getState(ctx, getResp, key, v, ignoreNotFound)
   864  	}
   865  }
   866  
   867  func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) {
   868  	state := &objState{
   869  		meta: &storage.ResponseMeta{},
   870  	}
   871  
   872  	if u, ok := v.Addr().Interface().(runtime.Unstructured); ok {
   873  		state.obj = u.NewEmptyInstance()
   874  	} else {
   875  		state.obj = reflect.New(v.Type()).Interface().(runtime.Object)
   876  	}
   877  
   878  	if len(getResp.Kvs) == 0 {
   879  		if !ignoreNotFound {
   880  			return nil, storage.NewKeyNotFoundError(key, 0)
   881  		}
   882  		if err := runtime.SetZeroValue(state.obj); err != nil {
   883  			return nil, err
   884  		}
   885  	} else {
   886  		data, stale, err := s.transformer.TransformFromStorage(ctx, getResp.Kvs[0].Value, authenticatedDataString(key))
   887  		if err != nil {
   888  			return nil, storage.NewInternalError(err.Error())
   889  		}
   890  		state.rev = getResp.Kvs[0].ModRevision
   891  		state.meta.ResourceVersion = uint64(state.rev)
   892  		state.data = data
   893  		state.stale = stale
   894  		if err := decode(s.codec, s.versioner, state.data, state.obj, state.rev); err != nil {
   895  			recordDecodeError(s.groupResourceString, key)
   896  			return nil, err
   897  		}
   898  	}
   899  	return state, nil
   900  }
   901  
   902  func (s *store) getStateFromObject(obj runtime.Object) (*objState, error) {
   903  	state := &objState{
   904  		obj:  obj,
   905  		meta: &storage.ResponseMeta{},
   906  	}
   907  
   908  	rv, err := s.versioner.ObjectResourceVersion(obj)
   909  	if err != nil {
   910  		return nil, fmt.Errorf("couldn't get resource version: %v", err)
   911  	}
   912  	state.rev = int64(rv)
   913  	state.meta.ResourceVersion = uint64(state.rev)
   914  
   915  	// Compute the serialized form - for that we need to temporarily clean
   916  	// its resource version field (those are not stored in etcd).
   917  	if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
   918  		return nil, fmt.Errorf("PrepareObjectForStorage failed: %v", err)
   919  	}
   920  	state.data, err = runtime.Encode(s.codec, obj)
   921  	if err != nil {
   922  		return nil, err
   923  	}
   924  	if err := s.versioner.UpdateObject(state.obj, uint64(rv)); err != nil {
   925  		klog.Errorf("failed to update object version: %v", err)
   926  	}
   927  	return state, nil
   928  }
   929  
   930  func (s *store) updateState(st *objState, userUpdate storage.UpdateFunc) (runtime.Object, uint64, error) {
   931  	ret, ttlPtr, err := userUpdate(st.obj, *st.meta)
   932  	if err != nil {
   933  		return nil, 0, err
   934  	}
   935  
   936  	if err := s.versioner.PrepareObjectForStorage(ret); err != nil {
   937  		return nil, 0, fmt.Errorf("PrepareObjectForStorage failed: %v", err)
   938  	}
   939  	var ttl uint64
   940  	if ttlPtr != nil {
   941  		ttl = *ttlPtr
   942  	}
   943  	return ret, ttl, nil
   944  }
   945  
   946  // ttlOpts returns client options based on given ttl.
   947  // ttl: if ttl is non-zero, it will attach the key to a lease with ttl of roughly the same length
   948  func (s *store) ttlOpts(ctx context.Context, ttl int64) ([]clientv3.OpOption, error) {
   949  	if ttl == 0 {
   950  		return nil, nil
   951  	}
   952  	id, err := s.leaseManager.GetLease(ctx, ttl)
   953  	if err != nil {
   954  		return nil, err
   955  	}
   956  	return []clientv3.OpOption{clientv3.WithLease(id)}, nil
   957  }
   958  
   959  // validateMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is
   960  // greater than the most recent actualRevision available from storage.
   961  func (s *store) validateMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error {
   962  	if minimumResourceVersion == "" {
   963  		return nil
   964  	}
   965  	minimumRV, err := s.versioner.ParseResourceVersion(minimumResourceVersion)
   966  	if err != nil {
   967  		return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
   968  	}
   969  	// Enforce the storage.Interface guarantee that the resource version of the returned data
   970  	// "will be at least 'resourceVersion'".
   971  	if minimumRV > actualRevision {
   972  		return storage.NewTooLargeResourceVersionError(minimumRV, actualRevision, 0)
   973  	}
   974  	return nil
   975  }
   976  
   977  func (s *store) prepareKey(key string) (string, error) {
   978  	if key == ".." ||
   979  		strings.HasPrefix(key, "../") ||
   980  		strings.HasSuffix(key, "/..") ||
   981  		strings.Contains(key, "/../") {
   982  		return "", fmt.Errorf("invalid key: %q", key)
   983  	}
   984  	if key == "." ||
   985  		strings.HasPrefix(key, "./") ||
   986  		strings.HasSuffix(key, "/.") ||
   987  		strings.Contains(key, "/./") {
   988  		return "", fmt.Errorf("invalid key: %q", key)
   989  	}
   990  	if key == "" || key == "/" {
   991  		return "", fmt.Errorf("empty key: %q", key)
   992  	}
   993  	// We ensured that pathPrefix ends in '/' in construction, so skip any leading '/' in the key now.
   994  	startIndex := 0
   995  	if key[0] == '/' {
   996  		startIndex = 1
   997  	}
   998  	return s.pathPrefix + key[startIndex:], nil
   999  }
  1000  
  1001  // decode decodes value of bytes into object. It will also set the object resource version to rev.
  1002  // On success, objPtr would be set to the object.
  1003  func decode(codec runtime.Codec, versioner storage.Versioner, value []byte, objPtr runtime.Object, rev int64) error {
  1004  	if _, err := conversion.EnforcePtr(objPtr); err != nil {
  1005  		return fmt.Errorf("unable to convert output object to pointer: %v", err)
  1006  	}
  1007  	_, _, err := codec.Decode(value, nil, objPtr)
  1008  	if err != nil {
  1009  		return err
  1010  	}
  1011  	// being unable to set the version does not prevent the object from being extracted
  1012  	if err := versioner.UpdateObject(objPtr, uint64(rev)); err != nil {
  1013  		klog.Errorf("failed to update object version: %v", err)
  1014  	}
  1015  	return nil
  1016  }
  1017  
  1018  // appendListItem decodes and appends the object (if it passes filter) to v, which must be a slice.
  1019  func appendListItem(v reflect.Value, data []byte, rev uint64, pred storage.SelectionPredicate, codec runtime.Codec, versioner storage.Versioner, newItemFunc func() runtime.Object) error {
  1020  	obj, _, err := codec.Decode(data, nil, newItemFunc())
  1021  	if err != nil {
  1022  		return err
  1023  	}
  1024  	// being unable to set the version does not prevent the object from being extracted
  1025  	if err := versioner.UpdateObject(obj, rev); err != nil {
  1026  		klog.Errorf("failed to update object version: %v", err)
  1027  	}
  1028  	if matched, err := pred.Matches(obj); err == nil && matched {
  1029  		v.Set(reflect.Append(v, reflect.ValueOf(obj).Elem()))
  1030  	}
  1031  	return nil
  1032  }
  1033  
  1034  // recordDecodeError record decode error split by object type.
  1035  func recordDecodeError(resource string, key string) {
  1036  	metrics.RecordDecodeError(resource)
  1037  	klog.V(4).Infof("Decoding %s \"%s\" failed", resource, key)
  1038  }
  1039  
  1040  func notFound(key string) clientv3.Cmp {
  1041  	return clientv3.Compare(clientv3.ModRevision(key), "=", 0)
  1042  }
  1043  
  1044  // getTypeName returns type name of an object for reporting purposes.
  1045  func getTypeName(obj interface{}) string {
  1046  	return reflect.TypeOf(obj).String()
  1047  }