k8s.io/apiserver@v0.31.1/pkg/storage/etcd3/store.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package etcd3
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"path"
    25  	"reflect"
    26  	"strings"
    27  	"time"
    28  
    29  	clientv3 "go.etcd.io/etcd/client/v3"
    30  	"go.opentelemetry.io/otel/attribute"
    31  
    32  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    33  	"k8s.io/apimachinery/pkg/api/meta"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    36  	"k8s.io/apimachinery/pkg/conversion"
    37  	"k8s.io/apimachinery/pkg/runtime"
    38  	"k8s.io/apimachinery/pkg/runtime/schema"
    39  	"k8s.io/apimachinery/pkg/watch"
    40  	"k8s.io/apiserver/pkg/audit"
    41  	endpointsrequest "k8s.io/apiserver/pkg/endpoints/request"
    42  	"k8s.io/apiserver/pkg/features"
    43  	"k8s.io/apiserver/pkg/storage"
    44  	"k8s.io/apiserver/pkg/storage/etcd3/metrics"
    45  	etcdfeature "k8s.io/apiserver/pkg/storage/feature"
    46  	"k8s.io/apiserver/pkg/storage/value"
    47  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    48  	"k8s.io/component-base/tracing"
    49  	"k8s.io/klog/v2"
    50  )
    51  
    52  const (
    53  	// maxLimit is a maximum page limit increase used when fetching objects from etcd.
    54  	// This limit is used only for increasing page size by kube-apiserver. If request
    55  	// specifies larger limit initially, it won't be changed.
    56  	maxLimit = 10000
    57  )
    58  
    59  // authenticatedDataString satisfies the value.Context interface. It uses the key to
    60  // authenticate the stored data. This does not defend against reuse of previously
    61  // encrypted values under the same key, but will prevent an attacker from using an
    62  // encrypted value from a different key. A stronger authenticated data segment would
    63  // include the etcd3 Version field (which is incremented on each write to a key and
    64  // reset when the key is deleted), but an attacker with write access to etcd can
    65  // force deletion and recreation of keys to weaken that angle.
    66  type authenticatedDataString string
    67  
    68  // AuthenticatedData implements the value.Context interface.
    69  func (d authenticatedDataString) AuthenticatedData() []byte {
    70  	return []byte(string(d))
    71  }
    72  
    73  var _ value.Context = authenticatedDataString("")
    74  
    75  type store struct {
    76  	client              *clientv3.Client
    77  	codec               runtime.Codec
    78  	versioner           storage.Versioner
    79  	transformer         value.Transformer
    80  	pathPrefix          string
    81  	groupResource       schema.GroupResource
    82  	groupResourceString string
    83  	watcher             *watcher
    84  	leaseManager        *leaseManager
    85  }
    86  
    87  func (s *store) RequestWatchProgress(ctx context.Context) error {
    88  	// Use watchContext to match ctx metadata provided when creating the watch.
    89  	// In best case scenario we would use the same context that watch was created, but there is no way access it from watchCache.
    90  	return s.client.RequestProgress(s.watchContext(ctx))
    91  }
    92  
    93  type objState struct {
    94  	obj   runtime.Object
    95  	meta  *storage.ResponseMeta
    96  	rev   int64
    97  	data  []byte
    98  	stale bool
    99  }
   100  
   101  // New returns an etcd3 implementation of storage.Interface.
   102  func New(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) storage.Interface {
   103  	return newStore(c, codec, newFunc, newListFunc, prefix, resourcePrefix, groupResource, transformer, leaseManagerConfig)
   104  }
   105  
   106  func newStore(c *clientv3.Client, codec runtime.Codec, newFunc, newListFunc func() runtime.Object, prefix, resourcePrefix string, groupResource schema.GroupResource, transformer value.Transformer, leaseManagerConfig LeaseManagerConfig) *store {
   107  	versioner := storage.APIObjectVersioner{}
   108  	// for compatibility with etcd2 impl.
   109  	// no-op for default prefix of '/registry'.
   110  	// keeps compatibility with etcd2 impl for custom prefixes that don't start with '/'
   111  	pathPrefix := path.Join("/", prefix)
   112  	if !strings.HasSuffix(pathPrefix, "/") {
   113  		// Ensure the pathPrefix ends in "/" here to simplify key concatenation later.
   114  		pathPrefix += "/"
   115  	}
   116  
   117  	w := &watcher{
   118  		client:        c,
   119  		codec:         codec,
   120  		newFunc:       newFunc,
   121  		groupResource: groupResource,
   122  		versioner:     versioner,
   123  		transformer:   transformer,
   124  	}
   125  	if newFunc == nil {
   126  		w.objectType = "<unknown>"
   127  	} else {
   128  		w.objectType = reflect.TypeOf(newFunc()).String()
   129  	}
   130  	s := &store{
   131  		client:              c,
   132  		codec:               codec,
   133  		versioner:           versioner,
   134  		transformer:         transformer,
   135  		pathPrefix:          pathPrefix,
   136  		groupResource:       groupResource,
   137  		groupResourceString: groupResource.String(),
   138  		watcher:             w,
   139  		leaseManager:        newDefaultLeaseManager(c, leaseManagerConfig),
   140  	}
   141  
   142  	w.getCurrentStorageRV = func(ctx context.Context) (uint64, error) {
   143  		return storage.GetCurrentResourceVersionFromStorage(ctx, s, newListFunc, resourcePrefix, w.objectType)
   144  	}
   145  	if utilfeature.DefaultFeatureGate.Enabled(features.ConsistentListFromCache) || utilfeature.DefaultFeatureGate.Enabled(features.WatchList) {
   146  		etcdfeature.DefaultFeatureSupportChecker.CheckClient(c.Ctx(), c, storage.RequestWatchProgress)
   147  	}
   148  	return s
   149  }
   150  
   151  // Versioner implements storage.Interface.Versioner.
   152  func (s *store) Versioner() storage.Versioner {
   153  	return s.versioner
   154  }
   155  
   156  // Get implements storage.Interface.Get.
   157  func (s *store) Get(ctx context.Context, key string, opts storage.GetOptions, out runtime.Object) error {
   158  	preparedKey, err := s.prepareKey(key)
   159  	if err != nil {
   160  		return err
   161  	}
   162  	startTime := time.Now()
   163  	getResp, err := s.client.KV.Get(ctx, preparedKey)
   164  	metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
   165  	if err != nil {
   166  		return err
   167  	}
   168  	if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
   169  		return err
   170  	}
   171  
   172  	if len(getResp.Kvs) == 0 {
   173  		if opts.IgnoreNotFound {
   174  			return runtime.SetZeroValue(out)
   175  		}
   176  		return storage.NewKeyNotFoundError(preparedKey, 0)
   177  	}
   178  	kv := getResp.Kvs[0]
   179  
   180  	data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(preparedKey))
   181  	if err != nil {
   182  		return storage.NewInternalError(err.Error())
   183  	}
   184  
   185  	err = decode(s.codec, s.versioner, data, out, kv.ModRevision)
   186  	if err != nil {
   187  		recordDecodeError(s.groupResourceString, preparedKey)
   188  		return err
   189  	}
   190  	return nil
   191  }
   192  
   193  // Create implements storage.Interface.Create.
   194  func (s *store) Create(ctx context.Context, key string, obj, out runtime.Object, ttl uint64) error {
   195  	preparedKey, err := s.prepareKey(key)
   196  	if err != nil {
   197  		return err
   198  	}
   199  	ctx, span := tracing.Start(ctx, "Create etcd3",
   200  		attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
   201  		attribute.String("key", key),
   202  		attribute.String("type", getTypeName(obj)),
   203  		attribute.String("resource", s.groupResourceString),
   204  	)
   205  	defer span.End(500 * time.Millisecond)
   206  	if version, err := s.versioner.ObjectResourceVersion(obj); err == nil && version != 0 {
   207  		return storage.ErrResourceVersionSetOnCreate
   208  	}
   209  	if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
   210  		return fmt.Errorf("PrepareObjectForStorage failed: %v", err)
   211  	}
   212  	span.AddEvent("About to Encode")
   213  	data, err := runtime.Encode(s.codec, obj)
   214  	if err != nil {
   215  		span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   216  		return err
   217  	}
   218  	span.AddEvent("Encode succeeded", attribute.Int("len", len(data)))
   219  
   220  	opts, err := s.ttlOpts(ctx, int64(ttl))
   221  	if err != nil {
   222  		return err
   223  	}
   224  
   225  	newData, err := s.transformer.TransformToStorage(ctx, data, authenticatedDataString(preparedKey))
   226  	if err != nil {
   227  		span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error()))
   228  		return storage.NewInternalError(err.Error())
   229  	}
   230  	span.AddEvent("TransformToStorage succeeded")
   231  
   232  	startTime := time.Now()
   233  	txnResp, err := s.client.KV.Txn(ctx).If(
   234  		notFound(preparedKey),
   235  	).Then(
   236  		clientv3.OpPut(preparedKey, string(newData), opts...),
   237  	).Commit()
   238  	metrics.RecordEtcdRequest("create", s.groupResourceString, err, startTime)
   239  	if err != nil {
   240  		span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
   241  		return err
   242  	}
   243  	span.AddEvent("Txn call succeeded")
   244  
   245  	if !txnResp.Succeeded {
   246  		return storage.NewKeyExistsError(preparedKey, 0)
   247  	}
   248  
   249  	if out != nil {
   250  		putResp := txnResp.Responses[0].GetResponsePut()
   251  		err = decode(s.codec, s.versioner, data, out, putResp.Header.Revision)
   252  		if err != nil {
   253  			span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   254  			recordDecodeError(s.groupResourceString, preparedKey)
   255  			return err
   256  		}
   257  		span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
   258  	}
   259  	return nil
   260  }
   261  
   262  // Delete implements storage.Interface.Delete.
   263  func (s *store) Delete(
   264  	ctx context.Context, key string, out runtime.Object, preconditions *storage.Preconditions,
   265  	validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
   266  	preparedKey, err := s.prepareKey(key)
   267  	if err != nil {
   268  		return err
   269  	}
   270  	v, err := conversion.EnforcePtr(out)
   271  	if err != nil {
   272  		return fmt.Errorf("unable to convert output object to pointer: %v", err)
   273  	}
   274  	return s.conditionalDelete(ctx, preparedKey, out, v, preconditions, validateDeletion, cachedExistingObject)
   275  }
   276  
   277  func (s *store) conditionalDelete(
   278  	ctx context.Context, key string, out runtime.Object, v reflect.Value, preconditions *storage.Preconditions,
   279  	validateDeletion storage.ValidateObjectFunc, cachedExistingObject runtime.Object) error {
   280  	getCurrentState := s.getCurrentState(ctx, key, v, false)
   281  
   282  	var origState *objState
   283  	var err error
   284  	var origStateIsCurrent bool
   285  	if cachedExistingObject != nil {
   286  		origState, err = s.getStateFromObject(cachedExistingObject)
   287  	} else {
   288  		origState, err = getCurrentState()
   289  		origStateIsCurrent = true
   290  	}
   291  	if err != nil {
   292  		return err
   293  	}
   294  
   295  	for {
   296  		if preconditions != nil {
   297  			if err := preconditions.Check(key, origState.obj); err != nil {
   298  				if origStateIsCurrent {
   299  					return err
   300  				}
   301  
   302  				// It's possible we're working with stale data.
   303  				// Remember the revision of the potentially stale data and the resulting update error
   304  				cachedRev := origState.rev
   305  				cachedUpdateErr := err
   306  
   307  				// Actually fetch
   308  				origState, err = getCurrentState()
   309  				if err != nil {
   310  					return err
   311  				}
   312  				origStateIsCurrent = true
   313  
   314  				// it turns out our cached data was not stale, return the error
   315  				if cachedRev == origState.rev {
   316  					return cachedUpdateErr
   317  				}
   318  
   319  				// Retry
   320  				continue
   321  			}
   322  		}
   323  		if err := validateDeletion(ctx, origState.obj); err != nil {
   324  			if origStateIsCurrent {
   325  				return err
   326  			}
   327  
   328  			// It's possible we're working with stale data.
   329  			// Remember the revision of the potentially stale data and the resulting update error
   330  			cachedRev := origState.rev
   331  			cachedUpdateErr := err
   332  
   333  			// Actually fetch
   334  			origState, err = getCurrentState()
   335  			if err != nil {
   336  				return err
   337  			}
   338  			origStateIsCurrent = true
   339  
   340  			// it turns out our cached data was not stale, return the error
   341  			if cachedRev == origState.rev {
   342  				return cachedUpdateErr
   343  			}
   344  
   345  			// Retry
   346  			continue
   347  		}
   348  
   349  		startTime := time.Now()
   350  		txnResp, err := s.client.KV.Txn(ctx).If(
   351  			clientv3.Compare(clientv3.ModRevision(key), "=", origState.rev),
   352  		).Then(
   353  			clientv3.OpDelete(key),
   354  		).Else(
   355  			clientv3.OpGet(key),
   356  		).Commit()
   357  		metrics.RecordEtcdRequest("delete", s.groupResourceString, err, startTime)
   358  		if err != nil {
   359  			return err
   360  		}
   361  		if !txnResp.Succeeded {
   362  			getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
   363  			klog.V(4).Infof("deletion of %s failed because of a conflict, going to retry", key)
   364  			origState, err = s.getState(ctx, getResp, key, v, false)
   365  			if err != nil {
   366  				return err
   367  			}
   368  			origStateIsCurrent = true
   369  			continue
   370  		}
   371  
   372  		if len(txnResp.Responses) == 0 || txnResp.Responses[0].GetResponseDeleteRange() == nil {
   373  			return errors.New(fmt.Sprintf("invalid DeleteRange response: %v", txnResp.Responses))
   374  		}
   375  		deleteResp := txnResp.Responses[0].GetResponseDeleteRange()
   376  		if deleteResp.Header == nil {
   377  			return errors.New("invalid DeleteRange response - nil header")
   378  		}
   379  		err = decode(s.codec, s.versioner, origState.data, out, deleteResp.Header.Revision)
   380  		if err != nil {
   381  			recordDecodeError(s.groupResourceString, key)
   382  			return err
   383  		}
   384  		return nil
   385  	}
   386  }
   387  
   388  // GuaranteedUpdate implements storage.Interface.GuaranteedUpdate.
   389  func (s *store) GuaranteedUpdate(
   390  	ctx context.Context, key string, destination runtime.Object, ignoreNotFound bool,
   391  	preconditions *storage.Preconditions, tryUpdate storage.UpdateFunc, cachedExistingObject runtime.Object) error {
   392  	preparedKey, err := s.prepareKey(key)
   393  	if err != nil {
   394  		return err
   395  	}
   396  	ctx, span := tracing.Start(ctx, "GuaranteedUpdate etcd3",
   397  		attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
   398  		attribute.String("key", key),
   399  		attribute.String("type", getTypeName(destination)),
   400  		attribute.String("resource", s.groupResourceString))
   401  	defer span.End(500 * time.Millisecond)
   402  
   403  	v, err := conversion.EnforcePtr(destination)
   404  	if err != nil {
   405  		return fmt.Errorf("unable to convert output object to pointer: %v", err)
   406  	}
   407  
   408  	getCurrentState := s.getCurrentState(ctx, preparedKey, v, ignoreNotFound)
   409  
   410  	var origState *objState
   411  	var origStateIsCurrent bool
   412  	if cachedExistingObject != nil {
   413  		origState, err = s.getStateFromObject(cachedExistingObject)
   414  	} else {
   415  		origState, err = getCurrentState()
   416  		origStateIsCurrent = true
   417  	}
   418  	if err != nil {
   419  		return err
   420  	}
   421  	span.AddEvent("initial value restored")
   422  
   423  	transformContext := authenticatedDataString(preparedKey)
   424  	for {
   425  		if err := preconditions.Check(preparedKey, origState.obj); err != nil {
   426  			// If our data is already up to date, return the error
   427  			if origStateIsCurrent {
   428  				return err
   429  			}
   430  
   431  			// It's possible we were working with stale data
   432  			// Actually fetch
   433  			origState, err = getCurrentState()
   434  			if err != nil {
   435  				return err
   436  			}
   437  			origStateIsCurrent = true
   438  			// Retry
   439  			continue
   440  		}
   441  
   442  		ret, ttl, err := s.updateState(origState, tryUpdate)
   443  		if err != nil {
   444  			// If our data is already up to date, return the error
   445  			if origStateIsCurrent {
   446  				return err
   447  			}
   448  
   449  			// It's possible we were working with stale data
   450  			// Remember the revision of the potentially stale data and the resulting update error
   451  			cachedRev := origState.rev
   452  			cachedUpdateErr := err
   453  
   454  			// Actually fetch
   455  			origState, err = getCurrentState()
   456  			if err != nil {
   457  				return err
   458  			}
   459  			origStateIsCurrent = true
   460  
   461  			// it turns out our cached data was not stale, return the error
   462  			if cachedRev == origState.rev {
   463  				return cachedUpdateErr
   464  			}
   465  
   466  			// Retry
   467  			continue
   468  		}
   469  
   470  		span.AddEvent("About to Encode")
   471  		data, err := runtime.Encode(s.codec, ret)
   472  		if err != nil {
   473  			span.AddEvent("Encode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   474  			return err
   475  		}
   476  		span.AddEvent("Encode succeeded", attribute.Int("len", len(data)))
   477  		if !origState.stale && bytes.Equal(data, origState.data) {
   478  			// if we skipped the original Get in this loop, we must refresh from
   479  			// etcd in order to be sure the data in the store is equivalent to
   480  			// our desired serialization
   481  			if !origStateIsCurrent {
   482  				origState, err = getCurrentState()
   483  				if err != nil {
   484  					return err
   485  				}
   486  				origStateIsCurrent = true
   487  				if !bytes.Equal(data, origState.data) {
   488  					// original data changed, restart loop
   489  					continue
   490  				}
   491  			}
   492  			// recheck that the data from etcd is not stale before short-circuiting a write
   493  			if !origState.stale {
   494  				err = decode(s.codec, s.versioner, origState.data, destination, origState.rev)
   495  				if err != nil {
   496  					recordDecodeError(s.groupResourceString, preparedKey)
   497  					return err
   498  				}
   499  				return nil
   500  			}
   501  		}
   502  
   503  		newData, err := s.transformer.TransformToStorage(ctx, data, transformContext)
   504  		if err != nil {
   505  			span.AddEvent("TransformToStorage failed", attribute.String("err", err.Error()))
   506  			return storage.NewInternalError(err.Error())
   507  		}
   508  		span.AddEvent("TransformToStorage succeeded")
   509  
   510  		opts, err := s.ttlOpts(ctx, int64(ttl))
   511  		if err != nil {
   512  			return err
   513  		}
   514  		span.AddEvent("Transaction prepared")
   515  
   516  		startTime := time.Now()
   517  		txnResp, err := s.client.KV.Txn(ctx).If(
   518  			clientv3.Compare(clientv3.ModRevision(preparedKey), "=", origState.rev),
   519  		).Then(
   520  			clientv3.OpPut(preparedKey, string(newData), opts...),
   521  		).Else(
   522  			clientv3.OpGet(preparedKey),
   523  		).Commit()
   524  		metrics.RecordEtcdRequest("update", s.groupResourceString, err, startTime)
   525  		if err != nil {
   526  			span.AddEvent("Txn call failed", attribute.String("err", err.Error()))
   527  			return err
   528  		}
   529  		span.AddEvent("Txn call completed")
   530  		span.AddEvent("Transaction committed")
   531  		if !txnResp.Succeeded {
   532  			getResp := (*clientv3.GetResponse)(txnResp.Responses[0].GetResponseRange())
   533  			klog.V(4).Infof("GuaranteedUpdate of %s failed because of a conflict, going to retry", preparedKey)
   534  			origState, err = s.getState(ctx, getResp, preparedKey, v, ignoreNotFound)
   535  			if err != nil {
   536  				return err
   537  			}
   538  			span.AddEvent("Retry value restored")
   539  			origStateIsCurrent = true
   540  			continue
   541  		}
   542  		putResp := txnResp.Responses[0].GetResponsePut()
   543  
   544  		err = decode(s.codec, s.versioner, data, destination, putResp.Header.Revision)
   545  		if err != nil {
   546  			span.AddEvent("decode failed", attribute.Int("len", len(data)), attribute.String("err", err.Error()))
   547  			recordDecodeError(s.groupResourceString, preparedKey)
   548  			return err
   549  		}
   550  		span.AddEvent("decode succeeded", attribute.Int("len", len(data)))
   551  		return nil
   552  	}
   553  }
   554  
   555  func getNewItemFunc(listObj runtime.Object, v reflect.Value) func() runtime.Object {
   556  	// For unstructured lists with a target group/version, preserve the group/version in the instantiated list items
   557  	if unstructuredList, isUnstructured := listObj.(*unstructured.UnstructuredList); isUnstructured {
   558  		if apiVersion := unstructuredList.GetAPIVersion(); len(apiVersion) > 0 {
   559  			return func() runtime.Object {
   560  				return &unstructured.Unstructured{Object: map[string]interface{}{"apiVersion": apiVersion}}
   561  			}
   562  		}
   563  	}
   564  
   565  	// Otherwise just instantiate an empty item
   566  	elem := v.Type().Elem()
   567  	return func() runtime.Object {
   568  		return reflect.New(elem).Interface().(runtime.Object)
   569  	}
   570  }
   571  
   572  func (s *store) Count(key string) (int64, error) {
   573  	preparedKey, err := s.prepareKey(key)
   574  	if err != nil {
   575  		return 0, err
   576  	}
   577  
   578  	// We need to make sure the key ended with "/" so that we only get children "directories".
   579  	// e.g. if we have key "/a", "/a/b", "/ab", getting keys with prefix "/a" will return all three,
   580  	// while with prefix "/a/" will return only "/a/b" which is the correct answer.
   581  	if !strings.HasSuffix(preparedKey, "/") {
   582  		preparedKey += "/"
   583  	}
   584  
   585  	startTime := time.Now()
   586  	getResp, err := s.client.KV.Get(context.Background(), preparedKey, clientv3.WithRange(clientv3.GetPrefixRangeEnd(preparedKey)), clientv3.WithCountOnly())
   587  	metrics.RecordEtcdRequest("listWithCount", preparedKey, err, startTime)
   588  	if err != nil {
   589  		return 0, err
   590  	}
   591  	return getResp.Count, nil
   592  }
   593  
   594  // ReadinessCheck implements storage.Interface.
   595  func (s *store) ReadinessCheck() error {
   596  	return nil
   597  }
   598  
   599  // resolveGetListRev is used by GetList to resolve the rev to use in the client.KV.Get request.
   600  func (s *store) resolveGetListRev(continueKey string, continueRV int64, opts storage.ListOptions) (int64, error) {
   601  	var withRev int64
   602  	// Uses continueRV if this is a continuation request.
   603  	if len(continueKey) > 0 {
   604  		if len(opts.ResourceVersion) > 0 && opts.ResourceVersion != "0" {
   605  			return withRev, apierrors.NewBadRequest("specifying resource version is not allowed when using continue")
   606  		}
   607  		// If continueRV > 0, the LIST request needs a specific resource version.
   608  		// continueRV==0 is invalid.
   609  		// If continueRV < 0, the request is for the latest resource version.
   610  		if continueRV > 0 {
   611  			withRev = continueRV
   612  		}
   613  		return withRev, nil
   614  	}
   615  	// Returns 0 if ResourceVersion is not specified.
   616  	if len(opts.ResourceVersion) == 0 {
   617  		return withRev, nil
   618  	}
   619  	parsedRV, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
   620  	if err != nil {
   621  		return withRev, apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
   622  	}
   623  
   624  	switch opts.ResourceVersionMatch {
   625  	case metav1.ResourceVersionMatchNotOlderThan:
   626  		// The not older than constraint is checked after we get a response from etcd,
   627  		// and returnedRV is then set to the revision we get from the etcd response.
   628  	case metav1.ResourceVersionMatchExact:
   629  		withRev = int64(parsedRV)
   630  	case "": // legacy case
   631  		if opts.Recursive && opts.Predicate.Limit > 0 && parsedRV > 0 {
   632  			withRev = int64(parsedRV)
   633  		}
   634  	default:
   635  		return withRev, fmt.Errorf("unknown ResourceVersionMatch value: %v", opts.ResourceVersionMatch)
   636  	}
   637  	return withRev, nil
   638  }
   639  
   640  // GetList implements storage.Interface.
   641  func (s *store) GetList(ctx context.Context, key string, opts storage.ListOptions, listObj runtime.Object) error {
   642  	preparedKey, err := s.prepareKey(key)
   643  	if err != nil {
   644  		return err
   645  	}
   646  	ctx, span := tracing.Start(ctx, fmt.Sprintf("List(recursive=%v) etcd3", opts.Recursive),
   647  		attribute.String("audit-id", audit.GetAuditIDTruncated(ctx)),
   648  		attribute.String("key", key),
   649  		attribute.String("resourceVersion", opts.ResourceVersion),
   650  		attribute.String("resourceVersionMatch", string(opts.ResourceVersionMatch)),
   651  		attribute.Int("limit", int(opts.Predicate.Limit)),
   652  		attribute.String("continue", opts.Predicate.Continue))
   653  	defer span.End(500 * time.Millisecond)
   654  	listPtr, err := meta.GetItemsPtr(listObj)
   655  	if err != nil {
   656  		return err
   657  	}
   658  	v, err := conversion.EnforcePtr(listPtr)
   659  	if err != nil || v.Kind() != reflect.Slice {
   660  		return fmt.Errorf("need ptr to slice: %v", err)
   661  	}
   662  
   663  	// For recursive lists, we need to make sure the key ended with "/" so that we only
   664  	// get children "directories". e.g. if we have key "/a", "/a/b", "/ab", getting keys
   665  	// with prefix "/a" will return all three, while with prefix "/a/" will return only
   666  	// "/a/b" which is the correct answer.
   667  	if opts.Recursive && !strings.HasSuffix(preparedKey, "/") {
   668  		preparedKey += "/"
   669  	}
   670  	keyPrefix := preparedKey
   671  
   672  	// set the appropriate clientv3 options to filter the returned data set
   673  	var limitOption *clientv3.OpOption
   674  	limit := opts.Predicate.Limit
   675  	var paging bool
   676  	options := make([]clientv3.OpOption, 0, 4)
   677  	if opts.Predicate.Limit > 0 {
   678  		paging = true
   679  		options = append(options, clientv3.WithLimit(limit))
   680  		limitOption = &options[len(options)-1]
   681  	}
   682  
   683  	if opts.Recursive {
   684  		rangeEnd := clientv3.GetPrefixRangeEnd(keyPrefix)
   685  		options = append(options, clientv3.WithRange(rangeEnd))
   686  	}
   687  
   688  	newItemFunc := getNewItemFunc(listObj, v)
   689  
   690  	var continueRV, withRev int64
   691  	var continueKey string
   692  	if opts.Recursive && len(opts.Predicate.Continue) > 0 {
   693  		continueKey, continueRV, err = storage.DecodeContinue(opts.Predicate.Continue, keyPrefix)
   694  		if err != nil {
   695  			return apierrors.NewBadRequest(fmt.Sprintf("invalid continue token: %v", err))
   696  		}
   697  		preparedKey = continueKey
   698  	}
   699  	if withRev, err = s.resolveGetListRev(continueKey, continueRV, opts); err != nil {
   700  		return err
   701  	}
   702  
   703  	if withRev != 0 {
   704  		options = append(options, clientv3.WithRev(withRev))
   705  	}
   706  
   707  	// loop until we have filled the requested limit from etcd or there are no more results
   708  	var lastKey []byte
   709  	var hasMore bool
   710  	var getResp *clientv3.GetResponse
   711  	var numFetched int
   712  	var numEvald int
   713  	// Because these metrics are for understanding the costs of handling LIST requests,
   714  	// get them recorded even in error cases.
   715  	defer func() {
   716  		numReturn := v.Len()
   717  		metrics.RecordStorageListMetrics(s.groupResourceString, numFetched, numEvald, numReturn)
   718  	}()
   719  
   720  	metricsOp := "get"
   721  	if opts.Recursive {
   722  		metricsOp = "list"
   723  	}
   724  
   725  	for {
   726  		startTime := time.Now()
   727  		getResp, err = s.client.KV.Get(ctx, preparedKey, options...)
   728  		metrics.RecordEtcdRequest(metricsOp, s.groupResourceString, err, startTime)
   729  		if err != nil {
   730  			return interpretListError(err, len(opts.Predicate.Continue) > 0, continueKey, keyPrefix)
   731  		}
   732  		numFetched += len(getResp.Kvs)
   733  		if err = s.validateMinimumResourceVersion(opts.ResourceVersion, uint64(getResp.Header.Revision)); err != nil {
   734  			return err
   735  		}
   736  		hasMore = getResp.More
   737  
   738  		if len(getResp.Kvs) == 0 && getResp.More {
   739  			return fmt.Errorf("no results were found, but etcd indicated there were more values remaining")
   740  		}
   741  		// indicate to the client which resource version was returned, and use the same resource version for subsequent requests.
   742  		if withRev == 0 {
   743  			withRev = getResp.Header.Revision
   744  			options = append(options, clientv3.WithRev(withRev))
   745  		}
   746  
   747  		// avoid small allocations for the result slice, since this can be called in many
   748  		// different contexts and we don't know how significantly the result will be filtered
   749  		if opts.Predicate.Empty() {
   750  			growSlice(v, len(getResp.Kvs))
   751  		} else {
   752  			growSlice(v, 2048, len(getResp.Kvs))
   753  		}
   754  
   755  		// take items from the response until the bucket is full, filtering as we go
   756  		for i, kv := range getResp.Kvs {
   757  			if paging && int64(v.Len()) >= opts.Predicate.Limit {
   758  				hasMore = true
   759  				break
   760  			}
   761  			lastKey = kv.Key
   762  
   763  			data, _, err := s.transformer.TransformFromStorage(ctx, kv.Value, authenticatedDataString(kv.Key))
   764  			if err != nil {
   765  				return storage.NewInternalErrorf("unable to transform key %q: %v", kv.Key, err)
   766  			}
   767  
   768  			// Check if the request has already timed out before decode object
   769  			select {
   770  			case <-ctx.Done():
   771  				// parent context is canceled or timed out, no point in continuing
   772  				return storage.NewTimeoutError(string(kv.Key), "request did not complete within requested timeout")
   773  			default:
   774  			}
   775  
   776  			obj, err := decodeListItem(ctx, data, uint64(kv.ModRevision), s.codec, s.versioner, newItemFunc)
   777  			if err != nil {
   778  				recordDecodeError(s.groupResourceString, string(kv.Key))
   779  				return err
   780  			}
   781  
   782  			// being unable to set the version does not prevent the object from being extracted
   783  			if matched, err := opts.Predicate.Matches(obj); err == nil && matched {
   784  				v.Set(reflect.Append(v, reflect.ValueOf(obj).Elem()))
   785  			}
   786  
   787  			numEvald++
   788  
   789  			// free kv early. Long lists can take O(seconds) to decode.
   790  			getResp.Kvs[i] = nil
   791  		}
   792  
   793  		// no more results remain or we didn't request paging
   794  		if !hasMore || !paging {
   795  			break
   796  		}
   797  		// we're paging but we have filled our bucket
   798  		if int64(v.Len()) >= opts.Predicate.Limit {
   799  			break
   800  		}
   801  
   802  		if limit < maxLimit {
   803  			// We got incomplete result due to field/label selector dropping the object.
   804  			// Double page size to reduce total number of calls to etcd.
   805  			limit *= 2
   806  			if limit > maxLimit {
   807  				limit = maxLimit
   808  			}
   809  			*limitOption = clientv3.WithLimit(limit)
   810  		}
   811  		preparedKey = string(lastKey) + "\x00"
   812  	}
   813  
   814  	if v.IsNil() {
   815  		// Ensure that we never return a nil Items pointer in the result for consistency.
   816  		v.Set(reflect.MakeSlice(v.Type(), 0, 0))
   817  	}
   818  
   819  	continueValue, remainingItemCount, err := storage.PrepareContinueToken(string(lastKey), keyPrefix, withRev, getResp.Count, hasMore, opts)
   820  	if err != nil {
   821  		return err
   822  	}
   823  	return s.versioner.UpdateList(listObj, uint64(withRev), continueValue, remainingItemCount)
   824  }
   825  
   826  // growSlice takes a slice value and grows its capacity up
   827  // to the maximum of the passed sizes or maxCapacity, whichever
   828  // is smaller. Above maxCapacity decisions about allocation are left
   829  // to the Go runtime on append. This allows a caller to make an
   830  // educated guess about the potential size of the total list while
   831  // still avoiding overly aggressive initial allocation. If sizes
   832  // is empty maxCapacity will be used as the size to grow.
   833  func growSlice(v reflect.Value, maxCapacity int, sizes ...int) {
   834  	cap := v.Cap()
   835  	max := cap
   836  	for _, size := range sizes {
   837  		if size > max {
   838  			max = size
   839  		}
   840  	}
   841  	if len(sizes) == 0 || max > maxCapacity {
   842  		max = maxCapacity
   843  	}
   844  	if max <= cap {
   845  		return
   846  	}
   847  	if v.Len() > 0 {
   848  		extra := reflect.MakeSlice(v.Type(), v.Len(), max)
   849  		reflect.Copy(extra, v)
   850  		v.Set(extra)
   851  	} else {
   852  		extra := reflect.MakeSlice(v.Type(), 0, max)
   853  		v.Set(extra)
   854  	}
   855  }
   856  
   857  // Watch implements storage.Interface.Watch.
   858  func (s *store) Watch(ctx context.Context, key string, opts storage.ListOptions) (watch.Interface, error) {
   859  	preparedKey, err := s.prepareKey(key)
   860  	if err != nil {
   861  		return nil, err
   862  	}
   863  	rev, err := s.versioner.ParseResourceVersion(opts.ResourceVersion)
   864  	if err != nil {
   865  		return nil, err
   866  	}
   867  	return s.watcher.Watch(s.watchContext(ctx), preparedKey, int64(rev), opts)
   868  }
   869  
   870  func (s *store) watchContext(ctx context.Context) context.Context {
   871  	// The etcd server waits until it cannot find a leader for 3 election
   872  	// timeouts to cancel existing streams. 3 is currently a hard coded
   873  	// constant. The election timeout defaults to 1000ms. If the cluster is
   874  	// healthy, when the leader is stopped, the leadership transfer should be
   875  	// smooth. (leader transfers its leadership before stopping). If leader is
   876  	// hard killed, other servers will take an election timeout to realize
   877  	// leader lost and start campaign.
   878  	return clientv3.WithRequireLeader(ctx)
   879  }
   880  
   881  func (s *store) getCurrentState(ctx context.Context, key string, v reflect.Value, ignoreNotFound bool) func() (*objState, error) {
   882  	return func() (*objState, error) {
   883  		startTime := time.Now()
   884  		getResp, err := s.client.KV.Get(ctx, key)
   885  		metrics.RecordEtcdRequest("get", s.groupResourceString, err, startTime)
   886  		if err != nil {
   887  			return nil, err
   888  		}
   889  		return s.getState(ctx, getResp, key, v, ignoreNotFound)
   890  	}
   891  }
   892  
   893  func (s *store) getState(ctx context.Context, getResp *clientv3.GetResponse, key string, v reflect.Value, ignoreNotFound bool) (*objState, error) {
   894  	state := &objState{
   895  		meta: &storage.ResponseMeta{},
   896  	}
   897  
   898  	if u, ok := v.Addr().Interface().(runtime.Unstructured); ok {
   899  		state.obj = u.NewEmptyInstance()
   900  	} else {
   901  		state.obj = reflect.New(v.Type()).Interface().(runtime.Object)
   902  	}
   903  
   904  	if len(getResp.Kvs) == 0 {
   905  		if !ignoreNotFound {
   906  			return nil, storage.NewKeyNotFoundError(key, 0)
   907  		}
   908  		if err := runtime.SetZeroValue(state.obj); err != nil {
   909  			return nil, err
   910  		}
   911  	} else {
   912  		data, stale, err := s.transformer.TransformFromStorage(ctx, getResp.Kvs[0].Value, authenticatedDataString(key))
   913  		if err != nil {
   914  			return nil, storage.NewInternalError(err.Error())
   915  		}
   916  		state.rev = getResp.Kvs[0].ModRevision
   917  		state.meta.ResourceVersion = uint64(state.rev)
   918  		state.data = data
   919  		state.stale = stale
   920  		if err := decode(s.codec, s.versioner, state.data, state.obj, state.rev); err != nil {
   921  			recordDecodeError(s.groupResourceString, key)
   922  			return nil, err
   923  		}
   924  	}
   925  	return state, nil
   926  }
   927  
   928  func (s *store) getStateFromObject(obj runtime.Object) (*objState, error) {
   929  	state := &objState{
   930  		obj:  obj,
   931  		meta: &storage.ResponseMeta{},
   932  	}
   933  
   934  	rv, err := s.versioner.ObjectResourceVersion(obj)
   935  	if err != nil {
   936  		return nil, fmt.Errorf("couldn't get resource version: %v", err)
   937  	}
   938  	state.rev = int64(rv)
   939  	state.meta.ResourceVersion = uint64(state.rev)
   940  
   941  	// Compute the serialized form - for that we need to temporarily clean
   942  	// its resource version field (those are not stored in etcd).
   943  	if err := s.versioner.PrepareObjectForStorage(obj); err != nil {
   944  		return nil, fmt.Errorf("PrepareObjectForStorage failed: %v", err)
   945  	}
   946  	state.data, err = runtime.Encode(s.codec, obj)
   947  	if err != nil {
   948  		return nil, err
   949  	}
   950  	if err := s.versioner.UpdateObject(state.obj, uint64(rv)); err != nil {
   951  		klog.Errorf("failed to update object version: %v", err)
   952  	}
   953  	return state, nil
   954  }
   955  
   956  func (s *store) updateState(st *objState, userUpdate storage.UpdateFunc) (runtime.Object, uint64, error) {
   957  	ret, ttlPtr, err := userUpdate(st.obj, *st.meta)
   958  	if err != nil {
   959  		return nil, 0, err
   960  	}
   961  
   962  	if err := s.versioner.PrepareObjectForStorage(ret); err != nil {
   963  		return nil, 0, fmt.Errorf("PrepareObjectForStorage failed: %v", err)
   964  	}
   965  	var ttl uint64
   966  	if ttlPtr != nil {
   967  		ttl = *ttlPtr
   968  	}
   969  	return ret, ttl, nil
   970  }
   971  
   972  // ttlOpts returns client options based on given ttl.
   973  // ttl: if ttl is non-zero, it will attach the key to a lease with ttl of roughly the same length
   974  func (s *store) ttlOpts(ctx context.Context, ttl int64) ([]clientv3.OpOption, error) {
   975  	if ttl == 0 {
   976  		return nil, nil
   977  	}
   978  	id, err := s.leaseManager.GetLease(ctx, ttl)
   979  	if err != nil {
   980  		return nil, err
   981  	}
   982  	return []clientv3.OpOption{clientv3.WithLease(id)}, nil
   983  }
   984  
   985  // validateMinimumResourceVersion returns a 'too large resource' version error when the provided minimumResourceVersion is
   986  // greater than the most recent actualRevision available from storage.
   987  func (s *store) validateMinimumResourceVersion(minimumResourceVersion string, actualRevision uint64) error {
   988  	if minimumResourceVersion == "" {
   989  		return nil
   990  	}
   991  	minimumRV, err := s.versioner.ParseResourceVersion(minimumResourceVersion)
   992  	if err != nil {
   993  		return apierrors.NewBadRequest(fmt.Sprintf("invalid resource version: %v", err))
   994  	}
   995  	// Enforce the storage.Interface guarantee that the resource version of the returned data
   996  	// "will be at least 'resourceVersion'".
   997  	if minimumRV > actualRevision {
   998  		return storage.NewTooLargeResourceVersionError(minimumRV, actualRevision, 0)
   999  	}
  1000  	return nil
  1001  }
  1002  
  1003  func (s *store) prepareKey(key string) (string, error) {
  1004  	if key == ".." ||
  1005  		strings.HasPrefix(key, "../") ||
  1006  		strings.HasSuffix(key, "/..") ||
  1007  		strings.Contains(key, "/../") {
  1008  		return "", fmt.Errorf("invalid key: %q", key)
  1009  	}
  1010  	if key == "." ||
  1011  		strings.HasPrefix(key, "./") ||
  1012  		strings.HasSuffix(key, "/.") ||
  1013  		strings.Contains(key, "/./") {
  1014  		return "", fmt.Errorf("invalid key: %q", key)
  1015  	}
  1016  	if key == "" || key == "/" {
  1017  		return "", fmt.Errorf("empty key: %q", key)
  1018  	}
  1019  	// We ensured that pathPrefix ends in '/' in construction, so skip any leading '/' in the key now.
  1020  	startIndex := 0
  1021  	if key[0] == '/' {
  1022  		startIndex = 1
  1023  	}
  1024  	return s.pathPrefix + key[startIndex:], nil
  1025  }
  1026  
  1027  // decode decodes value of bytes into object. It will also set the object resource version to rev.
  1028  // On success, objPtr would be set to the object.
  1029  func decode(codec runtime.Codec, versioner storage.Versioner, value []byte, objPtr runtime.Object, rev int64) error {
  1030  	if _, err := conversion.EnforcePtr(objPtr); err != nil {
  1031  		return fmt.Errorf("unable to convert output object to pointer: %v", err)
  1032  	}
  1033  	_, _, err := codec.Decode(value, nil, objPtr)
  1034  	if err != nil {
  1035  		return err
  1036  	}
  1037  	// being unable to set the version does not prevent the object from being extracted
  1038  	if err := versioner.UpdateObject(objPtr, uint64(rev)); err != nil {
  1039  		klog.Errorf("failed to update object version: %v", err)
  1040  	}
  1041  	return nil
  1042  }
  1043  
  1044  // decodeListItem decodes bytes value in array into object.
  1045  func decodeListItem(ctx context.Context, data []byte, rev uint64, codec runtime.Codec, versioner storage.Versioner, newItemFunc func() runtime.Object) (runtime.Object, error) {
  1046  	startedAt := time.Now()
  1047  	defer func() {
  1048  		endpointsrequest.TrackDecodeLatency(ctx, time.Since(startedAt))
  1049  	}()
  1050  
  1051  	obj, _, err := codec.Decode(data, nil, newItemFunc())
  1052  	if err != nil {
  1053  		return nil, err
  1054  	}
  1055  
  1056  	if err := versioner.UpdateObject(obj, rev); err != nil {
  1057  		klog.Errorf("failed to update object version: %v", err)
  1058  	}
  1059  
  1060  	return obj, nil
  1061  }
  1062  
  1063  // recordDecodeError record decode error split by object type.
  1064  func recordDecodeError(resource string, key string) {
  1065  	metrics.RecordDecodeError(resource)
  1066  	klog.V(4).Infof("Decoding %s \"%s\" failed", resource, key)
  1067  }
  1068  
  1069  func notFound(key string) clientv3.Cmp {
  1070  	return clientv3.Compare(clientv3.ModRevision(key), "=", 0)
  1071  }
  1072  
  1073  // getTypeName returns type name of an object for reporting purposes.
  1074  func getTypeName(obj interface{}) string {
  1075  	return reflect.TypeOf(obj).String()
  1076  }