github.com/m3db/m3@v1.5.0/src/cluster/services/heartbeat/etcd/store.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package etcd
    22  
    23  import (
    24  	"errors"
    25  	"fmt"
    26  	"strings"
    27  	"sync"
    28  	"time"
    29  
    30  	"github.com/m3db/m3/src/cluster/etcd/watchmanager"
    31  	"github.com/m3db/m3/src/cluster/generated/proto/placementpb"
    32  	"github.com/m3db/m3/src/cluster/kv"
    33  	"github.com/m3db/m3/src/cluster/placement"
    34  	"github.com/m3db/m3/src/cluster/services"
    35  	"github.com/m3db/m3/src/x/retry"
    36  	"github.com/m3db/m3/src/x/watch"
    37  
    38  	"github.com/golang/protobuf/proto"
    39  	"github.com/uber-go/tally"
    40  	clientv3 "go.etcd.io/etcd/client/v3"
    41  	"go.uber.org/zap"
    42  	"golang.org/x/net/context"
    43  )
    44  
    45  const (
    46  	heartbeatKeyPrefix = "_hb"
    47  	keySeparator       = "/"
    48  	keyFormat          = "%s/%s"
    49  )
    50  
    51  var (
    52  	noopCancel     func()
    53  	errNoServiceID = errors.New("ServiceID cannot be empty")
    54  )
    55  
    56  // NewStore creates a heartbeat store based on etcd
    57  func NewStore(c *clientv3.Client, opts Options) (services.HeartbeatService, error) {
    58  	if opts.ServiceID() == nil {
    59  		return nil, errNoServiceID
    60  	}
    61  
    62  	scope := opts.InstrumentsOptions().MetricsScope()
    63  
    64  	store := &client{
    65  		cache:      newLeaseCache(),
    66  		watchables: make(map[string]watch.Watchable),
    67  		opts:       opts,
    68  		sid:        opts.ServiceID(),
    69  		logger:     opts.InstrumentsOptions().Logger(),
    70  		retrier:    retry.NewRetrier(opts.RetryOptions()),
    71  		m: clientMetrics{
    72  			etcdGetError:   scope.Counter("etcd-get-error"),
    73  			etcdPutError:   scope.Counter("etcd-put-error"),
    74  			etcdLeaseError: scope.Counter("etcd-lease-error"),
    75  		},
    76  
    77  		l:       c.Lease,
    78  		kv:      c.KV,
    79  		watcher: c.Watcher,
    80  	}
    81  
    82  	wOpts := watchmanager.NewOptions().
    83  		SetClient(c).
    84  		SetUpdateFn(store.update).
    85  		SetTickAndStopFn(store.tickAndStop).
    86  		SetWatchOptions([]clientv3.OpOption{
    87  			// WithPrefix so that the watch will receive any changes
    88  			// from the instances under the service
    89  			clientv3.WithPrefix(),
    90  			// periodically (appx every 10 mins) checks for the latest data
    91  			// with or without any update notification
    92  			clientv3.WithProgressNotify(),
    93  			// receive initial notification once the watch channel is created
    94  			clientv3.WithCreatedNotify(),
    95  		}).
    96  		SetWatchChanCheckInterval(opts.WatchChanCheckInterval()).
    97  		SetWatchChanInitTimeout(opts.WatchChanInitTimeout()).
    98  		SetWatchChanResetInterval(opts.WatchChanResetInterval()).
    99  		SetInstrumentsOptions(opts.InstrumentsOptions())
   100  
   101  	wm, err := watchmanager.NewWatchManager(wOpts)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  
   106  	store.wm = wm
   107  
   108  	return store, nil
   109  }
   110  
   111  type client struct {
   112  	sync.RWMutex
   113  
   114  	cache      *leaseCache
   115  	watchables map[string]watch.Watchable
   116  	opts       Options
   117  	sid        services.ServiceID
   118  	logger     *zap.Logger
   119  	retrier    retry.Retrier
   120  	m          clientMetrics
   121  
   122  	l       clientv3.Lease
   123  	kv      clientv3.KV
   124  	watcher clientv3.Watcher
   125  
   126  	wm watchmanager.WatchManager
   127  }
   128  
   129  type clientMetrics struct {
   130  	etcdGetError   tally.Counter
   131  	etcdPutError   tally.Counter
   132  	etcdLeaseError tally.Counter
   133  }
   134  
   135  func (c *client) Heartbeat(instance placement.Instance, ttl time.Duration) error {
   136  	leaseID, ok := c.cache.get(c.sid, instance.ID(), ttl)
   137  	if ok {
   138  		ctx, cancel := c.context()
   139  		defer cancel()
   140  
   141  		_, err := c.l.KeepAliveOnce(ctx, leaseID)
   142  		// if err != nil, it could because the old lease has already timedout
   143  		// on the server side, we need to try a new lease.
   144  		if err == nil {
   145  			return nil
   146  		}
   147  	}
   148  
   149  	ctx, cancel := c.context()
   150  	defer cancel()
   151  
   152  	resp, err := c.l.Grant(ctx, int64(ttl/time.Second))
   153  	if err != nil {
   154  		c.m.etcdLeaseError.Inc(1)
   155  		return err
   156  	}
   157  
   158  	ctx, cancel = c.context()
   159  	defer cancel()
   160  
   161  	instanceProto, err := instance.Proto()
   162  	if err != nil {
   163  		return err
   164  	}
   165  
   166  	instanceBytes, err := proto.Marshal(instanceProto)
   167  	if err != nil {
   168  		return err
   169  	}
   170  
   171  	_, err = c.kv.Put(
   172  		ctx,
   173  		heartbeatKey(c.sid, instance.ID()),
   174  		string(instanceBytes),
   175  		clientv3.WithLease(resp.ID),
   176  	)
   177  	if err != nil {
   178  		c.m.etcdPutError.Inc(1)
   179  		return err
   180  	}
   181  
   182  	c.cache.put(c.sid, instance.ID(), ttl, resp.ID)
   183  
   184  	return nil
   185  }
   186  
   187  func (c *client) Get() ([]string, error) {
   188  	return c.get(servicePrefix(c.sid))
   189  }
   190  
   191  func (c *client) get(key string) ([]string, error) {
   192  	ctx, cancel := c.context()
   193  	defer cancel()
   194  
   195  	resp, err := c.kv.Get(
   196  		ctx,
   197  		key,
   198  		clientv3.WithPrefix(),
   199  		clientv3.WithKeysOnly(),
   200  	)
   201  
   202  	if err != nil {
   203  		c.m.etcdGetError.Inc(1)
   204  		return nil, err
   205  	}
   206  
   207  	r := make([]string, len(resp.Kvs))
   208  	for i, kv := range resp.Kvs {
   209  		r[i] = instanceFromKey(string(kv.Key), key)
   210  	}
   211  
   212  	return r, nil
   213  }
   214  
   215  func (c *client) GetInstances() ([]placement.Instance, error) {
   216  	return c.getInstances(servicePrefix(c.sid))
   217  }
   218  
   219  func (c *client) getInstances(key string) ([]placement.Instance, error) {
   220  	ctx, cancel := c.context()
   221  	defer cancel()
   222  
   223  	gr, err := c.kv.Get(ctx, key, clientv3.WithPrefix())
   224  	if err != nil {
   225  		c.m.etcdGetError.Inc(1)
   226  		return nil, err
   227  	}
   228  
   229  	r := make([]placement.Instance, len(gr.Kvs))
   230  	for i, kv := range gr.Kvs {
   231  		var p placementpb.Instance
   232  		if err := proto.Unmarshal(kv.Value, &p); err != nil {
   233  			return nil, err
   234  		}
   235  
   236  		pi, err := placement.NewInstanceFromProto(&p)
   237  		if err != nil {
   238  			return nil, err
   239  		}
   240  
   241  		r[i] = pi
   242  	}
   243  	return r, nil
   244  }
   245  
   246  func (c *client) Delete(instance string) error {
   247  	ctx, cancel := c.context()
   248  	defer cancel()
   249  
   250  	r, err := c.kv.Delete(ctx, heartbeatKey(c.sid, instance))
   251  	if err != nil {
   252  		return err
   253  	}
   254  
   255  	if r.Deleted == 0 {
   256  		return fmt.Errorf("could not find heartbeat for service: %s, env: %s, instance: %s", c.sid.Name(), c.sid.Environment(), instance)
   257  	}
   258  
   259  	// NB(cw) we need to clean up cached lease ID, if not the next heartbeat might reuse the cached lease
   260  	// and keep alive on existing lease wont work since the key is deleted
   261  	c.cache.delete(c.sid, instance)
   262  	return nil
   263  }
   264  
   265  func (c *client) Watch() (watch.Watch, error) {
   266  	serviceKey := servicePrefix(c.sid)
   267  
   268  	c.Lock()
   269  	watchable, ok := c.watchables[serviceKey]
   270  	if !ok {
   271  		watchable = watch.NewWatchable()
   272  		c.watchables[serviceKey] = watchable
   273  
   274  		go c.wm.Watch(serviceKey)
   275  	}
   276  	c.Unlock()
   277  
   278  	_, w, err := watchable.Watch()
   279  	return w, err
   280  }
   281  
   282  func (c *client) update(key string, _ []*clientv3.Event) error {
   283  	var (
   284  		newValue []string
   285  		err      error
   286  	)
   287  	// we need retry here because if Get() failed on an watch update,
   288  	// it has to wait 10 mins to be notified to try again
   289  	if execErr := c.retrier.Attempt(func() error {
   290  		newValue, err = c.get(key)
   291  		if err == kv.ErrNotFound {
   292  			// do not retry on ErrNotFound
   293  			return retry.NonRetryableError(err)
   294  		}
   295  		return err
   296  	}); execErr != nil {
   297  		return execErr
   298  	}
   299  
   300  	c.RLock()
   301  	w, ok := c.watchables[key]
   302  	c.RUnlock()
   303  	if !ok {
   304  		return fmt.Errorf("unexpected: no watchable found for key: %s", key)
   305  	}
   306  	w.Update(newValue)
   307  
   308  	return nil
   309  }
   310  
   311  func (c *client) tickAndStop(key string) bool {
   312  	// fast path
   313  	c.RLock()
   314  	watchable, ok := c.watchables[key]
   315  	c.RUnlock()
   316  	if !ok {
   317  		c.logger.Warn("unexpected: key is already cleaned up", zap.String("key", key))
   318  		return true
   319  	}
   320  
   321  	if watchable.NumWatches() != 0 {
   322  		return false
   323  	}
   324  
   325  	// slow path
   326  	c.Lock()
   327  	defer c.Unlock()
   328  	watchable, ok = c.watchables[key]
   329  	if !ok {
   330  		// not expect this to happen
   331  		c.logger.Warn("unexpected: key is already cleaned up", zap.String("key", key))
   332  		return true
   333  	}
   334  
   335  	if watchable.NumWatches() != 0 {
   336  		// a new watch has subscribed to the watchable, do not clean up
   337  		return false
   338  	}
   339  
   340  	watchable.Close()
   341  	delete(c.watchables, key)
   342  	return true
   343  }
   344  
   345  func (c *client) context() (context.Context, context.CancelFunc) {
   346  	ctx := context.Background()
   347  	cancel := noopCancel
   348  	if c.opts.RequestTimeout() > 0 {
   349  		ctx, cancel = context.WithTimeout(ctx, c.opts.RequestTimeout())
   350  	}
   351  
   352  	return ctx, cancel
   353  }
   354  
   355  func heartbeatKey(sid services.ServiceID, instance string) string {
   356  	return fmt.Sprintf(keyFormat, servicePrefix(sid), instance)
   357  }
   358  
   359  func instanceFromKey(key, servicePrefix string) string {
   360  	return strings.TrimPrefix(
   361  		strings.TrimPrefix(key, servicePrefix),
   362  		keySeparator,
   363  	)
   364  }
   365  
   366  // heartbeats for a service "svc" in env "test" should be stored under
   367  // "_hb/test/svc". A service "svc" with no environment will be stored under
   368  // "_hb/svc".
   369  func servicePrefix(sid services.ServiceID) string {
   370  	env := sid.Environment()
   371  	if env == "" {
   372  		return fmt.Sprintf(keyFormat, heartbeatKeyPrefix, sid.Name())
   373  	}
   374  
   375  	return fmt.Sprintf(
   376  		keyFormat,
   377  		heartbeatKeyPrefix,
   378  		fmt.Sprintf(keyFormat, env, sid.Name()))
   379  }
   380  
   381  func newLeaseCache() *leaseCache {
   382  	return &leaseCache{
   383  		leases: make(map[string]map[time.Duration]clientv3.LeaseID),
   384  	}
   385  }
   386  
   387  type leaseCache struct {
   388  	sync.RWMutex
   389  
   390  	leases map[string]map[time.Duration]clientv3.LeaseID
   391  }
   392  
   393  func (c *leaseCache) get(sid services.ServiceID, instance string, ttl time.Duration) (clientv3.LeaseID, bool) {
   394  	c.RLock()
   395  	defer c.RUnlock()
   396  
   397  	leases, ok := c.leases[heartbeatKey(sid, instance)]
   398  	if !ok {
   399  		return clientv3.LeaseID(0), false
   400  	}
   401  
   402  	id, ok := leases[ttl]
   403  	return id, ok
   404  }
   405  
   406  func (c *leaseCache) put(sid services.ServiceID, instance string, ttl time.Duration, id clientv3.LeaseID) {
   407  	key := heartbeatKey(sid, instance)
   408  
   409  	c.Lock()
   410  	defer c.Unlock()
   411  
   412  	leases, ok := c.leases[key]
   413  	if !ok {
   414  		leases = make(map[time.Duration]clientv3.LeaseID)
   415  		c.leases[key] = leases
   416  	}
   417  	leases[ttl] = id
   418  }
   419  
   420  func (c *leaseCache) delete(sid services.ServiceID, instance string) {
   421  	c.Lock()
   422  	delete(c.leases, heartbeatKey(sid, instance))
   423  	c.Unlock()
   424  }