github.com/m3db/m3@v1.5.0/src/cluster/etcd/watchmanager/manager.go (about)

     1  // Copyright (c) 2016 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package watchmanager
    22  
    23  import (
    24  	"context"
    25  	"fmt"
    26  	"math/rand"
    27  	"time"
    28  
    29  	"github.com/uber-go/tally"
    30  	"go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
    31  	clientv3 "go.etcd.io/etcd/client/v3"
    32  	"go.uber.org/zap"
    33  )
    34  
    35  // NewWatchManager creates a new watch manager
    36  func NewWatchManager(opts Options) (WatchManager, error) {
    37  	if err := opts.Validate(); err != nil {
    38  		return nil, err
    39  	}
    40  
    41  	scope := opts.InstrumentsOptions().MetricsScope()
    42  	return &manager{
    43  		opts:   opts,
    44  		logger: opts.InstrumentsOptions().Logger(),
    45  		m: metrics{
    46  			etcdWatchCreate: scope.Counter("etcd-watch-create"),
    47  			etcdWatchError:  scope.Counter("etcd-watch-error"),
    48  			etcdWatchReset:  scope.Counter("etcd-watch-reset"),
    49  		},
    50  		updateFn:      opts.UpdateFn(),
    51  		tickAndStopFn: opts.TickAndStopFn(),
    52  	}, nil
    53  }
    54  
    55  type manager struct {
    56  	opts   Options
    57  	logger *zap.Logger
    58  	m      metrics
    59  
    60  	updateFn      UpdateFn
    61  	tickAndStopFn TickAndStopFn
    62  }
    63  
    64  type metrics struct {
    65  	etcdWatchCreate tally.Counter
    66  	etcdWatchError  tally.Counter
    67  	etcdWatchReset  tally.Counter
    68  }
    69  
    70  func (w *manager) watchChanWithTimeout(key string, rev int64) (clientv3.WatchChan, context.CancelFunc, error) {
    71  	doneCh := make(chan struct{})
    72  
    73  	ctx, cancelFn := context.WithCancel(clientv3.WithRequireLeader(context.Background()))
    74  
    75  	var (
    76  		watcher   = clientv3.NewWatcher(w.opts.Client())
    77  		watchChan clientv3.WatchChan
    78  	)
    79  	go func() {
    80  		wOpts := w.opts.WatchOptions()
    81  		if rev > 0 {
    82  			wOpts = append(wOpts, clientv3.WithRev(rev))
    83  		}
    84  
    85  		watchChan = watcher.Watch(
    86  			ctx,
    87  			key,
    88  			wOpts...,
    89  		)
    90  		close(doneCh)
    91  	}()
    92  
    93  	var (
    94  		timeout       = w.opts.WatchChanInitTimeout()
    95  		cancelWatchFn = func() {
    96  			// we *must* both cancel the context and call .Close() on watch to
    97  			// properly free resources, and not end up with weird issues due to stale
    98  			// grpc streams or bad internal etcd watch state.
    99  			cancelFn()
   100  			if err := watcher.Close(); err != nil {
   101  				// however, there's nothing we can do about an error on watch close,
   102  				// and it shouldn't happen in practice - unless we end up
   103  				// closing an already closed grpc stream or smth.
   104  				w.logger.Info("error closing watcher", zap.Error(err))
   105  			}
   106  		}
   107  	)
   108  
   109  	select {
   110  	case <-doneCh:
   111  		return watchChan, cancelWatchFn, nil
   112  	case <-time.After(timeout):
   113  		cancelWatchFn()
   114  		err := fmt.Errorf("etcd watch create timed out after %s for key: %s", timeout.String(), key)
   115  		return nil, cancelWatchFn, err
   116  	}
   117  }
   118  
   119  func (w *manager) Watch(key string) {
   120  	var (
   121  		ticker = time.NewTicker(w.opts.WatchChanCheckInterval())
   122  		logger = w.logger.With(zap.String("watch_key", key))
   123  		rnd    = rand.New(rand.NewSource(time.Now().UnixNano())) //nolint:gosec
   124  
   125  		revOverride          int64
   126  		firstUpdateSucceeded bool
   127  		watchChan            clientv3.WatchChan
   128  		cancelFn             context.CancelFunc
   129  		err                  error
   130  	)
   131  
   132  	defer ticker.Stop()
   133  
   134  	resetWatchWithSleep := func() {
   135  		w.m.etcdWatchReset.Inc(1)
   136  
   137  		cancelFn()
   138  		// set it to nil so it will be recreated
   139  		watchChan = nil
   140  		// avoid recreating watch channel too frequently
   141  		dur := w.opts.WatchChanResetInterval()
   142  		dur += time.Duration(rnd.Int63n(int64(dur)))
   143  		time.Sleep(dur)
   144  	}
   145  
   146  	for {
   147  		if watchChan == nil {
   148  			w.m.etcdWatchCreate.Inc(1)
   149  			logger.Info("creating etcd watch at revision", zap.Int64("revision", revOverride))
   150  			watchChan, cancelFn, err = w.watchChanWithTimeout(key, revOverride)
   151  			if err != nil {
   152  				logger.Error("could not create etcd watch", zap.Error(err))
   153  
   154  				// NB(cw) when we failed to create a etcd watch channel
   155  				// we do a get for now and will try to recreate the watch chan later
   156  				if !firstUpdateSucceeded {
   157  					if err = w.updateFn(key, nil); err != nil {
   158  						logger.Error("failed to get value for key", zap.Error(err))
   159  					} else {
   160  						// NB(vytenis): only try initializing once, otherwise there's
   161  						// get request amplification, especially for non-existent keys.
   162  						firstUpdateSucceeded = true
   163  					}
   164  				}
   165  				resetWatchWithSleep()
   166  				continue
   167  			}
   168  		}
   169  
   170  		select {
   171  		case r, ok := <-watchChan:
   172  			if !ok {
   173  				resetWatchWithSleep()
   174  				logger.Warn("etcd watch channel closed on key, recreating a watch channel")
   175  				continue
   176  			}
   177  
   178  			// handle the update
   179  			if err = r.Err(); err != nil {
   180  				logger.Error(
   181  					"received error on watch channel",
   182  					zap.Uint64("etcd_cluster_id", r.Header.ClusterId),
   183  					zap.Uint64("etcd_member_id", r.Header.MemberId),
   184  					zap.Bool("etcd_watch_is_canceled", r.Canceled),
   185  					zap.Error(err),
   186  				)
   187  				w.m.etcdWatchError.Inc(1)
   188  				if err == rpctypes.ErrCompacted {
   189  					revOverride = r.CompactRevision
   190  					logger.Warn("compacted; recreating watch at revision",
   191  						zap.Int64("revision", revOverride))
   192  				} else {
   193  					logger.Warn("recreating watch due to an error", zap.Error(err))
   194  				}
   195  
   196  				resetWatchWithSleep()
   197  				continue
   198  			} else if r.IsProgressNotify() {
   199  				if r.CompactRevision > revOverride {
   200  					// we only care about last event as this watchmanager implementation does not support
   201  					// watching key ranges, only single keys.
   202  					// set revOverride to minimum non-compacted revision if watch was
   203  					// initialized with an older rev., since we really don't care about history.
   204  					// this may help recover faster (one less retry) on connection loss/leader change
   205  					// around compaction, if we were watching on a revision that's already compacted.
   206  					revOverride = r.CompactRevision
   207  				}
   208  				// Do not call updateFn on ProgressNotify as it happens periodically with no update events
   209  				continue
   210  			}
   211  
   212  			if err = w.updateFn(key, r.Events); err != nil {
   213  				logger.Error("received notification for key, but failed to get value", zap.Error(err))
   214  			}
   215  		case <-ticker.C:
   216  			if w.tickAndStopFn(key) {
   217  				logger.Info("watch on key ended")
   218  				return
   219  			}
   220  		}
   221  	}
   222  }