go.etcd.io/etcd@v3.3.27+incompatible/mvcc/watchable_store.go (about)

     1  // Copyright 2015 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mvcc
    16  
    17  import (
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/coreos/etcd/auth"
    22  	"github.com/coreos/etcd/lease"
    23  	"github.com/coreos/etcd/mvcc/backend"
    24  	"github.com/coreos/etcd/mvcc/mvccpb"
    25  )
    26  
    27  // non-const so modifiable by tests
    28  var (
    29  	// chanBufLen is the length of the buffered chan
    30  	// for sending out watched events.
    31  	// See https://github.com/etcd-io/etcd/issues/11906 for more detail.
    32  	chanBufLen = 128
    33  
    34  	// maxWatchersPerSync is the number of watchers to sync in a single batch
    35  	maxWatchersPerSync = 512
    36  )
    37  
    38  type watchable interface {
    39  	watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse, fcs ...FilterFunc) (*watcher, cancelFunc)
    40  	progress(w *watcher)
    41  	rev() int64
    42  }
    43  
    44  type watchableStore struct {
    45  	*store
    46  
    47  	// mu protects watcher groups and batches. It should never be locked
    48  	// before locking store.mu to avoid deadlock.
    49  	mu sync.RWMutex
    50  
    51  	// victims are watcher batches that were blocked on the watch channel
    52  	victims []watcherBatch
    53  	victimc chan struct{}
    54  
    55  	// contains all unsynced watchers that needs to sync with events that have happened
    56  	unsynced watcherGroup
    57  
    58  	// contains all synced watchers that are in sync with the progress of the store.
    59  	// The key of the map is the key that the watcher watches on.
    60  	synced watcherGroup
    61  
    62  	stopc chan struct{}
    63  	wg    sync.WaitGroup
    64  }
    65  
    66  // cancelFunc updates unsynced and synced maps when running
    67  // cancel operations.
    68  type cancelFunc func()
    69  
    70  func New(b backend.Backend, le lease.Lessor, as auth.AuthStore, ig ConsistentIndexGetter) ConsistentWatchableKV {
    71  	return newWatchableStore(b, le, as, ig)
    72  }
    73  
    74  func newWatchableStore(b backend.Backend, le lease.Lessor, as auth.AuthStore, ig ConsistentIndexGetter) *watchableStore {
    75  	s := &watchableStore{
    76  		store:    NewStore(b, le, ig),
    77  		victimc:  make(chan struct{}, 1),
    78  		unsynced: newWatcherGroup(),
    79  		synced:   newWatcherGroup(),
    80  		stopc:    make(chan struct{}),
    81  	}
    82  	s.store.ReadView = &readView{s}
    83  	s.store.WriteView = &writeView{s}
    84  	if s.le != nil {
    85  		// use this store as the deleter so revokes trigger watch events
    86  		s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write() })
    87  	}
    88  	if as != nil {
    89  		// TODO: encapsulating consistentindex into a separate package
    90  		as.SetConsistentIndexSyncer(s.store.saveIndex)
    91  	}
    92  	s.wg.Add(2)
    93  	go s.syncWatchersLoop()
    94  	go s.syncVictimsLoop()
    95  	return s
    96  }
    97  
    98  func (s *watchableStore) Close() error {
    99  	close(s.stopc)
   100  	s.wg.Wait()
   101  	return s.store.Close()
   102  }
   103  
   104  func (s *watchableStore) NewWatchStream() WatchStream {
   105  	watchStreamGauge.Inc()
   106  	return &watchStream{
   107  		watchable: s,
   108  		ch:        make(chan WatchResponse, chanBufLen),
   109  		cancels:   make(map[WatchID]cancelFunc),
   110  		watchers:  make(map[WatchID]*watcher),
   111  	}
   112  }
   113  
   114  func (s *watchableStore) watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse, fcs ...FilterFunc) (*watcher, cancelFunc) {
   115  	wa := &watcher{
   116  		key:    key,
   117  		end:    end,
   118  		minRev: startRev,
   119  		id:     id,
   120  		ch:     ch,
   121  		fcs:    fcs,
   122  	}
   123  
   124  	s.mu.Lock()
   125  	s.revMu.RLock()
   126  	synced := startRev > s.store.currentRev || startRev == 0
   127  	if synced {
   128  		wa.minRev = s.store.currentRev + 1
   129  		if startRev > wa.minRev {
   130  			wa.minRev = startRev
   131  		}
   132  	}
   133  	if synced {
   134  		s.synced.add(wa)
   135  	} else {
   136  		slowWatcherGauge.Inc()
   137  		s.unsynced.add(wa)
   138  	}
   139  	s.revMu.RUnlock()
   140  	s.mu.Unlock()
   141  
   142  	watcherGauge.Inc()
   143  
   144  	return wa, func() { s.cancelWatcher(wa) }
   145  }
   146  
   147  // cancelWatcher removes references of the watcher from the watchableStore
   148  func (s *watchableStore) cancelWatcher(wa *watcher) {
   149  	for {
   150  		s.mu.Lock()
   151  		if s.unsynced.delete(wa) {
   152  			slowWatcherGauge.Dec()
   153  			break
   154  		} else if s.synced.delete(wa) {
   155  			break
   156  		} else if wa.compacted {
   157  			break
   158  		} else if wa.ch == nil {
   159  			// already canceled (e.g., cancel/close race)
   160  			break
   161  		}
   162  
   163  		if !wa.victim {
   164  			panic("watcher not victim but not in watch groups")
   165  		}
   166  
   167  		var victimBatch watcherBatch
   168  		for _, wb := range s.victims {
   169  			if wb[wa] != nil {
   170  				victimBatch = wb
   171  				break
   172  			}
   173  		}
   174  		if victimBatch != nil {
   175  			slowWatcherGauge.Dec()
   176  			delete(victimBatch, wa)
   177  			break
   178  		}
   179  
   180  		// victim being processed so not accessible; retry
   181  		s.mu.Unlock()
   182  		time.Sleep(time.Millisecond)
   183  	}
   184  
   185  	watcherGauge.Dec()
   186  	wa.ch = nil
   187  	s.mu.Unlock()
   188  }
   189  
   190  func (s *watchableStore) Restore(b backend.Backend) error {
   191  	s.mu.Lock()
   192  	defer s.mu.Unlock()
   193  	err := s.store.Restore(b)
   194  	if err != nil {
   195  		return err
   196  	}
   197  
   198  	for wa := range s.synced.watchers {
   199  		wa.restore = true
   200  		s.unsynced.add(wa)
   201  	}
   202  	s.synced = newWatcherGroup()
   203  	return nil
   204  }
   205  
   206  // syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
   207  func (s *watchableStore) syncWatchersLoop() {
   208  	defer s.wg.Done()
   209  
   210  	for {
   211  		s.mu.RLock()
   212  		st := time.Now()
   213  		lastUnsyncedWatchers := s.unsynced.size()
   214  		s.mu.RUnlock()
   215  
   216  		unsyncedWatchers := 0
   217  		if lastUnsyncedWatchers > 0 {
   218  			unsyncedWatchers = s.syncWatchers()
   219  		}
   220  		syncDuration := time.Since(st)
   221  
   222  		waitDuration := 100 * time.Millisecond
   223  		// more work pending?
   224  		if unsyncedWatchers != 0 && lastUnsyncedWatchers > unsyncedWatchers {
   225  			// be fair to other store operations by yielding time taken
   226  			waitDuration = syncDuration
   227  		}
   228  
   229  		select {
   230  		case <-time.After(waitDuration):
   231  		case <-s.stopc:
   232  			return
   233  		}
   234  	}
   235  }
   236  
   237  // syncVictimsLoop tries to write precomputed watcher responses to
   238  // watchers that had a blocked watcher channel
   239  func (s *watchableStore) syncVictimsLoop() {
   240  	defer s.wg.Done()
   241  
   242  	for {
   243  		for s.moveVictims() != 0 {
   244  			// try to update all victim watchers
   245  		}
   246  		s.mu.RLock()
   247  		isEmpty := len(s.victims) == 0
   248  		s.mu.RUnlock()
   249  
   250  		var tickc <-chan time.Time
   251  		if !isEmpty {
   252  			tickc = time.After(10 * time.Millisecond)
   253  		}
   254  
   255  		select {
   256  		case <-tickc:
   257  		case <-s.victimc:
   258  		case <-s.stopc:
   259  			return
   260  		}
   261  	}
   262  }
   263  
   264  // moveVictims tries to update watches with already pending event data
   265  func (s *watchableStore) moveVictims() (moved int) {
   266  	s.mu.Lock()
   267  	victims := s.victims
   268  	s.victims = nil
   269  	s.mu.Unlock()
   270  
   271  	var newVictim watcherBatch
   272  	for _, wb := range victims {
   273  		// try to send responses again
   274  		for w, eb := range wb {
   275  			// watcher has observed the store up to, but not including, w.minRev
   276  			rev := w.minRev - 1
   277  			if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
   278  				pendingEventsGauge.Add(float64(len(eb.evs)))
   279  			} else {
   280  				if newVictim == nil {
   281  					newVictim = make(watcherBatch)
   282  				}
   283  				newVictim[w] = eb
   284  				continue
   285  			}
   286  			moved++
   287  		}
   288  
   289  		// assign completed victim watchers to unsync/sync
   290  		s.mu.Lock()
   291  		s.store.revMu.RLock()
   292  		curRev := s.store.currentRev
   293  		for w, eb := range wb {
   294  			if newVictim != nil && newVictim[w] != nil {
   295  				// couldn't send watch response; stays victim
   296  				continue
   297  			}
   298  			w.victim = false
   299  			if eb.moreRev != 0 {
   300  				w.minRev = eb.moreRev
   301  			}
   302  			if w.minRev <= curRev {
   303  				s.unsynced.add(w)
   304  			} else {
   305  				slowWatcherGauge.Dec()
   306  				s.synced.add(w)
   307  			}
   308  		}
   309  		s.store.revMu.RUnlock()
   310  		s.mu.Unlock()
   311  	}
   312  
   313  	if len(newVictim) > 0 {
   314  		s.mu.Lock()
   315  		s.victims = append(s.victims, newVictim)
   316  		s.mu.Unlock()
   317  	}
   318  
   319  	return moved
   320  }
   321  
   322  // syncWatchers syncs unsynced watchers by:
   323  //	1. choose a set of watchers from the unsynced watcher group
   324  //	2. iterate over the set to get the minimum revision and remove compacted watchers
   325  //	3. use minimum revision to get all key-value pairs and send those events to watchers
   326  //	4. remove synced watchers in set from unsynced group and move to synced group
   327  func (s *watchableStore) syncWatchers() int {
   328  	s.mu.Lock()
   329  	defer s.mu.Unlock()
   330  
   331  	if s.unsynced.size() == 0 {
   332  		return 0
   333  	}
   334  
   335  	s.store.revMu.RLock()
   336  	defer s.store.revMu.RUnlock()
   337  
   338  	// in order to find key-value pairs from unsynced watchers, we need to
   339  	// find min revision index, and these revisions can be used to
   340  	// query the backend store of key-value pairs
   341  	curRev := s.store.currentRev
   342  	compactionRev := s.store.compactMainRev
   343  
   344  	wg, minRev := s.unsynced.choose(maxWatchersPerSync, curRev, compactionRev)
   345  	minBytes, maxBytes := newRevBytes(), newRevBytes()
   346  	revToBytes(revision{main: minRev}, minBytes)
   347  	revToBytes(revision{main: curRev + 1}, maxBytes)
   348  
   349  	// UnsafeRange returns keys and values. And in boltdb, keys are revisions.
   350  	// values are actual key-value pairs in backend.
   351  	tx := s.store.b.ReadTx()
   352  	tx.Lock()
   353  	revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
   354  	evs := kvsToEvents(wg, revs, vs)
   355  	tx.Unlock()
   356  
   357  	var victims watcherBatch
   358  	wb := newWatcherBatch(wg, evs)
   359  	for w := range wg.watchers {
   360  		w.minRev = curRev + 1
   361  
   362  		eb, ok := wb[w]
   363  		if !ok {
   364  			// bring un-notified watcher to synced
   365  			s.synced.add(w)
   366  			s.unsynced.delete(w)
   367  			continue
   368  		}
   369  
   370  		if eb.moreRev != 0 {
   371  			w.minRev = eb.moreRev
   372  		}
   373  
   374  		if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: curRev}) {
   375  			pendingEventsGauge.Add(float64(len(eb.evs)))
   376  		} else {
   377  			if victims == nil {
   378  				victims = make(watcherBatch)
   379  			}
   380  			w.victim = true
   381  		}
   382  
   383  		if w.victim {
   384  			victims[w] = eb
   385  		} else {
   386  			if eb.moreRev != 0 {
   387  				// stay unsynced; more to read
   388  				continue
   389  			}
   390  			s.synced.add(w)
   391  		}
   392  		s.unsynced.delete(w)
   393  	}
   394  	s.addVictim(victims)
   395  
   396  	vsz := 0
   397  	for _, v := range s.victims {
   398  		vsz += len(v)
   399  	}
   400  	slowWatcherGauge.Set(float64(s.unsynced.size() + vsz))
   401  
   402  	return s.unsynced.size()
   403  }
   404  
   405  // kvsToEvents gets all events for the watchers from all key-value pairs
   406  func kvsToEvents(wg *watcherGroup, revs, vals [][]byte) (evs []mvccpb.Event) {
   407  	for i, v := range vals {
   408  		var kv mvccpb.KeyValue
   409  		if err := kv.Unmarshal(v); err != nil {
   410  			plog.Panicf("cannot unmarshal event: %v", err)
   411  		}
   412  
   413  		if !wg.contains(string(kv.Key)) {
   414  			continue
   415  		}
   416  
   417  		ty := mvccpb.PUT
   418  		if isTombstone(revs[i]) {
   419  			ty = mvccpb.DELETE
   420  			// patch in mod revision so watchers won't skip
   421  			kv.ModRevision = bytesToRev(revs[i]).main
   422  		}
   423  		evs = append(evs, mvccpb.Event{Kv: &kv, Type: ty})
   424  	}
   425  	return evs
   426  }
   427  
   428  // notify notifies the fact that given event at the given rev just happened to
   429  // watchers that watch on the key of the event.
   430  func (s *watchableStore) notify(rev int64, evs []mvccpb.Event) {
   431  	var victim watcherBatch
   432  	for w, eb := range newWatcherBatch(&s.synced, evs) {
   433  		if eb.revs != 1 {
   434  			plog.Panicf("unexpected multiple revisions in notification")
   435  		}
   436  		if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
   437  			pendingEventsGauge.Add(float64(len(eb.evs)))
   438  		} else {
   439  			// move slow watcher to victims
   440  			w.minRev = rev + 1
   441  			if victim == nil {
   442  				victim = make(watcherBatch)
   443  			}
   444  			w.victim = true
   445  			victim[w] = eb
   446  			s.synced.delete(w)
   447  			slowWatcherGauge.Inc()
   448  		}
   449  	}
   450  	s.addVictim(victim)
   451  }
   452  
   453  func (s *watchableStore) addVictim(victim watcherBatch) {
   454  	if victim == nil {
   455  		return
   456  	}
   457  	s.victims = append(s.victims, victim)
   458  	select {
   459  	case s.victimc <- struct{}{}:
   460  	default:
   461  	}
   462  }
   463  
   464  func (s *watchableStore) rev() int64 { return s.store.Rev() }
   465  
   466  func (s *watchableStore) progress(w *watcher) {
   467  	s.mu.RLock()
   468  	defer s.mu.RUnlock()
   469  
   470  	if _, ok := s.synced.watchers[w]; ok {
   471  		w.send(WatchResponse{WatchID: w.id, Revision: s.rev()})
   472  		// If the ch is full, this watcher is receiving events.
   473  		// We do not need to send progress at all.
   474  	}
   475  }
   476  
   477  type watcher struct {
   478  	// the watcher key
   479  	key []byte
   480  	// end indicates the end of the range to watch.
   481  	// If end is set, the watcher is on a range.
   482  	end []byte
   483  
   484  	// victim is set when ch is blocked and undergoing victim processing
   485  	victim bool
   486  
   487  	// compacted is set when the watcher is removed because of compaction
   488  	compacted bool
   489  
   490  	// restore is true when the watcher is being restored from leader snapshot
   491  	// which means that this watcher has just been moved from "synced" to "unsynced"
   492  	// watcher group, possibly with a future revision when it was first added
   493  	// to the synced watcher
   494  	// "unsynced" watcher revision must always be <= current revision,
   495  	// except when the watcher were to be moved from "synced" watcher group
   496  	restore bool
   497  
   498  	// minRev is the minimum revision update the watcher will accept
   499  	minRev int64
   500  	id     WatchID
   501  
   502  	fcs []FilterFunc
   503  	// a chan to send out the watch response.
   504  	// The chan might be shared with other watchers.
   505  	ch chan<- WatchResponse
   506  }
   507  
   508  func (w *watcher) send(wr WatchResponse) bool {
   509  	progressEvent := len(wr.Events) == 0
   510  
   511  	if len(w.fcs) != 0 {
   512  		ne := make([]mvccpb.Event, 0, len(wr.Events))
   513  		for i := range wr.Events {
   514  			filtered := false
   515  			for _, filter := range w.fcs {
   516  				if filter(wr.Events[i]) {
   517  					filtered = true
   518  					break
   519  				}
   520  			}
   521  			if !filtered {
   522  				ne = append(ne, wr.Events[i])
   523  			}
   524  		}
   525  		wr.Events = ne
   526  	}
   527  
   528  	// if all events are filtered out, we should send nothing.
   529  	if !progressEvent && len(wr.Events) == 0 {
   530  		return true
   531  	}
   532  	select {
   533  	case w.ch <- wr:
   534  		return true
   535  	default:
   536  		return false
   537  	}
   538  }