vitess.io/vitess@v0.16.2/go/vt/discovery/keyspace_events.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package discovery
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  
    24  	"google.golang.org/protobuf/proto"
    25  
    26  	"vitess.io/vitess/go/vt/log"
    27  	"vitess.io/vitess/go/vt/proto/query"
    28  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    29  	"vitess.io/vitess/go/vt/srvtopo"
    30  	"vitess.io/vitess/go/vt/topo"
    31  	"vitess.io/vitess/go/vt/topo/topoproto"
    32  )
    33  
    34  // KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents
    35  // for all keyspaces in a Vitess cell and notifies listeners when the events have been resolved.
    36  // Right now this is capable of detecting the end of failovers, both planned and unplanned,
    37  // and the end of resharding operations.
    38  //
    39  // The KeyspaceEventWatcher works by consolidating TabletHealth events from a HealthCheck stream,
    40  // which is a peer-to-peer check between nodes via GRPC, with events from a Topology Server, which
    41  // are global to the cluster and stored in an external system like etcd.
    42  type KeyspaceEventWatcher struct {
    43  	ts        srvtopo.Server
    44  	hc        HealthCheck
    45  	localCell string
    46  
    47  	mu        sync.Mutex
    48  	keyspaces map[string]*keyspaceState
    49  
    50  	subsMu sync.Mutex
    51  	subs   map[chan *KeyspaceEvent]struct{}
    52  }
    53  
    54  // KeyspaceEvent is yielded to all watchers when an availability event for a keyspace has been resolved
    55  type KeyspaceEvent struct {
    56  	// Cell is the cell where the keyspace lives
    57  	Cell string
    58  
    59  	// Keyspace is the name of the keyspace which was (partially) unavailable and is now fully healthy
    60  	Keyspace string
    61  
    62  	// Shards is a list of all the shards in the keyspace, including their state after the event is resolved
    63  	Shards []ShardEvent
    64  }
    65  
    66  type ShardEvent struct {
    67  	Tablet  *topodatapb.TabletAlias
    68  	Target  *query.Target
    69  	Serving bool
    70  }
    71  
    72  // NewKeyspaceEventWatcher returns a new watcher for all keyspace events in the given cell.
    73  // It requires access to a topology server, and an existing HealthCheck implementation which
    74  // will be used to detect unhealthy nodes.
    75  func NewKeyspaceEventWatcher(ctx context.Context, topoServer srvtopo.Server, hc HealthCheck, localCell string) *KeyspaceEventWatcher {
    76  	kew := &KeyspaceEventWatcher{
    77  		hc:        hc,
    78  		ts:        topoServer,
    79  		localCell: localCell,
    80  		keyspaces: make(map[string]*keyspaceState),
    81  		subs:      make(map[chan *KeyspaceEvent]struct{}),
    82  	}
    83  	kew.run(ctx)
    84  	log.Infof("started watching keyspace events in %q", localCell)
    85  	return kew
    86  }
    87  
    88  // keyspaceState is the internal state for all the keyspaces that the KEW is
    89  // currently watching
    90  type keyspaceState struct {
    91  	kew      *KeyspaceEventWatcher
    92  	keyspace string
    93  
    94  	mu         sync.Mutex
    95  	deleted    bool
    96  	consistent bool
    97  
    98  	lastError    error
    99  	lastKeyspace *topodatapb.SrvKeyspace
   100  	shards       map[string]*shardState
   101  }
   102  
   103  // Format prints the internal state for this keyspace for debug purposes
   104  func (kss *keyspaceState) Format(f fmt.State, verb rune) {
   105  	kss.mu.Lock()
   106  	defer kss.mu.Unlock()
   107  
   108  	fmt.Fprintf(f, "Keyspace(%s) = deleted: %v, consistent: %v, shards: [\n", kss.keyspace, kss.deleted, kss.consistent)
   109  	for shard, ss := range kss.shards {
   110  		fmt.Fprintf(f, "  Shard(%s) = target: [%s/%s %v], serving: %v, externally_reparented: %d, current_primary: %s\n",
   111  			shard,
   112  			ss.target.Keyspace, ss.target.Shard, ss.target.TabletType,
   113  			ss.serving, ss.externallyReparented,
   114  			ss.currentPrimary.String(),
   115  		)
   116  	}
   117  	fmt.Fprintf(f, "]\n")
   118  }
   119  
   120  // beingResharded returns whether this keyspace is thought to be in the middle of a resharding
   121  // operation. currentShard is the name of the shard that belongs to this keyspace and which
   122  // we are trying to access. currentShard can _only_ be a primary shard.
   123  func (kss *keyspaceState) beingResharded(currentShard string) bool {
   124  	kss.mu.Lock()
   125  	defer kss.mu.Unlock()
   126  
   127  	// if the keyspace is gone, or if it has no known availability events, the keyspace
   128  	// cannot be in the middle of a resharding operation
   129  	if kss.deleted || kss.consistent {
   130  		return false
   131  	}
   132  
   133  	// for all the known shards, try to find a primary shard besides the one we're trying to access
   134  	// and which is currently healthy. if there are other healthy primaries in the keyspace, it means
   135  	// we're in the middle of a resharding operation
   136  	for shard, sstate := range kss.shards {
   137  		if shard != currentShard && sstate.serving {
   138  			return true
   139  		}
   140  	}
   141  
   142  	return false
   143  }
   144  
   145  type shardState struct {
   146  	target               *query.Target
   147  	serving              bool
   148  	externallyReparented int64
   149  	currentPrimary       *topodatapb.TabletAlias
   150  }
   151  
   152  // Subscribe returns a channel that will receive any KeyspaceEvents for all keyspaces in the current cell
   153  func (kew *KeyspaceEventWatcher) Subscribe() chan *KeyspaceEvent {
   154  	kew.subsMu.Lock()
   155  	defer kew.subsMu.Unlock()
   156  	c := make(chan *KeyspaceEvent, 2)
   157  	kew.subs[c] = struct{}{}
   158  	return c
   159  }
   160  
   161  // Unsubscribe removes a listener previously returned from Subscribe
   162  func (kew *KeyspaceEventWatcher) Unsubscribe(c chan *KeyspaceEvent) {
   163  	kew.subsMu.Lock()
   164  	defer kew.subsMu.Unlock()
   165  	delete(kew.subs, c)
   166  }
   167  
   168  func (kew *KeyspaceEventWatcher) broadcast(th *KeyspaceEvent) {
   169  	kew.subsMu.Lock()
   170  	defer kew.subsMu.Unlock()
   171  	for c := range kew.subs {
   172  		select {
   173  		case c <- th:
   174  		default:
   175  		}
   176  	}
   177  }
   178  
   179  func (kew *KeyspaceEventWatcher) run(ctx context.Context) {
   180  	hcChan := kew.hc.Subscribe()
   181  	bufferCtx, bufferCancel := context.WithCancel(ctx)
   182  
   183  	go func() {
   184  		defer bufferCancel()
   185  
   186  		for {
   187  			select {
   188  			case <-bufferCtx.Done():
   189  				return
   190  			case result := <-hcChan:
   191  				if result == nil {
   192  					return
   193  				}
   194  				kew.processHealthCheck(result)
   195  			}
   196  		}
   197  	}()
   198  
   199  	go func() {
   200  		// Seed the keyspace statuses once at startup
   201  		keyspaces, err := kew.ts.GetSrvKeyspaceNames(ctx, kew.localCell, true)
   202  		if err != nil {
   203  			log.Errorf("CEM: initialize failed for cell %q: %v", kew.localCell, err)
   204  			return
   205  		}
   206  		for _, ks := range keyspaces {
   207  			kew.getKeyspaceStatus(ks)
   208  		}
   209  	}()
   210  }
   211  
   212  // ensureConsistentLocked checks if the current keyspace has recovered from an availability
   213  // event, and if so, returns information about the availability event to all subscribers
   214  func (kss *keyspaceState) ensureConsistentLocked() {
   215  	// if this keyspace is consistent, there's no ongoing availability event
   216  	if kss.consistent {
   217  		return
   218  	}
   219  
   220  	// get the topology metadata for our primary from `lastKeyspace`; this value is refreshed
   221  	// from our topology watcher whenever a change is detected, so it should always be up to date
   222  	primary := topoproto.SrvKeyspaceGetPartition(kss.lastKeyspace, topodatapb.TabletType_PRIMARY)
   223  
   224  	// if there's no primary, the keyspace is unhealthy;
   225  	// if there are ShardTabletControls active, the keyspace is undergoing a topology change;
   226  	// either way, the availability event is still ongoing
   227  	if primary == nil || len(primary.ShardTabletControls) > 0 {
   228  		return
   229  	}
   230  
   231  	activeShardsInPartition := make(map[string]bool)
   232  
   233  	// iterate through all the primary shards that the topology server knows about;
   234  	// for each shard, if our HealthCheck stream hasn't found the shard yet, or
   235  	// if the HealthCheck stream still thinks the shard is unhealthy, this
   236  	// means the availability event is still ongoing
   237  	for _, shard := range primary.ShardReferences {
   238  		sstate := kss.shards[shard.Name]
   239  		if sstate == nil || !sstate.serving {
   240  			return
   241  		}
   242  		activeShardsInPartition[shard.Name] = true
   243  	}
   244  
   245  	// iterate through all the shards as seen by our HealthCheck stream. if there are any
   246  	// shards that HealthCheck thinks are healthy, and they haven't been seen by the topology
   247  	// watcher, it means the keyspace is not fully consistent yet
   248  	for shard, sstate := range kss.shards {
   249  		if sstate.serving && !activeShardsInPartition[shard] {
   250  			return
   251  		}
   252  	}
   253  
   254  	// we haven't found any inconsistencies between the HealthCheck stream and the topology
   255  	// watcher. this means the ongoing availability event has been resolved, so we can broadcast
   256  	// a resolution event to all listeners
   257  	kss.consistent = true
   258  
   259  	ksevent := &KeyspaceEvent{
   260  		Cell:     kss.kew.localCell,
   261  		Keyspace: kss.keyspace,
   262  		Shards:   make([]ShardEvent, 0, len(kss.shards)),
   263  	}
   264  
   265  	for shard, sstate := range kss.shards {
   266  		ksevent.Shards = append(ksevent.Shards, ShardEvent{
   267  			Tablet:  sstate.currentPrimary,
   268  			Target:  sstate.target,
   269  			Serving: sstate.serving,
   270  		})
   271  
   272  		log.Infof("keyspace event resolved: %s/%s is now consistent (serving: %v)",
   273  			sstate.target.Keyspace, sstate.target.Keyspace,
   274  			sstate.serving,
   275  		)
   276  
   277  		if !sstate.serving {
   278  			delete(kss.shards, shard)
   279  		}
   280  	}
   281  
   282  	kss.kew.broadcast(ksevent)
   283  }
   284  
   285  // onHealthCheck is the callback that updates this keyspace with event data from the HealthCheck stream.
   286  // the HealthCheck stream applies to all the keyspaces in the cluster and emits TabletHealth events to our
   287  // parent KeyspaceWatcher, which will mux them into their corresponding keyspaceState
   288  func (kss *keyspaceState) onHealthCheck(th *TabletHealth) {
   289  	// we only care about health events on the primary
   290  	if th.Target.TabletType != topodatapb.TabletType_PRIMARY {
   291  		return
   292  	}
   293  
   294  	kss.mu.Lock()
   295  	defer kss.mu.Unlock()
   296  
   297  	sstate := kss.shards[th.Target.Shard]
   298  
   299  	// if we've never seen this shard before, we need to allocate a shardState for it, unless
   300  	// we've received a _not serving_ shard event for a shard which we don't know about yet,
   301  	// in which case we don't need to keep track of it. we'll start tracking it if/when the
   302  	// shard becomes healthy again
   303  	if sstate == nil {
   304  		if !th.Serving {
   305  			return
   306  		}
   307  
   308  		sstate = &shardState{target: th.Target}
   309  		kss.shards[th.Target.Shard] = sstate
   310  	}
   311  
   312  	// if the shard went from serving to not serving, or the other way around, the keyspace
   313  	// is undergoing an availability event
   314  	if sstate.serving != th.Serving {
   315  		sstate.serving = th.Serving
   316  		kss.consistent = false
   317  	}
   318  
   319  	// if the primary for this shard has been externally reparented, we're undergoing a failover,
   320  	// which is considered an availability event. update this shard to point it to the new tablet
   321  	// that acts as primary now
   322  	if th.PrimaryTermStartTime != 0 && th.PrimaryTermStartTime > sstate.externallyReparented {
   323  		sstate.externallyReparented = th.PrimaryTermStartTime
   324  		sstate.currentPrimary = th.Tablet.Alias
   325  		kss.consistent = false
   326  	}
   327  
   328  	kss.ensureConsistentLocked()
   329  }
   330  
   331  // onSrvKeyspace is the callback that updates this keyspace with fresh topology data from our topology server.
   332  // this callback is called from a Watcher in the topo server whenever a change to the topology for this keyspace
   333  // occurs. this watcher is dedicated to this keyspace, and will only yield topology metadata changes for as
   334  // long as we're interested on this keyspace.
   335  func (kss *keyspaceState) onSrvKeyspace(newKeyspace *topodatapb.SrvKeyspace, newError error) bool {
   336  	kss.mu.Lock()
   337  	defer kss.mu.Unlock()
   338  
   339  	// if the topology watcher has seen a NoNode while watching this keyspace, it means the keyspace
   340  	// has been deleted from the cluster. we mark it for eventual cleanup here, as we no longer need
   341  	// to keep watching for events in this keyspace.
   342  	if topo.IsErrType(newError, topo.NoNode) {
   343  		kss.deleted = true
   344  		log.Infof("keyspace %q deleted", kss.keyspace)
   345  		return false
   346  	}
   347  
   348  	// if there's another kind of error while watching this keyspace, we assume it's temporary and related
   349  	// to the topology server, not to the keyspace itself. we'll keep waiting for more topology events.
   350  	if newError != nil {
   351  		kss.lastError = newError
   352  		log.Errorf("error while watching keyspace %q: %v", kss.keyspace, newError)
   353  		return true
   354  	}
   355  
   356  	// if the topology metadata for our keyspace is identical to the last one we saw there's nothing to do
   357  	// here. this is a side-effect of the way ETCD watchers work.
   358  	if proto.Equal(kss.lastKeyspace, newKeyspace) {
   359  		// no changes
   360  		return true
   361  	}
   362  
   363  	// we only mark this keyspace as inconsistent if there has been a topology change in the PRIMARY for
   364  	// this keyspace, but we store the topology metadata for both primary and replicas for future-proofing.
   365  	var oldPrimary, newPrimary *topodatapb.SrvKeyspace_KeyspacePartition
   366  	if kss.lastKeyspace != nil {
   367  		oldPrimary = topoproto.SrvKeyspaceGetPartition(kss.lastKeyspace, topodatapb.TabletType_PRIMARY)
   368  	}
   369  	if newKeyspace != nil {
   370  		newPrimary = topoproto.SrvKeyspaceGetPartition(newKeyspace, topodatapb.TabletType_PRIMARY)
   371  	}
   372  	if !proto.Equal(oldPrimary, newPrimary) {
   373  		kss.consistent = false
   374  	}
   375  
   376  	kss.lastKeyspace = newKeyspace
   377  	kss.ensureConsistentLocked()
   378  	return true
   379  }
   380  
   381  // newKeyspaceState allocates the internal state required to keep track of availability incidents
   382  // in this keyspace, and starts up a SrvKeyspace watcher on our topology server which will update
   383  // our keyspaceState with any topology changes in real time.
   384  func newKeyspaceState(kew *KeyspaceEventWatcher, cell, keyspace string) *keyspaceState {
   385  	log.Infof("created dedicated watcher for keyspace %s/%s", cell, keyspace)
   386  	kss := &keyspaceState{
   387  		kew:      kew,
   388  		keyspace: keyspace,
   389  		shards:   make(map[string]*shardState),
   390  	}
   391  	kew.ts.WatchSrvKeyspace(context.Background(), cell, keyspace, kss.onSrvKeyspace)
   392  	return kss
   393  }
   394  
   395  // processHealthCheck is the callback that is called by the global HealthCheck stream that was initiated
   396  // by this KeyspaceEventWatcher. it redirects the TabletHealth event to the corresponding keyspaceState
   397  func (kew *KeyspaceEventWatcher) processHealthCheck(th *TabletHealth) {
   398  	kss := kew.getKeyspaceStatus(th.Target.Keyspace)
   399  	if kss == nil {
   400  		return
   401  	}
   402  
   403  	kss.onHealthCheck(th)
   404  }
   405  
   406  // getKeyspaceStatus returns the keyspaceState object for the corresponding keyspace, allocating it
   407  // if we've never seen the keyspace before.
   408  func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceState {
   409  	kew.mu.Lock()
   410  	defer kew.mu.Unlock()
   411  
   412  	kss := kew.keyspaces[keyspace]
   413  	if kss == nil {
   414  		kss = newKeyspaceState(kew, kew.localCell, keyspace)
   415  		kew.keyspaces[keyspace] = kss
   416  	}
   417  	if kss.deleted {
   418  		kss = nil
   419  		delete(kew.keyspaces, keyspace)
   420  	}
   421  	return kss
   422  }
   423  
   424  // TargetIsBeingResharded checks if the reason why the given target is not accessible right now
   425  // is because the keyspace where it resides is (potentially) undergoing a resharding operation.
   426  // This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the
   427  // request for the given target under the assumption that the reason why it cannot be completed
   428  // right now is transitory.
   429  func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool {
   430  	if target.TabletType != topodatapb.TabletType_PRIMARY {
   431  		return false
   432  	}
   433  	ks := kew.getKeyspaceStatus(target.Keyspace)
   434  	if ks == nil {
   435  		return false
   436  	}
   437  	return ks.beingResharded(target.Shard)
   438  }
   439  
   440  // PrimaryIsNotServing checks if the reason why the given target is not accessible right now is
   441  // that the primary tablet for that shard is not serving. This is possible during a Planned Reparent Shard
   442  // operation. Just as the operation completes, a new primary will be elected, and it will send its own healthcheck
   443  // stating that it is serving. We should buffer requests until that point.
   444  // There are use cases where people do not run with a Primary server at all, so we must verify that
   445  // we only start buffering when a primary was present, and it went not serving.
   446  // The shard state keeps track of the current primary and the last externally reparented time, which we can use
   447  // to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary
   448  // RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed.
   449  func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) bool {
   450  	if target.TabletType != topodatapb.TabletType_PRIMARY {
   451  		return false
   452  	}
   453  	ks := kew.getKeyspaceStatus(target.Keyspace)
   454  	if ks == nil {
   455  		return false
   456  	}
   457  	ks.mu.Lock()
   458  	defer ks.mu.Unlock()
   459  	if state, ok := ks.shards[target.Shard]; ok {
   460  		// If the primary tablet was present then externallyReparented will be non-zero and currentPrimary will be not nil
   461  		return !state.serving && !ks.consistent && state.externallyReparented != 0 && state.currentPrimary != nil
   462  	}
   463  	return false
   464  }