vitess.io/vitess@v0.16.2/go/vt/discovery/keyspace_events.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package discovery 18 19 import ( 20 "context" 21 "fmt" 22 "sync" 23 24 "google.golang.org/protobuf/proto" 25 26 "vitess.io/vitess/go/vt/log" 27 "vitess.io/vitess/go/vt/proto/query" 28 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 29 "vitess.io/vitess/go/vt/srvtopo" 30 "vitess.io/vitess/go/vt/topo" 31 "vitess.io/vitess/go/vt/topo/topoproto" 32 ) 33 34 // KeyspaceEventWatcher is an auxiliary watcher that watches all availability incidents 35 // for all keyspaces in a Vitess cell and notifies listeners when the events have been resolved. 36 // Right now this is capable of detecting the end of failovers, both planned and unplanned, 37 // and the end of resharding operations. 38 // 39 // The KeyspaceEventWatcher works by consolidating TabletHealth events from a HealthCheck stream, 40 // which is a peer-to-peer check between nodes via GRPC, with events from a Topology Server, which 41 // are global to the cluster and stored in an external system like etcd. 42 type KeyspaceEventWatcher struct { 43 ts srvtopo.Server 44 hc HealthCheck 45 localCell string 46 47 mu sync.Mutex 48 keyspaces map[string]*keyspaceState 49 50 subsMu sync.Mutex 51 subs map[chan *KeyspaceEvent]struct{} 52 } 53 54 // KeyspaceEvent is yielded to all watchers when an availability event for a keyspace has been resolved 55 type KeyspaceEvent struct { 56 // Cell is the cell where the keyspace lives 57 Cell string 58 59 // Keyspace is the name of the keyspace which was (partially) unavailable and is now fully healthy 60 Keyspace string 61 62 // Shards is a list of all the shards in the keyspace, including their state after the event is resolved 63 Shards []ShardEvent 64 } 65 66 type ShardEvent struct { 67 Tablet *topodatapb.TabletAlias 68 Target *query.Target 69 Serving bool 70 } 71 72 // NewKeyspaceEventWatcher returns a new watcher for all keyspace events in the given cell. 73 // It requires access to a topology server, and an existing HealthCheck implementation which 74 // will be used to detect unhealthy nodes. 75 func NewKeyspaceEventWatcher(ctx context.Context, topoServer srvtopo.Server, hc HealthCheck, localCell string) *KeyspaceEventWatcher { 76 kew := &KeyspaceEventWatcher{ 77 hc: hc, 78 ts: topoServer, 79 localCell: localCell, 80 keyspaces: make(map[string]*keyspaceState), 81 subs: make(map[chan *KeyspaceEvent]struct{}), 82 } 83 kew.run(ctx) 84 log.Infof("started watching keyspace events in %q", localCell) 85 return kew 86 } 87 88 // keyspaceState is the internal state for all the keyspaces that the KEW is 89 // currently watching 90 type keyspaceState struct { 91 kew *KeyspaceEventWatcher 92 keyspace string 93 94 mu sync.Mutex 95 deleted bool 96 consistent bool 97 98 lastError error 99 lastKeyspace *topodatapb.SrvKeyspace 100 shards map[string]*shardState 101 } 102 103 // Format prints the internal state for this keyspace for debug purposes 104 func (kss *keyspaceState) Format(f fmt.State, verb rune) { 105 kss.mu.Lock() 106 defer kss.mu.Unlock() 107 108 fmt.Fprintf(f, "Keyspace(%s) = deleted: %v, consistent: %v, shards: [\n", kss.keyspace, kss.deleted, kss.consistent) 109 for shard, ss := range kss.shards { 110 fmt.Fprintf(f, " Shard(%s) = target: [%s/%s %v], serving: %v, externally_reparented: %d, current_primary: %s\n", 111 shard, 112 ss.target.Keyspace, ss.target.Shard, ss.target.TabletType, 113 ss.serving, ss.externallyReparented, 114 ss.currentPrimary.String(), 115 ) 116 } 117 fmt.Fprintf(f, "]\n") 118 } 119 120 // beingResharded returns whether this keyspace is thought to be in the middle of a resharding 121 // operation. currentShard is the name of the shard that belongs to this keyspace and which 122 // we are trying to access. currentShard can _only_ be a primary shard. 123 func (kss *keyspaceState) beingResharded(currentShard string) bool { 124 kss.mu.Lock() 125 defer kss.mu.Unlock() 126 127 // if the keyspace is gone, or if it has no known availability events, the keyspace 128 // cannot be in the middle of a resharding operation 129 if kss.deleted || kss.consistent { 130 return false 131 } 132 133 // for all the known shards, try to find a primary shard besides the one we're trying to access 134 // and which is currently healthy. if there are other healthy primaries in the keyspace, it means 135 // we're in the middle of a resharding operation 136 for shard, sstate := range kss.shards { 137 if shard != currentShard && sstate.serving { 138 return true 139 } 140 } 141 142 return false 143 } 144 145 type shardState struct { 146 target *query.Target 147 serving bool 148 externallyReparented int64 149 currentPrimary *topodatapb.TabletAlias 150 } 151 152 // Subscribe returns a channel that will receive any KeyspaceEvents for all keyspaces in the current cell 153 func (kew *KeyspaceEventWatcher) Subscribe() chan *KeyspaceEvent { 154 kew.subsMu.Lock() 155 defer kew.subsMu.Unlock() 156 c := make(chan *KeyspaceEvent, 2) 157 kew.subs[c] = struct{}{} 158 return c 159 } 160 161 // Unsubscribe removes a listener previously returned from Subscribe 162 func (kew *KeyspaceEventWatcher) Unsubscribe(c chan *KeyspaceEvent) { 163 kew.subsMu.Lock() 164 defer kew.subsMu.Unlock() 165 delete(kew.subs, c) 166 } 167 168 func (kew *KeyspaceEventWatcher) broadcast(th *KeyspaceEvent) { 169 kew.subsMu.Lock() 170 defer kew.subsMu.Unlock() 171 for c := range kew.subs { 172 select { 173 case c <- th: 174 default: 175 } 176 } 177 } 178 179 func (kew *KeyspaceEventWatcher) run(ctx context.Context) { 180 hcChan := kew.hc.Subscribe() 181 bufferCtx, bufferCancel := context.WithCancel(ctx) 182 183 go func() { 184 defer bufferCancel() 185 186 for { 187 select { 188 case <-bufferCtx.Done(): 189 return 190 case result := <-hcChan: 191 if result == nil { 192 return 193 } 194 kew.processHealthCheck(result) 195 } 196 } 197 }() 198 199 go func() { 200 // Seed the keyspace statuses once at startup 201 keyspaces, err := kew.ts.GetSrvKeyspaceNames(ctx, kew.localCell, true) 202 if err != nil { 203 log.Errorf("CEM: initialize failed for cell %q: %v", kew.localCell, err) 204 return 205 } 206 for _, ks := range keyspaces { 207 kew.getKeyspaceStatus(ks) 208 } 209 }() 210 } 211 212 // ensureConsistentLocked checks if the current keyspace has recovered from an availability 213 // event, and if so, returns information about the availability event to all subscribers 214 func (kss *keyspaceState) ensureConsistentLocked() { 215 // if this keyspace is consistent, there's no ongoing availability event 216 if kss.consistent { 217 return 218 } 219 220 // get the topology metadata for our primary from `lastKeyspace`; this value is refreshed 221 // from our topology watcher whenever a change is detected, so it should always be up to date 222 primary := topoproto.SrvKeyspaceGetPartition(kss.lastKeyspace, topodatapb.TabletType_PRIMARY) 223 224 // if there's no primary, the keyspace is unhealthy; 225 // if there are ShardTabletControls active, the keyspace is undergoing a topology change; 226 // either way, the availability event is still ongoing 227 if primary == nil || len(primary.ShardTabletControls) > 0 { 228 return 229 } 230 231 activeShardsInPartition := make(map[string]bool) 232 233 // iterate through all the primary shards that the topology server knows about; 234 // for each shard, if our HealthCheck stream hasn't found the shard yet, or 235 // if the HealthCheck stream still thinks the shard is unhealthy, this 236 // means the availability event is still ongoing 237 for _, shard := range primary.ShardReferences { 238 sstate := kss.shards[shard.Name] 239 if sstate == nil || !sstate.serving { 240 return 241 } 242 activeShardsInPartition[shard.Name] = true 243 } 244 245 // iterate through all the shards as seen by our HealthCheck stream. if there are any 246 // shards that HealthCheck thinks are healthy, and they haven't been seen by the topology 247 // watcher, it means the keyspace is not fully consistent yet 248 for shard, sstate := range kss.shards { 249 if sstate.serving && !activeShardsInPartition[shard] { 250 return 251 } 252 } 253 254 // we haven't found any inconsistencies between the HealthCheck stream and the topology 255 // watcher. this means the ongoing availability event has been resolved, so we can broadcast 256 // a resolution event to all listeners 257 kss.consistent = true 258 259 ksevent := &KeyspaceEvent{ 260 Cell: kss.kew.localCell, 261 Keyspace: kss.keyspace, 262 Shards: make([]ShardEvent, 0, len(kss.shards)), 263 } 264 265 for shard, sstate := range kss.shards { 266 ksevent.Shards = append(ksevent.Shards, ShardEvent{ 267 Tablet: sstate.currentPrimary, 268 Target: sstate.target, 269 Serving: sstate.serving, 270 }) 271 272 log.Infof("keyspace event resolved: %s/%s is now consistent (serving: %v)", 273 sstate.target.Keyspace, sstate.target.Keyspace, 274 sstate.serving, 275 ) 276 277 if !sstate.serving { 278 delete(kss.shards, shard) 279 } 280 } 281 282 kss.kew.broadcast(ksevent) 283 } 284 285 // onHealthCheck is the callback that updates this keyspace with event data from the HealthCheck stream. 286 // the HealthCheck stream applies to all the keyspaces in the cluster and emits TabletHealth events to our 287 // parent KeyspaceWatcher, which will mux them into their corresponding keyspaceState 288 func (kss *keyspaceState) onHealthCheck(th *TabletHealth) { 289 // we only care about health events on the primary 290 if th.Target.TabletType != topodatapb.TabletType_PRIMARY { 291 return 292 } 293 294 kss.mu.Lock() 295 defer kss.mu.Unlock() 296 297 sstate := kss.shards[th.Target.Shard] 298 299 // if we've never seen this shard before, we need to allocate a shardState for it, unless 300 // we've received a _not serving_ shard event for a shard which we don't know about yet, 301 // in which case we don't need to keep track of it. we'll start tracking it if/when the 302 // shard becomes healthy again 303 if sstate == nil { 304 if !th.Serving { 305 return 306 } 307 308 sstate = &shardState{target: th.Target} 309 kss.shards[th.Target.Shard] = sstate 310 } 311 312 // if the shard went from serving to not serving, or the other way around, the keyspace 313 // is undergoing an availability event 314 if sstate.serving != th.Serving { 315 sstate.serving = th.Serving 316 kss.consistent = false 317 } 318 319 // if the primary for this shard has been externally reparented, we're undergoing a failover, 320 // which is considered an availability event. update this shard to point it to the new tablet 321 // that acts as primary now 322 if th.PrimaryTermStartTime != 0 && th.PrimaryTermStartTime > sstate.externallyReparented { 323 sstate.externallyReparented = th.PrimaryTermStartTime 324 sstate.currentPrimary = th.Tablet.Alias 325 kss.consistent = false 326 } 327 328 kss.ensureConsistentLocked() 329 } 330 331 // onSrvKeyspace is the callback that updates this keyspace with fresh topology data from our topology server. 332 // this callback is called from a Watcher in the topo server whenever a change to the topology for this keyspace 333 // occurs. this watcher is dedicated to this keyspace, and will only yield topology metadata changes for as 334 // long as we're interested on this keyspace. 335 func (kss *keyspaceState) onSrvKeyspace(newKeyspace *topodatapb.SrvKeyspace, newError error) bool { 336 kss.mu.Lock() 337 defer kss.mu.Unlock() 338 339 // if the topology watcher has seen a NoNode while watching this keyspace, it means the keyspace 340 // has been deleted from the cluster. we mark it for eventual cleanup here, as we no longer need 341 // to keep watching for events in this keyspace. 342 if topo.IsErrType(newError, topo.NoNode) { 343 kss.deleted = true 344 log.Infof("keyspace %q deleted", kss.keyspace) 345 return false 346 } 347 348 // if there's another kind of error while watching this keyspace, we assume it's temporary and related 349 // to the topology server, not to the keyspace itself. we'll keep waiting for more topology events. 350 if newError != nil { 351 kss.lastError = newError 352 log.Errorf("error while watching keyspace %q: %v", kss.keyspace, newError) 353 return true 354 } 355 356 // if the topology metadata for our keyspace is identical to the last one we saw there's nothing to do 357 // here. this is a side-effect of the way ETCD watchers work. 358 if proto.Equal(kss.lastKeyspace, newKeyspace) { 359 // no changes 360 return true 361 } 362 363 // we only mark this keyspace as inconsistent if there has been a topology change in the PRIMARY for 364 // this keyspace, but we store the topology metadata for both primary and replicas for future-proofing. 365 var oldPrimary, newPrimary *topodatapb.SrvKeyspace_KeyspacePartition 366 if kss.lastKeyspace != nil { 367 oldPrimary = topoproto.SrvKeyspaceGetPartition(kss.lastKeyspace, topodatapb.TabletType_PRIMARY) 368 } 369 if newKeyspace != nil { 370 newPrimary = topoproto.SrvKeyspaceGetPartition(newKeyspace, topodatapb.TabletType_PRIMARY) 371 } 372 if !proto.Equal(oldPrimary, newPrimary) { 373 kss.consistent = false 374 } 375 376 kss.lastKeyspace = newKeyspace 377 kss.ensureConsistentLocked() 378 return true 379 } 380 381 // newKeyspaceState allocates the internal state required to keep track of availability incidents 382 // in this keyspace, and starts up a SrvKeyspace watcher on our topology server which will update 383 // our keyspaceState with any topology changes in real time. 384 func newKeyspaceState(kew *KeyspaceEventWatcher, cell, keyspace string) *keyspaceState { 385 log.Infof("created dedicated watcher for keyspace %s/%s", cell, keyspace) 386 kss := &keyspaceState{ 387 kew: kew, 388 keyspace: keyspace, 389 shards: make(map[string]*shardState), 390 } 391 kew.ts.WatchSrvKeyspace(context.Background(), cell, keyspace, kss.onSrvKeyspace) 392 return kss 393 } 394 395 // processHealthCheck is the callback that is called by the global HealthCheck stream that was initiated 396 // by this KeyspaceEventWatcher. it redirects the TabletHealth event to the corresponding keyspaceState 397 func (kew *KeyspaceEventWatcher) processHealthCheck(th *TabletHealth) { 398 kss := kew.getKeyspaceStatus(th.Target.Keyspace) 399 if kss == nil { 400 return 401 } 402 403 kss.onHealthCheck(th) 404 } 405 406 // getKeyspaceStatus returns the keyspaceState object for the corresponding keyspace, allocating it 407 // if we've never seen the keyspace before. 408 func (kew *KeyspaceEventWatcher) getKeyspaceStatus(keyspace string) *keyspaceState { 409 kew.mu.Lock() 410 defer kew.mu.Unlock() 411 412 kss := kew.keyspaces[keyspace] 413 if kss == nil { 414 kss = newKeyspaceState(kew, kew.localCell, keyspace) 415 kew.keyspaces[keyspace] = kss 416 } 417 if kss.deleted { 418 kss = nil 419 delete(kew.keyspaces, keyspace) 420 } 421 return kss 422 } 423 424 // TargetIsBeingResharded checks if the reason why the given target is not accessible right now 425 // is because the keyspace where it resides is (potentially) undergoing a resharding operation. 426 // This is not a fully accurate heuristic, but it's good enough that we'd want to buffer the 427 // request for the given target under the assumption that the reason why it cannot be completed 428 // right now is transitory. 429 func (kew *KeyspaceEventWatcher) TargetIsBeingResharded(target *query.Target) bool { 430 if target.TabletType != topodatapb.TabletType_PRIMARY { 431 return false 432 } 433 ks := kew.getKeyspaceStatus(target.Keyspace) 434 if ks == nil { 435 return false 436 } 437 return ks.beingResharded(target.Shard) 438 } 439 440 // PrimaryIsNotServing checks if the reason why the given target is not accessible right now is 441 // that the primary tablet for that shard is not serving. This is possible during a Planned Reparent Shard 442 // operation. Just as the operation completes, a new primary will be elected, and it will send its own healthcheck 443 // stating that it is serving. We should buffer requests until that point. 444 // There are use cases where people do not run with a Primary server at all, so we must verify that 445 // we only start buffering when a primary was present, and it went not serving. 446 // The shard state keeps track of the current primary and the last externally reparented time, which we can use 447 // to determine that there was a serving primary which now became non serving. This is only possible in a DemotePrimary 448 // RPC which are only called from ERS and PRS. So buffering will stop when these operations succeed. 449 func (kew *KeyspaceEventWatcher) PrimaryIsNotServing(target *query.Target) bool { 450 if target.TabletType != topodatapb.TabletType_PRIMARY { 451 return false 452 } 453 ks := kew.getKeyspaceStatus(target.Keyspace) 454 if ks == nil { 455 return false 456 } 457 ks.mu.Lock() 458 defer ks.mu.Unlock() 459 if state, ok := ks.shards[target.Shard]; ok { 460 // If the primary tablet was present then externallyReparented will be non-zero and currentPrimary will be not nil 461 return !state.serving && !ks.consistent && state.externallyReparented != 0 && state.currentPrimary != nil 462 } 463 return false 464 }