github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/gossip/gossip.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 /* 12 Each node attempts to contact peer nodes to gather all Infos in 13 the system with minimal total hops. The algorithm is as follows: 14 15 0 Node starts up gossip server to accept incoming gossip requests. 16 Continue to step #1 to join the gossip network. 17 18 1 Node selects random peer from bootstrap list, excluding its own 19 address for its first outgoing connection. Node starts client and 20 continues to step #2. 21 22 2 Node requests gossip from peer. Gossip requests (and responses) 23 contain a map from node ID to info about other nodes in the 24 network. Each node maintains its own map as well as the maps of 25 each of its peers. The info for each node includes the most recent 26 timestamp of any Info originating at that node, as well as the min 27 number of hops to reach that node. Requesting node times out at 28 checkInterval. On timeout, client is closed and GC'd. If node has 29 no outgoing connections, goto #1. 30 31 a. When gossip is received, infostore is augmented. If new Info was 32 received, the client in question is credited. If node has no 33 outgoing connections, goto #1. 34 35 b. If any gossip was received at > maxHops and num connected peers 36 < maxPeers(), choose random peer from those originating Info > 37 maxHops, start it, and goto #2. 38 39 c. If sentinel gossip keyed by KeySentinel is missing or expired, 40 node is considered partitioned; goto #1. 41 42 3 On connect, if node has too many connected clients, gossip requests 43 are returned immediately with an alternate address set to a random 44 selection from amongst already-connected clients. 45 */ 46 47 package gossip 48 49 import ( 50 "bytes" 51 "context" 52 "fmt" 53 "math" 54 "math/rand" 55 "net" 56 "sort" 57 "strconv" 58 "strings" 59 "sync" 60 "time" 61 62 circuit "github.com/cockroachdb/circuitbreaker" 63 "github.com/cockroachdb/cockroach/pkg/base" 64 "github.com/cockroachdb/cockroach/pkg/config" 65 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 66 "github.com/cockroachdb/cockroach/pkg/gossip/resolver" 67 "github.com/cockroachdb/cockroach/pkg/roachpb" 68 "github.com/cockroachdb/cockroach/pkg/rpc" 69 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 70 "github.com/cockroachdb/cockroach/pkg/util" 71 "github.com/cockroachdb/cockroach/pkg/util/errorutil" 72 "github.com/cockroachdb/cockroach/pkg/util/log" 73 "github.com/cockroachdb/cockroach/pkg/util/metric" 74 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 75 "github.com/cockroachdb/cockroach/pkg/util/stop" 76 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 77 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 78 "github.com/cockroachdb/cockroach/pkg/util/uuid" 79 "github.com/cockroachdb/errors" 80 "github.com/cockroachdb/logtags" 81 "github.com/gogo/protobuf/proto" 82 "google.golang.org/grpc" 83 ) 84 85 const ( 86 // maxHops is the maximum number of hops which any gossip info 87 // should require to transit between any two nodes in a gossip 88 // network. 89 maxHops = 5 90 91 // minPeers is the minimum number of peers which the maxPeers() 92 // function will return. This is set higher than one to prevent 93 // excessive tightening of the network. 94 minPeers = 3 95 96 // defaultStallInterval is the default interval for checking whether 97 // the incoming and outgoing connections to the gossip network are 98 // insufficient to keep the network connected. 99 defaultStallInterval = 2 * time.Second 100 101 // defaultBootstrapInterval is the minimum time between successive 102 // bootstrapping attempts to avoid busy-looping trying to find the 103 // sentinel gossip info. 104 defaultBootstrapInterval = 1 * time.Second 105 106 // defaultCullInterval is the default interval for culling the least 107 // "useful" outgoing gossip connection to free up space for a more 108 // efficiently targeted connection to the most distant node. 109 defaultCullInterval = 60 * time.Second 110 111 // defaultClientsInterval is the default interval for updating the gossip 112 // clients key which allows every node in the cluster to create a map of 113 // gossip connectivity. This value is intentionally small as we want to 114 // detect gossip partitions faster that the node liveness timeout (9s). 115 defaultClientsInterval = 2 * time.Second 116 117 // NodeDescriptorInterval is the interval for gossiping the node descriptor. 118 // Note that increasing this duration may increase the likelihood of gossip 119 // thrashing, since node descriptors are used to determine the number of gossip 120 // hops between nodes (see #9819 for context). 121 NodeDescriptorInterval = 1 * time.Hour 122 123 // NodeDescriptorTTL is time-to-live for node ID -> descriptor. 124 NodeDescriptorTTL = 2 * NodeDescriptorInterval 125 126 // StoresInterval is the default interval for gossiping store descriptors. 127 StoresInterval = 60 * time.Second 128 129 // StoreTTL is time-to-live for store-related info. 130 StoreTTL = 2 * StoresInterval 131 132 unknownNodeID roachpb.NodeID = 0 133 ) 134 135 // Gossip metrics counter names. 136 var ( 137 MetaConnectionsIncomingGauge = metric.Metadata{ 138 Name: "gossip.connections.incoming", 139 Help: "Number of active incoming gossip connections", 140 Measurement: "Connections", 141 Unit: metric.Unit_COUNT, 142 } 143 MetaConnectionsOutgoingGauge = metric.Metadata{ 144 Name: "gossip.connections.outgoing", 145 Help: "Number of active outgoing gossip connections", 146 Measurement: "Connections", 147 Unit: metric.Unit_COUNT, 148 } 149 MetaConnectionsRefused = metric.Metadata{ 150 Name: "gossip.connections.refused", 151 Help: "Number of refused incoming gossip connections", 152 Measurement: "Connections", 153 Unit: metric.Unit_COUNT, 154 } 155 MetaInfosSent = metric.Metadata{ 156 Name: "gossip.infos.sent", 157 Help: "Number of sent gossip Info objects", 158 Measurement: "Infos", 159 Unit: metric.Unit_COUNT, 160 } 161 MetaInfosReceived = metric.Metadata{ 162 Name: "gossip.infos.received", 163 Help: "Number of received gossip Info objects", 164 Measurement: "Infos", 165 Unit: metric.Unit_COUNT, 166 } 167 MetaBytesSent = metric.Metadata{ 168 Name: "gossip.bytes.sent", 169 Help: "Number of sent gossip bytes", 170 Measurement: "Gossip Bytes", 171 Unit: metric.Unit_BYTES, 172 } 173 MetaBytesReceived = metric.Metadata{ 174 Name: "gossip.bytes.received", 175 Help: "Number of received gossip bytes", 176 Measurement: "Gossip Bytes", 177 Unit: metric.Unit_BYTES, 178 } 179 ) 180 181 // KeyNotPresentError is returned by gossip when queried for a key that doesn't 182 // exist or has expired. 183 type KeyNotPresentError struct { 184 key string 185 } 186 187 // Error implements the error interface. 188 func (err KeyNotPresentError) Error() string { 189 return fmt.Sprintf("KeyNotPresentError: gossip key %q does not exist or has expired", err.key) 190 } 191 192 // NewKeyNotPresentError creates a new KeyNotPresentError. 193 func NewKeyNotPresentError(key string) error { 194 return KeyNotPresentError{key: key} 195 } 196 197 // AddressResolver is a thin wrapper around gossip's GetNodeIDAddress 198 // that allows it to be used as a nodedialer.AddressResolver 199 func AddressResolver(gossip *Gossip) nodedialer.AddressResolver { 200 return func(nodeID roachpb.NodeID) (net.Addr, error) { 201 return gossip.GetNodeIDAddress(nodeID) 202 } 203 } 204 205 // Storage is an interface which allows the gossip instance 206 // to read and write bootstrapping data to persistent storage 207 // between instantiations. 208 type Storage interface { 209 // ReadBootstrapInfo fetches the bootstrap data from the persistent 210 // store into the provided bootstrap protobuf. Returns nil or an 211 // error on failure. 212 ReadBootstrapInfo(*BootstrapInfo) error 213 // WriteBootstrapInfo stores the provided bootstrap data to the 214 // persistent store. Returns nil or an error on failure. 215 WriteBootstrapInfo(*BootstrapInfo) error 216 } 217 218 // Gossip is an instance of a gossip node. It embeds a gossip server. 219 // During bootstrapping, the bootstrap list contains candidates for 220 // entry to the gossip network. 221 type Gossip struct { 222 started bool // for assertions 223 224 *server // Embedded gossip RPC server 225 226 Connected chan struct{} // Closed upon initial connection 227 hasConnected bool // Set first time network is connected 228 rpcContext *rpc.Context // The context required for RPC 229 outgoing nodeSet // Set of outgoing client node IDs 230 storage Storage // Persistent storage interface 231 bootstrapInfo BootstrapInfo // BootstrapInfo proto for persistent storage 232 bootstrapping map[string]struct{} // Set of active bootstrap clients 233 hasCleanedBS bool 234 235 // Note that access to each client's internal state is serialized by the 236 // embedded server's mutex. This is surprising! 237 clientsMu struct { 238 syncutil.Mutex 239 clients []*client 240 // One breaker per client for the life of the process. 241 breakers map[string]*circuit.Breaker 242 } 243 244 disconnected chan *client // Channel of disconnected clients 245 stalled bool // True if gossip is stalled (i.e. host doesn't have sentinel) 246 stalledCh chan struct{} // Channel to wake up stalled bootstrap 247 248 stallInterval time.Duration 249 bootstrapInterval time.Duration 250 cullInterval time.Duration 251 252 // The system config is treated unlike other info objects. 253 // It is used so often that we keep an unmarshaled version of it 254 // here and its own set of callbacks. 255 // We do not use the infostore to avoid unmarshalling under the 256 // main gossip lock. 257 systemConfig *config.SystemConfig 258 systemConfigMu syncutil.RWMutex 259 systemConfigChannels []chan<- struct{} 260 261 // resolvers is a list of resolvers used to determine 262 // bootstrap hosts for connecting to the gossip network. 263 resolverIdx int 264 resolvers []resolver.Resolver 265 resolversTried map[int]struct{} // Set of attempted resolver indexes 266 nodeDescs map[roachpb.NodeID]*roachpb.NodeDescriptor 267 // storeMap maps store IDs to node IDs. 268 storeMap map[roachpb.StoreID]roachpb.NodeID 269 270 // Membership sets for resolvers and bootstrap addresses. 271 // bootstrapAddrs also tracks which address is associated with which 272 // node ID to enable faster node lookup by address. 273 resolverAddrs map[util.UnresolvedAddr]resolver.Resolver 274 bootstrapAddrs map[util.UnresolvedAddr]roachpb.NodeID 275 276 localityTierMap map[string]struct{} 277 278 lastConnectivity string 279 280 defaultZoneConfig *zonepb.ZoneConfig 281 } 282 283 // New creates an instance of a gossip node. 284 // The higher level manages the ClusterIDContainer and NodeIDContainer instances 285 // (which can be shared by various server components). The ambient context is 286 // expected to already contain the node ID. 287 // 288 // grpcServer: The server on which the new Gossip instance will register its RPC 289 // service. Can be nil, in which case the Gossip will not register the 290 // service. 291 // rpcContext: The context used to connect to other nodes. Can be nil for tests 292 // that also specify a nil grpcServer and that plan on using the Gossip in a 293 // restricted way by populating it with data manually. 294 func New( 295 ambient log.AmbientContext, 296 clusterID *base.ClusterIDContainer, 297 nodeID *base.NodeIDContainer, 298 rpcContext *rpc.Context, 299 grpcServer *grpc.Server, 300 stopper *stop.Stopper, 301 registry *metric.Registry, 302 locality roachpb.Locality, 303 defaultZoneConfig *zonepb.ZoneConfig, 304 ) *Gossip { 305 ambient.SetEventLog("gossip", "gossip") 306 g := &Gossip{ 307 server: newServer(ambient, clusterID, nodeID, stopper, registry), 308 Connected: make(chan struct{}), 309 rpcContext: rpcContext, 310 outgoing: makeNodeSet(minPeers, metric.NewGauge(MetaConnectionsOutgoingGauge)), 311 bootstrapping: map[string]struct{}{}, 312 disconnected: make(chan *client, 10), 313 stalledCh: make(chan struct{}, 1), 314 stallInterval: defaultStallInterval, 315 bootstrapInterval: defaultBootstrapInterval, 316 cullInterval: defaultCullInterval, 317 resolversTried: map[int]struct{}{}, 318 nodeDescs: map[roachpb.NodeID]*roachpb.NodeDescriptor{}, 319 storeMap: make(map[roachpb.StoreID]roachpb.NodeID), 320 resolverAddrs: map[util.UnresolvedAddr]resolver.Resolver{}, 321 bootstrapAddrs: map[util.UnresolvedAddr]roachpb.NodeID{}, 322 localityTierMap: map[string]struct{}{}, 323 defaultZoneConfig: defaultZoneConfig, 324 } 325 326 for _, loc := range locality.Tiers { 327 g.localityTierMap[loc.String()] = struct{}{} 328 } 329 stopper.AddCloser(stop.CloserFn(g.server.AmbientContext.FinishEventLog)) 330 331 registry.AddMetric(g.outgoing.gauge) 332 g.clientsMu.breakers = map[string]*circuit.Breaker{} 333 334 g.mu.Lock() 335 // Add ourselves as a SystemConfig watcher. 336 g.mu.is.registerCallback(KeySystemConfig, g.updateSystemConfig) 337 // Add ourselves as a node descriptor watcher. 338 g.mu.is.registerCallback(MakePrefixPattern(KeyNodeIDPrefix), g.updateNodeAddress) 339 g.mu.is.registerCallback(MakePrefixPattern(KeyStorePrefix), g.updateStoreMap) 340 // Log gossip connectivity whenever we receive an update. 341 g.mu.Unlock() 342 343 if grpcServer != nil { 344 RegisterGossipServer(grpcServer, g.server) 345 } 346 return g 347 } 348 349 // NewTest is a simplified wrapper around New that creates the 350 // ClusterIDContainer and NodeIDContainer internally. Used for testing. 351 // 352 // grpcServer: The server on which the new Gossip instance will register its RPC 353 // service. Can be nil, in which case the Gossip will not register the 354 // service. 355 // rpcContext: The context used to connect to other nodes. Can be nil for tests 356 // that also specify a nil grpcServer and that plan on using the Gossip in a 357 // restricted way by populating it with data manually. 358 func NewTest( 359 nodeID roachpb.NodeID, 360 rpcContext *rpc.Context, 361 grpcServer *grpc.Server, 362 stopper *stop.Stopper, 363 registry *metric.Registry, 364 defaultZoneConfig *zonepb.ZoneConfig, 365 ) *Gossip { 366 return NewTestWithLocality(nodeID, rpcContext, grpcServer, stopper, registry, roachpb.Locality{}, defaultZoneConfig) 367 } 368 369 // NewTestWithLocality calls NewTest with an explicit locality value. 370 func NewTestWithLocality( 371 nodeID roachpb.NodeID, 372 rpcContext *rpc.Context, 373 grpcServer *grpc.Server, 374 stopper *stop.Stopper, 375 registry *metric.Registry, 376 locality roachpb.Locality, 377 defaultZoneConfig *zonepb.ZoneConfig, 378 ) *Gossip { 379 c := &base.ClusterIDContainer{} 380 n := &base.NodeIDContainer{} 381 var ac log.AmbientContext 382 ac.AddLogTag("n", n) 383 gossip := New(ac, c, n, rpcContext, grpcServer, stopper, registry, locality, defaultZoneConfig) 384 if nodeID != 0 { 385 n.Set(context.TODO(), nodeID) 386 } 387 return gossip 388 } 389 390 // AssertNotStarted fatals if the Gossip instance was already started. 391 func (g *Gossip) AssertNotStarted(ctx context.Context) { 392 if g.started { 393 log.Fatalf(ctx, "Gossip instance was already started") 394 } 395 } 396 397 // GetNodeMetrics returns the gossip node metrics. 398 func (g *Gossip) GetNodeMetrics() *Metrics { 399 return g.server.GetNodeMetrics() 400 } 401 402 // SetNodeDescriptor adds the node descriptor to the gossip network. 403 func (g *Gossip) SetNodeDescriptor(desc *roachpb.NodeDescriptor) error { 404 ctx := g.AnnotateCtx(context.TODO()) 405 log.Infof(ctx, "NodeDescriptor set to %+v", desc) 406 if desc.Address.IsEmpty() { 407 log.Fatalf(ctx, "n%d address is empty", desc.NodeID) 408 } 409 if err := g.AddInfoProto(MakeNodeIDKey(desc.NodeID), desc, NodeDescriptorTTL); err != nil { 410 return errors.Errorf("n%d: couldn't gossip descriptor: %v", desc.NodeID, err) 411 } 412 g.updateClients() 413 return nil 414 } 415 416 // SetStallInterval sets the interval between successive checks 417 // to determine whether this host is not connected to the gossip 418 // network, or else is connected to a partition which doesn't 419 // include the host which gossips the sentinel info. 420 func (g *Gossip) SetStallInterval(interval time.Duration) { 421 g.mu.Lock() 422 defer g.mu.Unlock() 423 g.stallInterval = interval 424 } 425 426 // SetBootstrapInterval sets a minimum interval between successive 427 // attempts to connect to new hosts in order to join the gossip 428 // network. 429 func (g *Gossip) SetBootstrapInterval(interval time.Duration) { 430 g.mu.Lock() 431 defer g.mu.Unlock() 432 g.bootstrapInterval = interval 433 } 434 435 // SetCullInterval sets the interval between periodic shutdown of 436 // outgoing gossip client connections in an effort to improve the 437 // fitness of the network. 438 func (g *Gossip) SetCullInterval(interval time.Duration) { 439 g.mu.Lock() 440 defer g.mu.Unlock() 441 g.cullInterval = interval 442 } 443 444 // SetStorage provides an instance of the Storage interface 445 // for reading and writing gossip bootstrap data from persistent 446 // storage. This should be invoked as early in the lifecycle of a 447 // gossip instance as possible, but can be called at any time. 448 func (g *Gossip) SetStorage(storage Storage) error { 449 ctx := g.AnnotateCtx(context.TODO()) 450 // Maintain lock ordering. 451 var storedBI BootstrapInfo 452 if err := storage.ReadBootstrapInfo(&storedBI); err != nil { 453 log.Warningf(ctx, "failed to read gossip bootstrap info: %s", err) 454 } 455 456 g.mu.Lock() 457 defer g.mu.Unlock() 458 g.storage = storage 459 460 // Merge the stored bootstrap info addresses with any we've become 461 // aware of through gossip. 462 existing := map[string]struct{}{} 463 makeKey := func(a util.UnresolvedAddr) string { return fmt.Sprintf("%s,%s", a.Network(), a.String()) } 464 for _, addr := range g.bootstrapInfo.Addresses { 465 existing[makeKey(addr)] = struct{}{} 466 } 467 for _, addr := range storedBI.Addresses { 468 // If the address is new, and isn't our own address, add it. 469 if _, ok := existing[makeKey(addr)]; !ok && addr != g.mu.is.NodeAddr { 470 g.maybeAddBootstrapAddressLocked(addr, unknownNodeID) 471 } 472 } 473 // Persist merged addresses. 474 if numAddrs := len(g.bootstrapInfo.Addresses); numAddrs > len(storedBI.Addresses) { 475 if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { 476 log.Errorf(ctx, "%v", err) 477 } 478 } 479 480 // Cycle through all persisted bootstrap hosts and add resolvers for 481 // any which haven't already been added. 482 newResolverFound := false 483 for _, addr := range g.bootstrapInfo.Addresses { 484 if !g.maybeAddResolverLocked(addr) { 485 continue 486 } 487 // If we find a new resolver, reset the resolver index so that the 488 // next resolver we try is the first of the new resolvers. 489 if !newResolverFound { 490 newResolverFound = true 491 g.resolverIdx = len(g.resolvers) - 1 492 } 493 } 494 495 // If a new resolver was found, immediately signal bootstrap. 496 if newResolverFound { 497 if log.V(1) { 498 log.Infof(ctx, "found new resolvers from storage; signaling bootstrap") 499 } 500 g.signalStalledLocked() 501 } 502 return nil 503 } 504 505 // setResolvers initializes the set of gossip resolvers used to find 506 // nodes to bootstrap the gossip network. 507 func (g *Gossip) setResolvers(resolvers []resolver.Resolver) { 508 if resolvers == nil { 509 return 510 } 511 512 g.mu.Lock() 513 defer g.mu.Unlock() 514 515 // Start index at end because get next address loop logic increments as first step. 516 g.resolverIdx = len(resolvers) - 1 517 g.resolvers = resolvers 518 g.resolversTried = map[int]struct{}{} 519 520 // Start new bootstrapping immediately instead of waiting for next bootstrap interval. 521 g.maybeSignalStatusChangeLocked() 522 } 523 524 // GetResolvers returns a copy of the resolvers slice. 525 func (g *Gossip) GetResolvers() []resolver.Resolver { 526 g.mu.RLock() 527 defer g.mu.RUnlock() 528 return append([]resolver.Resolver(nil), g.resolvers...) 529 } 530 531 // GetNodeIDAddress looks up the RPC address of the node by ID. 532 func (g *Gossip) GetNodeIDAddress(nodeID roachpb.NodeID) (*util.UnresolvedAddr, error) { 533 g.mu.RLock() 534 defer g.mu.RUnlock() 535 return g.getNodeIDAddressLocked(nodeID) 536 } 537 538 // GetNodeIDSQLAddress looks up the SQL address of the node by ID. 539 func (g *Gossip) GetNodeIDSQLAddress(nodeID roachpb.NodeID) (*util.UnresolvedAddr, error) { 540 g.mu.RLock() 541 defer g.mu.RUnlock() 542 return g.getNodeIDSQLAddressLocked(nodeID) 543 } 544 545 // GetNodeIDForStoreID looks up the NodeID by StoreID. 546 func (g *Gossip) GetNodeIDForStoreID(storeID roachpb.StoreID) (roachpb.NodeID, error) { 547 g.mu.RLock() 548 defer g.mu.RUnlock() 549 return g.getNodeIDForStoreIDLocked(storeID) 550 } 551 552 // GetNodeDescriptor looks up the descriptor of the node by ID. 553 func (g *Gossip) GetNodeDescriptor(nodeID roachpb.NodeID) (*roachpb.NodeDescriptor, error) { 554 g.mu.RLock() 555 defer g.mu.RUnlock() 556 return g.getNodeDescriptorLocked(nodeID) 557 } 558 559 // LogStatus logs the current status of gossip such as the incoming and 560 // outgoing connections. 561 func (g *Gossip) LogStatus() { 562 g.mu.RLock() 563 n := len(g.nodeDescs) 564 status := "ok" 565 if g.mu.is.getInfo(KeySentinel) == nil { 566 status = "stalled" 567 } 568 g.mu.RUnlock() 569 570 var connectivity string 571 if s := g.Connectivity().String(); s != g.lastConnectivity { 572 g.lastConnectivity = s 573 connectivity = s 574 } 575 576 ctx := g.AnnotateCtx(context.TODO()) 577 log.Infof(ctx, "gossip status (%s, %d node%s)\n%s%s%s", 578 status, n, util.Pluralize(int64(n)), g.clientStatus(), g.server.status(), connectivity) 579 } 580 581 func (g *Gossip) clientStatus() ClientStatus { 582 g.mu.RLock() 583 defer g.mu.RUnlock() 584 g.clientsMu.Lock() 585 defer g.clientsMu.Unlock() 586 587 var status ClientStatus 588 589 status.MaxConns = int32(g.outgoing.maxSize) 590 status.ConnStatus = make([]OutgoingConnStatus, 0, len(g.clientsMu.clients)) 591 for _, c := range g.clientsMu.clients { 592 status.ConnStatus = append(status.ConnStatus, OutgoingConnStatus{ 593 ConnStatus: ConnStatus{ 594 NodeID: c.peerID, 595 Address: c.addr.String(), 596 AgeNanos: timeutil.Since(c.createdAt).Nanoseconds(), 597 }, 598 MetricSnap: c.clientMetrics.Snapshot(), 599 }) 600 } 601 return status 602 } 603 604 // Connectivity returns the current view of the gossip network as seen by this 605 // node. 606 func (g *Gossip) Connectivity() Connectivity { 607 ctx := g.AnnotateCtx(context.TODO()) 608 var c Connectivity 609 610 g.mu.RLock() 611 612 if i := g.mu.is.getInfo(KeySentinel); i != nil { 613 c.SentinelNodeID = i.NodeID 614 } 615 616 for nodeID := range g.nodeDescs { 617 i := g.mu.is.getInfo(MakeGossipClientsKey(nodeID)) 618 if i == nil { 619 continue 620 } 621 622 v, err := i.Value.GetBytes() 623 if err != nil { 624 log.Errorf(ctx, "unable to retrieve gossip value for %s: %v", 625 MakeGossipClientsKey(nodeID), err) 626 continue 627 } 628 if len(v) == 0 { 629 continue 630 } 631 632 for _, part := range strings.Split(string(v), ",") { 633 id, err := strconv.ParseInt(part, 10 /* base */, 64 /* bitSize */) 634 if err != nil { 635 log.Errorf(ctx, "unable to parse node ID: %v", err) 636 } 637 c.ClientConns = append(c.ClientConns, Connectivity_Conn{ 638 SourceID: nodeID, 639 TargetID: roachpb.NodeID(id), 640 }) 641 } 642 } 643 644 g.mu.RUnlock() 645 646 sort.Slice(c.ClientConns, func(i, j int) bool { 647 a, b := &c.ClientConns[i], &c.ClientConns[j] 648 if a.SourceID < b.SourceID { 649 return true 650 } 651 if a.SourceID > b.SourceID { 652 return false 653 } 654 return a.TargetID < b.TargetID 655 }) 656 657 return c 658 } 659 660 // EnableSimulationCycler is for TESTING PURPOSES ONLY. It sets a 661 // condition variable which is signaled at each cycle of the 662 // simulation via SimulationCycle(). The gossip server makes each 663 // connecting client wait for the cycler to signal before responding. 664 func (g *Gossip) EnableSimulationCycler(enable bool) { 665 g.mu.Lock() 666 defer g.mu.Unlock() 667 if enable { 668 g.simulationCycler = sync.NewCond(&g.mu) 669 } else { 670 // TODO(spencer): remove this nil check when gossip/simulation is no 671 // longer used in kv tests. 672 if g.simulationCycler != nil { 673 g.simulationCycler.Broadcast() 674 g.simulationCycler = nil 675 } 676 } 677 } 678 679 // SimulationCycle cycles this gossip node's server by allowing all 680 // connected clients to proceed one step. 681 func (g *Gossip) SimulationCycle() { 682 g.mu.Lock() 683 defer g.mu.Unlock() 684 if g.simulationCycler != nil { 685 g.simulationCycler.Broadcast() 686 } 687 } 688 689 // maybeAddResolverLocked creates and adds a resolver for the specified 690 // address if one does not already exist. Returns whether a new 691 // resolver was added. The caller must hold the gossip mutex. 692 func (g *Gossip) maybeAddResolverLocked(addr util.UnresolvedAddr) bool { 693 if _, ok := g.resolverAddrs[addr]; ok { 694 return false 695 } 696 ctx := g.AnnotateCtx(context.TODO()) 697 r, err := resolver.NewResolverFromUnresolvedAddr(addr) 698 if err != nil { 699 log.Warningf(ctx, "bad address %s: %s", addr, err) 700 return false 701 } 702 g.resolvers = append(g.resolvers, r) 703 g.resolverAddrs[addr] = r 704 log.Eventf(ctx, "add resolver %s", r) 705 return true 706 } 707 708 // maybeAddBootstrapAddressLocked adds the specified address to the list 709 // of bootstrap addresses if not already present. Returns whether a new 710 // bootstrap address was added. The caller must hold the gossip mutex. 711 func (g *Gossip) maybeAddBootstrapAddressLocked( 712 addr util.UnresolvedAddr, nodeID roachpb.NodeID, 713 ) bool { 714 if existingNodeID, ok := g.bootstrapAddrs[addr]; ok { 715 if existingNodeID == unknownNodeID || existingNodeID != nodeID { 716 g.bootstrapAddrs[addr] = nodeID 717 } 718 return false 719 } 720 g.bootstrapInfo.Addresses = append(g.bootstrapInfo.Addresses, addr) 721 g.bootstrapAddrs[addr] = nodeID 722 ctx := g.AnnotateCtx(context.TODO()) 723 log.Eventf(ctx, "add bootstrap %s", addr) 724 return true 725 } 726 727 // maybeCleanupBootstrapAddresses cleans up the stored bootstrap addresses to 728 // include only those currently available via gossip. The gossip mutex must 729 // be held by the caller. 730 func (g *Gossip) maybeCleanupBootstrapAddressesLocked() { 731 if g.storage == nil || g.hasCleanedBS { 732 return 733 } 734 defer func() { g.hasCleanedBS = true }() 735 ctx := g.AnnotateCtx(context.TODO()) 736 log.Event(ctx, "cleaning up bootstrap addresses") 737 738 g.resolvers = g.resolvers[:0] 739 g.resolverIdx = 0 740 g.bootstrapInfo.Addresses = g.bootstrapInfo.Addresses[:0] 741 g.bootstrapAddrs = map[util.UnresolvedAddr]roachpb.NodeID{} 742 g.resolverAddrs = map[util.UnresolvedAddr]resolver.Resolver{} 743 g.resolversTried = map[int]struct{}{} 744 745 var desc roachpb.NodeDescriptor 746 if err := g.mu.is.visitInfos(func(key string, i *Info) error { 747 if strings.HasPrefix(key, KeyNodeIDPrefix) { 748 if err := i.Value.GetProto(&desc); err != nil { 749 return err 750 } 751 if desc.Address.IsEmpty() || desc.Address == g.mu.is.NodeAddr { 752 return nil 753 } 754 g.maybeAddResolverLocked(desc.Address) 755 g.maybeAddBootstrapAddressLocked(desc.Address, desc.NodeID) 756 } 757 return nil 758 }, true /* deleteExpired */); err != nil { 759 log.Errorf(ctx, "%v", err) 760 return 761 } 762 763 if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { 764 log.Errorf(ctx, "%v", err) 765 } 766 } 767 768 // maxPeers returns the maximum number of peers each gossip node 769 // may connect to. This is based on maxHops, which is a preset 770 // maximum for number of hops allowed before the gossip network 771 // will seek to "tighten" by creating new connections to distant 772 // nodes. 773 func maxPeers(nodeCount int) int { 774 // This formula uses maxHops-2, instead of maxHops, to provide a 775 // "fudge" factor for max connected peers, to account for the 776 // arbitrary, decentralized way in which gossip networks are created. 777 // This will return the following maxPeers for the given number of nodes: 778 // <= 27 nodes -> 3 peers 779 // <= 64 nodes -> 4 peers 780 // <= 125 nodes -> 5 peers 781 // <= n^3 nodes -> n peers 782 // 783 // Quick derivation of the formula for posterity (without the fudge factor): 784 // maxPeers^maxHops > nodeCount 785 // maxHops * log(maxPeers) > log(nodeCount) 786 // log(maxPeers) > log(nodeCount) / maxHops 787 // maxPeers > e^(log(nodeCount) / maxHops) 788 // hence maxPeers = ceil(e^(log(nodeCount) / maxHops)) should work 789 maxPeers := int(math.Ceil(math.Exp(math.Log(float64(nodeCount)) / float64(maxHops-2)))) 790 if maxPeers < minPeers { 791 return minPeers 792 } 793 return maxPeers 794 } 795 796 // updateNodeAddress is a gossip callback which fires with each 797 // update to a node descriptor. This allows us to compute the 798 // total size of the gossip network (for determining max peers 799 // each gossip node is allowed to have), as well as to create 800 // new resolvers for each encountered host and to write the 801 // set of gossip node addresses to persistent storage when it 802 // changes. 803 func (g *Gossip) updateNodeAddress(key string, content roachpb.Value) { 804 ctx := g.AnnotateCtx(context.TODO()) 805 var desc roachpb.NodeDescriptor 806 if err := content.GetProto(&desc); err != nil { 807 log.Errorf(ctx, "%v", err) 808 return 809 } 810 if log.V(1) { 811 log.Infof(ctx, "updateNodeAddress called on %q with desc %+v", key, desc) 812 } 813 814 g.mu.Lock() 815 defer g.mu.Unlock() 816 817 // If desc is the empty descriptor, that indicates that the node has been 818 // removed from the cluster. If that's the case, remove it from our map of 819 // nodes to prevent other parts of the system from trying to talk to it. 820 // We can't directly compare the node against the empty descriptor because 821 // the proto has a repeated field and thus isn't comparable. 822 if desc.NodeID == 0 || desc.Address.IsEmpty() { 823 nodeID, err := NodeIDFromKey(key, KeyNodeIDPrefix) 824 if err != nil { 825 log.Errorf(ctx, "unable to update node address for removed node: %s", err) 826 return 827 } 828 log.Infof(ctx, "removed n%d from gossip", nodeID) 829 g.removeNodeDescriptorLocked(nodeID) 830 return 831 } 832 833 existingDesc, ok := g.nodeDescs[desc.NodeID] 834 if !ok || !proto.Equal(existingDesc, &desc) { 835 g.nodeDescs[desc.NodeID] = &desc 836 } 837 // Skip all remaining logic if the address hasn't changed, since that's all 838 // the logic cares about. 839 if ok && existingDesc.Address == desc.Address { 840 return 841 } 842 g.recomputeMaxPeersLocked() 843 844 // Skip if it's our own address. 845 if desc.Address == g.mu.is.NodeAddr { 846 return 847 } 848 849 // Add this new node address (if it's not already there) to our list 850 // of resolvers so we can keep connecting to gossip if the original 851 // resolvers go offline. 852 g.maybeAddResolverLocked(desc.Address) 853 854 // Add new address (if it's not already there) to bootstrap info and 855 // persist if possible. 856 added := g.maybeAddBootstrapAddressLocked(desc.Address, desc.NodeID) 857 if added && g.storage != nil { 858 if err := g.storage.WriteBootstrapInfo(&g.bootstrapInfo); err != nil { 859 log.Errorf(ctx, "%v", err) 860 } 861 } 862 } 863 864 func (g *Gossip) removeNodeDescriptorLocked(nodeID roachpb.NodeID) { 865 delete(g.nodeDescs, nodeID) 866 g.recomputeMaxPeersLocked() 867 } 868 869 // updateStoreMaps is a gossip callback which is used to update storeMap. 870 func (g *Gossip) updateStoreMap(key string, content roachpb.Value) { 871 ctx := g.AnnotateCtx(context.TODO()) 872 var desc roachpb.StoreDescriptor 873 if err := content.GetProto(&desc); err != nil { 874 log.Errorf(ctx, "%v", err) 875 return 876 } 877 878 if log.V(1) { 879 log.Infof(ctx, "updateStoreMap called on %q with desc %+v", key, desc) 880 } 881 882 g.mu.Lock() 883 defer g.mu.Unlock() 884 g.storeMap[desc.StoreID] = desc.Node.NodeID 885 } 886 887 func (g *Gossip) getNodeIDForStoreIDLocked(storeID roachpb.StoreID) (roachpb.NodeID, error) { 888 if nodeID, ok := g.storeMap[storeID]; ok { 889 return nodeID, nil 890 } 891 return 0, errors.Errorf("unable to look up Node ID for store %d", storeID) 892 } 893 894 func (g *Gossip) updateClients() { 895 nodeID := g.NodeID.Get() 896 if nodeID == 0 { 897 return 898 } 899 900 var buf bytes.Buffer 901 var sep string 902 903 g.mu.RLock() 904 g.clientsMu.Lock() 905 for _, c := range g.clientsMu.clients { 906 if c.peerID != 0 { 907 fmt.Fprintf(&buf, "%s%d", sep, c.peerID) 908 sep = "," 909 } 910 } 911 g.clientsMu.Unlock() 912 g.mu.RUnlock() 913 914 if err := g.AddInfo(MakeGossipClientsKey(nodeID), buf.Bytes(), 2*defaultClientsInterval); err != nil { 915 log.Errorf(g.AnnotateCtx(context.Background()), "%v", err) 916 } 917 } 918 919 // recomputeMaxPeersLocked recomputes max peers based on size of 920 // network and set the max sizes for incoming and outgoing node sets. 921 // 922 // Note: if we notice issues with never-ending connection refused errors 923 // in real deployments, consider allowing more incoming connections than 924 // outgoing connections. As of now, the cluster's steady state is to have 925 // all nodes fill up, which can make rebalancing of connections tough. 926 // I'm not making this change now since it tends to lead to less balanced 927 // networks and I'm not sure what all the consequences of that might be. 928 func (g *Gossip) recomputeMaxPeersLocked() { 929 maxPeers := maxPeers(len(g.nodeDescs)) 930 g.mu.incoming.setMaxSize(maxPeers) 931 g.outgoing.setMaxSize(maxPeers) 932 } 933 934 // getNodeDescriptorLocked looks up the descriptor of the node by ID. The mutex 935 // is assumed held by the caller. This method is called externally via 936 // GetNodeDescriptor and internally by getNodeIDAddressLocked. 937 func (g *Gossip) getNodeDescriptorLocked(nodeID roachpb.NodeID) (*roachpb.NodeDescriptor, error) { 938 if desc, ok := g.nodeDescs[nodeID]; ok { 939 if desc.Address.IsEmpty() { 940 log.Fatalf(g.AnnotateCtx(context.Background()), "n%d has an empty address", nodeID) 941 } 942 return desc, nil 943 } 944 945 // Fallback to retrieving the node info and unmarshalling the node 946 // descriptor. This path occurs in tests which add a node descriptor to 947 // gossip and then immediately try retrieve it. 948 nodeIDKey := MakeNodeIDKey(nodeID) 949 950 // We can't use GetInfoProto here because that method grabs the lock. 951 if i := g.mu.is.getInfo(nodeIDKey); i != nil { 952 if err := i.Value.Verify([]byte(nodeIDKey)); err != nil { 953 return nil, err 954 } 955 nodeDescriptor := &roachpb.NodeDescriptor{} 956 if err := i.Value.GetProto(nodeDescriptor); err != nil { 957 return nil, err 958 } 959 // Don't return node descriptors that are empty, because that's meant to 960 // indicate that the node has been removed from the cluster. 961 if nodeDescriptor.NodeID == 0 || nodeDescriptor.Address.IsEmpty() { 962 return nil, errors.Errorf("n%d has been removed from the cluster", nodeID) 963 } 964 965 return nodeDescriptor, nil 966 } 967 968 return nil, errors.Errorf("unable to look up descriptor for n%d", nodeID) 969 } 970 971 // getNodeIDAddressLocked looks up the address of the node by ID. The mutex is 972 // assumed held by the caller. This method is called externally via 973 // GetNodeIDAddress or internally when looking up a "distant" node address to 974 // connect directly to. 975 func (g *Gossip) getNodeIDAddressLocked(nodeID roachpb.NodeID) (*util.UnresolvedAddr, error) { 976 nd, err := g.getNodeDescriptorLocked(nodeID) 977 if err != nil { 978 return nil, err 979 } 980 for i := range nd.LocalityAddress { 981 locality := &nd.LocalityAddress[i] 982 if _, ok := g.localityTierMap[locality.LocalityTier.String()]; ok { 983 return &locality.Address, nil 984 } 985 } 986 return &nd.Address, nil 987 } 988 989 // getNodeIDAddressLocked looks up the SQL address of the node by ID. The mutex is 990 // assumed held by the caller. This method is called externally via 991 // GetNodeIDSQLAddress or internally when looking up a "distant" node address to 992 // connect directly to. 993 func (g *Gossip) getNodeIDSQLAddressLocked(nodeID roachpb.NodeID) (*util.UnresolvedAddr, error) { 994 nd, err := g.getNodeDescriptorLocked(nodeID) 995 if err != nil { 996 return nil, err 997 } 998 return &nd.SQLAddress, nil 999 } 1000 1001 // AddInfo adds or updates an info object. Returns an error if info 1002 // couldn't be added. 1003 func (g *Gossip) AddInfo(key string, val []byte, ttl time.Duration) error { 1004 g.mu.Lock() 1005 defer g.mu.Unlock() 1006 return g.addInfoLocked(key, val, ttl) 1007 } 1008 1009 // addInfoLocked adds or updates an info object. The mutex is assumed held by 1010 // the caller. Returns an error if info couldn't be added. 1011 func (g *Gossip) addInfoLocked(key string, val []byte, ttl time.Duration) error { 1012 err := g.mu.is.addInfo(key, g.mu.is.newInfo(val, ttl)) 1013 if err == nil { 1014 g.signalConnectedLocked() 1015 } 1016 return err 1017 } 1018 1019 // AddInfoProto adds or updates an info object. Returns an error if info 1020 // couldn't be added. 1021 func (g *Gossip) AddInfoProto(key string, msg protoutil.Message, ttl time.Duration) error { 1022 bytes, err := protoutil.Marshal(msg) 1023 if err != nil { 1024 return err 1025 } 1026 return g.AddInfo(key, bytes, ttl) 1027 } 1028 1029 // AddClusterID is a convenience method for gossipping the cluster ID. There's 1030 // no TTL - the record lives forever. 1031 func (g *Gossip) AddClusterID(val uuid.UUID) error { 1032 return g.AddInfo(KeyClusterID, val.GetBytes(), 0 /* ttl */) 1033 } 1034 1035 // GetClusterID returns the cluster ID if it has been gossipped. If it hasn't, 1036 // (so if this gossip instance is not "connected"), an error is returned. 1037 func (g *Gossip) GetClusterID() (uuid.UUID, error) { 1038 uuidBytes, err := g.GetInfo(KeyClusterID) 1039 if err != nil { 1040 return uuid.Nil, errors.Wrap(err, "unable to ascertain cluster ID from gossip network") 1041 } 1042 clusterID, err := uuid.FromBytes(uuidBytes) 1043 if err != nil { 1044 return uuid.Nil, errors.Wrap(err, "unable to parse cluster ID from gossip network") 1045 } 1046 return clusterID, nil 1047 } 1048 1049 // GetInfo returns an info value by key or an KeyNotPresentError if specified 1050 // key does not exist or has expired. 1051 func (g *Gossip) GetInfo(key string) ([]byte, error) { 1052 g.mu.RLock() 1053 i := g.mu.is.getInfo(key) 1054 g.mu.RUnlock() 1055 1056 if i != nil { 1057 if err := i.Value.Verify([]byte(key)); err != nil { 1058 return nil, err 1059 } 1060 return i.Value.GetBytes() 1061 } 1062 return nil, NewKeyNotPresentError(key) 1063 } 1064 1065 // GetInfoProto returns an info value by key or KeyNotPresentError if specified 1066 // key does not exist or has expired. 1067 func (g *Gossip) GetInfoProto(key string, msg protoutil.Message) error { 1068 bytes, err := g.GetInfo(key) 1069 if err != nil { 1070 return err 1071 } 1072 return protoutil.Unmarshal(bytes, msg) 1073 } 1074 1075 // InfoOriginatedHere returns true iff the latest info for the provided key 1076 // originated on this node. This is useful for ensuring that the system config 1077 // is regossiped as soon as possible when its lease changes hands. 1078 func (g *Gossip) InfoOriginatedHere(key string) bool { 1079 g.mu.RLock() 1080 info := g.mu.is.getInfo(key) 1081 g.mu.RUnlock() 1082 return info != nil && info.NodeID == g.NodeID.Get() 1083 } 1084 1085 // GetInfoStatus returns the a copy of the contents of the infostore. 1086 func (g *Gossip) GetInfoStatus() InfoStatus { 1087 clientStatus := g.clientStatus() 1088 serverStatus := g.server.status() 1089 connectivity := g.Connectivity() 1090 1091 g.mu.RLock() 1092 defer g.mu.RUnlock() 1093 is := InfoStatus{ 1094 Infos: make(map[string]Info), 1095 Client: clientStatus, 1096 Server: serverStatus, 1097 Connectivity: connectivity, 1098 } 1099 for k, v := range g.mu.is.Infos { 1100 is.Infos[k] = *protoutil.Clone(v).(*Info) 1101 } 1102 return is 1103 } 1104 1105 // IterateInfos visits all infos matching the given prefix. 1106 func (g *Gossip) IterateInfos(prefix string, visit func(k string, info Info) error) error { 1107 g.mu.RLock() 1108 defer g.mu.RUnlock() 1109 for k, v := range g.mu.is.Infos { 1110 if strings.HasPrefix(k, prefix+separator) { 1111 if err := visit(k, *(protoutil.Clone(v).(*Info))); err != nil { 1112 return err 1113 } 1114 } 1115 } 1116 return nil 1117 } 1118 1119 // Callback is a callback method to be invoked on gossip update 1120 // of info denoted by key. 1121 type Callback func(string, roachpb.Value) 1122 1123 // CallbackOption is a marker interface that callback options must implement. 1124 type CallbackOption interface { 1125 apply(cb *callback) 1126 } 1127 1128 type redundantCallbacks struct { 1129 } 1130 1131 func (redundantCallbacks) apply(cb *callback) { 1132 cb.redundant = true 1133 } 1134 1135 // Redundant is a callback option that specifies that the callback should be 1136 // invoked even if the gossip value has not changed. 1137 var Redundant redundantCallbacks 1138 1139 // RegisterCallback registers a callback for a key pattern to be 1140 // invoked whenever new info for a gossip key matching pattern is 1141 // received. The callback method is invoked with the info key which 1142 // matched pattern. Returns a function to unregister the callback. 1143 func (g *Gossip) RegisterCallback(pattern string, method Callback, opts ...CallbackOption) func() { 1144 if pattern == KeySystemConfig { 1145 ctx := g.AnnotateCtx(context.TODO()) 1146 log.Warningf( 1147 ctx, 1148 "raw gossip callback registered on %s, consider using RegisterSystemConfigChannel", 1149 KeySystemConfig, 1150 ) 1151 } 1152 1153 g.mu.Lock() 1154 unregister := g.mu.is.registerCallback(pattern, method, opts...) 1155 g.mu.Unlock() 1156 return func() { 1157 g.mu.Lock() 1158 unregister() 1159 g.mu.Unlock() 1160 } 1161 } 1162 1163 // GetSystemConfig returns the local unmarshaled version of the system config. 1164 // Returns nil if the system config hasn't been set yet. 1165 func (g *Gossip) GetSystemConfig() *config.SystemConfig { 1166 g.systemConfigMu.RLock() 1167 defer g.systemConfigMu.RUnlock() 1168 return g.systemConfig 1169 } 1170 1171 // RegisterSystemConfigChannel registers a channel to signify updates for the 1172 // system config. It is notified after registration (if a system config is 1173 // already set), and whenever a new system config is successfully unmarshaled. 1174 func (g *Gossip) RegisterSystemConfigChannel() <-chan struct{} { 1175 g.systemConfigMu.Lock() 1176 defer g.systemConfigMu.Unlock() 1177 1178 // Create channel that receives new system config notifications. 1179 // The channel has a size of 1 to prevent gossip from blocking on it. 1180 c := make(chan struct{}, 1) 1181 g.systemConfigChannels = append(g.systemConfigChannels, c) 1182 1183 // Notify the channel right away if we have a config. 1184 if g.systemConfig != nil { 1185 c <- struct{}{} 1186 } 1187 1188 return c 1189 } 1190 1191 // updateSystemConfig is the raw gossip info callback. Unmarshal the 1192 // system config, and if successful, send on each system config 1193 // channel. 1194 func (g *Gossip) updateSystemConfig(key string, content roachpb.Value) { 1195 ctx := g.AnnotateCtx(context.TODO()) 1196 if key != KeySystemConfig { 1197 log.Fatalf(ctx, "wrong key received on SystemConfig callback: %s", key) 1198 } 1199 cfg := config.NewSystemConfig(g.defaultZoneConfig) 1200 if err := content.GetProto(&cfg.SystemConfigEntries); err != nil { 1201 log.Errorf(ctx, "could not unmarshal system config on callback: %s", err) 1202 return 1203 } 1204 1205 g.systemConfigMu.Lock() 1206 defer g.systemConfigMu.Unlock() 1207 g.systemConfig = cfg 1208 for _, c := range g.systemConfigChannels { 1209 select { 1210 case c <- struct{}{}: 1211 default: 1212 } 1213 } 1214 } 1215 1216 // Incoming returns a slice of incoming gossip client connection 1217 // node IDs. 1218 func (g *Gossip) Incoming() []roachpb.NodeID { 1219 g.mu.RLock() 1220 defer g.mu.RUnlock() 1221 return g.mu.incoming.asSlice() 1222 } 1223 1224 // Outgoing returns a slice of outgoing gossip client connection 1225 // node IDs. Note that these outgoing client connections may not 1226 // actually be legitimately connected. They may be in the process 1227 // of trying, or may already have failed, but haven't yet been 1228 // processed by the gossip instance. 1229 func (g *Gossip) Outgoing() []roachpb.NodeID { 1230 g.mu.RLock() 1231 defer g.mu.RUnlock() 1232 return g.outgoing.asSlice() 1233 } 1234 1235 // MaxHops returns the maximum number of hops to reach any other 1236 // node in the system, according to the infos which have reached 1237 // this node via gossip network. 1238 func (g *Gossip) MaxHops() uint32 { 1239 g.mu.Lock() 1240 defer g.mu.Unlock() 1241 _, maxHops := g.mu.is.mostDistant(func(_ roachpb.NodeID) bool { return false }) 1242 return maxHops 1243 } 1244 1245 // Start launches the gossip instance, which commences joining the 1246 // gossip network using the supplied rpc server and previously known 1247 // peer addresses in addition to any bootstrap addresses specified via 1248 // --join and passed to this method via the resolvers parameter. 1249 // 1250 // The supplied advertised address is used to identify the gossip 1251 // instance in the gossip network; it will be used by other instances 1252 // to connect to this instance. 1253 // 1254 // This method starts bootstrap loop, gossip server, and client 1255 // management in separate goroutines and returns. 1256 func (g *Gossip) Start(advertAddr net.Addr, resolvers []resolver.Resolver) { 1257 g.AssertNotStarted(context.Background()) 1258 g.started = true 1259 g.setResolvers(resolvers) 1260 g.server.start(advertAddr) // serve gossip protocol 1261 g.bootstrap() // bootstrap gossip client 1262 g.manage() // manage gossip clients 1263 } 1264 1265 // hasIncomingLocked returns whether the server has an incoming gossip 1266 // client matching the provided node ID. Mutex should be held by 1267 // caller. 1268 func (g *Gossip) hasIncomingLocked(nodeID roachpb.NodeID) bool { 1269 return g.mu.incoming.hasNode(nodeID) 1270 } 1271 1272 // hasOutgoingLocked returns whether the server has an outgoing gossip 1273 // client matching the provided node ID. Mutex should be held by 1274 // caller. 1275 func (g *Gossip) hasOutgoingLocked(nodeID roachpb.NodeID) bool { 1276 // We have to use findClient and compare node addresses rather than using the 1277 // outgoing nodeSet due to the way that outgoing clients' node IDs are only 1278 // resolved once the connection has been established (rather than as soon as 1279 // we've created it). 1280 nodeAddr, err := g.getNodeIDAddressLocked(nodeID) 1281 if err != nil { 1282 // If we don't have the address, fall back to using the outgoing nodeSet 1283 // since at least it's better than nothing. 1284 ctx := g.AnnotateCtx(context.TODO()) 1285 log.Errorf(ctx, "unable to get address for n%d: %s", nodeID, err) 1286 return g.outgoing.hasNode(nodeID) 1287 } 1288 c := g.findClient(func(c *client) bool { 1289 return c.addr.String() == nodeAddr.String() 1290 }) 1291 return c != nil 1292 } 1293 1294 // getNextBootstrapAddress returns the next available bootstrap 1295 // address by consulting the first non-exhausted resolver from the 1296 // slice supplied to the constructor or set using setBootstrap(). 1297 // The lock is assumed held. 1298 func (g *Gossip) getNextBootstrapAddressLocked() net.Addr { 1299 // Run through resolvers round robin starting at last resolved index. 1300 for i := 0; i < len(g.resolvers); i++ { 1301 g.resolverIdx++ 1302 g.resolverIdx %= len(g.resolvers) 1303 defer func(idx int) { g.resolversTried[idx] = struct{}{} }(g.resolverIdx) 1304 resolver := g.resolvers[g.resolverIdx] 1305 if addr, err := resolver.GetAddress(); err != nil { 1306 if _, ok := g.resolversTried[g.resolverIdx]; !ok { 1307 ctx := g.AnnotateCtx(context.TODO()) 1308 log.Warningf(ctx, "invalid bootstrap address: %+v, %v", resolver, err) 1309 } 1310 continue 1311 } else { 1312 addrStr := addr.String() 1313 if _, addrActive := g.bootstrapping[addrStr]; !addrActive { 1314 g.bootstrapping[addrStr] = struct{}{} 1315 return addr 1316 } 1317 } 1318 } 1319 1320 return nil 1321 } 1322 1323 // bootstrap connects the node to the gossip network. Bootstrapping 1324 // commences in the event there are no connected clients or the 1325 // sentinel gossip info is not available. After a successful bootstrap 1326 // connection, this method will block on the stalled condvar, which 1327 // receives notifications that gossip network connectivity has been 1328 // lost and requires re-bootstrapping. 1329 func (g *Gossip) bootstrap() { 1330 ctx := g.AnnotateCtx(context.Background()) 1331 g.server.stopper.RunWorker(ctx, func(ctx context.Context) { 1332 ctx = logtags.AddTag(ctx, "bootstrap", nil) 1333 var bootstrapTimer timeutil.Timer 1334 defer bootstrapTimer.Stop() 1335 for { 1336 if g.server.stopper.RunTask(ctx, "gossip.Gossip: bootstrap ", func(ctx context.Context) { 1337 g.mu.Lock() 1338 defer g.mu.Unlock() 1339 haveClients := g.outgoing.len() > 0 1340 haveSentinel := g.mu.is.getInfo(KeySentinel) != nil 1341 log.Eventf(ctx, "have clients: %t, have sentinel: %t", haveClients, haveSentinel) 1342 if !haveClients || !haveSentinel { 1343 // Try to get another bootstrap address from the resolvers. 1344 if addr := g.getNextBootstrapAddressLocked(); addr != nil { 1345 g.startClientLocked(addr) 1346 } else { 1347 bootstrapAddrs := make([]string, 0, len(g.bootstrapping)) 1348 for addr := range g.bootstrapping { 1349 bootstrapAddrs = append(bootstrapAddrs, addr) 1350 } 1351 log.Eventf(ctx, "no next bootstrap address; currently bootstrapping: %v", bootstrapAddrs) 1352 // We couldn't start a client, signal that we're stalled so that 1353 // we'll retry. 1354 g.maybeSignalStatusChangeLocked() 1355 } 1356 } 1357 }) != nil { 1358 return 1359 } 1360 1361 // Pause an interval before next possible bootstrap. 1362 bootstrapTimer.Reset(g.bootstrapInterval) 1363 log.Eventf(ctx, "sleeping %s until bootstrap", g.bootstrapInterval) 1364 select { 1365 case <-bootstrapTimer.C: 1366 bootstrapTimer.Read = true 1367 // continue 1368 case <-g.server.stopper.ShouldStop(): 1369 return 1370 } 1371 log.Eventf(ctx, "idling until bootstrap required") 1372 // Block until we need bootstrapping again. 1373 select { 1374 case <-g.stalledCh: 1375 log.Eventf(ctx, "detected stall; commencing bootstrap") 1376 // continue 1377 case <-g.server.stopper.ShouldStop(): 1378 return 1379 } 1380 } 1381 }) 1382 } 1383 1384 // manage manages outgoing clients. Periodically, the infostore is 1385 // scanned for infos with hop count exceeding the maxHops 1386 // threshold. If the number of outgoing clients doesn't exceed 1387 // maxPeers(), a new gossip client is connected to a randomly selected 1388 // peer beyond maxHops threshold. Otherwise, the least useful peer 1389 // node is cut off to make room for a replacement. Disconnected 1390 // clients are processed via the disconnected channel and taken out of 1391 // the outgoing address set. If there are no longer any outgoing 1392 // connections or the sentinel gossip is unavailable, the bootstrapper 1393 // is notified via the stalled conditional variable. 1394 func (g *Gossip) manage() { 1395 ctx := g.AnnotateCtx(context.Background()) 1396 g.server.stopper.RunWorker(ctx, func(ctx context.Context) { 1397 clientsTimer := timeutil.NewTimer() 1398 cullTimer := timeutil.NewTimer() 1399 stallTimer := timeutil.NewTimer() 1400 defer clientsTimer.Stop() 1401 defer cullTimer.Stop() 1402 defer stallTimer.Stop() 1403 1404 clientsTimer.Reset(defaultClientsInterval) 1405 cullTimer.Reset(jitteredInterval(g.cullInterval)) 1406 stallTimer.Reset(jitteredInterval(g.stallInterval)) 1407 for { 1408 select { 1409 case <-g.server.stopper.ShouldStop(): 1410 return 1411 case c := <-g.disconnected: 1412 g.doDisconnected(c) 1413 case <-g.tighten: 1414 g.tightenNetwork(ctx) 1415 case <-clientsTimer.C: 1416 clientsTimer.Read = true 1417 g.updateClients() 1418 clientsTimer.Reset(defaultClientsInterval) 1419 case <-cullTimer.C: 1420 cullTimer.Read = true 1421 cullTimer.Reset(jitteredInterval(g.cullInterval)) 1422 func() { 1423 g.mu.Lock() 1424 if !g.outgoing.hasSpace() { 1425 leastUsefulID := g.mu.is.leastUseful(g.outgoing) 1426 1427 if c := g.findClient(func(c *client) bool { 1428 return c.peerID == leastUsefulID 1429 }); c != nil { 1430 if log.V(1) { 1431 log.Infof(ctx, "closing least useful client %+v to tighten network graph", c) 1432 } 1433 log.Eventf(ctx, "culling %s", c.addr) 1434 c.close() 1435 1436 // After releasing the lock, block until the client disconnects. 1437 defer func() { 1438 g.doDisconnected(<-g.disconnected) 1439 }() 1440 } else { 1441 if log.V(1) { 1442 g.clientsMu.Lock() 1443 log.Infof(ctx, "couldn't find least useful client among %+v", g.clientsMu.clients) 1444 g.clientsMu.Unlock() 1445 } 1446 } 1447 } 1448 g.mu.Unlock() 1449 }() 1450 case <-stallTimer.C: 1451 stallTimer.Read = true 1452 stallTimer.Reset(jitteredInterval(g.stallInterval)) 1453 1454 g.mu.Lock() 1455 g.maybeSignalStatusChangeLocked() 1456 g.mu.Unlock() 1457 } 1458 } 1459 }) 1460 } 1461 1462 // jitteredInterval returns a randomly jittered (+/-25%) duration 1463 // from checkInterval. 1464 func jitteredInterval(interval time.Duration) time.Duration { 1465 return time.Duration(float64(interval) * (0.75 + 0.5*rand.Float64())) 1466 } 1467 1468 // tightenNetwork "tightens" the network by starting a new gossip client to the 1469 // client to the most distant node to which we don't already have an outgoing 1470 // connection. Does nothing if we don't have room for any more outgoing 1471 // connections. 1472 func (g *Gossip) tightenNetwork(ctx context.Context) { 1473 g.mu.Lock() 1474 defer g.mu.Unlock() 1475 if g.outgoing.hasSpace() { 1476 distantNodeID, distantHops := g.mu.is.mostDistant(g.hasOutgoingLocked) 1477 log.VEventf(ctx, 2, "distantHops: %d from %d", distantHops, distantNodeID) 1478 if distantHops <= maxHops { 1479 return 1480 } 1481 if nodeAddr, err := g.getNodeIDAddressLocked(distantNodeID); err != nil { 1482 log.Errorf(ctx, "unable to get address for n%d: %s", distantNodeID, err) 1483 } else { 1484 log.Infof(ctx, "starting client to n%d (%d > %d) to tighten network graph", 1485 distantNodeID, distantHops, maxHops) 1486 log.Eventf(ctx, "tightening network with new client to %s", nodeAddr) 1487 g.startClientLocked(nodeAddr) 1488 } 1489 } 1490 } 1491 1492 func (g *Gossip) doDisconnected(c *client) { 1493 defer g.updateClients() 1494 1495 g.mu.Lock() 1496 defer g.mu.Unlock() 1497 g.removeClientLocked(c) 1498 1499 // If the client was disconnected with a forwarding address, connect now. 1500 if c.forwardAddr != nil { 1501 g.startClientLocked(c.forwardAddr) 1502 } 1503 g.maybeSignalStatusChangeLocked() 1504 } 1505 1506 // maybeSignalStatusChangeLocked checks whether gossip should transition its 1507 // internal state from connected to stalled or vice versa. 1508 func (g *Gossip) maybeSignalStatusChangeLocked() { 1509 ctx := g.AnnotateCtx(context.TODO()) 1510 orphaned := g.outgoing.len()+g.mu.incoming.len() == 0 1511 multiNode := len(g.bootstrapInfo.Addresses) > 0 1512 // We're stalled if we don't have the sentinel key, or if we're a multi node 1513 // cluster and have no gossip connections. 1514 stalled := (orphaned && multiNode) || g.mu.is.getInfo(KeySentinel) == nil 1515 if stalled { 1516 // We employ the stalled boolean to avoid filling logs with warnings. 1517 if !g.stalled { 1518 log.Eventf(ctx, "now stalled") 1519 if orphaned { 1520 if len(g.resolvers) == 0 { 1521 if log.V(1) { 1522 log.Warningf(ctx, "no resolvers found; use --join to specify a connected node") 1523 } 1524 } else { 1525 log.Warningf(ctx, "no incoming or outgoing connections") 1526 } 1527 } else if len(g.resolversTried) == len(g.resolvers) { 1528 log.Warningf(ctx, "first range unavailable; resolvers exhausted") 1529 } else { 1530 log.Warningf(ctx, "first range unavailable; trying remaining resolvers") 1531 } 1532 } 1533 if len(g.resolvers) > 0 { 1534 g.signalStalledLocked() 1535 } 1536 } else { 1537 if g.stalled { 1538 log.Eventf(ctx, "connected") 1539 log.Infof(ctx, "node has connected to cluster via gossip") 1540 g.signalConnectedLocked() 1541 } 1542 g.maybeCleanupBootstrapAddressesLocked() 1543 } 1544 g.stalled = stalled 1545 } 1546 1547 func (g *Gossip) signalStalledLocked() { 1548 select { 1549 case g.stalledCh <- struct{}{}: 1550 default: 1551 } 1552 } 1553 1554 // signalConnectedLocked checks whether this gossip instance is connected to 1555 // enough of the gossip network that it has received the cluster ID gossip 1556 // info. Once connected, the "Connected" channel is closed to signal to any 1557 // waiters that the gossip instance is ready. The gossip mutex should be held 1558 // by caller. 1559 // 1560 // TODO(tschottdorf): this is called from various locations which seem ad-hoc 1561 // (with the exception of the call bootstrap loop) yet necessary. Consolidate 1562 // and add commentary at each callsite. 1563 func (g *Gossip) signalConnectedLocked() { 1564 // Check if we have the cluster ID gossip to start. 1565 // If so, then mark ourselves as trivially connected to the gossip network. 1566 if !g.hasConnected && g.mu.is.getInfo(KeyClusterID) != nil { 1567 g.hasConnected = true 1568 close(g.Connected) 1569 } 1570 } 1571 1572 // startClientLocked launches a new client connected to remote address. 1573 // The client is added to the outgoing address set and launched in 1574 // a goroutine. 1575 func (g *Gossip) startClientLocked(addr net.Addr) { 1576 g.clientsMu.Lock() 1577 defer g.clientsMu.Unlock() 1578 breaker, ok := g.clientsMu.breakers[addr.String()] 1579 if !ok { 1580 name := fmt.Sprintf("gossip %v->%v", g.rpcContext.Addr, addr) 1581 breaker = g.rpcContext.NewBreaker(name) 1582 g.clientsMu.breakers[addr.String()] = breaker 1583 } 1584 ctx := g.AnnotateCtx(context.TODO()) 1585 log.Eventf(ctx, "starting new client to %s", addr) 1586 c := newClient(g.server.AmbientContext, addr, g.serverMetrics) 1587 g.clientsMu.clients = append(g.clientsMu.clients, c) 1588 c.startLocked(g, g.disconnected, g.rpcContext, g.server.stopper, breaker) 1589 } 1590 1591 // removeClientLocked removes the specified client. Called when a client 1592 // disconnects. 1593 func (g *Gossip) removeClientLocked(target *client) { 1594 g.clientsMu.Lock() 1595 defer g.clientsMu.Unlock() 1596 for i, candidate := range g.clientsMu.clients { 1597 if candidate == target { 1598 ctx := g.AnnotateCtx(context.TODO()) 1599 log.Eventf(ctx, "client %s disconnected", candidate.addr) 1600 g.clientsMu.clients = append(g.clientsMu.clients[:i], g.clientsMu.clients[i+1:]...) 1601 delete(g.bootstrapping, candidate.addr.String()) 1602 g.outgoing.removeNode(candidate.peerID) 1603 break 1604 } 1605 } 1606 } 1607 1608 func (g *Gossip) findClient(match func(*client) bool) *client { 1609 g.clientsMu.Lock() 1610 defer g.clientsMu.Unlock() 1611 for _, c := range g.clientsMu.clients { 1612 if match(c) { 1613 return c 1614 } 1615 } 1616 return nil 1617 } 1618 1619 // MakeExposedGossip initializes a DeprecatedGossip instance which exposes a 1620 // wrapped Gossip instance via Optional(). This is used on SQL servers running 1621 // inside of a KV server (i.e. single-tenant deployments). 1622 // 1623 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1624 // introduce new uses of it. 1625 // 1626 // See TenantSQLDeprecatedWrapper for details. 1627 func MakeExposedGossip(g *Gossip) DeprecatedGossip { 1628 const exposed = true 1629 return DeprecatedGossip{ 1630 w: errorutil.MakeTenantSQLDeprecatedWrapper(g, exposed), 1631 } 1632 } 1633 1634 // MakeUnexposedGossip initializes a DeprecatedGossip instance for which 1635 // Optional() does not return the wrapped Gossip instance. This is used on 1636 // SQL servers not running as part of a KV server, i.e. with multi-tenancy. 1637 // 1638 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1639 // introduce new uses of it. 1640 // 1641 // See TenantSQLDeprecatedWrapper for details. 1642 // 1643 // TODO(tbg): once we can start a SQL tenant without gossip, remove this method 1644 // and rename DeprecatedGossip to OptionalGossip. 1645 func MakeUnexposedGossip(g *Gossip) DeprecatedGossip { 1646 const exposed = false 1647 return DeprecatedGossip{ 1648 w: errorutil.MakeTenantSQLDeprecatedWrapper(g, exposed), 1649 } 1650 } 1651 1652 // DeprecatedGossip is a Gossip instance in a SQL tenant server. 1653 // 1654 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1655 // introduce new uses of it. 1656 // 1657 // See TenantSQLDeprecatedWrapper for details. 1658 type DeprecatedGossip struct { 1659 w errorutil.TenantSQLDeprecatedWrapper 1660 } 1661 1662 // Start calls .Start() on the underlying Gossip instance, which is assumed to 1663 // be non-nil. 1664 func (dg DeprecatedGossip) Start(advertAddr net.Addr, resolvers []resolver.Resolver) { 1665 dg.w.Deprecated(0).(*Gossip).Start(advertAddr, resolvers) 1666 } 1667 1668 // deprecated trades a Github issue tracking the removal of the call for the 1669 // wrapped Gossip instance. 1670 func (dg DeprecatedGossip) deprecated(issueNo int) *Gossip { 1671 // NB: some tests use a nil Gossip. 1672 g, _ := dg.w.Deprecated(issueNo).(*Gossip) 1673 return g 1674 } 1675 1676 // DeprecatedSystemConfig calls GetSystemConfig on the wrapped Gossip instance. 1677 // 1678 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1679 // introduce new uses of it. 1680 func (dg DeprecatedGossip) DeprecatedSystemConfig(issueNo int) *config.SystemConfig { 1681 g := dg.deprecated(issueNo) 1682 if g == nil { 1683 return nil // a few unit tests 1684 } 1685 return g.GetSystemConfig() 1686 } 1687 1688 // DeprecatedOracleGossip trims down *gossip.Gossip for use in the Oracle. 1689 // 1690 // NB: we're trying to get rid of this dep altogether, see: 1691 // https://github.com/cockroachdb/cockroach/issues/48432 1692 type DeprecatedOracleGossip interface { 1693 // GetNodeDescriptor is used by oracles to order replicas by distance from the 1694 // current locality. 1695 GetNodeDescriptor(roachpb.NodeID) (*roachpb.NodeDescriptor, error) 1696 GetNodeIDForStoreID(roachpb.StoreID) (roachpb.NodeID, error) 1697 } 1698 1699 // DeprecatedOracleGossip returns an DeprecatedOracleGossip (a Gossip for use with the 1700 // replicaoracle package). 1701 // 1702 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1703 // introduce new uses of it. 1704 func (dg DeprecatedGossip) DeprecatedOracleGossip(issueNo int) DeprecatedOracleGossip { 1705 return dg.deprecated(issueNo) 1706 } 1707 1708 // DeprecatedRegisterSystemConfigChannel calls RegisterSystemConfigChannel on 1709 // the wrapped Gossip instance. 1710 // 1711 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1712 // introduce new uses of it. 1713 func (dg DeprecatedGossip) DeprecatedRegisterSystemConfigChannel(issueNo int) <-chan struct{} { 1714 g := dg.deprecated(issueNo) 1715 return g.RegisterSystemConfigChannel() 1716 } 1717 1718 // OptionalErr returns the Gossip instance if the wrapper was set up to allow 1719 // it. Otherwise, it returns an error referring to the optionally passed in 1720 // issues. 1721 // 1722 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1723 // introduce new uses of it. 1724 func (dg DeprecatedGossip) OptionalErr(issueNos ...int) (*Gossip, error) { 1725 v, err := dg.w.OptionalErr(issueNos...) 1726 if err != nil { 1727 return nil, err 1728 } 1729 // NB: some tests use a nil Gossip. 1730 g, _ := v.(*Gossip) 1731 return g, nil 1732 } 1733 1734 // Optional is like OptionalErr, but returns false if Gossip is not exposed. 1735 // 1736 // Use of Gossip from within the SQL layer is **deprecated**. Please do not 1737 // introduce new uses of it. 1738 func (dg DeprecatedGossip) Optional(issueNos ...int) (*Gossip, bool) { 1739 v, ok := dg.w.Optional() 1740 if !ok { 1741 return nil, false 1742 } 1743 // NB: some tests use a nil Gossip. 1744 g, _ := v.(*Gossip) 1745 return g, true 1746 }