github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/gossip/server.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package gossip
    12  
    13  import (
    14  	"context"
    15  	"math/rand"
    16  	"net"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    22  	"github.com/cockroachdb/cockroach/pkg/util"
    23  	"github.com/cockroachdb/cockroach/pkg/util/log"
    24  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    25  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    26  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    29  	"github.com/cockroachdb/errors"
    30  )
    31  
    32  type serverInfo struct {
    33  	createdAt time.Time
    34  	peerID    roachpb.NodeID
    35  }
    36  
    37  // server maintains an array of connected peers to which it gossips
    38  // newly arrived information on a periodic basis.
    39  type server struct {
    40  	log.AmbientContext
    41  
    42  	clusterID *base.ClusterIDContainer
    43  	NodeID    *base.NodeIDContainer
    44  
    45  	stopper *stop.Stopper
    46  
    47  	mu struct {
    48  		syncutil.RWMutex
    49  		is       *infoStore                         // The backing infostore
    50  		incoming nodeSet                            // Incoming client node IDs
    51  		nodeMap  map[util.UnresolvedAddr]serverInfo // Incoming client's local address -> serverInfo
    52  		// ready broadcasts a wakeup to waiting gossip requests. This is done
    53  		// via closing the current ready channel and opening a new one. This
    54  		// is required due to the fact that condition variables are not
    55  		// composable. There's an open proposal to add them:
    56  		// https://github.com/golang/go/issues/16620
    57  		ready chan struct{}
    58  	}
    59  	tighten chan struct{} // Sent on when we may want to tighten the network
    60  
    61  	nodeMetrics   Metrics
    62  	serverMetrics Metrics
    63  
    64  	simulationCycler *sync.Cond // Used when simulating the network to signal next cycle
    65  }
    66  
    67  // newServer creates and returns a server struct.
    68  func newServer(
    69  	ambient log.AmbientContext,
    70  	clusterID *base.ClusterIDContainer,
    71  	nodeID *base.NodeIDContainer,
    72  	stopper *stop.Stopper,
    73  	registry *metric.Registry,
    74  ) *server {
    75  	s := &server{
    76  		AmbientContext: ambient,
    77  		clusterID:      clusterID,
    78  		NodeID:         nodeID,
    79  		stopper:        stopper,
    80  		tighten:        make(chan struct{}, 1),
    81  		nodeMetrics:    makeMetrics(),
    82  		serverMetrics:  makeMetrics(),
    83  	}
    84  
    85  	s.mu.is = newInfoStore(s.AmbientContext, nodeID, util.UnresolvedAddr{}, stopper)
    86  	s.mu.incoming = makeNodeSet(minPeers, metric.NewGauge(MetaConnectionsIncomingGauge))
    87  	s.mu.nodeMap = make(map[util.UnresolvedAddr]serverInfo)
    88  	s.mu.ready = make(chan struct{})
    89  
    90  	registry.AddMetric(s.mu.incoming.gauge)
    91  	registry.AddMetricStruct(s.nodeMetrics)
    92  
    93  	return s
    94  }
    95  
    96  // GetNodeMetrics returns this server's node metrics struct.
    97  func (s *server) GetNodeMetrics() *Metrics {
    98  	return &s.nodeMetrics
    99  }
   100  
   101  // Gossip receives gossiped information from a peer node.
   102  // The received delta is combined with the infostore, and this
   103  // node's own gossip is returned to requesting client.
   104  func (s *server) Gossip(stream Gossip_GossipServer) error {
   105  	args, err := stream.Recv()
   106  	if err != nil {
   107  		return err
   108  	}
   109  	if (args.ClusterID != uuid.UUID{}) && args.ClusterID != s.clusterID.Get() {
   110  		return errors.Errorf("gossip connection refused from different cluster %s", args.ClusterID)
   111  	}
   112  
   113  	ctx, cancel := context.WithCancel(s.AnnotateCtx(stream.Context()))
   114  	defer cancel()
   115  	syncChan := make(chan struct{}, 1)
   116  	send := func(reply *Response) error {
   117  		select {
   118  		case <-ctx.Done():
   119  			return ctx.Err()
   120  		case syncChan <- struct{}{}:
   121  			defer func() { <-syncChan }()
   122  
   123  			bytesSent := int64(reply.Size())
   124  			infoCount := int64(len(reply.Delta))
   125  			s.nodeMetrics.BytesSent.Inc(bytesSent)
   126  			s.nodeMetrics.InfosSent.Inc(infoCount)
   127  			s.serverMetrics.BytesSent.Inc(bytesSent)
   128  			s.serverMetrics.InfosSent.Inc(infoCount)
   129  
   130  			return stream.Send(reply)
   131  		}
   132  	}
   133  
   134  	defer func() { syncChan <- struct{}{} }()
   135  
   136  	errCh := make(chan error, 1)
   137  
   138  	// Starting workers in a task prevents data races during shutdown.
   139  	if err := s.stopper.RunTask(ctx, "gossip.server: receiver", func(ctx context.Context) {
   140  		s.stopper.RunWorker(ctx, func(ctx context.Context) {
   141  			errCh <- s.gossipReceiver(ctx, &args, send, stream.Recv)
   142  		})
   143  	}); err != nil {
   144  		return err
   145  	}
   146  
   147  	reply := new(Response)
   148  
   149  	for init := true; ; init = false {
   150  		s.mu.Lock()
   151  		// Store the old ready so that if it gets replaced with a new one
   152  		// (once the lock is released) and is closed, we still trigger the
   153  		// select below.
   154  		ready := s.mu.ready
   155  		delta := s.mu.is.delta(args.HighWaterStamps)
   156  		if init {
   157  			s.mu.is.populateMostDistantMarkers(delta)
   158  		}
   159  		if args.HighWaterStamps == nil {
   160  			args.HighWaterStamps = make(map[roachpb.NodeID]int64)
   161  		}
   162  
   163  		// Send a response if this is the first response on the connection, or if
   164  		// there are deltas to send. The first condition is necessary to make sure
   165  		// the remote node receives our high water stamps in a timely fashion.
   166  		if infoCount := len(delta); init || infoCount > 0 {
   167  			if log.V(1) {
   168  				log.Infof(ctx, "returning %d info(s) to n%d: %s",
   169  					infoCount, args.NodeID, extractKeys(delta))
   170  			}
   171  			// Ensure that the high water stamps for the remote client are kept up to
   172  			// date so that we avoid resending the same gossip infos as infos are
   173  			// updated locally.
   174  			for _, i := range delta {
   175  				ratchetHighWaterStamp(args.HighWaterStamps, i.NodeID, i.OrigStamp)
   176  			}
   177  
   178  			*reply = Response{
   179  				NodeID:          s.NodeID.Get(),
   180  				HighWaterStamps: s.mu.is.getHighWaterStamps(),
   181  				Delta:           delta,
   182  			}
   183  
   184  			s.mu.Unlock()
   185  			if err := send(reply); err != nil {
   186  				return err
   187  			}
   188  			s.mu.Lock()
   189  		}
   190  
   191  		s.mu.Unlock()
   192  
   193  		select {
   194  		case <-s.stopper.ShouldQuiesce():
   195  			return nil
   196  		case err := <-errCh:
   197  			return err
   198  		case <-ready:
   199  		}
   200  	}
   201  }
   202  
   203  func (s *server) gossipReceiver(
   204  	ctx context.Context,
   205  	argsPtr **Request,
   206  	senderFn func(*Response) error,
   207  	receiverFn func() (*Request, error),
   208  ) error {
   209  	s.mu.Lock()
   210  	defer s.mu.Unlock()
   211  
   212  	reply := new(Response)
   213  
   214  	// Track whether we've decided whether or not to admit the gossip connection
   215  	// from this node. We only want to do this once so that we can do a duplicate
   216  	// connection check based on node ID here.
   217  	nodeIdentified := false
   218  
   219  	// This loop receives gossip from the client. It does not attempt to send the
   220  	// server's gossip to the client.
   221  	for {
   222  		args := *argsPtr
   223  		if args.NodeID == 0 {
   224  			// Let the connection through so that the client can get a node ID. Once it
   225  			// has one, we'll run the logic below to decide whether to keep the
   226  			// connection to it or to forward it elsewhere.
   227  			log.Infof(ctx, "received initial cluster-verification connection from %s", args.Addr)
   228  		} else if !nodeIdentified {
   229  			nodeIdentified = true
   230  
   231  			// Decide whether or not we can accept the incoming connection
   232  			// as a permanent peer.
   233  			if args.NodeID == s.NodeID.Get() {
   234  				// This is an incoming loopback connection which should be closed by
   235  				// the client.
   236  				if log.V(2) {
   237  					log.Infof(ctx, "ignoring gossip from n%d (loopback)", args.NodeID)
   238  				}
   239  			} else if _, ok := s.mu.nodeMap[args.Addr]; ok {
   240  				// This is a duplicate incoming connection from the same node as an existing
   241  				// connection. This can happen when bootstrap connections are initiated
   242  				// through a load balancer.
   243  				if log.V(2) {
   244  					log.Infof(ctx, "duplicate connection received from n%d at %s", args.NodeID, args.Addr)
   245  				}
   246  				return errors.Errorf("duplicate connection from node at %s", args.Addr)
   247  			} else if s.mu.incoming.hasSpace() {
   248  				log.VEventf(ctx, 2, "adding n%d to incoming set", args.NodeID)
   249  
   250  				s.mu.incoming.addNode(args.NodeID)
   251  				s.mu.nodeMap[args.Addr] = serverInfo{
   252  					peerID:    args.NodeID,
   253  					createdAt: timeutil.Now(),
   254  				}
   255  
   256  				defer func(nodeID roachpb.NodeID, addr util.UnresolvedAddr) {
   257  					log.VEventf(ctx, 2, "removing n%d from incoming set", args.NodeID)
   258  					s.mu.incoming.removeNode(nodeID)
   259  					delete(s.mu.nodeMap, addr)
   260  				}(args.NodeID, args.Addr)
   261  			} else {
   262  				// If we don't have any space left, forward the client along to a peer.
   263  				var alternateAddr util.UnresolvedAddr
   264  				var alternateNodeID roachpb.NodeID
   265  				// Choose a random peer for forwarding.
   266  				altIdx := rand.Intn(len(s.mu.nodeMap))
   267  				for addr, info := range s.mu.nodeMap {
   268  					if altIdx == 0 {
   269  						alternateAddr = addr
   270  						alternateNodeID = info.peerID
   271  						break
   272  					}
   273  					altIdx--
   274  				}
   275  
   276  				s.nodeMetrics.ConnectionsRefused.Inc(1)
   277  				log.Infof(ctx, "refusing gossip from n%d (max %d conns); forwarding to n%d (%s)",
   278  					args.NodeID, s.mu.incoming.maxSize, alternateNodeID, alternateAddr)
   279  
   280  				*reply = Response{
   281  					NodeID:          s.NodeID.Get(),
   282  					AlternateAddr:   &alternateAddr,
   283  					AlternateNodeID: alternateNodeID,
   284  				}
   285  
   286  				s.mu.Unlock()
   287  				err := senderFn(reply)
   288  				s.mu.Lock()
   289  				// Naively, we would return err here unconditionally, but that
   290  				// introduces a race. Specifically, the client may observe the
   291  				// end of the connection before it has a chance to receive and
   292  				// process this message, which instructs it to hang up anyway.
   293  				// Instead, we send the message and proceed to gossip
   294  				// normally, depending on the client to end the connection.
   295  				if err != nil {
   296  					return err
   297  				}
   298  			}
   299  		}
   300  
   301  		bytesReceived := int64(args.Size())
   302  		infosReceived := int64(len(args.Delta))
   303  		s.nodeMetrics.BytesReceived.Inc(bytesReceived)
   304  		s.nodeMetrics.InfosReceived.Inc(infosReceived)
   305  		s.serverMetrics.BytesReceived.Inc(bytesReceived)
   306  		s.serverMetrics.InfosReceived.Inc(infosReceived)
   307  
   308  		freshCount, err := s.mu.is.combine(args.Delta, args.NodeID)
   309  		if err != nil {
   310  			log.Warningf(ctx, "failed to fully combine gossip delta from n%d: %s", args.NodeID, err)
   311  		}
   312  		if log.V(1) {
   313  			log.Infof(ctx, "received %s from n%d (%d fresh)", extractKeys(args.Delta), args.NodeID, freshCount)
   314  		}
   315  		s.maybeTightenLocked()
   316  
   317  		*reply = Response{
   318  			NodeID:          s.NodeID.Get(),
   319  			HighWaterStamps: s.mu.is.getHighWaterStamps(),
   320  		}
   321  
   322  		s.mu.Unlock()
   323  		err = senderFn(reply)
   324  		s.mu.Lock()
   325  		if err != nil {
   326  			return err
   327  		}
   328  
   329  		if cycler := s.simulationCycler; cycler != nil {
   330  			cycler.Wait()
   331  		}
   332  
   333  		s.mu.Unlock()
   334  		recvArgs, err := receiverFn()
   335  		s.mu.Lock()
   336  		if err != nil {
   337  			return err
   338  		}
   339  
   340  		// *argsPtr holds the remote peer state; we need to update it whenever we
   341  		// receive a new non-nil request. We avoid assigning to *argsPtr directly
   342  		// because the gossip sender above has closed over *argsPtr and will NPE if
   343  		// *argsPtr were set to nil.
   344  		mergeHighWaterStamps(&recvArgs.HighWaterStamps, (*argsPtr).HighWaterStamps)
   345  		*argsPtr = recvArgs
   346  	}
   347  }
   348  
   349  func (s *server) maybeTightenLocked() {
   350  	select {
   351  	case s.tighten <- struct{}{}:
   352  	default:
   353  	}
   354  }
   355  
   356  // start initializes the infostore with the rpc server address and
   357  // then begins processing connecting clients in an infinite select
   358  // loop via goroutine. Periodically, clients connected and awaiting
   359  // the next round of gossip are awoken via the conditional variable.
   360  func (s *server) start(addr net.Addr) {
   361  	s.mu.Lock()
   362  	defer s.mu.Unlock()
   363  	s.mu.is.NodeAddr = util.MakeUnresolvedAddr(addr.Network(), addr.String())
   364  
   365  	broadcast := func() {
   366  		// Close the old ready and open a new one. This will broadcast to all
   367  		// receivers and setup a fresh channel to replace the closed one.
   368  		s.mu.Lock()
   369  		defer s.mu.Unlock()
   370  		ready := make(chan struct{})
   371  		close(s.mu.ready)
   372  		s.mu.ready = ready
   373  	}
   374  
   375  	// We require redundant callbacks here as the broadcast callback is
   376  	// propagating gossip infos to other nodes and needs to propagate the new
   377  	// expiration info.
   378  	unregister := s.mu.is.registerCallback(".*", func(_ string, _ roachpb.Value) {
   379  		broadcast()
   380  	}, Redundant)
   381  
   382  	s.stopper.RunWorker(context.TODO(), func(context.Context) {
   383  		<-s.stopper.ShouldQuiesce()
   384  
   385  		s.mu.Lock()
   386  		unregister()
   387  		s.mu.Unlock()
   388  
   389  		broadcast()
   390  	})
   391  }
   392  
   393  func (s *server) status() ServerStatus {
   394  	s.mu.RLock()
   395  	defer s.mu.RUnlock()
   396  
   397  	var status ServerStatus
   398  	status.ConnStatus = make([]ConnStatus, 0, len(s.mu.nodeMap))
   399  	status.MaxConns = int32(s.mu.incoming.maxSize)
   400  	status.MetricSnap = s.serverMetrics.Snapshot()
   401  
   402  	for addr, info := range s.mu.nodeMap {
   403  		status.ConnStatus = append(status.ConnStatus, ConnStatus{
   404  			NodeID:   info.peerID,
   405  			Address:  addr.String(),
   406  			AgeNanos: timeutil.Since(info.createdAt).Nanoseconds(),
   407  		})
   408  	}
   409  	return status
   410  }
   411  
   412  func roundSecs(d time.Duration) time.Duration {
   413  	return time.Duration(d.Seconds()+0.5) * time.Second
   414  }
   415  
   416  // GetNodeAddr returns the node's address stored in the Infostore.
   417  func (s *server) GetNodeAddr() *util.UnresolvedAddr {
   418  	s.mu.RLock()
   419  	defer s.mu.RUnlock()
   420  	return &s.mu.is.NodeAddr
   421  }