github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/conn/node.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package conn
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"math/rand"
    24  	"strings"
    25  	"sync"
    26  	"sync/atomic"
    27  	"time"
    28  
    29  	"github.com/dgraph-io/badger/y"
    30  	"github.com/dgraph-io/dgo/protos/api"
    31  	"github.com/dgraph-io/dgraph/protos/pb"
    32  	"github.com/dgraph-io/dgraph/raftwal"
    33  	"github.com/dgraph-io/dgraph/x"
    34  	"github.com/golang/glog"
    35  	"github.com/pkg/errors"
    36  	"go.etcd.io/etcd/raft"
    37  	"go.etcd.io/etcd/raft/raftpb"
    38  	otrace "go.opencensus.io/trace"
    39  	"golang.org/x/net/context"
    40  )
    41  
    42  var (
    43  	// ErrNoNode is returned when no node has been set up.
    44  	ErrNoNode = errors.Errorf("No node has been set up yet")
    45  )
    46  
    47  // Node represents a node participating in the RAFT protocol.
    48  type Node struct {
    49  	x.SafeMutex
    50  
    51  	joinLock sync.Mutex
    52  
    53  	// Used to keep track of lin read requests.
    54  	requestCh chan linReadReq
    55  
    56  	// SafeMutex is for fields which can be changed after init.
    57  	_confState *raftpb.ConfState
    58  	_raft      raft.Node
    59  
    60  	// Fields which are never changed after init.
    61  	Cfg         *raft.Config
    62  	MyAddr      string
    63  	Id          uint64
    64  	peers       map[uint64]string
    65  	confChanges map[uint64]chan error
    66  	messages    chan sendmsg
    67  	RaftContext *pb.RaftContext
    68  	Store       *raftwal.DiskStorage
    69  	Rand        *rand.Rand
    70  
    71  	Proposals proposals
    72  	// applied is used to keep track of the applied RAFT proposals.
    73  	// The stages are proposed -> committed (accepted by cluster) ->
    74  	// applied (to PL) -> synced (to BadgerDB).
    75  	Applied y.WaterMark
    76  
    77  	heartbeatsOut int64
    78  	heartbeatsIn  int64
    79  }
    80  
    81  // NewNode returns a new Node instance.
    82  func NewNode(rc *pb.RaftContext, store *raftwal.DiskStorage) *Node {
    83  	snap, err := store.Snapshot()
    84  	x.Check(err)
    85  
    86  	n := &Node{
    87  		Id:     rc.Id,
    88  		MyAddr: rc.Addr,
    89  		Store:  store,
    90  		Cfg: &raft.Config{
    91  			ID:                       rc.Id,
    92  			ElectionTick:             20, // 2s if we call Tick() every 100 ms.
    93  			HeartbeatTick:            1,  // 100ms if we call Tick() every 100 ms.
    94  			Storage:                  store,
    95  			MaxInflightMsgs:          256,
    96  			MaxSizePerMsg:            256 << 10, // 256 KB should allow more batching.
    97  			MaxCommittedSizePerReady: 64 << 20,  // Avoid loading entire Raft log into memory.
    98  			// We don't need lease based reads. They cause issues because they
    99  			// require CheckQuorum to be true, and that causes a lot of issues
   100  			// for us during cluster bootstrapping and later. A seemingly
   101  			// healthy cluster would just cause leader to step down due to
   102  			// "inactive" quorum, and then disallow anyone from becoming leader.
   103  			// So, let's stick to default options.  Let's achieve correctness,
   104  			// then we achieve performance. Plus, for the Dgraph alphas, we'll
   105  			// be soon relying only on Timestamps for blocking reads and
   106  			// achieving linearizability, than checking quorums (Zero would
   107  			// still check quorums).
   108  			ReadOnlyOption: raft.ReadOnlySafe,
   109  			// When a disconnected node joins back, it forces a leader change,
   110  			// as it starts with a higher term, as described in Raft thesis (not
   111  			// the paper) in section 9.6. This setting can avoid that by only
   112  			// increasing the term, if the node has a good chance of becoming
   113  			// the leader.
   114  			PreVote: true,
   115  
   116  			// We can explicitly set Applied to the first index in the Raft log,
   117  			// so it does not derive it separately, thus avoiding a crash when
   118  			// the Applied is set to below snapshot index by Raft.
   119  			// In case this is a new Raft log, first would be 1, and therefore
   120  			// Applied would be zero, hence meeting the condition by the library
   121  			// that Applied should only be set during a restart.
   122  			//
   123  			// Update: Set the Applied to the latest snapshot, because it seems
   124  			// like somehow the first index can be out of sync with the latest
   125  			// snapshot.
   126  			Applied: snap.Metadata.Index,
   127  
   128  			Logger: &x.ToGlog{},
   129  		},
   130  		// processConfChange etc are not throttled so some extra delta, so that we don't
   131  		// block tick when applyCh is full
   132  		Applied:     y.WaterMark{Name: fmt.Sprintf("Applied watermark")},
   133  		RaftContext: rc,
   134  		Rand:        rand.New(&lockedSource{src: rand.NewSource(time.Now().UnixNano())}),
   135  		confChanges: make(map[uint64]chan error),
   136  		messages:    make(chan sendmsg, 100),
   137  		peers:       make(map[uint64]string),
   138  		requestCh:   make(chan linReadReq, 100),
   139  	}
   140  	n.Applied.Init(nil)
   141  	// This should match up to the Applied index set above.
   142  	n.Applied.SetDoneUntil(n.Cfg.Applied)
   143  	glog.Infof("Setting raft.Config to: %+v\n", n.Cfg)
   144  	return n
   145  }
   146  
   147  // ReportRaftComms periodically prints the state of the node (heartbeats in and out).
   148  func (n *Node) ReportRaftComms() {
   149  	if !glog.V(3) {
   150  		return
   151  	}
   152  	ticker := time.NewTicker(time.Second)
   153  	defer ticker.Stop()
   154  
   155  	for range ticker.C {
   156  		out := atomic.SwapInt64(&n.heartbeatsOut, 0)
   157  		in := atomic.SwapInt64(&n.heartbeatsIn, 0)
   158  		glog.Infof("RaftComm: [%#x] Heartbeats out: %d, in: %d", n.Id, out, in)
   159  	}
   160  }
   161  
   162  // SetRaft would set the provided raft.Node to this node.
   163  // It would check fail if the node is already set.
   164  func (n *Node) SetRaft(r raft.Node) {
   165  	n.Lock()
   166  	defer n.Unlock()
   167  	x.AssertTrue(n._raft == nil)
   168  	n._raft = r
   169  }
   170  
   171  // Raft would return back the raft.Node stored in the node.
   172  func (n *Node) Raft() raft.Node {
   173  	n.RLock()
   174  	defer n.RUnlock()
   175  	return n._raft
   176  }
   177  
   178  // SetConfState would store the latest ConfState generated by ApplyConfChange.
   179  func (n *Node) SetConfState(cs *raftpb.ConfState) {
   180  	glog.Infof("Setting conf state to %+v\n", cs)
   181  	n.Lock()
   182  	defer n.Unlock()
   183  	n._confState = cs
   184  }
   185  
   186  // DoneConfChange marks a configuration change as done and sends the given error to the
   187  // config channel.
   188  func (n *Node) DoneConfChange(id uint64, err error) {
   189  	n.Lock()
   190  	defer n.Unlock()
   191  	ch, has := n.confChanges[id]
   192  	if !has {
   193  		return
   194  	}
   195  	delete(n.confChanges, id)
   196  	ch <- err
   197  }
   198  
   199  func (n *Node) storeConfChange(che chan error) uint64 {
   200  	n.Lock()
   201  	defer n.Unlock()
   202  	id := rand.Uint64()
   203  	_, has := n.confChanges[id]
   204  	for has {
   205  		id = rand.Uint64()
   206  		_, has = n.confChanges[id]
   207  	}
   208  	n.confChanges[id] = che
   209  	return id
   210  }
   211  
   212  // ConfState would return the latest ConfState stored in node.
   213  func (n *Node) ConfState() *raftpb.ConfState {
   214  	n.RLock()
   215  	defer n.RUnlock()
   216  	return n._confState
   217  }
   218  
   219  // Peer returns the address of the peer with the given id.
   220  func (n *Node) Peer(pid uint64) (string, bool) {
   221  	n.RLock()
   222  	defer n.RUnlock()
   223  	addr, ok := n.peers[pid]
   224  	return addr, ok
   225  }
   226  
   227  // SetPeer sets the address of the peer with the given id. The address must not be empty.
   228  func (n *Node) SetPeer(pid uint64, addr string) {
   229  	x.AssertTruef(addr != "", "SetPeer for peer %d has empty addr.", pid)
   230  	n.Lock()
   231  	defer n.Unlock()
   232  	n.peers[pid] = addr
   233  }
   234  
   235  // Send sends the given RAFT message from this node.
   236  func (n *Node) Send(msg raftpb.Message) {
   237  	x.AssertTruef(n.Id != msg.To, "Sending message to itself")
   238  	data, err := msg.Marshal()
   239  	x.Check(err)
   240  
   241  	if glog.V(2) {
   242  		switch msg.Type {
   243  		case raftpb.MsgHeartbeat, raftpb.MsgHeartbeatResp:
   244  			atomic.AddInt64(&n.heartbeatsOut, 1)
   245  		case raftpb.MsgReadIndex, raftpb.MsgReadIndexResp:
   246  		case raftpb.MsgApp, raftpb.MsgAppResp:
   247  		case raftpb.MsgProp:
   248  		default:
   249  			glog.Infof("RaftComm: [%#x] Sending message of type %s to %#x", msg.From, msg.Type, msg.To)
   250  		}
   251  	}
   252  	// As long as leadership is stable, any attempted Propose() calls should be reflected in the
   253  	// next raft.Ready.Messages. Leaders will send MsgApps to the followers; followers will send
   254  	// MsgProp to the leader. It is up to the transport layer to get those messages to their
   255  	// destination. If a MsgApp gets dropped by the transport layer, it will get retried by raft
   256  	// (i.e. it will appear in a future Ready.Messages), but MsgProp will only be sent once. During
   257  	// leadership transitions, proposals may get dropped even if the network is reliable.
   258  	//
   259  	// We can't do a select default here. The messages must be sent to the channel, otherwise we
   260  	// should block until the channel can accept these messages. BatchAndSendMessages would take
   261  	// care of dropping messages which can't be sent due to network issues to the corresponding
   262  	// node. But, we shouldn't take the liberty to do that here. It would take us more time to
   263  	// repropose these dropped messages anyway, than to block here a bit waiting for the messages
   264  	// channel to clear out.
   265  	n.messages <- sendmsg{to: msg.To, data: data}
   266  }
   267  
   268  // Snapshot returns the current snapshot.
   269  func (n *Node) Snapshot() (raftpb.Snapshot, error) {
   270  	if n == nil || n.Store == nil {
   271  		return raftpb.Snapshot{}, errors.New("Uninitialized node or raft store")
   272  	}
   273  	return n.Store.Snapshot()
   274  }
   275  
   276  // SaveToStorage saves the hard state, entries, and snapshot to persistent storage, in that order.
   277  func (n *Node) SaveToStorage(h raftpb.HardState, es []raftpb.Entry, s raftpb.Snapshot) {
   278  	for {
   279  		if err := n.Store.Save(h, es, s); err != nil {
   280  			glog.Errorf("While trying to save Raft update: %v. Retrying...", err)
   281  		} else {
   282  			return
   283  		}
   284  	}
   285  }
   286  
   287  // PastLife returns the index of the snapshot before the restart (if any) and whether there was
   288  // a previous state that should be recovered after a restart.
   289  func (n *Node) PastLife() (uint64, bool, error) {
   290  	var (
   291  		sp      raftpb.Snapshot
   292  		idx     uint64
   293  		restart bool
   294  		rerr    error
   295  	)
   296  	sp, rerr = n.Store.Snapshot()
   297  	if rerr != nil {
   298  		return 0, false, rerr
   299  	}
   300  	if !raft.IsEmptySnap(sp) {
   301  		glog.Infof("Found Snapshot.Metadata: %+v\n", sp.Metadata)
   302  		restart = true
   303  		idx = sp.Metadata.Index
   304  	}
   305  
   306  	var hd raftpb.HardState
   307  	hd, rerr = n.Store.HardState()
   308  	if rerr != nil {
   309  		return 0, false, rerr
   310  	}
   311  	if !raft.IsEmptyHardState(hd) {
   312  		glog.Infof("Found hardstate: %+v\n", hd)
   313  		restart = true
   314  	}
   315  
   316  	var num int
   317  	num, rerr = n.Store.NumEntries()
   318  	if rerr != nil {
   319  		return 0, false, rerr
   320  	}
   321  	glog.Infof("Group %d found %d entries\n", n.RaftContext.Group, num)
   322  	// We'll always have at least one entry.
   323  	if num > 1 {
   324  		restart = true
   325  	}
   326  	return idx, restart, nil
   327  }
   328  
   329  const (
   330  	messageBatchSoftLimit = 10e6
   331  )
   332  
   333  type stream struct {
   334  	msgCh chan []byte
   335  	alive int32
   336  }
   337  
   338  // BatchAndSendMessages sends messages in batches.
   339  func (n *Node) BatchAndSendMessages() {
   340  	batches := make(map[uint64]*bytes.Buffer)
   341  	streams := make(map[uint64]*stream)
   342  
   343  	for {
   344  		totalSize := 0
   345  		sm := <-n.messages
   346  	slurp_loop:
   347  		for {
   348  			var buf *bytes.Buffer
   349  			if b, ok := batches[sm.to]; !ok {
   350  				buf = new(bytes.Buffer)
   351  				batches[sm.to] = buf
   352  			} else {
   353  				buf = b
   354  			}
   355  			totalSize += 4 + len(sm.data)
   356  			x.Check(binary.Write(buf, binary.LittleEndian, uint32(len(sm.data))))
   357  			x.Check2(buf.Write(sm.data))
   358  
   359  			if totalSize > messageBatchSoftLimit {
   360  				// We limit the batch size, but we aren't pushing back on
   361  				// n.messages, because the loop below spawns a goroutine
   362  				// to do its dirty work.  This is good because right now
   363  				// (*node).send fails(!) if the channel is full.
   364  				break
   365  			}
   366  
   367  			select {
   368  			case sm = <-n.messages:
   369  			default:
   370  				break slurp_loop
   371  			}
   372  		}
   373  
   374  		for to, buf := range batches {
   375  			if buf.Len() == 0 {
   376  				continue
   377  			}
   378  			s, ok := streams[to]
   379  			if !ok || atomic.LoadInt32(&s.alive) <= 0 {
   380  				s = &stream{
   381  					msgCh: make(chan []byte, 100),
   382  					alive: 1,
   383  				}
   384  				go n.streamMessages(to, s)
   385  				streams[to] = s
   386  			}
   387  			data := make([]byte, buf.Len())
   388  			copy(data, buf.Bytes())
   389  			buf.Reset()
   390  
   391  			select {
   392  			case s.msgCh <- data:
   393  			default:
   394  			}
   395  		}
   396  	}
   397  }
   398  
   399  func (n *Node) streamMessages(to uint64, s *stream) {
   400  	defer atomic.StoreInt32(&s.alive, 0)
   401  
   402  	// Exit after this deadline. Let BatchAndSendMessages create another goroutine, if needed.
   403  	// Let's set the deadline to 10s because if we increase it, then it takes longer to recover from
   404  	// a partition and get a new leader.
   405  	deadline := time.Now().Add(10 * time.Second)
   406  	ticker := time.NewTicker(time.Second)
   407  	defer ticker.Stop()
   408  
   409  	var logged int
   410  	for range ticker.C { // Don't do this in an busy-wait loop, use a ticker.
   411  		if err := n.doSendMessage(to, s.msgCh); err != nil {
   412  			// Update lastLog so we print error only a few times if we are not able to connect.
   413  			// Otherwise, the log is polluted with repeated errors.
   414  			if logged == 0 {
   415  				glog.Warningf("Unable to send message to peer: %#x. Error: %v", to, err)
   416  				logged++
   417  			}
   418  		}
   419  		if time.Now().After(deadline) {
   420  			return
   421  		}
   422  	}
   423  }
   424  
   425  func (n *Node) doSendMessage(to uint64, msgCh chan []byte) error {
   426  	addr, has := n.Peer(to)
   427  	if !has {
   428  		return errors.Errorf("Do not have address of peer %#x", to)
   429  	}
   430  	pool, err := GetPools().Get(addr)
   431  	if err != nil {
   432  		return err
   433  	}
   434  
   435  	c := pb.NewRaftClient(pool.Get())
   436  	ctx, span := otrace.StartSpan(context.Background(),
   437  		fmt.Sprintf("RaftMessage-%d-to-%d", n.Id, to))
   438  	defer span.End()
   439  
   440  	mc, err := c.RaftMessage(ctx)
   441  	if err != nil {
   442  		return err
   443  	}
   444  
   445  	var packets, lastPackets uint64
   446  	slurp := func(batch *pb.RaftBatch) {
   447  		for {
   448  			if len(batch.Payload.Data) > messageBatchSoftLimit {
   449  				return
   450  			}
   451  			select {
   452  			case data := <-msgCh:
   453  				batch.Payload.Data = append(batch.Payload.Data, data...)
   454  				packets++
   455  			default:
   456  				return
   457  			}
   458  		}
   459  	}
   460  
   461  	ctx = mc.Context()
   462  	ticker := time.NewTicker(3 * time.Minute)
   463  	defer ticker.Stop()
   464  
   465  	for {
   466  		select {
   467  		case data := <-msgCh:
   468  			batch := &pb.RaftBatch{
   469  				Context: n.RaftContext,
   470  				Payload: &api.Payload{Data: data},
   471  			}
   472  			packets++
   473  			slurp(batch) // Pick up more entries from msgCh, if present.
   474  			span.Annotatef(nil, "[Packets: %d] Sending data of length: %d.",
   475  				packets, len(batch.Payload.Data))
   476  			if err := mc.Send(batch); err != nil {
   477  				span.Annotatef(nil, "Error while mc.Send: %v", err)
   478  				switch {
   479  				case strings.Contains(err.Error(), "TransientFailure"):
   480  					glog.Warningf("Reporting node: %d addr: %s as unreachable.", to, pool.Addr)
   481  					n.Raft().ReportUnreachable(to)
   482  					pool.SetUnhealthy()
   483  				default:
   484  				}
   485  				// We don't need to do anything if we receive any error while sending message.
   486  				// RAFT would automatically retry.
   487  				return err
   488  			}
   489  		case <-ticker.C:
   490  			if lastPackets == packets {
   491  				span.Annotatef(nil,
   492  					"No activity for a while [Packets == %d]. Closing connection.", packets)
   493  				return mc.CloseSend()
   494  			}
   495  			lastPackets = packets
   496  		case <-ctx.Done():
   497  			return ctx.Err()
   498  		}
   499  	}
   500  }
   501  
   502  // Connect connects the node and makes its peerPool refer to the constructed pool and address
   503  // (possibly updating ourselves from the old address.)  (Unless pid is ourselves, in which
   504  // case this does nothing.)
   505  func (n *Node) Connect(pid uint64, addr string) {
   506  	if pid == n.Id {
   507  		return
   508  	}
   509  	if paddr, ok := n.Peer(pid); ok && paddr == addr {
   510  		// Already connected.
   511  		return
   512  	}
   513  	// Here's what we do.  Right now peerPool maps peer node id's to addr values.  If
   514  	// a *pool can be created, good, but if not, we still create a peerPoolEntry with
   515  	// a nil *pool.
   516  	if addr == n.MyAddr {
   517  		// TODO: Note this fact in more general peer health info somehow.
   518  		glog.Infof("Peer %d claims same host as me\n", pid)
   519  		n.SetPeer(pid, addr)
   520  		return
   521  	}
   522  	GetPools().Connect(addr)
   523  	n.SetPeer(pid, addr)
   524  }
   525  
   526  // DeletePeer deletes the record of the peer with the given id.
   527  func (n *Node) DeletePeer(pid uint64) {
   528  	if pid == n.Id {
   529  		return
   530  	}
   531  	n.Lock()
   532  	defer n.Unlock()
   533  	delete(n.peers, pid)
   534  }
   535  
   536  var errInternalRetry = errors.New("Retry proposal again")
   537  
   538  func (n *Node) proposeConfChange(ctx context.Context, pb raftpb.ConfChange) error {
   539  	cctx, cancel := context.WithTimeout(ctx, 3*time.Second)
   540  	defer cancel()
   541  
   542  	ch := make(chan error, 1)
   543  	id := n.storeConfChange(ch)
   544  	// TODO: Delete id from the map.
   545  	pb.ID = id
   546  	if err := n.Raft().ProposeConfChange(cctx, pb); err != nil {
   547  		if cctx.Err() != nil {
   548  			return errInternalRetry
   549  		}
   550  		glog.Warningf("Error while proposing conf change: %v", err)
   551  		return err
   552  	}
   553  	select {
   554  	case err := <-ch:
   555  		return err
   556  	case <-ctx.Done():
   557  		return ctx.Err()
   558  	case <-cctx.Done():
   559  		return errInternalRetry
   560  	}
   561  }
   562  
   563  func (n *Node) addToCluster(ctx context.Context, pid uint64) error {
   564  	addr, ok := n.Peer(pid)
   565  	x.AssertTruef(ok, "Unable to find conn pool for peer: %#x", pid)
   566  	rc := &pb.RaftContext{
   567  		Addr:  addr,
   568  		Group: n.RaftContext.Group,
   569  		Id:    pid,
   570  	}
   571  	rcBytes, err := rc.Marshal()
   572  	x.Check(err)
   573  
   574  	cc := raftpb.ConfChange{
   575  		Type:    raftpb.ConfChangeAddNode,
   576  		NodeID:  pid,
   577  		Context: rcBytes,
   578  	}
   579  	err = errInternalRetry
   580  	for err == errInternalRetry {
   581  		glog.Infof("Trying to add %#x to cluster. Addr: %v\n", pid, addr)
   582  		glog.Infof("Current confstate at %#x: %+v\n", n.Id, n.ConfState())
   583  		err = n.proposeConfChange(ctx, cc)
   584  	}
   585  	return err
   586  }
   587  
   588  // ProposePeerRemoval proposes a new configuration with the peer with the given id removed.
   589  func (n *Node) ProposePeerRemoval(ctx context.Context, id uint64) error {
   590  	if n.Raft() == nil {
   591  		return ErrNoNode
   592  	}
   593  	if _, ok := n.Peer(id); !ok && id != n.RaftContext.Id {
   594  		return errors.Errorf("Node %#x not part of group", id)
   595  	}
   596  	cc := raftpb.ConfChange{
   597  		Type:   raftpb.ConfChangeRemoveNode,
   598  		NodeID: id,
   599  	}
   600  	err := errInternalRetry
   601  	for err == errInternalRetry {
   602  		err = n.proposeConfChange(ctx, cc)
   603  	}
   604  	return err
   605  }
   606  
   607  type linReadReq struct {
   608  	// A one-shot chan which we send a raft index upon.
   609  	indexCh chan<- uint64
   610  }
   611  
   612  var errReadIndex = errors.Errorf(
   613  	"Cannot get linearized read (time expired or no configured leader)")
   614  
   615  // WaitLinearizableRead waits until a linearizable read can be performed.
   616  func (n *Node) WaitLinearizableRead(ctx context.Context) error {
   617  	span := otrace.FromContext(ctx)
   618  	span.Annotate(nil, "WaitLinearizableRead")
   619  
   620  	indexCh := make(chan uint64, 1)
   621  	select {
   622  	case n.requestCh <- linReadReq{indexCh: indexCh}:
   623  		span.Annotate(nil, "Pushed to requestCh")
   624  	case <-ctx.Done():
   625  		span.Annotate(nil, "Context expired")
   626  		return ctx.Err()
   627  	}
   628  
   629  	select {
   630  	case index := <-indexCh:
   631  		span.Annotatef(nil, "Received index: %d", index)
   632  		if index == 0 {
   633  			return errReadIndex
   634  		}
   635  		err := n.Applied.WaitForMark(ctx, index)
   636  		span.Annotatef(nil, "Error from Applied.WaitForMark: %v", err)
   637  		return err
   638  	case <-ctx.Done():
   639  		span.Annotate(nil, "Context expired")
   640  		return ctx.Err()
   641  	}
   642  }
   643  
   644  // RunReadIndexLoop runs the RAFT index in a loop.
   645  func (n *Node) RunReadIndexLoop(closer *y.Closer, readStateCh <-chan raft.ReadState) {
   646  	defer closer.Done()
   647  	readIndex := func(activeRctx []byte) (uint64, error) {
   648  		// Read Request can get rejected then we would wait indefinitely on the channel
   649  		// so have a timeout.
   650  		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
   651  		defer cancel()
   652  
   653  		if err := n.Raft().ReadIndex(ctx, activeRctx); err != nil {
   654  			glog.Errorf("Error while trying to call ReadIndex: %v\n", err)
   655  			return 0, err
   656  		}
   657  
   658  	again:
   659  		select {
   660  		case <-closer.HasBeenClosed():
   661  			return 0, errors.New("Closer has been called")
   662  		case rs := <-readStateCh:
   663  			if !bytes.Equal(activeRctx, rs.RequestCtx) {
   664  				glog.V(3).Infof("Read state: %x != requested %x", rs.RequestCtx, activeRctx[:])
   665  				goto again
   666  			}
   667  			return rs.Index, nil
   668  		case <-ctx.Done():
   669  			glog.Warningf("[%#x] Read index context timed out\n", n.Id)
   670  			return 0, errInternalRetry
   671  		}
   672  	} // end of readIndex func
   673  
   674  	// We maintain one linearizable ReadIndex request at a time.  Others wait queued behind
   675  	// requestCh.
   676  	requests := []linReadReq{}
   677  	for {
   678  		select {
   679  		case <-closer.HasBeenClosed():
   680  			return
   681  		case <-readStateCh:
   682  			// Do nothing, discard ReadState as we don't have any pending ReadIndex requests.
   683  		case req := <-n.requestCh:
   684  		slurpLoop:
   685  			for {
   686  				requests = append(requests, req)
   687  				select {
   688  				case req = <-n.requestCh:
   689  				default:
   690  					break slurpLoop
   691  				}
   692  			}
   693  			// Create one activeRctx slice for the read index, even if we have to call readIndex
   694  			// repeatedly. That way, we can process the requests as soon as we encounter the first
   695  			// activeRctx. This is better than flooding readIndex with a new activeRctx on each
   696  			// call, causing more unique traffic and further delays in request processing.
   697  			activeRctx := make([]byte, 8)
   698  			x.Check2(n.Rand.Read(activeRctx))
   699  			glog.V(3).Infof("Request readctx: %#x", activeRctx)
   700  			for {
   701  				index, err := readIndex(activeRctx)
   702  				if err == errInternalRetry {
   703  					continue
   704  				}
   705  				if err != nil {
   706  					index = 0
   707  					glog.Errorf("[%#x] While trying to do lin read index: %v", n.Id, err)
   708  				}
   709  				for _, req := range requests {
   710  					req.indexCh <- index
   711  				}
   712  				break
   713  			}
   714  			requests = requests[:0]
   715  		}
   716  	}
   717  }
   718  
   719  func (n *Node) joinCluster(ctx context.Context, rc *pb.RaftContext) (*api.Payload, error) {
   720  	// Only process one JoinCluster request at a time.
   721  	n.joinLock.Lock()
   722  	defer n.joinLock.Unlock()
   723  
   724  	// Check that the new node is from the same group as me.
   725  	if rc.Group != n.RaftContext.Group {
   726  		return nil, errors.Errorf("Raft group mismatch")
   727  	}
   728  	// Also check that the new node is not me.
   729  	if rc.Id == n.RaftContext.Id {
   730  		return nil, errors.Errorf("REUSE_RAFTID: Raft ID duplicates mine: %+v", rc)
   731  	}
   732  
   733  	// Check that the new node is not already part of the group.
   734  	if addr, ok := n.Peer(rc.Id); ok && rc.Addr != addr {
   735  		// There exists a healthy connection to server with same id.
   736  		if _, err := GetPools().Get(addr); err == nil {
   737  			return &api.Payload{}, errors.Errorf(
   738  				"REUSE_ADDR: IP Address same as existing peer: %s", addr)
   739  		}
   740  	}
   741  	n.Connect(rc.Id, rc.Addr)
   742  
   743  	err := n.addToCluster(context.Background(), rc.Id)
   744  	glog.Infof("[%#x] Done joining cluster with err: %v", rc.Id, err)
   745  	return &api.Payload{}, err
   746  }