github.com/unigraph-dev/dgraph@v1.1.1-0.20200923154953-8b52b426f765/dgraph/cmd/zero/raft.go (about)

     1  /*
     2   * Copyright 2017-2018 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package zero
    18  
    19  import (
    20  	"fmt"
    21  	"log"
    22  	"math"
    23  	"sort"
    24  	"strings"
    25  	"sync"
    26  	"time"
    27  
    28  	otrace "go.opencensus.io/trace"
    29  
    30  	"github.com/dgraph-io/badger/y"
    31  	"github.com/dgraph-io/dgraph/conn"
    32  	"github.com/dgraph-io/dgraph/protos/pb"
    33  	"github.com/dgraph-io/dgraph/x"
    34  	farm "github.com/dgryski/go-farm"
    35  	"github.com/golang/glog"
    36  	"github.com/google/uuid"
    37  	"github.com/pkg/errors"
    38  	"go.etcd.io/etcd/raft"
    39  	"go.etcd.io/etcd/raft/raftpb"
    40  	"golang.org/x/net/context"
    41  )
    42  
    43  type node struct {
    44  	*conn.Node
    45  	server *Server
    46  	ctx    context.Context
    47  	closer *y.Closer // to stop Run.
    48  
    49  	// The last timestamp when this Zero was able to reach quorum.
    50  	mu         sync.RWMutex
    51  	lastQuorum time.Time
    52  }
    53  
    54  func (n *node) AmLeader() bool {
    55  	if n.Raft() == nil {
    56  		return false
    57  	}
    58  	r := n.Raft()
    59  	if r.Status().Lead != r.Status().ID {
    60  		return false
    61  	}
    62  
    63  	// This node must be the leader, but must also be an active member of the cluster, and not
    64  	// hidden behind a partition. Basically, if this node was the leader and goes behind a
    65  	// partition, it would still think that it is indeed the leader for the duration mentioned
    66  	// below.
    67  	n.mu.RLock()
    68  	defer n.mu.RUnlock()
    69  	return time.Since(n.lastQuorum) <= 5*time.Second
    70  }
    71  
    72  func (n *node) uniqueKey() string {
    73  	return fmt.Sprintf("z%x-%d", n.Id, n.Rand.Uint64())
    74  }
    75  
    76  var errInternalRetry = errors.New("Retry Raft proposal internally")
    77  
    78  // proposeAndWait makes a proposal to the quorum for Group Zero and waits for it to be accepted by
    79  // the group before returning. It is safe to call concurrently.
    80  func (n *node) proposeAndWait(ctx context.Context, proposal *pb.ZeroProposal) error {
    81  	switch {
    82  	case n.Raft() == nil:
    83  		return errors.Errorf("Raft isn't initialized yet.")
    84  	case ctx.Err() != nil:
    85  		return ctx.Err()
    86  	case !n.AmLeader():
    87  		// Do this check upfront. Don't do this inside propose for reasons explained below.
    88  		return errors.Errorf("Not Zero leader. Aborting proposal: %+v", proposal)
    89  	}
    90  
    91  	// We could consider adding a wrapper around the user proposal, so we can access any key-values.
    92  	// Something like this:
    93  	// https://github.com/golang/go/commit/5d39260079b5170e6b4263adb4022cc4b54153c4
    94  	span := otrace.FromContext(ctx)
    95  	// Overwrite ctx, so we no longer enforce the timeouts or cancels from ctx.
    96  	ctx = otrace.NewContext(context.Background(), span)
    97  
    98  	stop := x.SpanTimer(span, "n.proposeAndWait")
    99  	defer stop()
   100  
   101  	// propose runs in a loop. So, we should not do any checks inside, including n.AmLeader. This is
   102  	// to avoid the scenario where the first proposal times out and the second one gets returned
   103  	// due to node no longer being the leader. In this scenario, the first proposal can still get
   104  	// accepted by Raft, causing a txn violation later for us, because we assumed that the proposal
   105  	// did not go through.
   106  	propose := func(timeout time.Duration) error {
   107  		cctx, cancel := context.WithTimeout(ctx, timeout)
   108  		defer cancel()
   109  
   110  		errCh := make(chan error, 1)
   111  		pctx := &conn.ProposalCtx{
   112  			ErrCh: errCh,
   113  			// Don't use the original context, because that's not what we're passing to Raft.
   114  			Ctx: cctx,
   115  		}
   116  		key := n.uniqueKey()
   117  		x.AssertTruef(n.Proposals.Store(key, pctx), "Found existing proposal with key: [%v]", key)
   118  		defer n.Proposals.Delete(key)
   119  		proposal.Key = key
   120  		span.Annotatef(nil, "Proposing with key: %s. Timeout: %v", key, timeout)
   121  
   122  		data, err := proposal.Marshal()
   123  		if err != nil {
   124  			return err
   125  		}
   126  		// Propose the change.
   127  		if err := n.Raft().Propose(cctx, data); err != nil {
   128  			span.Annotatef(nil, "Error while proposing via Raft: %v", err)
   129  			return errors.Wrapf(err, "While proposing")
   130  		}
   131  
   132  		// Wait for proposal to be applied or timeout.
   133  		select {
   134  		case err := <-errCh:
   135  			// We arrived here by a call to n.props.Done().
   136  			return err
   137  		case <-cctx.Done():
   138  			span.Annotatef(nil, "Internal context timeout %s. Will retry...", timeout)
   139  			return errInternalRetry
   140  		}
   141  	}
   142  
   143  	// Some proposals can be stuck if leader change happens. For e.g. MsgProp message from follower
   144  	// to leader can be dropped/end up appearing with empty Data in CommittedEntries.
   145  	// Having a timeout here prevents the mutation being stuck forever in case they don't have a
   146  	// timeout. We should always try with a timeout and optionally retry.
   147  	err := errInternalRetry
   148  	timeout := 4 * time.Second
   149  	for err == errInternalRetry {
   150  		err = propose(timeout)
   151  		timeout *= 2 // Exponential backoff
   152  		if timeout > time.Minute {
   153  			timeout = 32 * time.Second
   154  		}
   155  	}
   156  	return err
   157  }
   158  
   159  var (
   160  	errInvalidProposal     = errors.New("Invalid group proposal")
   161  	errTabletAlreadyServed = errors.New("Tablet is already being served")
   162  )
   163  
   164  func newGroup() *pb.Group {
   165  	return &pb.Group{
   166  		Members: make(map[uint64]*pb.Member),
   167  		Tablets: make(map[string]*pb.Tablet),
   168  	}
   169  }
   170  
   171  func (n *node) handleMemberProposal(member *pb.Member) error {
   172  	n.server.AssertLock()
   173  	state := n.server.state
   174  
   175  	m := n.server.member(member.Addr)
   176  	// Ensures that different nodes don't have same address.
   177  	if m != nil && (m.Id != member.Id || m.GroupId != member.GroupId) {
   178  		return errors.Errorf("Found another member %d with same address: %v", m.Id, m.Addr)
   179  	}
   180  	if member.GroupId == 0 {
   181  		state.Zeros[member.Id] = member
   182  		if member.Leader {
   183  			// Unset leader flag for other nodes, there can be only one
   184  			// leader at a time.
   185  			for _, m := range state.Zeros {
   186  				if m.Id != member.Id {
   187  					m.Leader = false
   188  				}
   189  			}
   190  		}
   191  		return nil
   192  	}
   193  	group := state.Groups[member.GroupId]
   194  	if group == nil {
   195  		group = newGroup()
   196  		state.Groups[member.GroupId] = group
   197  	}
   198  	m, has := group.Members[member.Id]
   199  	if member.AmDead {
   200  		if has {
   201  			delete(group.Members, member.Id)
   202  			state.Removed = append(state.Removed, m)
   203  		}
   204  		// else already removed.
   205  		if len(group.Members) == 0 {
   206  			glog.V(3).Infof("Deleting group Id %d (no members) ...", member.GroupId)
   207  			delete(state.Groups, member.GroupId)
   208  		}
   209  		return nil
   210  	}
   211  	if !has && len(group.Members) >= n.server.NumReplicas {
   212  		// We shouldn't allow more members than the number of replicas.
   213  		return errors.Errorf("Group reached replication level. Can't add another member: %+v", member)
   214  	}
   215  
   216  	// Create a connection to this server.
   217  	go conn.GetPools().Connect(member.Addr)
   218  
   219  	group.Members[member.Id] = member
   220  	// Increment nextGroup when we have enough replicas
   221  	if member.GroupId == n.server.nextGroup &&
   222  		len(group.Members) >= n.server.NumReplicas {
   223  		n.server.nextGroup++
   224  	}
   225  	if member.Leader {
   226  		// Unset leader flag for other nodes, there can be only one
   227  		// leader at a time.
   228  		for _, m := range group.Members {
   229  			if m.Id != member.Id {
   230  				m.Leader = false
   231  			}
   232  		}
   233  	}
   234  	// On replay of logs on restart we need to set nextGroup.
   235  	if n.server.nextGroup <= member.GroupId {
   236  		n.server.nextGroup = member.GroupId + 1
   237  	}
   238  	return nil
   239  }
   240  
   241  func (n *node) handleTabletProposal(tablet *pb.Tablet) error {
   242  	n.server.AssertLock()
   243  	state := n.server.state
   244  	defer func() {
   245  		// Regenerate group checksums. These checksums are solely based on which tablets are being
   246  		// served by the group. If the tablets that a group is serving changes, and the Alpha does
   247  		// not know about these changes, then the read request must fail.
   248  		for _, g := range state.GetGroups() {
   249  			preds := make([]string, 0, len(g.GetTablets()))
   250  			for pred := range g.GetTablets() {
   251  				preds = append(preds, pred)
   252  			}
   253  			sort.Strings(preds)
   254  			g.Checksum = farm.Fingerprint64([]byte(strings.Join(preds, "")))
   255  		}
   256  		if n.AmLeader() {
   257  			// It is important to push something to Oracle updates channel, so the subscribers would
   258  			// get the latest checksum that we calculated above. Otherwise, if all the queries are
   259  			// best effort queries which don't create any transaction, then the OracleDelta never
   260  			// gets sent to Alphas, causing their group checksum to mismatch and never converge.
   261  			n.server.orc.updates <- &pb.OracleDelta{}
   262  		}
   263  	}()
   264  
   265  	if tablet.GroupId == 0 {
   266  		return errors.Errorf("Tablet group id is zero: %+v", tablet)
   267  	}
   268  	group := state.Groups[tablet.GroupId]
   269  	if tablet.Remove {
   270  		glog.Infof("Removing tablet for attr: [%v], gid: [%v]\n", tablet.Predicate, tablet.GroupId)
   271  		if group != nil {
   272  			delete(group.Tablets, tablet.Predicate)
   273  		}
   274  		return nil
   275  	}
   276  	if group == nil {
   277  		group = newGroup()
   278  		state.Groups[tablet.GroupId] = group
   279  	}
   280  
   281  	// There's a edge case that we're handling.
   282  	// Two servers ask to serve the same tablet, then we need to ensure that
   283  	// only the first one succeeds.
   284  	if prev := n.server.servingTablet(tablet.Predicate); prev != nil {
   285  		if tablet.Force {
   286  			originalGroup := state.Groups[prev.GroupId]
   287  			delete(originalGroup.Tablets, tablet.Predicate)
   288  		} else {
   289  			if prev.GroupId != tablet.GroupId {
   290  				glog.Infof(
   291  					"Tablet for attr: [%s], gid: [%d] already served by group: [%d]\n",
   292  					prev.Predicate, tablet.GroupId, prev.GroupId)
   293  				return errTabletAlreadyServed
   294  			}
   295  		}
   296  	}
   297  	tablet.Force = false
   298  	group.Tablets[tablet.Predicate] = tablet
   299  	return nil
   300  }
   301  
   302  func (n *node) applyProposal(e raftpb.Entry) (string, error) {
   303  	var p pb.ZeroProposal
   304  	// Raft commits empty entry on becoming a leader.
   305  	if len(e.Data) == 0 {
   306  		return p.Key, nil
   307  	}
   308  	if err := p.Unmarshal(e.Data); err != nil {
   309  		return p.Key, err
   310  	}
   311  	if len(p.Key) == 0 {
   312  		return p.Key, errInvalidProposal
   313  	}
   314  	span := otrace.FromContext(n.Proposals.Ctx(p.Key))
   315  
   316  	n.server.Lock()
   317  	defer n.server.Unlock()
   318  
   319  	state := n.server.state
   320  	state.Counter = e.Index
   321  	if len(p.Cid) > 0 {
   322  		if len(state.Cid) > 0 {
   323  			return p.Key, errInvalidProposal
   324  		}
   325  		state.Cid = p.Cid
   326  	}
   327  	if p.MaxRaftId > 0 {
   328  		if p.MaxRaftId <= state.MaxRaftId {
   329  			return p.Key, errInvalidProposal
   330  		}
   331  		state.MaxRaftId = p.MaxRaftId
   332  	}
   333  	if p.SnapshotTs != nil {
   334  		for gid, ts := range p.SnapshotTs {
   335  			if group, ok := state.Groups[gid]; ok {
   336  				group.SnapshotTs = x.Max(group.SnapshotTs, ts)
   337  			}
   338  		}
   339  		purgeTs := uint64(math.MaxUint64)
   340  		for _, group := range state.Groups {
   341  			purgeTs = x.Min(purgeTs, group.SnapshotTs)
   342  		}
   343  		if purgeTs < math.MaxUint64 {
   344  			n.server.orc.purgeBelow(purgeTs)
   345  		}
   346  	}
   347  	if p.Member != nil {
   348  		if err := n.handleMemberProposal(p.Member); err != nil {
   349  			span.Annotatef(nil, "While applying membership proposal: %+v", err)
   350  			glog.Errorf("While applying membership proposal: %+v", err)
   351  			return p.Key, err
   352  		}
   353  	}
   354  	if p.Tablet != nil {
   355  		if err := n.handleTabletProposal(p.Tablet); err != nil {
   356  			span.Annotatef(nil, "While applying tablet proposal: %+v", err)
   357  			glog.Errorf("While applying tablet proposal: %+v", err)
   358  			return p.Key, err
   359  		}
   360  	}
   361  	if p.License != nil {
   362  		// Check that the number of nodes in the cluster should be less than MaxNodes, otherwise
   363  		// reject the proposal.
   364  		numNodes := len(state.GetZeros())
   365  		for _, group := range state.GetGroups() {
   366  			numNodes += len(group.GetMembers())
   367  		}
   368  		if uint64(numNodes) > p.GetLicense().GetMaxNodes() {
   369  			return p.Key, errInvalidProposal
   370  		}
   371  		state.License = p.License
   372  		// Check expiry and set enabled accordingly.
   373  		expiry := time.Unix(state.License.ExpiryTs, 0).UTC()
   374  		state.License.Enabled = time.Now().UTC().Before(expiry)
   375  	}
   376  
   377  	if p.MaxLeaseId > state.MaxLeaseId {
   378  		state.MaxLeaseId = p.MaxLeaseId
   379  	} else if p.MaxTxnTs > state.MaxTxnTs {
   380  		state.MaxTxnTs = p.MaxTxnTs
   381  	} else if p.MaxLeaseId != 0 || p.MaxTxnTs != 0 {
   382  		// Could happen after restart when some entries were there in WAL and did not get
   383  		// snapshotted.
   384  		glog.Infof("Could not apply proposal, ignoring: p.MaxLeaseId=%v, p.MaxTxnTs=%v maxLeaseId=%d"+
   385  			" maxTxnTs=%d\n", p.MaxLeaseId, p.MaxTxnTs, state.MaxLeaseId, state.MaxTxnTs)
   386  	}
   387  	if p.Txn != nil {
   388  		n.server.orc.updateCommitStatus(e.Index, p.Txn)
   389  	}
   390  
   391  	return p.Key, nil
   392  }
   393  
   394  func (n *node) applyConfChange(e raftpb.Entry) {
   395  	var cc raftpb.ConfChange
   396  	if err := cc.Unmarshal(e.Data); err != nil {
   397  		glog.Errorf("While unmarshalling confchange: %+v", err)
   398  	}
   399  
   400  	if cc.Type == raftpb.ConfChangeRemoveNode {
   401  		if cc.NodeID == n.Id {
   402  			glog.Fatalf("I [id:%#x group:0] have been removed. Goodbye!", n.Id)
   403  		}
   404  		n.DeletePeer(cc.NodeID)
   405  		n.server.removeZero(cc.NodeID)
   406  
   407  	} else if len(cc.Context) > 0 {
   408  		var rc pb.RaftContext
   409  		x.Check(rc.Unmarshal(cc.Context))
   410  		go n.Connect(rc.Id, rc.Addr)
   411  
   412  		m := &pb.Member{Id: rc.Id, Addr: rc.Addr, GroupId: 0}
   413  		for _, member := range n.server.membershipState().Removed {
   414  			// It is not recommended to reuse RAFT ids.
   415  			if member.GroupId == 0 && m.Id == member.Id {
   416  				err := errors.Errorf("REUSE_RAFTID: Reusing removed id: %d.\n", m.Id)
   417  				n.DoneConfChange(cc.ID, err)
   418  				// Cancel configuration change.
   419  				cc.NodeID = raft.None
   420  				n.Raft().ApplyConfChange(cc)
   421  				return
   422  			}
   423  		}
   424  
   425  		n.server.storeZero(m)
   426  	}
   427  
   428  	cs := n.Raft().ApplyConfChange(cc)
   429  	n.SetConfState(cs)
   430  	n.DoneConfChange(cc.ID, nil)
   431  
   432  	// The following doesn't really trigger leader change. It's just capturing a leader change
   433  	// event. The naming is poor. TODO: Fix naming, and see if we can simplify this leader change
   434  	// logic.
   435  	n.triggerLeaderChange()
   436  }
   437  
   438  func (n *node) triggerLeaderChange() {
   439  	n.server.triggerLeaderChange()
   440  	// We update leader information on each node without proposal. This
   441  	// function is called on all nodes on leader change.
   442  	n.server.updateZeroLeader()
   443  }
   444  
   445  func (n *node) initAndStartNode() error {
   446  	_, restart, err := n.PastLife()
   447  	x.Check(err)
   448  
   449  	if restart {
   450  		glog.Infoln("Restarting node for dgraphzero")
   451  		sp, err := n.Store.Snapshot()
   452  		x.Checkf(err, "Unable to get existing snapshot")
   453  		if !raft.IsEmptySnap(sp) {
   454  			// It is important that we pick up the conf state here.
   455  			n.SetConfState(&sp.Metadata.ConfState)
   456  
   457  			var state pb.MembershipState
   458  			x.Check(state.Unmarshal(sp.Data))
   459  			n.server.SetMembershipState(&state)
   460  			for _, id := range sp.Metadata.ConfState.Nodes {
   461  				n.Connect(id, state.Zeros[id].Addr)
   462  			}
   463  		}
   464  
   465  		n.SetRaft(raft.RestartNode(n.Cfg))
   466  
   467  	} else if len(opts.peer) > 0 {
   468  		p := conn.GetPools().Connect(opts.peer)
   469  		if p == nil {
   470  			return errors.Errorf("Unhealthy connection to %v", opts.peer)
   471  		}
   472  
   473  		gconn := p.Get()
   474  		c := pb.NewRaftClient(gconn)
   475  		timeout := 8 * time.Second
   476  		for {
   477  			ctx, cancel := context.WithTimeout(n.ctx, timeout)
   478  			defer cancel()
   479  			// JoinCluster can block indefinitely, raft ignores conf change proposal
   480  			// if it has pending configuration.
   481  			_, err := c.JoinCluster(ctx, n.RaftContext)
   482  			if err == nil {
   483  				break
   484  			}
   485  			if x.ShouldCrash(err) {
   486  				log.Fatalf("Error while joining cluster: %v", err)
   487  			}
   488  			glog.Errorf("Error while joining cluster: %v\n", err)
   489  			timeout *= 2
   490  			if timeout > 32*time.Second {
   491  				timeout = 32 * time.Second
   492  			}
   493  			time.Sleep(timeout) // This is useful because JoinCluster can exit immediately.
   494  		}
   495  		glog.Infof("[%#x] Starting node\n", n.Id)
   496  		n.SetRaft(raft.StartNode(n.Cfg, nil))
   497  
   498  	} else {
   499  		data, err := n.RaftContext.Marshal()
   500  		x.Check(err)
   501  		peers := []raft.Peer{{ID: n.Id, Context: data}}
   502  		n.SetRaft(raft.StartNode(n.Cfg, peers))
   503  
   504  		go func() {
   505  			// This is a new cluster. So, propose a new ID for the cluster.
   506  			for {
   507  				id := uuid.New().String()
   508  				err := n.proposeAndWait(context.Background(), &pb.ZeroProposal{Cid: id})
   509  				if err == nil {
   510  					glog.Infof("CID set for cluster: %v", id)
   511  					break
   512  				}
   513  				if err == errInvalidProposal {
   514  					glog.Errorf("invalid proposal error while proposing cluster id")
   515  					return
   516  				}
   517  				glog.Errorf("While proposing CID: %v. Retrying...", err)
   518  				time.Sleep(3 * time.Second)
   519  			}
   520  
   521  			if err := n.proposeTrialLicense(); err != nil {
   522  				glog.Errorf("while proposing trial license to cluster: %v", err)
   523  			}
   524  		}()
   525  	}
   526  
   527  	go n.Run()
   528  	go n.BatchAndSendMessages()
   529  	return nil
   530  }
   531  
   532  func (n *node) updateZeroMembershipPeriodically(closer *y.Closer) {
   533  	defer closer.Done()
   534  	ticker := time.NewTicker(10 * time.Second)
   535  	defer ticker.Stop()
   536  
   537  	for {
   538  		select {
   539  		case <-ticker.C:
   540  			n.server.updateZeroLeader()
   541  		case <-closer.HasBeenClosed():
   542  			return
   543  		}
   544  	}
   545  }
   546  
   547  var startOption = otrace.WithSampler(otrace.ProbabilitySampler(0.01))
   548  
   549  func (n *node) checkQuorum(closer *y.Closer) {
   550  	defer closer.Done()
   551  	ticker := time.NewTicker(time.Second)
   552  	defer ticker.Stop()
   553  
   554  	quorum := func() {
   555  		// Make this timeout 1.5x the timeout on RunReadIndexLoop.
   556  		ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
   557  		defer cancel()
   558  
   559  		ctx, span := otrace.StartSpan(ctx, "Zero.checkQuorum", startOption)
   560  		defer span.End()
   561  		span.Annotatef(nil, "Node id: %d", n.Id)
   562  
   563  		if state, err := n.server.latestMembershipState(ctx); err == nil {
   564  			n.mu.Lock()
   565  			n.lastQuorum = time.Now()
   566  			n.mu.Unlock()
   567  			// Also do some connection cleanup.
   568  			conn.GetPools().RemoveInvalid(state)
   569  			span.Annotate(nil, "Updated lastQuorum")
   570  
   571  		} else if glog.V(1) {
   572  			span.Annotatef(nil, "Got error: %v", err)
   573  			glog.Warningf("Zero node: %#x unable to reach quorum. Error: %v", n.Id, err)
   574  		}
   575  	}
   576  
   577  	for {
   578  		select {
   579  		case <-ticker.C:
   580  			quorum()
   581  		case <-closer.HasBeenClosed():
   582  			return
   583  		}
   584  	}
   585  }
   586  
   587  func (n *node) snapshotPeriodically(closer *y.Closer) {
   588  	defer closer.Done()
   589  	ticker := time.NewTicker(10 * time.Second)
   590  	defer ticker.Stop()
   591  
   592  	for {
   593  		select {
   594  		case <-ticker.C:
   595  			n.trySnapshot(1000)
   596  
   597  		case <-closer.HasBeenClosed():
   598  			return
   599  		}
   600  	}
   601  }
   602  
   603  func (n *node) trySnapshot(skip uint64) {
   604  	existing, err := n.Store.Snapshot()
   605  	x.Checkf(err, "Unable to get existing snapshot")
   606  	si := existing.Metadata.Index
   607  	idx := n.server.SyncedUntil()
   608  	if idx <= si+skip {
   609  		return
   610  	}
   611  
   612  	data, err := n.server.MarshalMembershipState()
   613  	x.Check(err)
   614  
   615  	err = n.Store.CreateSnapshot(idx, n.ConfState(), data)
   616  	x.Checkf(err, "While creating snapshot")
   617  	glog.Infof("Writing snapshot at index: %d, applied mark: %d\n", idx, n.Applied.DoneUntil())
   618  }
   619  
   620  func (n *node) Run() {
   621  	var leader bool
   622  	ticker := time.NewTicker(100 * time.Millisecond)
   623  	defer ticker.Stop()
   624  
   625  	// snapshot can cause select loop to block while deleting entries, so run
   626  	// it in goroutine
   627  	readStateCh := make(chan raft.ReadState, 100)
   628  	closer := y.NewCloser(5)
   629  	defer func() {
   630  		closer.SignalAndWait()
   631  		n.closer.Done()
   632  		glog.Infof("Zero Node.Run finished.")
   633  	}()
   634  
   635  	go n.snapshotPeriodically(closer)
   636  	go n.updateEnterpriseState(closer)
   637  	go n.updateZeroMembershipPeriodically(closer)
   638  	go n.checkQuorum(closer)
   639  	go n.RunReadIndexLoop(closer, readStateCh)
   640  	// We only stop runReadIndexLoop after the for loop below has finished interacting with it.
   641  	// That way we know sending to readStateCh will not deadlock.
   642  
   643  	var timer x.Timer
   644  	for {
   645  		select {
   646  		case <-n.closer.HasBeenClosed():
   647  			n.Raft().Stop()
   648  			return
   649  		case <-ticker.C:
   650  			n.Raft().Tick()
   651  		case rd := <-n.Raft().Ready():
   652  			timer.Start()
   653  			_, span := otrace.StartSpan(n.ctx, "Zero.RunLoop",
   654  				otrace.WithSampler(otrace.ProbabilitySampler(0.001)))
   655  			for _, rs := range rd.ReadStates {
   656  				// No need to use select-case-default on pushing to readStateCh. It is typically
   657  				// empty.
   658  				readStateCh <- rs
   659  			}
   660  			span.Annotatef(nil, "Pushed %d readstates", len(rd.ReadStates))
   661  
   662  			if rd.SoftState != nil {
   663  				if rd.RaftState == raft.StateLeader && !leader {
   664  					glog.Infoln("I've become the leader, updating leases.")
   665  					n.server.updateLeases()
   666  				}
   667  				leader = rd.RaftState == raft.StateLeader
   668  				// Oracle stream would close the stream once it steps down as leader
   669  				// predicate move would cancel any in progress move on stepping down.
   670  				n.triggerLeaderChange()
   671  			}
   672  			if leader {
   673  				// Leader can send messages in parallel with writing to disk.
   674  				for _, msg := range rd.Messages {
   675  					n.Send(msg)
   676  				}
   677  			}
   678  			n.SaveToStorage(rd.HardState, rd.Entries, rd.Snapshot)
   679  			timer.Record("disk")
   680  			if rd.MustSync {
   681  				if err := n.Store.Sync(); err != nil {
   682  					glog.Errorf("Error while calling Store.Sync: %v", err)
   683  				}
   684  				timer.Record("sync")
   685  			}
   686  			span.Annotatef(nil, "Saved to storage")
   687  
   688  			if !raft.IsEmptySnap(rd.Snapshot) {
   689  				var state pb.MembershipState
   690  				x.Check(state.Unmarshal(rd.Snapshot.Data))
   691  				n.server.SetMembershipState(&state)
   692  			}
   693  
   694  			for _, entry := range rd.CommittedEntries {
   695  				n.Applied.Begin(entry.Index)
   696  				if entry.Type == raftpb.EntryConfChange {
   697  					n.applyConfChange(entry)
   698  					glog.Infof("Done applying conf change at %#x", n.Id)
   699  
   700  				} else if entry.Type == raftpb.EntryNormal {
   701  					key, err := n.applyProposal(entry)
   702  					if err != nil {
   703  						glog.Errorf("While applying proposal: %v\n", err)
   704  					}
   705  					n.Proposals.Done(key, err)
   706  
   707  				} else {
   708  					glog.Infof("Unhandled entry: %+v\n", entry)
   709  				}
   710  				n.Applied.Done(entry.Index)
   711  			}
   712  			span.Annotatef(nil, "Applied %d CommittedEntries", len(rd.CommittedEntries))
   713  
   714  			if !leader {
   715  				// Followers should send messages later.
   716  				for _, msg := range rd.Messages {
   717  					n.Send(msg)
   718  				}
   719  			}
   720  			span.Annotate(nil, "Sent messages")
   721  			timer.Record("proposals")
   722  
   723  			n.Raft().Advance()
   724  			span.Annotate(nil, "Advanced Raft")
   725  			timer.Record("advance")
   726  
   727  			span.End()
   728  			if timer.Total() > 200*time.Millisecond {
   729  				glog.Warningf(
   730  					"Raft.Ready took too long to process: %s."+
   731  						" Num entries: %d. MustSync: %v",
   732  					timer.String(), len(rd.Entries), rd.MustSync)
   733  			}
   734  		}
   735  	}
   736  }