
     1  // Copyright 2018 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     4  package raftbackstop
     6  import (
     7  	"bytes"
     9  	""
    10  	""
    11  	""
    12  	""
    13  	""
    14  	""
    16  	""
    17  )
    19  // RaftNode captures the part of the *raft.Raft API needed by the
    20  // backstop worker.
    21  type RaftNode interface {
    22  	State() raft.RaftState
    23  	GetConfiguration() raft.ConfigurationFuture
    24  }
    26  // Logger represents the logging methods called.
    27  type Logger interface {
    28  	Infof(message string, args ...interface{})
    29  	Debugf(message string, args ...interface{})
    30  }
    32  // This worker monitors the state of the raft cluster it's passed, and
    33  // if it detects that there is only one remaining API server that is
    34  // non-voting, will force-append a configuration message to the raft
    35  // log to enable it to recover. This allows us to handle the situation
    36  // where we had two remaining machines (a leader and a non-voting
    37  // follower) but the leader was removed.
    39  // Config holds the values needed by the worker.
    40  type Config struct {
    41  	Raft     RaftNode
    42  	LogStore raft.LogStore
    43  	Hub      *pubsub.StructuredHub
    44  	Logger   Logger
    45  	LocalID  raft.ServerID
    46  }
    48  // Validate validates the raft worker configuration.
    49  func (config Config) Validate() error {
    50  	if config.Hub == nil {
    51  		return errors.NotValidf("nil Hub")
    52  	}
    53  	if config.Raft == nil {
    54  		return errors.NotValidf("nil Raft")
    55  	}
    56  	if config.LogStore == nil {
    57  		return errors.NotValidf("nil LogStore")
    58  	}
    59  	if config.Logger == nil {
    60  		return errors.NotValidf("nil Logger")
    61  	}
    62  	if config.LocalID == "" {
    63  		return errors.NotValidf("empty LocalID")
    64  	}
    65  	return nil
    66  }
    68  // NewWorker returns a worker responsible for recovering the raft
    69  // cluster when it's been reduced to one server that can't become
    70  // leader.
    71  func NewWorker(config Config) (worker.Worker, error) {
    72  	if err := config.Validate(); err != nil {
    73  		return nil, errors.Trace(err)
    74  	}
    75  	w := &backstopWorker{
    76  		config:        config,
    77  		serverDetails: make(chan apiserver.Details),
    78  	}
    79  	// Subscribe to API server address changes.
    80  	unsubscribe, err := config.Hub.Subscribe(
    81  		apiserver.DetailsTopic,
    82  		w.apiserverDetailsChanged,
    83  	)
    84  	if err != nil {
    85  		return nil, errors.Annotate(err, "subscribing to apiserver details")
    86  	}
    87  	// Now that we're subscribed, request the current API server details.
    88  	req := apiserver.DetailsRequest{
    89  		Requester: "raft-backstop",
    90  		LocalOnly: true,
    91  	}
    92  	if _, err := config.Hub.Publish(apiserver.DetailsRequestTopic, req); err != nil {
    93  		return nil, errors.Annotate(err, "requesting current apiserver details")
    94  	}
    96  	if err := catacomb.Invoke(catacomb.Plan{
    97  		Site: &w.catacomb,
    98  		Work: func() error {
    99  			defer unsubscribe()
   100  			return w.loop()
   101  		},
   102  	}); err != nil {
   103  		unsubscribe()
   104  		return nil, errors.Trace(err)
   105  	}
   106  	return w, nil
   107  }
   109  type backstopWorker struct {
   110  	catacomb catacomb.Catacomb
   111  	config   Config
   113  	serverDetails   chan apiserver.Details
   114  	configUpdated   bool
   115  	reportedWaiting bool
   116  }
   118  // Kill is part of the worker.Worker interface.
   119  func (w *backstopWorker) Kill() {
   120  	w.catacomb.Kill(nil)
   121  }
   123  // Wait is part of the worker.Worker interface.
   124  func (w *backstopWorker) Wait() error {
   125  	return w.catacomb.Wait()
   126  }
   128  func (w *backstopWorker) loop() error {
   129  	for {
   130  		select {
   131  		case <-w.catacomb.Dying():
   132  			return w.catacomb.ErrDying()
   133  		case details := <-w.serverDetails:
   134  			err := w.maybeRecoverCluster(details)
   135  			if err != nil {
   136  				return errors.Trace(err)
   137  			}
   138  		}
   139  	}
   140  }
   142  func (w *backstopWorker) maybeRecoverCluster(details apiserver.Details) error {
   143  	if w.configUpdated {
   144  		if !w.reportedWaiting {
   145  			w.config.Logger.Infof("raft configuration already updated; waiting for raft worker restart")
   146  			w.reportedWaiting = true
   147  		}
   148  		return nil
   149  	}
   150  	if len(details.Servers) != 1 {
   151  		return nil
   152  	}
   153  	if _, found := details.Servers[string(w.config.LocalID)]; !found {
   154  		return nil
   155  	}
   156  	if w.config.Raft.State() == raft.Leader {
   157  		return nil
   158  	}
   160  	raftServers, err := w.getConfiguration()
   161  	if err != nil {
   162  		return errors.Annotate(err, "getting raft configuration")
   163  	}
   165  	numServers := len(raftServers)
   166  	localServer := raftServers[w.config.LocalID]
   167  	if localServer == nil {
   168  		return nil
   169  	}
   171  	localVote := localServer.Suffrage != raft.Nonvoter
   172  	if numServers == 1 && localVote {
   173  		// The server can vote and has quorum, so it can become leader.
   174  		return nil
   175  	}
   177  	err = w.recoverCluster(localServer)
   178  	return errors.Annotate(err, "recovering cluster")
   179  }
   181  func (w *backstopWorker) recoverCluster(server *raft.Server) error {
   182  	if server == nil {
   183  		return errors.Errorf("nil *server passed to recoverCluster")
   184  	}
   185  	w.config.Logger.Infof("remaining controller machine can't become raft leader - recovering the cluster")
   187  	// Make a configuration that can be written into the log, and append it.
   188  	newServer := *server
   189  	newServer.Suffrage = raft.Voter
   190  	configuration := raft.Configuration{
   191  		Servers: []raft.Server{newServer},
   192  	}
   193  	w.config.Logger.Debugf("appending recovery configuration: %#v", configuration)
   194  	data, err := encodeConfiguration(configuration)
   196  	// Work out the last term and index.
   197  	lastIndex, err := w.config.LogStore.LastIndex()
   198  	if err != nil {
   199  		return errors.Annotate(err, "getting last log index")
   200  	}
   201  	var lastLog raft.Log
   202  	err = w.config.LogStore.GetLog(lastIndex, &lastLog)
   203  	if err != nil {
   204  		return errors.Annotate(err, "getting last log entry")
   205  	}
   207  	// Prepare log record to add.
   208  	record := raft.Log{
   209  		Index: lastIndex + 1,
   210  		Term:  lastLog.Term,
   211  		Type:  raft.LogConfiguration,
   212  		Data:  data,
   213  	}
   214  	if err := w.config.LogStore.StoreLog(&record); err != nil {
   215  		return errors.Annotate(err, "storing recovery configuration")
   216  	}
   217  	w.configUpdated = true
   218  	return nil
   219  }
   221  func encodeConfiguration(config raft.Configuration) ([]byte, error) {
   222  	buf := bytes.NewBuffer(nil)
   223  	hd := codec.MsgpackHandle{}
   224  	enc := codec.NewEncoder(buf, &hd)
   225  	err := enc.Encode(config)
   226  	return buf.Bytes(), err
   227  }
   229  func (w *backstopWorker) getConfiguration() (map[raft.ServerID]*raft.Server, error) {
   230  	future := w.config.Raft.GetConfiguration()
   231  	err := w.waitFuture(future)
   232  	if err != nil {
   233  		return nil, errors.Trace(err)
   234  	}
   235  	servers := make(map[raft.ServerID]*raft.Server)
   236  	config := future.Configuration()
   237  	for i := range config.Servers {
   238  		server := config.Servers[i]
   239  		servers[server.ID] = &server
   240  	}
   241  	return servers, nil
   242  }
   244  // waitFuture waits for the future to return, or for the worker to be
   245  // killed, whichever happens first. If the worker is dying, then the
   246  // catacomb's ErrDying() is returned.
   247  func (w *backstopWorker) waitFuture(f raft.Future) error {
   248  	errch := make(chan error, 1)
   249  	go func() {
   250  		errch <- f.Error()
   251  	}()
   252  	select {
   253  	case <-w.catacomb.Dying():
   254  		return w.catacomb.ErrDying()
   255  	case err := <-errch:
   256  		return err
   257  	}
   258  }
   260  func (w *backstopWorker) apiserverDetailsChanged(topic string, details apiserver.Details, err error) {
   261  	if err != nil {
   262  		// This should never happen, so treat it as fatal.
   263  		w.catacomb.Kill(errors.Annotate(err, "apiserver details callback failed"))
   264  		return
   265  	}
   266  	select {
   267  	case w.serverDetails <- details:
   268  	case <-w.catacomb.Dying():
   269  	}
   270  }