github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/dbaccessor/worker.go (about)

     1  // Copyright 2021 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package dbaccessor
     5  
     6  import (
     7  	"context"
     8  	"net"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/juju/clock"
    13  	"github.com/juju/errors"
    14  	"github.com/juju/worker/v3"
    15  	"github.com/juju/worker/v3/catacomb"
    16  	"github.com/juju/worker/v3/dependency"
    17  
    18  	"github.com/juju/juju/core/database"
    19  	"github.com/juju/juju/database/app"
    20  	"github.com/juju/juju/database/dqlite"
    21  	"github.com/juju/juju/pubsub/apiserver"
    22  )
    23  
    24  const (
    25  	// errTryAgain indicates that the worker should try
    26  	// again later to start a DB tracker worker.
    27  	errTryAgain = errors.ConstError("DB node is nil, but worker is not dying; rescheduling TrackedDB start attempt")
    28  
    29  	// errNotReady indicates that we successfully created a new Dqlite app,
    30  	// but the Ready call timed out, and we are waiting for broadcast info.
    31  	errNotReady = errors.ConstError("started DB app, but it failed to become ready; waiting for topology updates")
    32  )
    33  
    34  // nodeShutdownTimeout is the timeout that we add to the context passed
    35  // handoff/shutdown calls when shutting down the Dqlite node.
    36  const nodeShutdownTimeout = 30 * time.Second
    37  
    38  // NodeManager creates Dqlite `App` initialisation arguments and options.
    39  type NodeManager interface {
    40  	// IsExistingNode returns true if this machine of container has run a
    41  	// Dqlite node in the past.
    42  	IsExistingNode() (bool, error)
    43  
    44  	// IsLoopbackPreferred returns true if the Dqlite application should
    45  	// be bound to the loopback address.
    46  	IsLoopbackPreferred() bool
    47  
    48  	// IsLoopbackBound returns true if we are a cluster of one,
    49  	// and bound to the loopback IP address.
    50  	IsLoopbackBound(context.Context) (bool, error)
    51  
    52  	// EnsureDataDir ensures that a directory for Dqlite data exists at
    53  	// a path determined by the agent config, then returns that path.
    54  	EnsureDataDir() (string, error)
    55  
    56  	// ClusterServers returns the node information for
    57  	// Dqlite nodes configured to be in the cluster.
    58  	ClusterServers(context.Context) ([]dqlite.NodeInfo, error)
    59  
    60  	//SetClusterServers reconfigures the Dqlite cluster members.
    61  	SetClusterServers(context.Context, []dqlite.NodeInfo) error
    62  
    63  	// SetNodeInfo rewrites the local node information
    64  	// file in the Dqlite data directory.
    65  	SetNodeInfo(dqlite.NodeInfo) error
    66  
    67  	// SetClusterToLocalNode reconfigures the Dqlite cluster
    68  	// so that it has the local node as its only member.
    69  	SetClusterToLocalNode(ctx context.Context) error
    70  
    71  	// WithLogFuncOption returns a Dqlite application Option that will proxy Dqlite
    72  	// log output via this factory's logger where the level is recognised.
    73  	WithLogFuncOption() app.Option
    74  
    75  	// WithTracingOption returns a Dqlite application Option
    76  	// that will enable tracing of Dqlite operations.
    77  	WithTracingOption() app.Option
    78  
    79  	// WithAddressOption returns a Dqlite application Option
    80  	// for specifying the local address:port to use.
    81  	WithAddressOption(string) app.Option
    82  
    83  	// WithTLSOption returns a Dqlite application Option for TLS encryption
    84  	// of traffic between clients and clustered application nodes.
    85  	WithTLSOption() (app.Option, error)
    86  
    87  	// WithClusterOption returns a Dqlite application Option for initialising
    88  	// Dqlite as the member of a cluster with peers representing other controllers.
    89  	WithClusterOption([]string) app.Option
    90  }
    91  
    92  // DBGetter describes the ability to supply a sql.DB
    93  // reference for a particular database.
    94  type DBGetter interface {
    95  	// GetDB returns a sql.DB reference for the dqlite-backed database that
    96  	// contains the data for the specified namespace.
    97  	// A NotFound error is returned if the worker is unaware of the requested DB.
    98  	GetDB(namespace string) (database.TrackedDB, error)
    99  }
   100  
   101  // dbRequest is used to pass requests for TrackedDB
   102  // instances into the worker loop.
   103  type dbRequest struct {
   104  	namespace string
   105  	done      chan struct{}
   106  }
   107  
   108  // makeDBRequest creates a new TrackedDB request for the input namespace.
   109  func makeDBRequest(namespace string) dbRequest {
   110  	return dbRequest{
   111  		namespace: namespace,
   112  		done:      make(chan struct{}),
   113  	}
   114  }
   115  
   116  // WorkerConfig encapsulates the configuration options for the
   117  // dbaccessor worker.
   118  type WorkerConfig struct {
   119  	NodeManager      NodeManager
   120  	Clock            clock.Clock
   121  	MetricsCollector *Collector
   122  
   123  	// Hub is the pub/sub central hub used to receive notifications
   124  	// about API server topology changes.
   125  	Hub         Hub
   126  	Logger      Logger
   127  	NewApp      func(string, ...app.Option) (DBApp, error)
   128  	NewDBWorker func(context.Context, DBApp, string, ...TrackedDBWorkerOption) (TrackedDB, error)
   129  
   130  	// ControllerID uniquely identifies the controller that this
   131  	// worker is running on. It is equivalent to the machine ID.
   132  	ControllerID string
   133  }
   134  
   135  // Validate ensures that the config values are valid.
   136  func (c *WorkerConfig) Validate() error {
   137  	if c.NodeManager == nil {
   138  		return errors.NotValidf("missing NodeManager")
   139  	}
   140  	if c.Clock == nil {
   141  		return errors.NotValidf("missing Clock")
   142  	}
   143  	if c.MetricsCollector == nil {
   144  		return errors.NotValidf("missing metrics collector")
   145  	}
   146  	if c.Hub == nil {
   147  		return errors.NotValidf("missing Hub")
   148  	}
   149  	if c.Logger == nil {
   150  		return errors.NotValidf("missing Logger")
   151  	}
   152  	if c.NewApp == nil {
   153  		return errors.NotValidf("missing NewApp")
   154  	}
   155  	if c.NewDBWorker == nil {
   156  		return errors.NotValidf("missing NewDBWorker")
   157  	}
   158  	return nil
   159  }
   160  
   161  type dbWorker struct {
   162  	cfg      WorkerConfig
   163  	catacomb catacomb.Catacomb
   164  
   165  	mu       sync.RWMutex
   166  	dbApp    DBApp
   167  	dbRunner *worker.Runner
   168  
   169  	// dbReady is used to signal that we can
   170  	// begin processing GetDB requests.
   171  	dbReady chan struct{}
   172  
   173  	// dbRequests is used to synchronise GetDB
   174  	// requests into this worker's event loop.
   175  	dbRequests chan dbRequest
   176  
   177  	// apiServerChanges is used to handle incoming changes
   178  	// to API server details within the worker loop.
   179  	apiServerChanges chan apiserver.Details
   180  }
   181  
   182  func newWorker(cfg WorkerConfig) (*dbWorker, error) {
   183  	var err error
   184  	if err = cfg.Validate(); err != nil {
   185  		return nil, errors.Trace(err)
   186  	}
   187  
   188  	w := &dbWorker{
   189  		cfg: cfg,
   190  		dbRunner: worker.NewRunner(worker.RunnerParams{
   191  			Clock: cfg.Clock,
   192  			// If a worker goes down, we've attempted multiple retries and in
   193  			// that case we do want to cause the dbaccessor to go down. This
   194  			// will then bring up a new dqlite app.
   195  			IsFatal: func(err error) bool {
   196  				// If there is a rebind during starting up a worker the dbApp
   197  				// will be nil. In this case, we'll return ErrTryAgain. In this
   198  				// case we don't want to kill the worker. We'll force the
   199  				// worker to try again.
   200  				return !errors.Is(err, errTryAgain)
   201  			},
   202  			RestartDelay: time.Second * 10,
   203  		}),
   204  		dbReady:          make(chan struct{}),
   205  		dbRequests:       make(chan dbRequest),
   206  		apiServerChanges: make(chan apiserver.Details),
   207  	}
   208  
   209  	if err = catacomb.Invoke(catacomb.Plan{
   210  		Site: &w.catacomb,
   211  		Work: w.loop,
   212  		Init: []worker.Worker{
   213  			w.dbRunner,
   214  		},
   215  	}); err != nil {
   216  		return nil, errors.Trace(err)
   217  	}
   218  
   219  	return w, nil
   220  }
   221  
   222  func (w *dbWorker) loop() (err error) {
   223  	// The context here should not be tied to the catacomb, as such a context
   224  	// would be cancelled when the worker is stopped, and we want to give a
   225  	// chance for the Dqlite app to shut down gracefully.
   226  	// There is a timeout in shutdownDqlite to ensure that we don't block
   227  	// forever.
   228  	// We allow a very short time to check whether we should attempt to hand
   229  	// over to another node.
   230  	// If we can't determine that we *shouldn't* within the time window,
   231  	// we go ahead and make the attempt.
   232  	defer func() {
   233  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   234  		bs, _ := w.cfg.NodeManager.IsLoopbackBound(ctx)
   235  		w.shutdownDqlite(context.Background(), !bs)
   236  		cancel()
   237  	}()
   238  
   239  	extant, err := w.cfg.NodeManager.IsExistingNode()
   240  	if err != nil {
   241  		return errors.Trace(err)
   242  	}
   243  
   244  	// At this time, while Juju is using both Mongo and Dqlite, we piggyback
   245  	// off the peer-grouper, which applies any configured HA space and
   246  	// broadcasts clustering addresses. Once we do away with mongo,
   247  	// that worker will be replaced with a Dqlite-focussed analogue that does
   248  	// largely the same thing, though potentially disseminating changes via a
   249  	// mechanism other than pub/sub.
   250  	unsub, err := w.cfg.Hub.Subscribe(apiserver.DetailsTopic, w.handleAPIServerChangeMsg)
   251  	if err != nil {
   252  		return errors.Annotate(err, "subscribing to API server topology changes")
   253  	}
   254  	defer unsub()
   255  
   256  	// If this is an existing node, we start it up immediately.
   257  	// Otherwise, this host is entering a HA cluster, and we need to wait for
   258  	// the peer-grouper to determine and broadcast addresses satisfying the
   259  	// Juju HA space (if configured); request those details.
   260  	// Once received we can continue configuring this node as a member.
   261  	if extant {
   262  		if err := w.startExistingDqliteNode(); err != nil {
   263  			return errors.Trace(err)
   264  		}
   265  	} else {
   266  		if err := w.requestAPIServerDetails(); err != nil {
   267  			return errors.Trace(err)
   268  		}
   269  	}
   270  
   271  	for {
   272  		select {
   273  		case req := <-w.dbRequests:
   274  			if err := w.openDatabase(req.namespace); err != nil {
   275  				w.cfg.Logger.Errorf("opening database %q: %s", req.namespace, err.Error())
   276  			}
   277  			close(req.done)
   278  		case <-w.catacomb.Dying():
   279  			return w.catacomb.ErrDying()
   280  		case apiDetails := <-w.apiServerChanges:
   281  			if err := w.processAPIServerChange(apiDetails); err != nil {
   282  				return errors.Trace(err)
   283  			}
   284  		}
   285  	}
   286  }
   287  
   288  // Kill is part of the worker.Worker interface.
   289  func (w *dbWorker) Kill() {
   290  	w.catacomb.Kill(nil)
   291  }
   292  
   293  // Wait is part of the worker.Worker interface.
   294  func (w *dbWorker) Wait() error {
   295  	return w.catacomb.Wait()
   296  }
   297  
   298  // Report provides information for the engine report.
   299  func (w *dbWorker) Report() map[string]any {
   300  	w.mu.RLock()
   301  	defer w.mu.RUnlock()
   302  
   303  	// We need to guard against attempting to report when setting up or dying,
   304  	// so we don't end up panicking with missing information.
   305  	result := w.dbRunner.Report()
   306  
   307  	if w.dbApp == nil {
   308  		result["leader"] = ""
   309  		result["leader-id"] = uint64(0)
   310  		result["leader-role"] = ""
   311  		return result
   312  	}
   313  
   314  	ctx, cancel := w.scopedContext()
   315  	defer cancel()
   316  
   317  	var (
   318  		leader     string
   319  		leaderRole string
   320  		leaderID   uint64
   321  	)
   322  	if client, err := w.dbApp.Client(ctx); err == nil {
   323  		if nodeInfo, err := client.Leader(ctx); err == nil {
   324  			leaderID = nodeInfo.ID
   325  			leader = nodeInfo.Address
   326  			leaderRole = nodeInfo.Role.String()
   327  		}
   328  	}
   329  
   330  	result["leader-id"] = leaderID
   331  	result["leader"] = leader
   332  	result["leader-role"] = leaderRole
   333  
   334  	return result
   335  }
   336  
   337  // GetDB returns a TrackedDB reference for the dqlite-backed
   338  // database that contains the data for the specified namespace.
   339  // TODO (stickupkid): Before handing out any DB for any namespace,
   340  // we should first validate it exists in the controller list.
   341  // This should only be required if it's not the controller DB.
   342  func (w *dbWorker) GetDB(namespace string) (database.TrackedDB, error) {
   343  	// Ensure Dqlite is initialised.
   344  	select {
   345  	case <-w.dbReady:
   346  	case <-w.catacomb.Dying():
   347  		return nil, w.catacomb.ErrDying()
   348  	}
   349  
   350  	// Enqueue the request.
   351  	req := makeDBRequest(namespace)
   352  	select {
   353  	case w.dbRequests <- req:
   354  	case <-w.catacomb.Dying():
   355  		return nil, w.catacomb.ErrDying()
   356  	}
   357  
   358  	// Wait for the worker loop to indicate it's done.
   359  	select {
   360  	case <-req.done:
   361  	case <-w.catacomb.Dying():
   362  		return nil, w.catacomb.ErrDying()
   363  	}
   364  
   365  	// This will return a not found error if the request was not honoured.
   366  	// The error will be logged - we don't crash this worker for bad calls.
   367  	tracked, err := w.dbRunner.Worker(namespace, w.catacomb.Dying())
   368  	if err != nil {
   369  		return nil, errors.Trace(err)
   370  	}
   371  	return tracked.(database.TrackedDB), nil
   372  }
   373  
   374  // startExistingDqliteNode takes care of starting Dqlite
   375  // when this host has run a node previously.
   376  func (w *dbWorker) startExistingDqliteNode() error {
   377  	mgr := w.cfg.NodeManager
   378  	if mgr.IsLoopbackPreferred() {
   379  		w.cfg.Logger.Infof("host is configured to use loopback address as a Dqlite node")
   380  
   381  		return errors.Trace(w.initialiseDqlite())
   382  	}
   383  
   384  	w.cfg.Logger.Infof("host is configured to use cloud-local address as a Dqlite node")
   385  
   386  	ctx, cancel := w.scopedContext()
   387  	defer cancel()
   388  
   389  	asBootstrapped, err := mgr.IsLoopbackBound(ctx)
   390  	if err != nil {
   391  		return errors.Trace(err)
   392  	}
   393  
   394  	// If this existing node is not as bootstrapped, then it is part of a
   395  	// cluster. The Dqlite Raft log and configuration in the Dqlite data
   396  	// directory will indicate the cluster members, but we need to ensure
   397  	// TLS for traffic between nodes explicitly.
   398  	var options []app.Option
   399  	if !asBootstrapped {
   400  		withTLS, err := mgr.WithTLSOption()
   401  		if err != nil {
   402  			return errors.Trace(err)
   403  		}
   404  		options = append(options, withTLS)
   405  	}
   406  
   407  	return errors.Trace(w.initialiseDqlite(options...))
   408  }
   409  
   410  func (w *dbWorker) initialiseDqlite(options ...app.Option) error {
   411  	if err := w.startDqliteNode(options...); err != nil {
   412  		if errors.Is(err, errNotReady) {
   413  			return nil
   414  		}
   415  		return errors.Trace(err)
   416  	}
   417  
   418  	// Open up the default controller database.
   419  	// Other database namespaces are opened lazily via GetDB calls.
   420  	// We don't need to apply the database schema here as the
   421  	// controller database is created during bootstrap.
   422  	if err := w.openDatabase(database.ControllerNS); err != nil {
   423  		return errors.Annotate(err, "opening controller database")
   424  	}
   425  
   426  	// Begin handling external requests.
   427  	close(w.dbReady)
   428  	return nil
   429  }
   430  
   431  func (w *dbWorker) startDqliteNode(options ...app.Option) error {
   432  	w.mu.Lock()
   433  	defer w.mu.Unlock()
   434  
   435  	if w.dbApp != nil {
   436  		return nil
   437  	}
   438  
   439  	mgr := w.cfg.NodeManager
   440  
   441  	dataDir, err := mgr.EnsureDataDir()
   442  	if err != nil {
   443  		return errors.Trace(err)
   444  	}
   445  
   446  	dqliteOptions := append(options,
   447  		mgr.WithLogFuncOption(),
   448  		mgr.WithTracingOption(),
   449  	)
   450  	if w.dbApp, err = w.cfg.NewApp(dataDir, dqliteOptions...); err != nil {
   451  		return errors.Trace(err)
   452  	}
   453  
   454  	ctx, pCancel := w.scopedContext()
   455  	defer pCancel()
   456  	ctx, cCancel := context.WithTimeout(ctx, time.Minute)
   457  	defer cCancel()
   458  
   459  	if err := w.dbApp.Ready(ctx); err != nil {
   460  		if errors.Is(err, context.DeadlineExceeded) {
   461  			// We don't know whether we were cancelled by tomb or by timeout.
   462  			// Request API server details in case we need to invoke a backstop
   463  			// scenario. If we are shutting down, this won't matter.
   464  			if err := w.dbApp.Close(); err != nil {
   465  				return errors.Trace(err)
   466  			}
   467  			w.dbApp = nil
   468  
   469  			if err := w.requestAPIServerDetails(); err != nil {
   470  				return errors.Annotatef(err, "requesting API server details")
   471  			}
   472  			return errNotReady
   473  		}
   474  		return errors.Annotatef(err, "ensuring Dqlite is ready to process changes")
   475  	}
   476  
   477  	w.cfg.Logger.Infof("serving Dqlite application (ID: %v)", w.dbApp.ID())
   478  
   479  	if c, err := w.dbApp.Client(ctx); err == nil {
   480  		if info, err := c.Cluster(ctx); err == nil {
   481  			w.cfg.Logger.Infof("current cluster: %#v", info)
   482  		}
   483  	}
   484  
   485  	return nil
   486  }
   487  
   488  // openDatabase starts a TrackedDB worker for the database with the input name.
   489  // It is called by initialiseDqlite to open the controller databases,
   490  // and via GetDB to service downstream database requests.
   491  // It is important to note that the start function passed to StartWorker is not
   492  // invoked synchronously.
   493  // Since GetDB blocks until dbReady is closed, and initialiseDqlite waits for
   494  // the node to be ready, we can assume that we will never race with a nil dbApp
   495  // when first starting up.
   496  // Since the only way we can get into this race is during shutdown or a rebind,
   497  // it is safe to return ErrDying if the catacomb is dying when we detect a nil
   498  // database or ErrTryAgain to force the runner to retry starting the worker
   499  // again.
   500  func (w *dbWorker) openDatabase(namespace string) error {
   501  	// Note: Do not be tempted to create the worker outside of the StartWorker
   502  	// function. This will create potential data race if openDatabase is called
   503  	// multiple times for the same namespace.
   504  	err := w.dbRunner.StartWorker(namespace, func() (worker.Worker, error) {
   505  		w.mu.RLock()
   506  		defer w.mu.RUnlock()
   507  		if w.dbApp == nil {
   508  			// If the dbApp is nil, then we're either shutting down or
   509  			// rebinding the address. In either case, we don't want to
   510  			// start a new worker. We'll return ErrTryAgain to indicate
   511  			// that we should try again in a bit. This will continue until
   512  			// the dbApp is no longer nil.
   513  			select {
   514  			case <-w.catacomb.Dying():
   515  				return nil, w.catacomb.ErrDying()
   516  			default:
   517  				return nil, errTryAgain
   518  			}
   519  		}
   520  
   521  		ctx, cancel := w.scopedContext()
   522  		defer cancel()
   523  
   524  		return w.cfg.NewDBWorker(ctx, w.dbApp, namespace,
   525  			WithClock(w.cfg.Clock),
   526  			WithLogger(w.cfg.Logger),
   527  			WithMetricsCollector(w.cfg.MetricsCollector),
   528  		)
   529  	})
   530  	if errors.Is(err, errors.AlreadyExists) {
   531  		return nil
   532  	}
   533  	return errors.Trace(err)
   534  }
   535  
   536  // handleAPIServerChangeMsg is the callback supplied to the pub/sub
   537  // subscription for API server details. It effectively synchronises the
   538  // handling of such messages into the worker's evert loop.
   539  func (w *dbWorker) handleAPIServerChangeMsg(_ string, apiDetails apiserver.Details, err error) {
   540  	if err != nil {
   541  		// This should never happen.
   542  		w.cfg.Logger.Errorf("pub/sub callback error: %v", err)
   543  		return
   544  	}
   545  
   546  	select {
   547  	case <-w.catacomb.Dying():
   548  	case w.apiServerChanges <- apiDetails:
   549  	}
   550  }
   551  
   552  // processAPIServerChange deals with cluster topology changes.
   553  // Note that this is always invoked from the worker loop and will never
   554  // race with Dqlite initialisation. If this is called then we either came
   555  // up successfully or we determined that we couldn't and are waiting.
   556  func (w *dbWorker) processAPIServerChange(apiDetails apiserver.Details) error {
   557  	log := w.cfg.Logger
   558  	log.Debugf("new API server details: %#v", apiDetails)
   559  
   560  	mgr := w.cfg.NodeManager
   561  	extant, err := mgr.IsExistingNode()
   562  	if err != nil {
   563  		return errors.Trace(err)
   564  	}
   565  
   566  	ctx, cancel := w.scopedContext()
   567  	defer cancel()
   568  
   569  	// If we prefer the loopback address, we shouldn't need to do anything.
   570  	// We double-check that we are bound to the loopback address, if not,
   571  	// we bounce the worker and try and resolve that in the next go around.
   572  	if mgr.IsLoopbackPreferred() {
   573  		if extant {
   574  			isLoopbackBound, err := mgr.IsLoopbackBound(ctx)
   575  			if err != nil {
   576  				return errors.Trace(err)
   577  			}
   578  			// Everything is fine, we're bound to the loopback address and
   579  			// can return early.
   580  			if isLoopbackBound {
   581  				return nil
   582  			}
   583  
   584  			// This should never happen, but we want to be conservative.
   585  			w.cfg.Logger.Warningf("existing Dqlite node is not bound to loopback; but should be; restarting worker")
   586  		}
   587  
   588  		// We don't have a Dqlite node, but somehow we got here, we should just
   589  		// bounce the worker and try again.
   590  		return dependency.ErrBounce
   591  	}
   592  
   593  	if extant {
   594  		asBootstrapped, err := mgr.IsLoopbackBound(ctx)
   595  		if err != nil {
   596  			return errors.Trace(err)
   597  		}
   598  
   599  		serverCount := len(apiDetails.Servers)
   600  
   601  		// If we are as-bootstrapped, check if we are entering HA and need to
   602  		// change our binding from the loopback IP to a local-cloud address.
   603  		if asBootstrapped {
   604  			if serverCount == 1 {
   605  				// This bootstrapped node is still the only one around.
   606  				// We don't need to do anything.
   607  				return nil
   608  			}
   609  
   610  			addr, err := w.bindAddrFromServerDetails(apiDetails)
   611  			if err != nil {
   612  				if errors.Is(err, errors.NotFound) {
   613  					w.cfg.Logger.Infof(err.Error())
   614  					return nil
   615  				}
   616  				return errors.Trace(err)
   617  			}
   618  
   619  			if err := w.rebindAddress(ctx, addr); err != nil {
   620  				return errors.Trace(err)
   621  			}
   622  
   623  			log.Infof("successfully reconfigured Dqlite; restarting worker")
   624  			return dependency.ErrBounce
   625  		}
   626  
   627  		// If we are an existing, previously clustered node,
   628  		// and the node is running, we have nothing to do.
   629  		w.mu.RLock()
   630  		running := w.dbApp != nil
   631  		w.mu.RUnlock()
   632  		if running {
   633  			return nil
   634  		}
   635  
   636  		// Make absolutely sure. We only reconfigure the cluster if the details
   637  		// indicate exactly one controller machine, and that machine is us.
   638  		if _, ok := apiDetails.Servers[w.cfg.ControllerID]; ok && serverCount == 1 {
   639  			log.Warningf("reconfiguring Dqlite cluster with this node as the only member")
   640  			if err := w.cfg.NodeManager.SetClusterToLocalNode(ctx); err != nil {
   641  				return errors.Annotatef(err, "reconfiguring Dqlite cluster")
   642  			}
   643  
   644  			log.Infof("successfully reconfigured Dqlite; restarting worker")
   645  			return dependency.ErrBounce
   646  		}
   647  
   648  		// Otherwise there is no deterministic course of action.
   649  		// We don't want to throw an error here, because it can result in churn
   650  		// when entering HA. Just try again to start.
   651  		log.Infof("unable to reconcile current controller and Dqlite cluster status; reattempting node start-up")
   652  		return errors.Trace(w.startExistingDqliteNode())
   653  	}
   654  
   655  	// Otherwise this is a node added by enabling HA,
   656  	// and we need to join to an existing cluster.
   657  	return errors.Trace(w.joinNodeToCluster(apiDetails))
   658  }
   659  
   660  // rebindAddress stops the current node, reconfigures the cluster so that
   661  // it is a single server bound to the input local-cloud address.
   662  // It should be called only for a cluster constituted by a single node
   663  // bound to the loopback IP address.
   664  func (w *dbWorker) rebindAddress(ctx context.Context, addr string) error {
   665  	// We only rebind the address when going into HA from a single node.
   666  	// Therefore, we do not have to worry about handing over responsibilities.
   667  	// Passing false ensures we come back up in the shortest time possible.
   668  	w.shutdownDqlite(ctx, false)
   669  
   670  	mgr := w.cfg.NodeManager
   671  	servers, err := mgr.ClusterServers(ctx)
   672  	if err != nil {
   673  		return errors.Trace(err)
   674  	}
   675  
   676  	// This should be implied by an earlier check of
   677  	// NodeManager.IsLoopbackBound, but we want to guard very
   678  	// conservatively against breaking established clusters.
   679  	if len(servers) != 1 {
   680  		w.cfg.Logger.Debugf("not a singular server; skipping address rebind")
   681  		return nil
   682  	}
   683  
   684  	// We need to preserve the port from the existing address.
   685  	_, port, err := net.SplitHostPort(servers[0].Address)
   686  	if err != nil {
   687  		return errors.Trace(err)
   688  	}
   689  	servers[0].Address = net.JoinHostPort(addr, port)
   690  
   691  	w.cfg.Logger.Infof("rebinding Dqlite node to %s", addr)
   692  	if err := mgr.SetClusterServers(ctx, servers); err != nil {
   693  		return errors.Trace(err)
   694  	}
   695  
   696  	return errors.Trace(mgr.SetNodeInfo(servers[0]))
   697  }
   698  
   699  // joinNodeToCluster uses the input server details to determine a bind address
   700  // for this node, and one or more addresses of other nodes to cluster with.
   701  // It then uses these to initialise Dqlite.
   702  // If either bind or cluster addresses can not be determined,
   703  // we just return nil and keep waiting for further server detail messages.
   704  func (w *dbWorker) joinNodeToCluster(apiDetails apiserver.Details) error {
   705  	// Get our address from the API details.
   706  	localAddr, err := w.bindAddrFromServerDetails(apiDetails)
   707  	if err != nil {
   708  		if errors.Is(err, errors.NotFound) {
   709  			w.cfg.Logger.Infof(err.Error())
   710  			return nil
   711  		}
   712  		return errors.Trace(err)
   713  	}
   714  
   715  	// Then get addresses for any other of the servers,
   716  	// so we can join the cluster.
   717  	var clusterAddrs []string
   718  	for id, server := range apiDetails.Servers {
   719  		hostPort := server.InternalAddress
   720  		if id != w.cfg.ControllerID && hostPort != "" {
   721  			addr, _, err := net.SplitHostPort(hostPort)
   722  			if err != nil {
   723  				return errors.Annotatef(err, "splitting host/port for %s", hostPort)
   724  			}
   725  			clusterAddrs = append(clusterAddrs, addr)
   726  		}
   727  	}
   728  	if len(clusterAddrs) == 0 {
   729  		w.cfg.Logger.Infof("no addresses available for this Dqlite node to join cluster")
   730  		return nil
   731  	}
   732  
   733  	w.cfg.Logger.Infof("joining Dqlite cluster")
   734  	mgr := w.cfg.NodeManager
   735  
   736  	withTLS, err := mgr.WithTLSOption()
   737  	if err != nil {
   738  		return errors.Trace(err)
   739  	}
   740  
   741  	return errors.Trace(w.initialiseDqlite(
   742  		mgr.WithAddressOption(localAddr), mgr.WithClusterOption(clusterAddrs), withTLS))
   743  }
   744  
   745  // bindAddrFromServerDetails returns the internal IP address from the
   746  // input details that corresponds with this controller machine.
   747  func (w *dbWorker) bindAddrFromServerDetails(apiDetails apiserver.Details) (string, error) {
   748  	hostPort := apiDetails.Servers[w.cfg.ControllerID].InternalAddress
   749  	if hostPort == "" {
   750  		return "", errors.NotFoundf("internal address for this Dqlite node to bind to")
   751  	}
   752  
   753  	addr, _, err := net.SplitHostPort(hostPort)
   754  	if err != nil {
   755  		return "", errors.Annotatef(err, "splitting host/port for %s", hostPort)
   756  	}
   757  
   758  	return addr, nil
   759  }
   760  
   761  // shutdownDqlite shuts down the local Dqlite node, making a best-effort
   762  // attempt at graceful handover when the input boolean is true.
   763  // If the worker is not shutting down permanently, Dqlite should be
   764  // reinitialised either directly or by bouncing the agent reasonably
   765  // soon after calling this method.
   766  func (w *dbWorker) shutdownDqlite(ctx context.Context, handover bool) {
   767  	w.cfg.Logger.Infof("shutting down Dqlite node")
   768  
   769  	w.mu.Lock()
   770  	defer w.mu.Unlock()
   771  
   772  	if w.dbApp == nil {
   773  		return
   774  	}
   775  
   776  	if handover {
   777  		// Set a bound on the time that we allow for hand off.
   778  		ctx, cancel := context.WithTimeout(ctx, nodeShutdownTimeout)
   779  		defer cancel()
   780  
   781  		if err := w.dbApp.Handover(ctx); err != nil {
   782  			w.cfg.Logger.Errorf("handing off Dqlite responsibilities: %v", err)
   783  		}
   784  	} else {
   785  		w.cfg.Logger.Infof("skipping Dqlite handover")
   786  	}
   787  
   788  	if err := w.dbApp.Close(); err != nil {
   789  		w.cfg.Logger.Errorf("closing Dqlite application: %v", err)
   790  	}
   791  
   792  	w.dbApp = nil
   793  }
   794  
   795  func (w *dbWorker) requestAPIServerDetails() error {
   796  	_, err := w.cfg.Hub.Publish(apiserver.DetailsRequestTopic, apiserver.DetailsRequest{
   797  		Requester: "db-accessor",
   798  		LocalOnly: true,
   799  	})
   800  	return errors.Trace(err)
   801  }
   802  
   803  // scopedContext returns a context that is in the scope of the worker lifetime.
   804  // It returns a cancellable context that is cancelled when the action has
   805  // completed.
   806  func (w *dbWorker) scopedContext() (context.Context, context.CancelFunc) {
   807  	ctx, cancel := context.WithCancel(context.Background())
   808  	return w.catacomb.Context(ctx), cancel
   809  }