github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/dbaccessor/worker.go (about) 1 // Copyright 2021 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package dbaccessor 5 6 import ( 7 "context" 8 "net" 9 "sync" 10 "time" 11 12 "github.com/juju/clock" 13 "github.com/juju/errors" 14 "github.com/juju/worker/v3" 15 "github.com/juju/worker/v3/catacomb" 16 "github.com/juju/worker/v3/dependency" 17 18 "github.com/juju/juju/core/database" 19 "github.com/juju/juju/database/app" 20 "github.com/juju/juju/database/dqlite" 21 "github.com/juju/juju/pubsub/apiserver" 22 ) 23 24 const ( 25 // errTryAgain indicates that the worker should try 26 // again later to start a DB tracker worker. 27 errTryAgain = errors.ConstError("DB node is nil, but worker is not dying; rescheduling TrackedDB start attempt") 28 29 // errNotReady indicates that we successfully created a new Dqlite app, 30 // but the Ready call timed out, and we are waiting for broadcast info. 31 errNotReady = errors.ConstError("started DB app, but it failed to become ready; waiting for topology updates") 32 ) 33 34 // nodeShutdownTimeout is the timeout that we add to the context passed 35 // handoff/shutdown calls when shutting down the Dqlite node. 36 const nodeShutdownTimeout = 30 * time.Second 37 38 // NodeManager creates Dqlite `App` initialisation arguments and options. 39 type NodeManager interface { 40 // IsExistingNode returns true if this machine of container has run a 41 // Dqlite node in the past. 42 IsExistingNode() (bool, error) 43 44 // IsLoopbackPreferred returns true if the Dqlite application should 45 // be bound to the loopback address. 46 IsLoopbackPreferred() bool 47 48 // IsLoopbackBound returns true if we are a cluster of one, 49 // and bound to the loopback IP address. 50 IsLoopbackBound(context.Context) (bool, error) 51 52 // EnsureDataDir ensures that a directory for Dqlite data exists at 53 // a path determined by the agent config, then returns that path. 54 EnsureDataDir() (string, error) 55 56 // ClusterServers returns the node information for 57 // Dqlite nodes configured to be in the cluster. 58 ClusterServers(context.Context) ([]dqlite.NodeInfo, error) 59 60 //SetClusterServers reconfigures the Dqlite cluster members. 61 SetClusterServers(context.Context, []dqlite.NodeInfo) error 62 63 // SetNodeInfo rewrites the local node information 64 // file in the Dqlite data directory. 65 SetNodeInfo(dqlite.NodeInfo) error 66 67 // SetClusterToLocalNode reconfigures the Dqlite cluster 68 // so that it has the local node as its only member. 69 SetClusterToLocalNode(ctx context.Context) error 70 71 // WithLogFuncOption returns a Dqlite application Option that will proxy Dqlite 72 // log output via this factory's logger where the level is recognised. 73 WithLogFuncOption() app.Option 74 75 // WithTracingOption returns a Dqlite application Option 76 // that will enable tracing of Dqlite operations. 77 WithTracingOption() app.Option 78 79 // WithAddressOption returns a Dqlite application Option 80 // for specifying the local address:port to use. 81 WithAddressOption(string) app.Option 82 83 // WithTLSOption returns a Dqlite application Option for TLS encryption 84 // of traffic between clients and clustered application nodes. 85 WithTLSOption() (app.Option, error) 86 87 // WithClusterOption returns a Dqlite application Option for initialising 88 // Dqlite as the member of a cluster with peers representing other controllers. 89 WithClusterOption([]string) app.Option 90 } 91 92 // DBGetter describes the ability to supply a sql.DB 93 // reference for a particular database. 94 type DBGetter interface { 95 // GetDB returns a sql.DB reference for the dqlite-backed database that 96 // contains the data for the specified namespace. 97 // A NotFound error is returned if the worker is unaware of the requested DB. 98 GetDB(namespace string) (database.TrackedDB, error) 99 } 100 101 // dbRequest is used to pass requests for TrackedDB 102 // instances into the worker loop. 103 type dbRequest struct { 104 namespace string 105 done chan struct{} 106 } 107 108 // makeDBRequest creates a new TrackedDB request for the input namespace. 109 func makeDBRequest(namespace string) dbRequest { 110 return dbRequest{ 111 namespace: namespace, 112 done: make(chan struct{}), 113 } 114 } 115 116 // WorkerConfig encapsulates the configuration options for the 117 // dbaccessor worker. 118 type WorkerConfig struct { 119 NodeManager NodeManager 120 Clock clock.Clock 121 MetricsCollector *Collector 122 123 // Hub is the pub/sub central hub used to receive notifications 124 // about API server topology changes. 125 Hub Hub 126 Logger Logger 127 NewApp func(string, ...app.Option) (DBApp, error) 128 NewDBWorker func(context.Context, DBApp, string, ...TrackedDBWorkerOption) (TrackedDB, error) 129 130 // ControllerID uniquely identifies the controller that this 131 // worker is running on. It is equivalent to the machine ID. 132 ControllerID string 133 } 134 135 // Validate ensures that the config values are valid. 136 func (c *WorkerConfig) Validate() error { 137 if c.NodeManager == nil { 138 return errors.NotValidf("missing NodeManager") 139 } 140 if c.Clock == nil { 141 return errors.NotValidf("missing Clock") 142 } 143 if c.MetricsCollector == nil { 144 return errors.NotValidf("missing metrics collector") 145 } 146 if c.Hub == nil { 147 return errors.NotValidf("missing Hub") 148 } 149 if c.Logger == nil { 150 return errors.NotValidf("missing Logger") 151 } 152 if c.NewApp == nil { 153 return errors.NotValidf("missing NewApp") 154 } 155 if c.NewDBWorker == nil { 156 return errors.NotValidf("missing NewDBWorker") 157 } 158 return nil 159 } 160 161 type dbWorker struct { 162 cfg WorkerConfig 163 catacomb catacomb.Catacomb 164 165 mu sync.RWMutex 166 dbApp DBApp 167 dbRunner *worker.Runner 168 169 // dbReady is used to signal that we can 170 // begin processing GetDB requests. 171 dbReady chan struct{} 172 173 // dbRequests is used to synchronise GetDB 174 // requests into this worker's event loop. 175 dbRequests chan dbRequest 176 177 // apiServerChanges is used to handle incoming changes 178 // to API server details within the worker loop. 179 apiServerChanges chan apiserver.Details 180 } 181 182 func newWorker(cfg WorkerConfig) (*dbWorker, error) { 183 var err error 184 if err = cfg.Validate(); err != nil { 185 return nil, errors.Trace(err) 186 } 187 188 w := &dbWorker{ 189 cfg: cfg, 190 dbRunner: worker.NewRunner(worker.RunnerParams{ 191 Clock: cfg.Clock, 192 // If a worker goes down, we've attempted multiple retries and in 193 // that case we do want to cause the dbaccessor to go down. This 194 // will then bring up a new dqlite app. 195 IsFatal: func(err error) bool { 196 // If there is a rebind during starting up a worker the dbApp 197 // will be nil. In this case, we'll return ErrTryAgain. In this 198 // case we don't want to kill the worker. We'll force the 199 // worker to try again. 200 return !errors.Is(err, errTryAgain) 201 }, 202 RestartDelay: time.Second * 10, 203 }), 204 dbReady: make(chan struct{}), 205 dbRequests: make(chan dbRequest), 206 apiServerChanges: make(chan apiserver.Details), 207 } 208 209 if err = catacomb.Invoke(catacomb.Plan{ 210 Site: &w.catacomb, 211 Work: w.loop, 212 Init: []worker.Worker{ 213 w.dbRunner, 214 }, 215 }); err != nil { 216 return nil, errors.Trace(err) 217 } 218 219 return w, nil 220 } 221 222 func (w *dbWorker) loop() (err error) { 223 // The context here should not be tied to the catacomb, as such a context 224 // would be cancelled when the worker is stopped, and we want to give a 225 // chance for the Dqlite app to shut down gracefully. 226 // There is a timeout in shutdownDqlite to ensure that we don't block 227 // forever. 228 // We allow a very short time to check whether we should attempt to hand 229 // over to another node. 230 // If we can't determine that we *shouldn't* within the time window, 231 // we go ahead and make the attempt. 232 defer func() { 233 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 234 bs, _ := w.cfg.NodeManager.IsLoopbackBound(ctx) 235 w.shutdownDqlite(context.Background(), !bs) 236 cancel() 237 }() 238 239 extant, err := w.cfg.NodeManager.IsExistingNode() 240 if err != nil { 241 return errors.Trace(err) 242 } 243 244 // At this time, while Juju is using both Mongo and Dqlite, we piggyback 245 // off the peer-grouper, which applies any configured HA space and 246 // broadcasts clustering addresses. Once we do away with mongo, 247 // that worker will be replaced with a Dqlite-focussed analogue that does 248 // largely the same thing, though potentially disseminating changes via a 249 // mechanism other than pub/sub. 250 unsub, err := w.cfg.Hub.Subscribe(apiserver.DetailsTopic, w.handleAPIServerChangeMsg) 251 if err != nil { 252 return errors.Annotate(err, "subscribing to API server topology changes") 253 } 254 defer unsub() 255 256 // If this is an existing node, we start it up immediately. 257 // Otherwise, this host is entering a HA cluster, and we need to wait for 258 // the peer-grouper to determine and broadcast addresses satisfying the 259 // Juju HA space (if configured); request those details. 260 // Once received we can continue configuring this node as a member. 261 if extant { 262 if err := w.startExistingDqliteNode(); err != nil { 263 return errors.Trace(err) 264 } 265 } else { 266 if err := w.requestAPIServerDetails(); err != nil { 267 return errors.Trace(err) 268 } 269 } 270 271 for { 272 select { 273 case req := <-w.dbRequests: 274 if err := w.openDatabase(req.namespace); err != nil { 275 w.cfg.Logger.Errorf("opening database %q: %s", req.namespace, err.Error()) 276 } 277 close(req.done) 278 case <-w.catacomb.Dying(): 279 return w.catacomb.ErrDying() 280 case apiDetails := <-w.apiServerChanges: 281 if err := w.processAPIServerChange(apiDetails); err != nil { 282 return errors.Trace(err) 283 } 284 } 285 } 286 } 287 288 // Kill is part of the worker.Worker interface. 289 func (w *dbWorker) Kill() { 290 w.catacomb.Kill(nil) 291 } 292 293 // Wait is part of the worker.Worker interface. 294 func (w *dbWorker) Wait() error { 295 return w.catacomb.Wait() 296 } 297 298 // Report provides information for the engine report. 299 func (w *dbWorker) Report() map[string]any { 300 w.mu.RLock() 301 defer w.mu.RUnlock() 302 303 // We need to guard against attempting to report when setting up or dying, 304 // so we don't end up panicking with missing information. 305 result := w.dbRunner.Report() 306 307 if w.dbApp == nil { 308 result["leader"] = "" 309 result["leader-id"] = uint64(0) 310 result["leader-role"] = "" 311 return result 312 } 313 314 ctx, cancel := w.scopedContext() 315 defer cancel() 316 317 var ( 318 leader string 319 leaderRole string 320 leaderID uint64 321 ) 322 if client, err := w.dbApp.Client(ctx); err == nil { 323 if nodeInfo, err := client.Leader(ctx); err == nil { 324 leaderID = nodeInfo.ID 325 leader = nodeInfo.Address 326 leaderRole = nodeInfo.Role.String() 327 } 328 } 329 330 result["leader-id"] = leaderID 331 result["leader"] = leader 332 result["leader-role"] = leaderRole 333 334 return result 335 } 336 337 // GetDB returns a TrackedDB reference for the dqlite-backed 338 // database that contains the data for the specified namespace. 339 // TODO (stickupkid): Before handing out any DB for any namespace, 340 // we should first validate it exists in the controller list. 341 // This should only be required if it's not the controller DB. 342 func (w *dbWorker) GetDB(namespace string) (database.TrackedDB, error) { 343 // Ensure Dqlite is initialised. 344 select { 345 case <-w.dbReady: 346 case <-w.catacomb.Dying(): 347 return nil, w.catacomb.ErrDying() 348 } 349 350 // Enqueue the request. 351 req := makeDBRequest(namespace) 352 select { 353 case w.dbRequests <- req: 354 case <-w.catacomb.Dying(): 355 return nil, w.catacomb.ErrDying() 356 } 357 358 // Wait for the worker loop to indicate it's done. 359 select { 360 case <-req.done: 361 case <-w.catacomb.Dying(): 362 return nil, w.catacomb.ErrDying() 363 } 364 365 // This will return a not found error if the request was not honoured. 366 // The error will be logged - we don't crash this worker for bad calls. 367 tracked, err := w.dbRunner.Worker(namespace, w.catacomb.Dying()) 368 if err != nil { 369 return nil, errors.Trace(err) 370 } 371 return tracked.(database.TrackedDB), nil 372 } 373 374 // startExistingDqliteNode takes care of starting Dqlite 375 // when this host has run a node previously. 376 func (w *dbWorker) startExistingDqliteNode() error { 377 mgr := w.cfg.NodeManager 378 if mgr.IsLoopbackPreferred() { 379 w.cfg.Logger.Infof("host is configured to use loopback address as a Dqlite node") 380 381 return errors.Trace(w.initialiseDqlite()) 382 } 383 384 w.cfg.Logger.Infof("host is configured to use cloud-local address as a Dqlite node") 385 386 ctx, cancel := w.scopedContext() 387 defer cancel() 388 389 asBootstrapped, err := mgr.IsLoopbackBound(ctx) 390 if err != nil { 391 return errors.Trace(err) 392 } 393 394 // If this existing node is not as bootstrapped, then it is part of a 395 // cluster. The Dqlite Raft log and configuration in the Dqlite data 396 // directory will indicate the cluster members, but we need to ensure 397 // TLS for traffic between nodes explicitly. 398 var options []app.Option 399 if !asBootstrapped { 400 withTLS, err := mgr.WithTLSOption() 401 if err != nil { 402 return errors.Trace(err) 403 } 404 options = append(options, withTLS) 405 } 406 407 return errors.Trace(w.initialiseDqlite(options...)) 408 } 409 410 func (w *dbWorker) initialiseDqlite(options ...app.Option) error { 411 if err := w.startDqliteNode(options...); err != nil { 412 if errors.Is(err, errNotReady) { 413 return nil 414 } 415 return errors.Trace(err) 416 } 417 418 // Open up the default controller database. 419 // Other database namespaces are opened lazily via GetDB calls. 420 // We don't need to apply the database schema here as the 421 // controller database is created during bootstrap. 422 if err := w.openDatabase(database.ControllerNS); err != nil { 423 return errors.Annotate(err, "opening controller database") 424 } 425 426 // Begin handling external requests. 427 close(w.dbReady) 428 return nil 429 } 430 431 func (w *dbWorker) startDqliteNode(options ...app.Option) error { 432 w.mu.Lock() 433 defer w.mu.Unlock() 434 435 if w.dbApp != nil { 436 return nil 437 } 438 439 mgr := w.cfg.NodeManager 440 441 dataDir, err := mgr.EnsureDataDir() 442 if err != nil { 443 return errors.Trace(err) 444 } 445 446 dqliteOptions := append(options, 447 mgr.WithLogFuncOption(), 448 mgr.WithTracingOption(), 449 ) 450 if w.dbApp, err = w.cfg.NewApp(dataDir, dqliteOptions...); err != nil { 451 return errors.Trace(err) 452 } 453 454 ctx, pCancel := w.scopedContext() 455 defer pCancel() 456 ctx, cCancel := context.WithTimeout(ctx, time.Minute) 457 defer cCancel() 458 459 if err := w.dbApp.Ready(ctx); err != nil { 460 if errors.Is(err, context.DeadlineExceeded) { 461 // We don't know whether we were cancelled by tomb or by timeout. 462 // Request API server details in case we need to invoke a backstop 463 // scenario. If we are shutting down, this won't matter. 464 if err := w.dbApp.Close(); err != nil { 465 return errors.Trace(err) 466 } 467 w.dbApp = nil 468 469 if err := w.requestAPIServerDetails(); err != nil { 470 return errors.Annotatef(err, "requesting API server details") 471 } 472 return errNotReady 473 } 474 return errors.Annotatef(err, "ensuring Dqlite is ready to process changes") 475 } 476 477 w.cfg.Logger.Infof("serving Dqlite application (ID: %v)", w.dbApp.ID()) 478 479 if c, err := w.dbApp.Client(ctx); err == nil { 480 if info, err := c.Cluster(ctx); err == nil { 481 w.cfg.Logger.Infof("current cluster: %#v", info) 482 } 483 } 484 485 return nil 486 } 487 488 // openDatabase starts a TrackedDB worker for the database with the input name. 489 // It is called by initialiseDqlite to open the controller databases, 490 // and via GetDB to service downstream database requests. 491 // It is important to note that the start function passed to StartWorker is not 492 // invoked synchronously. 493 // Since GetDB blocks until dbReady is closed, and initialiseDqlite waits for 494 // the node to be ready, we can assume that we will never race with a nil dbApp 495 // when first starting up. 496 // Since the only way we can get into this race is during shutdown or a rebind, 497 // it is safe to return ErrDying if the catacomb is dying when we detect a nil 498 // database or ErrTryAgain to force the runner to retry starting the worker 499 // again. 500 func (w *dbWorker) openDatabase(namespace string) error { 501 // Note: Do not be tempted to create the worker outside of the StartWorker 502 // function. This will create potential data race if openDatabase is called 503 // multiple times for the same namespace. 504 err := w.dbRunner.StartWorker(namespace, func() (worker.Worker, error) { 505 w.mu.RLock() 506 defer w.mu.RUnlock() 507 if w.dbApp == nil { 508 // If the dbApp is nil, then we're either shutting down or 509 // rebinding the address. In either case, we don't want to 510 // start a new worker. We'll return ErrTryAgain to indicate 511 // that we should try again in a bit. This will continue until 512 // the dbApp is no longer nil. 513 select { 514 case <-w.catacomb.Dying(): 515 return nil, w.catacomb.ErrDying() 516 default: 517 return nil, errTryAgain 518 } 519 } 520 521 ctx, cancel := w.scopedContext() 522 defer cancel() 523 524 return w.cfg.NewDBWorker(ctx, w.dbApp, namespace, 525 WithClock(w.cfg.Clock), 526 WithLogger(w.cfg.Logger), 527 WithMetricsCollector(w.cfg.MetricsCollector), 528 ) 529 }) 530 if errors.Is(err, errors.AlreadyExists) { 531 return nil 532 } 533 return errors.Trace(err) 534 } 535 536 // handleAPIServerChangeMsg is the callback supplied to the pub/sub 537 // subscription for API server details. It effectively synchronises the 538 // handling of such messages into the worker's evert loop. 539 func (w *dbWorker) handleAPIServerChangeMsg(_ string, apiDetails apiserver.Details, err error) { 540 if err != nil { 541 // This should never happen. 542 w.cfg.Logger.Errorf("pub/sub callback error: %v", err) 543 return 544 } 545 546 select { 547 case <-w.catacomb.Dying(): 548 case w.apiServerChanges <- apiDetails: 549 } 550 } 551 552 // processAPIServerChange deals with cluster topology changes. 553 // Note that this is always invoked from the worker loop and will never 554 // race with Dqlite initialisation. If this is called then we either came 555 // up successfully or we determined that we couldn't and are waiting. 556 func (w *dbWorker) processAPIServerChange(apiDetails apiserver.Details) error { 557 log := w.cfg.Logger 558 log.Debugf("new API server details: %#v", apiDetails) 559 560 mgr := w.cfg.NodeManager 561 extant, err := mgr.IsExistingNode() 562 if err != nil { 563 return errors.Trace(err) 564 } 565 566 ctx, cancel := w.scopedContext() 567 defer cancel() 568 569 // If we prefer the loopback address, we shouldn't need to do anything. 570 // We double-check that we are bound to the loopback address, if not, 571 // we bounce the worker and try and resolve that in the next go around. 572 if mgr.IsLoopbackPreferred() { 573 if extant { 574 isLoopbackBound, err := mgr.IsLoopbackBound(ctx) 575 if err != nil { 576 return errors.Trace(err) 577 } 578 // Everything is fine, we're bound to the loopback address and 579 // can return early. 580 if isLoopbackBound { 581 return nil 582 } 583 584 // This should never happen, but we want to be conservative. 585 w.cfg.Logger.Warningf("existing Dqlite node is not bound to loopback; but should be; restarting worker") 586 } 587 588 // We don't have a Dqlite node, but somehow we got here, we should just 589 // bounce the worker and try again. 590 return dependency.ErrBounce 591 } 592 593 if extant { 594 asBootstrapped, err := mgr.IsLoopbackBound(ctx) 595 if err != nil { 596 return errors.Trace(err) 597 } 598 599 serverCount := len(apiDetails.Servers) 600 601 // If we are as-bootstrapped, check if we are entering HA and need to 602 // change our binding from the loopback IP to a local-cloud address. 603 if asBootstrapped { 604 if serverCount == 1 { 605 // This bootstrapped node is still the only one around. 606 // We don't need to do anything. 607 return nil 608 } 609 610 addr, err := w.bindAddrFromServerDetails(apiDetails) 611 if err != nil { 612 if errors.Is(err, errors.NotFound) { 613 w.cfg.Logger.Infof(err.Error()) 614 return nil 615 } 616 return errors.Trace(err) 617 } 618 619 if err := w.rebindAddress(ctx, addr); err != nil { 620 return errors.Trace(err) 621 } 622 623 log.Infof("successfully reconfigured Dqlite; restarting worker") 624 return dependency.ErrBounce 625 } 626 627 // If we are an existing, previously clustered node, 628 // and the node is running, we have nothing to do. 629 w.mu.RLock() 630 running := w.dbApp != nil 631 w.mu.RUnlock() 632 if running { 633 return nil 634 } 635 636 // Make absolutely sure. We only reconfigure the cluster if the details 637 // indicate exactly one controller machine, and that machine is us. 638 if _, ok := apiDetails.Servers[w.cfg.ControllerID]; ok && serverCount == 1 { 639 log.Warningf("reconfiguring Dqlite cluster with this node as the only member") 640 if err := w.cfg.NodeManager.SetClusterToLocalNode(ctx); err != nil { 641 return errors.Annotatef(err, "reconfiguring Dqlite cluster") 642 } 643 644 log.Infof("successfully reconfigured Dqlite; restarting worker") 645 return dependency.ErrBounce 646 } 647 648 // Otherwise there is no deterministic course of action. 649 // We don't want to throw an error here, because it can result in churn 650 // when entering HA. Just try again to start. 651 log.Infof("unable to reconcile current controller and Dqlite cluster status; reattempting node start-up") 652 return errors.Trace(w.startExistingDqliteNode()) 653 } 654 655 // Otherwise this is a node added by enabling HA, 656 // and we need to join to an existing cluster. 657 return errors.Trace(w.joinNodeToCluster(apiDetails)) 658 } 659 660 // rebindAddress stops the current node, reconfigures the cluster so that 661 // it is a single server bound to the input local-cloud address. 662 // It should be called only for a cluster constituted by a single node 663 // bound to the loopback IP address. 664 func (w *dbWorker) rebindAddress(ctx context.Context, addr string) error { 665 // We only rebind the address when going into HA from a single node. 666 // Therefore, we do not have to worry about handing over responsibilities. 667 // Passing false ensures we come back up in the shortest time possible. 668 w.shutdownDqlite(ctx, false) 669 670 mgr := w.cfg.NodeManager 671 servers, err := mgr.ClusterServers(ctx) 672 if err != nil { 673 return errors.Trace(err) 674 } 675 676 // This should be implied by an earlier check of 677 // NodeManager.IsLoopbackBound, but we want to guard very 678 // conservatively against breaking established clusters. 679 if len(servers) != 1 { 680 w.cfg.Logger.Debugf("not a singular server; skipping address rebind") 681 return nil 682 } 683 684 // We need to preserve the port from the existing address. 685 _, port, err := net.SplitHostPort(servers[0].Address) 686 if err != nil { 687 return errors.Trace(err) 688 } 689 servers[0].Address = net.JoinHostPort(addr, port) 690 691 w.cfg.Logger.Infof("rebinding Dqlite node to %s", addr) 692 if err := mgr.SetClusterServers(ctx, servers); err != nil { 693 return errors.Trace(err) 694 } 695 696 return errors.Trace(mgr.SetNodeInfo(servers[0])) 697 } 698 699 // joinNodeToCluster uses the input server details to determine a bind address 700 // for this node, and one or more addresses of other nodes to cluster with. 701 // It then uses these to initialise Dqlite. 702 // If either bind or cluster addresses can not be determined, 703 // we just return nil and keep waiting for further server detail messages. 704 func (w *dbWorker) joinNodeToCluster(apiDetails apiserver.Details) error { 705 // Get our address from the API details. 706 localAddr, err := w.bindAddrFromServerDetails(apiDetails) 707 if err != nil { 708 if errors.Is(err, errors.NotFound) { 709 w.cfg.Logger.Infof(err.Error()) 710 return nil 711 } 712 return errors.Trace(err) 713 } 714 715 // Then get addresses for any other of the servers, 716 // so we can join the cluster. 717 var clusterAddrs []string 718 for id, server := range apiDetails.Servers { 719 hostPort := server.InternalAddress 720 if id != w.cfg.ControllerID && hostPort != "" { 721 addr, _, err := net.SplitHostPort(hostPort) 722 if err != nil { 723 return errors.Annotatef(err, "splitting host/port for %s", hostPort) 724 } 725 clusterAddrs = append(clusterAddrs, addr) 726 } 727 } 728 if len(clusterAddrs) == 0 { 729 w.cfg.Logger.Infof("no addresses available for this Dqlite node to join cluster") 730 return nil 731 } 732 733 w.cfg.Logger.Infof("joining Dqlite cluster") 734 mgr := w.cfg.NodeManager 735 736 withTLS, err := mgr.WithTLSOption() 737 if err != nil { 738 return errors.Trace(err) 739 } 740 741 return errors.Trace(w.initialiseDqlite( 742 mgr.WithAddressOption(localAddr), mgr.WithClusterOption(clusterAddrs), withTLS)) 743 } 744 745 // bindAddrFromServerDetails returns the internal IP address from the 746 // input details that corresponds with this controller machine. 747 func (w *dbWorker) bindAddrFromServerDetails(apiDetails apiserver.Details) (string, error) { 748 hostPort := apiDetails.Servers[w.cfg.ControllerID].InternalAddress 749 if hostPort == "" { 750 return "", errors.NotFoundf("internal address for this Dqlite node to bind to") 751 } 752 753 addr, _, err := net.SplitHostPort(hostPort) 754 if err != nil { 755 return "", errors.Annotatef(err, "splitting host/port for %s", hostPort) 756 } 757 758 return addr, nil 759 } 760 761 // shutdownDqlite shuts down the local Dqlite node, making a best-effort 762 // attempt at graceful handover when the input boolean is true. 763 // If the worker is not shutting down permanently, Dqlite should be 764 // reinitialised either directly or by bouncing the agent reasonably 765 // soon after calling this method. 766 func (w *dbWorker) shutdownDqlite(ctx context.Context, handover bool) { 767 w.cfg.Logger.Infof("shutting down Dqlite node") 768 769 w.mu.Lock() 770 defer w.mu.Unlock() 771 772 if w.dbApp == nil { 773 return 774 } 775 776 if handover { 777 // Set a bound on the time that we allow for hand off. 778 ctx, cancel := context.WithTimeout(ctx, nodeShutdownTimeout) 779 defer cancel() 780 781 if err := w.dbApp.Handover(ctx); err != nil { 782 w.cfg.Logger.Errorf("handing off Dqlite responsibilities: %v", err) 783 } 784 } else { 785 w.cfg.Logger.Infof("skipping Dqlite handover") 786 } 787 788 if err := w.dbApp.Close(); err != nil { 789 w.cfg.Logger.Errorf("closing Dqlite application: %v", err) 790 } 791 792 w.dbApp = nil 793 } 794 795 func (w *dbWorker) requestAPIServerDetails() error { 796 _, err := w.cfg.Hub.Publish(apiserver.DetailsRequestTopic, apiserver.DetailsRequest{ 797 Requester: "db-accessor", 798 LocalOnly: true, 799 }) 800 return errors.Trace(err) 801 } 802 803 // scopedContext returns a context that is in the scope of the worker lifetime. 804 // It returns a cancellable context that is cancelled when the action has 805 // completed. 806 func (w *dbWorker) scopedContext() (context.Context, context.CancelFunc) { 807 ctx, cancel := context.WithCancel(context.Background()) 808 return w.catacomb.Context(ctx), cancel 809 }