github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/dispatcher/dispatcher.go (about) 1 package dispatcher 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "strconv" 8 "sync" 9 "time" 10 11 "github.com/docker/go-events" 12 "github.com/docker/go-metrics" 13 "github.com/docker/swarmkit/api" 14 "github.com/docker/swarmkit/api/equality" 15 "github.com/docker/swarmkit/ca" 16 "github.com/docker/swarmkit/log" 17 "github.com/docker/swarmkit/manager/drivers" 18 "github.com/docker/swarmkit/manager/state/store" 19 "github.com/docker/swarmkit/protobuf/ptypes" 20 "github.com/docker/swarmkit/remotes" 21 "github.com/docker/swarmkit/watch" 22 gogotypes "github.com/gogo/protobuf/types" 23 "github.com/pkg/errors" 24 "github.com/sirupsen/logrus" 25 "google.golang.org/grpc/codes" 26 "google.golang.org/grpc/status" 27 ) 28 29 const ( 30 // DefaultHeartBeatPeriod is used for setting default value in cluster config 31 // and in case if cluster config is missing. 32 DefaultHeartBeatPeriod = 5 * time.Second 33 defaultHeartBeatEpsilon = 500 * time.Millisecond 34 defaultGracePeriodMultiplier = 3 35 defaultRateLimitPeriod = 8 * time.Second 36 37 // maxBatchItems is the threshold of queued writes that should 38 // trigger an actual transaction to commit them to the shared store. 39 maxBatchItems = 10000 40 41 // maxBatchInterval needs to strike a balance between keeping 42 // latency low, and realizing opportunities to combine many writes 43 // into a single transaction. A fraction of a second feels about 44 // right. 45 maxBatchInterval = 100 * time.Millisecond 46 47 modificationBatchLimit = 100 48 batchingWaitTime = 100 * time.Millisecond 49 50 // defaultNodeDownPeriod specifies the default time period we 51 // wait before moving tasks assigned to down nodes to ORPHANED 52 // state. 53 defaultNodeDownPeriod = 24 * time.Hour 54 ) 55 56 var ( 57 // ErrNodeAlreadyRegistered returned if node with same ID was already 58 // registered with this dispatcher. 59 ErrNodeAlreadyRegistered = errors.New("node already registered") 60 // ErrNodeNotRegistered returned if node with such ID wasn't registered 61 // with this dispatcher. 62 ErrNodeNotRegistered = errors.New("node not registered") 63 // ErrSessionInvalid returned when the session in use is no longer valid. 64 // The node should re-register and start a new session. 65 ErrSessionInvalid = errors.New("session invalid") 66 // ErrNodeNotFound returned when the Node doesn't exist in raft. 67 ErrNodeNotFound = errors.New("node not found") 68 69 // Scheduling delay timer. 70 schedulingDelayTimer metrics.Timer 71 ) 72 73 func init() { 74 ns := metrics.NewNamespace("swarm", "dispatcher", nil) 75 schedulingDelayTimer = ns.NewTimer("scheduling_delay", 76 "Scheduling delay is the time a task takes to go from NEW to RUNNING state.") 77 metrics.Register(ns) 78 } 79 80 // Config is configuration for Dispatcher. For default you should use 81 // DefaultConfig. 82 type Config struct { 83 HeartbeatPeriod time.Duration 84 HeartbeatEpsilon time.Duration 85 // RateLimitPeriod specifies how often node with same ID can try to register 86 // new session. 87 RateLimitPeriod time.Duration 88 GracePeriodMultiplier int 89 } 90 91 // DefaultConfig returns default config for Dispatcher. 92 func DefaultConfig() *Config { 93 return &Config{ 94 HeartbeatPeriod: DefaultHeartBeatPeriod, 95 HeartbeatEpsilon: defaultHeartBeatEpsilon, 96 RateLimitPeriod: defaultRateLimitPeriod, 97 GracePeriodMultiplier: defaultGracePeriodMultiplier, 98 } 99 } 100 101 // Cluster is interface which represent raft cluster. manager/state/raft.Node 102 // is implements it. This interface needed only for easier unit-testing. 103 type Cluster interface { 104 GetMemberlist() map[uint64]*api.RaftMember 105 SubscribePeers() (chan events.Event, func()) 106 MemoryStore() *store.MemoryStore 107 } 108 109 // nodeUpdate provides a new status and/or description to apply to a node 110 // object. 111 type nodeUpdate struct { 112 status *api.NodeStatus 113 description *api.NodeDescription 114 } 115 116 // clusterUpdate is an object that stores an update to the cluster that should trigger 117 // a new session message. These are pointers to indicate the difference between 118 // "there is no update" and "update this to nil" 119 type clusterUpdate struct { 120 managerUpdate *[]*api.WeightedPeer 121 bootstrapKeyUpdate *[]*api.EncryptionKey 122 rootCAUpdate *[]byte 123 } 124 125 // Dispatcher is responsible for dispatching tasks and tracking agent health. 126 type Dispatcher struct { 127 // Mutex to synchronize access to dispatcher shared state e.g. nodes, 128 // lastSeenManagers, networkBootstrapKeys etc. 129 // TODO(anshul): This can potentially be removed and rpcRW used in its place. 130 mu sync.Mutex 131 // WaitGroup to handle the case when Stop() gets called before Run() 132 // has finished initializing the dispatcher. 133 wg sync.WaitGroup 134 // This RWMutex synchronizes RPC handlers and the dispatcher stop(). 135 // The RPC handlers use the read lock while stop() uses the write lock 136 // and acts as a barrier to shutdown. 137 rpcRW sync.RWMutex 138 nodes *nodeStore 139 store *store.MemoryStore 140 lastSeenManagers []*api.WeightedPeer 141 networkBootstrapKeys []*api.EncryptionKey 142 lastSeenRootCert []byte 143 config *Config 144 cluster Cluster 145 ctx context.Context 146 cancel context.CancelFunc 147 clusterUpdateQueue *watch.Queue 148 dp *drivers.DriverProvider 149 securityConfig *ca.SecurityConfig 150 151 taskUpdates map[string]*api.TaskStatus // indexed by task ID 152 taskUpdatesLock sync.Mutex 153 154 nodeUpdates map[string]nodeUpdate // indexed by node ID 155 nodeUpdatesLock sync.Mutex 156 157 downNodes *nodeStore 158 159 processUpdatesTrigger chan struct{} 160 161 // for waiting for the next task/node batch update 162 processUpdatesLock sync.Mutex 163 processUpdatesCond *sync.Cond 164 } 165 166 // New returns Dispatcher with cluster interface(usually raft.Node). 167 func New() *Dispatcher { 168 d := &Dispatcher{ 169 downNodes: newNodeStore(defaultNodeDownPeriod, 0, 1, 0), 170 processUpdatesTrigger: make(chan struct{}, 1), 171 } 172 173 d.processUpdatesCond = sync.NewCond(&d.processUpdatesLock) 174 175 return d 176 } 177 178 // Init is used to initialize the dispatcher and 179 // is typically called before starting the dispatcher 180 // when a manager becomes a leader. 181 // The dispatcher is a grpc server, and unlike other components, 182 // it can't simply be recreated on becoming a leader. 183 // This function ensures the dispatcher restarts with a clean slate. 184 func (d *Dispatcher) Init(cluster Cluster, c *Config, dp *drivers.DriverProvider, securityConfig *ca.SecurityConfig) { 185 d.cluster = cluster 186 d.config = c 187 d.securityConfig = securityConfig 188 d.dp = dp 189 d.store = cluster.MemoryStore() 190 d.nodes = newNodeStore(c.HeartbeatPeriod, c.HeartbeatEpsilon, c.GracePeriodMultiplier, c.RateLimitPeriod) 191 } 192 193 func getWeightedPeers(cluster Cluster) []*api.WeightedPeer { 194 members := cluster.GetMemberlist() 195 var mgrs []*api.WeightedPeer 196 for _, m := range members { 197 mgrs = append(mgrs, &api.WeightedPeer{ 198 Peer: &api.Peer{ 199 NodeID: m.NodeID, 200 Addr: m.Addr, 201 }, 202 203 // TODO(stevvooe): Calculate weight of manager selection based on 204 // cluster-level observations, such as number of connections and 205 // load. 206 Weight: remotes.DefaultObservationWeight, 207 }) 208 } 209 return mgrs 210 } 211 212 // Run runs dispatcher tasks which should be run on leader dispatcher. 213 // Dispatcher can be stopped with cancelling ctx or calling Stop(). 214 func (d *Dispatcher) Run(ctx context.Context) error { 215 ctx = log.WithModule(ctx, "dispatcher") 216 log.G(ctx).Info("dispatcher starting") 217 218 d.taskUpdatesLock.Lock() 219 d.taskUpdates = make(map[string]*api.TaskStatus) 220 d.taskUpdatesLock.Unlock() 221 222 d.nodeUpdatesLock.Lock() 223 d.nodeUpdates = make(map[string]nodeUpdate) 224 d.nodeUpdatesLock.Unlock() 225 226 d.mu.Lock() 227 if d.isRunning() { 228 d.mu.Unlock() 229 return errors.New("dispatcher is already running") 230 } 231 if err := d.markNodesUnknown(ctx); err != nil { 232 log.G(ctx).Errorf(`failed to move all nodes to "unknown" state: %v`, err) 233 } 234 configWatcher, cancel, err := store.ViewAndWatch( 235 d.store, 236 func(readTx store.ReadTx) error { 237 clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) 238 if err != nil { 239 return err 240 } 241 if len(clusters) == 1 { 242 heartbeatPeriod, err := gogotypes.DurationFromProto(clusters[0].Spec.Dispatcher.HeartbeatPeriod) 243 if err == nil && heartbeatPeriod > 0 { 244 d.config.HeartbeatPeriod = heartbeatPeriod 245 } 246 if clusters[0].NetworkBootstrapKeys != nil { 247 d.networkBootstrapKeys = clusters[0].NetworkBootstrapKeys 248 } 249 d.lastSeenRootCert = clusters[0].RootCA.CACert 250 } 251 return nil 252 }, 253 api.EventUpdateCluster{}, 254 ) 255 if err != nil { 256 d.mu.Unlock() 257 return err 258 } 259 // set queue here to guarantee that Close will close it 260 d.clusterUpdateQueue = watch.NewQueue() 261 262 peerWatcher, peerCancel := d.cluster.SubscribePeers() 263 defer peerCancel() 264 d.lastSeenManagers = getWeightedPeers(d.cluster) 265 266 defer cancel() 267 d.ctx, d.cancel = context.WithCancel(ctx) 268 ctx = d.ctx 269 d.wg.Add(1) 270 defer d.wg.Done() 271 d.mu.Unlock() 272 273 publishManagers := func(peers []*api.Peer) { 274 var mgrs []*api.WeightedPeer 275 for _, p := range peers { 276 mgrs = append(mgrs, &api.WeightedPeer{ 277 Peer: p, 278 Weight: remotes.DefaultObservationWeight, 279 }) 280 } 281 d.mu.Lock() 282 d.lastSeenManagers = mgrs 283 d.mu.Unlock() 284 d.clusterUpdateQueue.Publish(clusterUpdate{managerUpdate: &mgrs}) 285 } 286 287 batchTimer := time.NewTimer(maxBatchInterval) 288 defer batchTimer.Stop() 289 290 for { 291 select { 292 case ev := <-peerWatcher: 293 publishManagers(ev.([]*api.Peer)) 294 case <-d.processUpdatesTrigger: 295 d.processUpdates(ctx) 296 batchTimer.Stop() 297 // drain the timer, if it has already expired 298 select { 299 case <-batchTimer.C: 300 default: 301 } 302 batchTimer.Reset(maxBatchInterval) 303 case <-batchTimer.C: 304 d.processUpdates(ctx) 305 // batch timer has already expired, so no need to drain 306 batchTimer.Reset(maxBatchInterval) 307 case v := <-configWatcher: 308 cluster := v.(api.EventUpdateCluster) 309 d.mu.Lock() 310 if cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod != nil { 311 // ignore error, since Spec has passed validation before 312 heartbeatPeriod, _ := gogotypes.DurationFromProto(cluster.Cluster.Spec.Dispatcher.HeartbeatPeriod) 313 if heartbeatPeriod != d.config.HeartbeatPeriod { 314 // only call d.nodes.updatePeriod when heartbeatPeriod changes 315 d.config.HeartbeatPeriod = heartbeatPeriod 316 d.nodes.updatePeriod(d.config.HeartbeatPeriod, d.config.HeartbeatEpsilon, d.config.GracePeriodMultiplier) 317 } 318 } 319 d.lastSeenRootCert = cluster.Cluster.RootCA.CACert 320 d.networkBootstrapKeys = cluster.Cluster.NetworkBootstrapKeys 321 d.mu.Unlock() 322 d.clusterUpdateQueue.Publish(clusterUpdate{ 323 bootstrapKeyUpdate: &cluster.Cluster.NetworkBootstrapKeys, 324 rootCAUpdate: &cluster.Cluster.RootCA.CACert, 325 }) 326 case <-ctx.Done(): 327 return nil 328 } 329 } 330 } 331 332 // Stop stops dispatcher and closes all grpc streams. 333 func (d *Dispatcher) Stop() error { 334 d.mu.Lock() 335 if !d.isRunning() { 336 d.mu.Unlock() 337 return errors.New("dispatcher is already stopped") 338 } 339 340 log := log.G(d.ctx).WithField("method", "(*Dispatcher).Stop") 341 log.Info("dispatcher stopping") 342 d.cancel() 343 d.mu.Unlock() 344 345 d.processUpdatesLock.Lock() 346 // when we called d.cancel(), there may be routines, servicing RPC calls to 347 // the (*Dispatcher).Session endpoint, currently waiting at 348 // d.processUpdatesCond.Wait() inside of (*Dispatcher).markNodeReady(). 349 // 350 // these routines are typically woken by a call to 351 // d.processUpdatesCond.Broadcast() at the end of 352 // (*Dispatcher).processUpdates() as part of the main Run loop. However, 353 // when d.cancel() is called, the main Run loop is stopped, and there are 354 // no more opportunties for processUpdates to be called. Any calls to 355 // Session would be stuck waiting on a call to Broadcast that will never 356 // come. 357 // 358 // Further, because the rpcRW write lock cannot be obtained until every RPC 359 // has exited and released its read lock, then Stop would be stuck forever. 360 // 361 // To avoid this case, we acquire the processUpdatesLock (so that no new 362 // waits can start) and then do a Broadcast to wake all of the waiting 363 // routines. Further, if any routines are waiting in markNodeReady to 364 // acquire this lock, but not yet waiting, those routines will check the 365 // context cancelation, see the context is canceled, and exit before doing 366 // the Wait. 367 // 368 // This call to Broadcast must occur here. If we called Broadcast before 369 // context cancelation, then some new routines could enter the wait. If we 370 // call Broadcast after attempting to acquire the rpcRW lock, we will be 371 // deadlocked. If we do this Broadcast without obtaining this lock (as is 372 // done in the processUpdates method), then it would be possible for that 373 // broadcast to come after the context cancelation check in markNodeReady, 374 // but before the call to Wait. 375 d.processUpdatesCond.Broadcast() 376 d.processUpdatesLock.Unlock() 377 378 // The active nodes list can be cleaned out only when all 379 // existing RPCs have finished. 380 // RPCs that start after rpcRW.Unlock() should find the context 381 // cancelled and should fail organically. 382 d.rpcRW.Lock() 383 d.nodes.Clean() 384 d.downNodes.Clean() 385 d.rpcRW.Unlock() 386 387 d.clusterUpdateQueue.Close() 388 389 // TODO(anshul): This use of Wait() could be unsafe. 390 // According to go's documentation on WaitGroup, 391 // Add() with a positive delta that occur when the counter is zero 392 // must happen before a Wait(). 393 // As is, dispatcher Stop() can race with Run(). 394 d.wg.Wait() 395 396 return nil 397 } 398 399 func (d *Dispatcher) isRunningLocked() (context.Context, error) { 400 d.mu.Lock() 401 if !d.isRunning() { 402 d.mu.Unlock() 403 return nil, status.Errorf(codes.Aborted, "dispatcher is stopped") 404 } 405 ctx := d.ctx 406 d.mu.Unlock() 407 return ctx, nil 408 } 409 410 func (d *Dispatcher) markNodesUnknown(ctx context.Context) error { 411 log := log.G(ctx).WithField("method", "(*Dispatcher).markNodesUnknown") 412 var nodes []*api.Node 413 var err error 414 d.store.View(func(tx store.ReadTx) { 415 nodes, err = store.FindNodes(tx, store.All) 416 }) 417 if err != nil { 418 return errors.Wrap(err, "failed to get list of nodes") 419 } 420 err = d.store.Batch(func(batch *store.Batch) error { 421 for _, n := range nodes { 422 err := batch.Update(func(tx store.Tx) error { 423 // check if node is still here 424 node := store.GetNode(tx, n.ID) 425 if node == nil { 426 return nil 427 } 428 // do not try to resurrect down nodes 429 if node.Status.State == api.NodeStatus_DOWN { 430 nodeCopy := node 431 expireFunc := func() { 432 log.Infof("moving tasks to orphaned state for node: %s", nodeCopy.ID) 433 if err := d.moveTasksToOrphaned(nodeCopy.ID); err != nil { 434 log.WithError(err).Errorf(`failed to move all tasks for node %s to "ORPHANED" state`, node.ID) 435 } 436 437 d.downNodes.Delete(nodeCopy.ID) 438 } 439 440 log.Infof(`node %s was found to be down when marking unknown on dispatcher start`, node.ID) 441 d.downNodes.Add(nodeCopy, expireFunc) 442 return nil 443 } 444 445 node.Status.State = api.NodeStatus_UNKNOWN 446 node.Status.Message = `Node moved to "unknown" state due to leadership change in cluster` 447 448 nodeID := node.ID 449 450 expireFunc := func() { 451 log := log.WithField("node", nodeID) 452 log.Infof(`heartbeat expiration for node %s in state "unknown"`, nodeID) 453 if err := d.markNodeNotReady(nodeID, api.NodeStatus_DOWN, `heartbeat failure for node in "unknown" state`); err != nil { 454 log.WithError(err).Error(`failed deregistering node after heartbeat expiration for node in "unknown" state`) 455 } 456 } 457 if err := d.nodes.AddUnknown(node, expireFunc); err != nil { 458 return errors.Wrapf(err, `adding node %s in "unknown" state to node store failed`, nodeID) 459 } 460 if err := store.UpdateNode(tx, node); err != nil { 461 return errors.Wrapf(err, "update for node %s failed", nodeID) 462 } 463 return nil 464 }) 465 if err != nil { 466 log.WithField("node", n.ID).WithError(err).Error(`failed to move node to "unknown" state`) 467 } 468 } 469 return nil 470 }) 471 return err 472 } 473 474 func (d *Dispatcher) isRunning() bool { 475 if d.ctx == nil { 476 return false 477 } 478 select { 479 case <-d.ctx.Done(): 480 return false 481 default: 482 } 483 return true 484 } 485 486 // markNodeReady updates the description of a node, updates its address, and sets status to READY 487 // this is used during registration when a new node description is provided 488 // and during node updates when the node description changes 489 func (d *Dispatcher) markNodeReady(ctx context.Context, nodeID string, description *api.NodeDescription, addr string) error { 490 d.nodeUpdatesLock.Lock() 491 d.nodeUpdates[nodeID] = nodeUpdate{ 492 status: &api.NodeStatus{ 493 State: api.NodeStatus_READY, 494 Addr: addr, 495 }, 496 description: description, 497 } 498 numUpdates := len(d.nodeUpdates) 499 d.nodeUpdatesLock.Unlock() 500 501 // Node is marked ready. Remove the node from down nodes if it 502 // is there. 503 d.downNodes.Delete(nodeID) 504 505 if numUpdates >= maxBatchItems { 506 select { 507 case d.processUpdatesTrigger <- struct{}{}: 508 case <-ctx.Done(): 509 return ctx.Err() 510 } 511 512 } 513 514 // Wait until the node update batch happens before unblocking register. 515 d.processUpdatesLock.Lock() 516 defer d.processUpdatesLock.Unlock() 517 518 select { 519 case <-ctx.Done(): 520 return ctx.Err() 521 default: 522 } 523 d.processUpdatesCond.Wait() 524 525 return nil 526 } 527 528 // gets the node IP from the context of a grpc call 529 func nodeIPFromContext(ctx context.Context) (string, error) { 530 nodeInfo, err := ca.RemoteNode(ctx) 531 if err != nil { 532 return "", err 533 } 534 addr, _, err := net.SplitHostPort(nodeInfo.RemoteAddr) 535 if err != nil { 536 return "", errors.Wrap(err, "unable to get ip from addr:port") 537 } 538 return addr, nil 539 } 540 541 // register is used for registration of node with particular dispatcher. 542 func (d *Dispatcher) register(ctx context.Context, nodeID string, description *api.NodeDescription) (string, error) { 543 logLocal := log.G(ctx).WithField("method", "(*Dispatcher).register") 544 // prevent register until we're ready to accept it 545 dctx, err := d.isRunningLocked() 546 if err != nil { 547 return "", err 548 } 549 550 if err := d.nodes.CheckRateLimit(nodeID); err != nil { 551 return "", err 552 } 553 554 // TODO(stevvooe): Validate node specification. 555 var node *api.Node 556 d.store.View(func(tx store.ReadTx) { 557 node = store.GetNode(tx, nodeID) 558 }) 559 if node == nil { 560 return "", ErrNodeNotFound 561 } 562 563 addr, err := nodeIPFromContext(ctx) 564 if err != nil { 565 logLocal.WithError(err).Debug("failed to get remote node IP") 566 } 567 568 if err := d.markNodeReady(dctx, nodeID, description, addr); err != nil { 569 return "", err 570 } 571 572 expireFunc := func() { 573 log.G(ctx).Debugf("heartbeat expiration for worker %s, setting worker status to NodeStatus_DOWN ", nodeID) 574 if err := d.markNodeNotReady(nodeID, api.NodeStatus_DOWN, "heartbeat failure"); err != nil { 575 log.G(ctx).WithError(err).Errorf("failed deregistering node after heartbeat expiration") 576 } 577 } 578 579 rn := d.nodes.Add(node, expireFunc) 580 logLocal.Infof("worker %s was successfully registered", nodeID) 581 582 // NOTE(stevvooe): We need be a little careful with re-registration. The 583 // current implementation just matches the node id and then gives away the 584 // sessionID. If we ever want to use sessionID as a secret, which we may 585 // want to, this is giving away the keys to the kitchen. 586 // 587 // The right behavior is going to be informed by identity. Basically, each 588 // time a node registers, we invalidate the session and issue a new 589 // session, once identity is proven. This will cause misbehaved agents to 590 // be kicked when multiple connections are made. 591 return rn.SessionID, nil 592 } 593 594 // UpdateTaskStatus updates status of task. Node should send such updates 595 // on every status change of its tasks. 596 func (d *Dispatcher) UpdateTaskStatus(ctx context.Context, r *api.UpdateTaskStatusRequest) (*api.UpdateTaskStatusResponse, error) { 597 d.rpcRW.RLock() 598 defer d.rpcRW.RUnlock() 599 600 dctx, err := d.isRunningLocked() 601 if err != nil { 602 return nil, err 603 } 604 605 nodeInfo, err := ca.RemoteNode(ctx) 606 if err != nil { 607 return nil, err 608 } 609 nodeID := nodeInfo.NodeID 610 fields := logrus.Fields{ 611 "node.id": nodeID, 612 "node.session": r.SessionID, 613 "method": "(*Dispatcher).UpdateTaskStatus", 614 } 615 if nodeInfo.ForwardedBy != nil { 616 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 617 } 618 log := log.G(ctx).WithFields(fields) 619 620 if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { 621 return nil, err 622 } 623 624 validTaskUpdates := make([]*api.UpdateTaskStatusRequest_TaskStatusUpdate, 0, len(r.Updates)) 625 626 // Validate task updates 627 for _, u := range r.Updates { 628 if u.Status == nil { 629 log.WithField("task.id", u.TaskID).Warn("task report has nil status") 630 continue 631 } 632 633 var t *api.Task 634 d.store.View(func(tx store.ReadTx) { 635 t = store.GetTask(tx, u.TaskID) 636 }) 637 if t == nil { 638 // Task may have been deleted 639 log.WithField("task.id", u.TaskID).Debug("cannot find target task in store") 640 continue 641 } 642 643 if t.NodeID != nodeID { 644 err := status.Errorf(codes.PermissionDenied, "cannot update a task not assigned this node") 645 log.WithField("task.id", u.TaskID).Error(err) 646 return nil, err 647 } 648 649 validTaskUpdates = append(validTaskUpdates, u) 650 } 651 652 d.taskUpdatesLock.Lock() 653 // Enqueue task updates 654 for _, u := range validTaskUpdates { 655 d.taskUpdates[u.TaskID] = u.Status 656 } 657 658 numUpdates := len(d.taskUpdates) 659 d.taskUpdatesLock.Unlock() 660 661 if numUpdates >= maxBatchItems { 662 select { 663 case d.processUpdatesTrigger <- struct{}{}: 664 case <-dctx.Done(): 665 } 666 } 667 return nil, nil 668 } 669 670 func (d *Dispatcher) processUpdates(ctx context.Context) { 671 var ( 672 taskUpdates map[string]*api.TaskStatus 673 nodeUpdates map[string]nodeUpdate 674 ) 675 d.taskUpdatesLock.Lock() 676 if len(d.taskUpdates) != 0 { 677 taskUpdates = d.taskUpdates 678 d.taskUpdates = make(map[string]*api.TaskStatus) 679 } 680 d.taskUpdatesLock.Unlock() 681 682 d.nodeUpdatesLock.Lock() 683 if len(d.nodeUpdates) != 0 { 684 nodeUpdates = d.nodeUpdates 685 d.nodeUpdates = make(map[string]nodeUpdate) 686 } 687 d.nodeUpdatesLock.Unlock() 688 689 if len(taskUpdates) == 0 && len(nodeUpdates) == 0 { 690 return 691 } 692 693 log := log.G(ctx).WithFields(logrus.Fields{ 694 "method": "(*Dispatcher).processUpdates", 695 }) 696 697 err := d.store.Batch(func(batch *store.Batch) error { 698 for taskID, status := range taskUpdates { 699 err := batch.Update(func(tx store.Tx) error { 700 logger := log.WithField("task.id", taskID) 701 task := store.GetTask(tx, taskID) 702 if task == nil { 703 // Task may have been deleted 704 logger.Debug("cannot find target task in store") 705 return nil 706 } 707 708 logger = logger.WithField("state.transition", fmt.Sprintf("%v->%v", task.Status.State, status.State)) 709 710 if task.Status == *status { 711 logger.Debug("task status identical, ignoring") 712 return nil 713 } 714 715 if task.Status.State > status.State { 716 logger.Debug("task status invalid transition") 717 return nil 718 } 719 720 // Update scheduling delay metric for running tasks. 721 // We use the status update time on the leader to calculate the scheduling delay. 722 // Because of this, the recorded scheduling delay will be an overestimate and include 723 // the network delay between the worker and the leader. 724 // This is not ideal, but its a known overestimation, rather than using the status update time 725 // from the worker node, which may cause unknown incorrect results due to possible clock skew. 726 if status.State == api.TaskStateRunning { 727 start := time.Unix(status.AppliedAt.GetSeconds(), int64(status.AppliedAt.GetNanos())) 728 schedulingDelayTimer.UpdateSince(start) 729 } 730 731 task.Status = *status 732 task.Status.AppliedBy = d.securityConfig.ClientTLSCreds.NodeID() 733 task.Status.AppliedAt = ptypes.MustTimestampProto(time.Now()) 734 logger.Debugf("state for task %v updated to %v", task.GetID(), task.Status.State) 735 if err := store.UpdateTask(tx, task); err != nil { 736 logger.WithError(err).Error("failed to update task status") 737 return nil 738 } 739 logger.Debug("dispatcher committed status update to store") 740 return nil 741 }) 742 if err != nil { 743 log.WithError(err).Error("dispatcher task update transaction failed") 744 } 745 } 746 747 for nodeID, nodeUpdate := range nodeUpdates { 748 err := batch.Update(func(tx store.Tx) error { 749 logger := log.WithField("node.id", nodeID) 750 node := store.GetNode(tx, nodeID) 751 if node == nil { 752 logger.Errorf("node unavailable") 753 return nil 754 } 755 756 if nodeUpdate.status != nil { 757 node.Status.State = nodeUpdate.status.State 758 node.Status.Message = nodeUpdate.status.Message 759 if nodeUpdate.status.Addr != "" { 760 node.Status.Addr = nodeUpdate.status.Addr 761 } 762 } 763 if nodeUpdate.description != nil { 764 node.Description = nodeUpdate.description 765 } 766 767 if err := store.UpdateNode(tx, node); err != nil { 768 logger.WithError(err).Error("failed to update node status") 769 return nil 770 } 771 logger.Debug("node status updated") 772 return nil 773 }) 774 if err != nil { 775 log.WithError(err).Error("dispatcher node update transaction failed") 776 } 777 } 778 779 return nil 780 }) 781 if err != nil { 782 log.WithError(err).Error("dispatcher batch failed") 783 } 784 785 d.processUpdatesCond.Broadcast() 786 } 787 788 // Tasks is a stream of tasks state for node. Each message contains full list 789 // of tasks which should be run on node, if task is not present in that list, 790 // it should be terminated. 791 func (d *Dispatcher) Tasks(r *api.TasksRequest, stream api.Dispatcher_TasksServer) error { 792 d.rpcRW.RLock() 793 defer d.rpcRW.RUnlock() 794 795 dctx, err := d.isRunningLocked() 796 if err != nil { 797 return err 798 } 799 800 nodeInfo, err := ca.RemoteNode(stream.Context()) 801 if err != nil { 802 return err 803 } 804 nodeID := nodeInfo.NodeID 805 806 fields := logrus.Fields{ 807 "node.id": nodeID, 808 "node.session": r.SessionID, 809 "method": "(*Dispatcher).Tasks", 810 } 811 if nodeInfo.ForwardedBy != nil { 812 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 813 } 814 log.G(stream.Context()).WithFields(fields).Debug("") 815 816 if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { 817 return err 818 } 819 820 tasksMap := make(map[string]*api.Task) 821 nodeTasks, cancel, err := store.ViewAndWatch( 822 d.store, 823 func(readTx store.ReadTx) error { 824 tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID)) 825 if err != nil { 826 return err 827 } 828 for _, t := range tasks { 829 tasksMap[t.ID] = t 830 } 831 return nil 832 }, 833 api.EventCreateTask{Task: &api.Task{NodeID: nodeID}, 834 Checks: []api.TaskCheckFunc{api.TaskCheckNodeID}}, 835 api.EventUpdateTask{Task: &api.Task{NodeID: nodeID}, 836 Checks: []api.TaskCheckFunc{api.TaskCheckNodeID}}, 837 api.EventDeleteTask{Task: &api.Task{NodeID: nodeID}, 838 Checks: []api.TaskCheckFunc{api.TaskCheckNodeID}}, 839 ) 840 if err != nil { 841 return err 842 } 843 defer cancel() 844 845 for { 846 if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { 847 return err 848 } 849 850 var tasks []*api.Task 851 for _, t := range tasksMap { 852 // dispatcher only sends tasks that have been assigned to a node 853 if t != nil && t.Status.State >= api.TaskStateAssigned { 854 tasks = append(tasks, t) 855 } 856 } 857 858 if err := stream.Send(&api.TasksMessage{Tasks: tasks}); err != nil { 859 return err 860 } 861 862 // bursty events should be processed in batches and sent out snapshot 863 var ( 864 modificationCnt int 865 batchingTimer *time.Timer 866 batchingTimeout <-chan time.Time 867 ) 868 869 batchingLoop: 870 for modificationCnt < modificationBatchLimit { 871 select { 872 case event := <-nodeTasks: 873 switch v := event.(type) { 874 case api.EventCreateTask: 875 tasksMap[v.Task.ID] = v.Task 876 modificationCnt++ 877 case api.EventUpdateTask: 878 if oldTask, exists := tasksMap[v.Task.ID]; exists { 879 // States ASSIGNED and below are set by the orchestrator/scheduler, 880 // not the agent, so tasks in these states need to be sent to the 881 // agent even if nothing else has changed. 882 if equality.TasksEqualStable(oldTask, v.Task) && v.Task.Status.State > api.TaskStateAssigned { 883 // this update should not trigger action at agent 884 tasksMap[v.Task.ID] = v.Task 885 continue 886 } 887 } 888 tasksMap[v.Task.ID] = v.Task 889 modificationCnt++ 890 case api.EventDeleteTask: 891 delete(tasksMap, v.Task.ID) 892 modificationCnt++ 893 } 894 if batchingTimer != nil { 895 batchingTimer.Reset(batchingWaitTime) 896 } else { 897 batchingTimer = time.NewTimer(batchingWaitTime) 898 batchingTimeout = batchingTimer.C 899 } 900 case <-batchingTimeout: 901 break batchingLoop 902 case <-stream.Context().Done(): 903 return stream.Context().Err() 904 case <-dctx.Done(): 905 return dctx.Err() 906 } 907 } 908 909 if batchingTimer != nil { 910 batchingTimer.Stop() 911 } 912 } 913 } 914 915 // Assignments is a stream of assignments for a node. Each message contains 916 // either full list of tasks and secrets for the node, or an incremental update. 917 func (d *Dispatcher) Assignments(r *api.AssignmentsRequest, stream api.Dispatcher_AssignmentsServer) error { 918 d.rpcRW.RLock() 919 defer d.rpcRW.RUnlock() 920 921 dctx, err := d.isRunningLocked() 922 if err != nil { 923 return err 924 } 925 926 nodeInfo, err := ca.RemoteNode(stream.Context()) 927 if err != nil { 928 return err 929 } 930 nodeID := nodeInfo.NodeID 931 932 fields := logrus.Fields{ 933 "node.id": nodeID, 934 "node.session": r.SessionID, 935 "method": "(*Dispatcher).Assignments", 936 } 937 if nodeInfo.ForwardedBy != nil { 938 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 939 } 940 log := log.G(stream.Context()).WithFields(fields) 941 log.Debug("") 942 943 if _, err = d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { 944 return err 945 } 946 947 var ( 948 sequence int64 949 appliesTo string 950 assignments = newAssignmentSet(log, d.dp) 951 ) 952 953 sendMessage := func(msg api.AssignmentsMessage, assignmentType api.AssignmentsMessage_Type) error { 954 sequence++ 955 msg.AppliesTo = appliesTo 956 msg.ResultsIn = strconv.FormatInt(sequence, 10) 957 appliesTo = msg.ResultsIn 958 msg.Type = assignmentType 959 960 return stream.Send(&msg) 961 } 962 963 // TODO(aaronl): Also send node secrets that should be exposed to 964 // this node. 965 nodeTasks, cancel, err := store.ViewAndWatch( 966 d.store, 967 func(readTx store.ReadTx) error { 968 tasks, err := store.FindTasks(readTx, store.ByNodeID(nodeID)) 969 if err != nil { 970 return err 971 } 972 973 for _, t := range tasks { 974 assignments.addOrUpdateTask(readTx, t) 975 } 976 977 return nil 978 }, 979 api.EventUpdateTask{Task: &api.Task{NodeID: nodeID}, 980 Checks: []api.TaskCheckFunc{api.TaskCheckNodeID}}, 981 api.EventDeleteTask{Task: &api.Task{NodeID: nodeID}, 982 Checks: []api.TaskCheckFunc{api.TaskCheckNodeID}}, 983 ) 984 if err != nil { 985 return err 986 } 987 defer cancel() 988 989 if err := sendMessage(assignments.message(), api.AssignmentsMessage_COMPLETE); err != nil { 990 return err 991 } 992 993 for { 994 // Check for session expiration 995 if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { 996 return err 997 } 998 999 // bursty events should be processed in batches and sent out together 1000 var ( 1001 modificationCnt int 1002 batchingTimer *time.Timer 1003 batchingTimeout <-chan time.Time 1004 ) 1005 1006 oneModification := func() { 1007 modificationCnt++ 1008 1009 if batchingTimer != nil { 1010 batchingTimer.Reset(batchingWaitTime) 1011 } else { 1012 batchingTimer = time.NewTimer(batchingWaitTime) 1013 batchingTimeout = batchingTimer.C 1014 } 1015 } 1016 1017 // The batching loop waits for 50 ms after the most recent 1018 // change, or until modificationBatchLimit is reached. The 1019 // worst case latency is modificationBatchLimit * batchingWaitTime, 1020 // which is 10 seconds. 1021 batchingLoop: 1022 for modificationCnt < modificationBatchLimit { 1023 select { 1024 case event := <-nodeTasks: 1025 switch v := event.(type) { 1026 // We don't monitor EventCreateTask because tasks are 1027 // never created in the ASSIGNED state. First tasks are 1028 // created by the orchestrator, then the scheduler moves 1029 // them to ASSIGNED. If this ever changes, we will need 1030 // to monitor task creations as well. 1031 case api.EventUpdateTask: 1032 d.store.View(func(readTx store.ReadTx) { 1033 if assignments.addOrUpdateTask(readTx, v.Task) { 1034 oneModification() 1035 } 1036 }) 1037 case api.EventDeleteTask: 1038 if assignments.removeTask(v.Task) { 1039 oneModification() 1040 } 1041 // TODO(aaronl): For node secrets, we'll need to handle 1042 // EventCreateSecret. 1043 } 1044 case <-batchingTimeout: 1045 break batchingLoop 1046 case <-stream.Context().Done(): 1047 return stream.Context().Err() 1048 case <-dctx.Done(): 1049 return dctx.Err() 1050 } 1051 } 1052 1053 if batchingTimer != nil { 1054 batchingTimer.Stop() 1055 } 1056 1057 if modificationCnt > 0 { 1058 if err := sendMessage(assignments.message(), api.AssignmentsMessage_INCREMENTAL); err != nil { 1059 return err 1060 } 1061 } 1062 } 1063 } 1064 1065 func (d *Dispatcher) moveTasksToOrphaned(nodeID string) error { 1066 err := d.store.Batch(func(batch *store.Batch) error { 1067 var ( 1068 tasks []*api.Task 1069 err error 1070 ) 1071 1072 d.store.View(func(tx store.ReadTx) { 1073 tasks, err = store.FindTasks(tx, store.ByNodeID(nodeID)) 1074 }) 1075 if err != nil { 1076 return err 1077 } 1078 1079 for _, task := range tasks { 1080 // Tasks running on an unreachable node need to be marked as 1081 // orphaned since we have no idea whether the task is still running 1082 // or not. 1083 // 1084 // This only applies for tasks that could have made progress since 1085 // the agent became unreachable (assigned<->running) 1086 // 1087 // Tasks in a final state (e.g. rejected) *cannot* have made 1088 // progress, therefore there's no point in marking them as orphaned 1089 if task.Status.State >= api.TaskStateAssigned && task.Status.State <= api.TaskStateRunning { 1090 task.Status.State = api.TaskStateOrphaned 1091 } 1092 1093 err := batch.Update(func(tx store.Tx) error { 1094 return store.UpdateTask(tx, task) 1095 }) 1096 if err != nil { 1097 return err 1098 } 1099 1100 } 1101 1102 return nil 1103 }) 1104 1105 return err 1106 } 1107 1108 // markNodeNotReady sets the node state to some state other than READY 1109 func (d *Dispatcher) markNodeNotReady(id string, state api.NodeStatus_State, message string) error { 1110 logLocal := log.G(d.ctx).WithField("method", "(*Dispatcher).markNodeNotReady") 1111 1112 dctx, err := d.isRunningLocked() 1113 if err != nil { 1114 return err 1115 } 1116 1117 // Node is down. Add it to down nodes so that we can keep 1118 // track of tasks assigned to the node. 1119 var node *api.Node 1120 d.store.View(func(readTx store.ReadTx) { 1121 node = store.GetNode(readTx, id) 1122 if node == nil { 1123 err = fmt.Errorf("could not find node %s while trying to add to down nodes store", id) 1124 } 1125 }) 1126 if err != nil { 1127 return err 1128 } 1129 1130 expireFunc := func() { 1131 log.G(dctx).Debugf(`worker timed-out %s in "down" state, moving all tasks to "ORPHANED" state`, id) 1132 if err := d.moveTasksToOrphaned(id); err != nil { 1133 log.G(dctx).WithError(err).Error(`failed to move all tasks to "ORPHANED" state`) 1134 } 1135 1136 d.downNodes.Delete(id) 1137 } 1138 1139 d.downNodes.Add(node, expireFunc) 1140 logLocal.Debugf("added node %s to down nodes list", node.ID) 1141 1142 status := &api.NodeStatus{ 1143 State: state, 1144 Message: message, 1145 } 1146 1147 d.nodeUpdatesLock.Lock() 1148 // pluck the description out of nodeUpdates. this protects against a case 1149 // where a node is marked ready and a description is added, but then the 1150 // node is immediately marked not ready. this preserves that description 1151 d.nodeUpdates[id] = nodeUpdate{status: status, description: d.nodeUpdates[id].description} 1152 numUpdates := len(d.nodeUpdates) 1153 d.nodeUpdatesLock.Unlock() 1154 1155 if numUpdates >= maxBatchItems { 1156 select { 1157 case d.processUpdatesTrigger <- struct{}{}: 1158 case <-dctx.Done(): 1159 } 1160 } 1161 1162 if rn := d.nodes.Delete(id); rn == nil { 1163 return errors.Errorf("node %s is not found in local storage", id) 1164 } 1165 logLocal.Debugf("deleted node %s from node store", node.ID) 1166 1167 return nil 1168 } 1169 1170 // Heartbeat is heartbeat method for nodes. It returns new TTL in response. 1171 // Node should send new heartbeat earlier than now + TTL, otherwise it will 1172 // be deregistered from dispatcher and its status will be updated to NodeStatus_DOWN 1173 func (d *Dispatcher) Heartbeat(ctx context.Context, r *api.HeartbeatRequest) (*api.HeartbeatResponse, error) { 1174 d.rpcRW.RLock() 1175 defer d.rpcRW.RUnlock() 1176 1177 // TODO(anshul) Explore if its possible to check context here without locking. 1178 if _, err := d.isRunningLocked(); err != nil { 1179 return nil, status.Errorf(codes.Aborted, "dispatcher is stopped") 1180 } 1181 1182 nodeInfo, err := ca.RemoteNode(ctx) 1183 if err != nil { 1184 return nil, err 1185 } 1186 1187 period, err := d.nodes.Heartbeat(nodeInfo.NodeID, r.SessionID) 1188 1189 log.G(ctx).WithField("method", "(*Dispatcher).Heartbeat").Debugf("received heartbeat from worker %v, expect next heartbeat in %v", nodeInfo, period) 1190 return &api.HeartbeatResponse{Period: period}, err 1191 } 1192 1193 func (d *Dispatcher) getManagers() []*api.WeightedPeer { 1194 d.mu.Lock() 1195 defer d.mu.Unlock() 1196 return d.lastSeenManagers 1197 } 1198 1199 func (d *Dispatcher) getNetworkBootstrapKeys() []*api.EncryptionKey { 1200 d.mu.Lock() 1201 defer d.mu.Unlock() 1202 return d.networkBootstrapKeys 1203 } 1204 1205 func (d *Dispatcher) getRootCACert() []byte { 1206 d.mu.Lock() 1207 defer d.mu.Unlock() 1208 return d.lastSeenRootCert 1209 } 1210 1211 // Session is a stream which controls agent connection. 1212 // Each message contains list of backup Managers with weights. Also there is 1213 // a special boolean field Disconnect which if true indicates that node should 1214 // reconnect to another Manager immediately. 1215 func (d *Dispatcher) Session(r *api.SessionRequest, stream api.Dispatcher_SessionServer) error { 1216 d.rpcRW.RLock() 1217 defer d.rpcRW.RUnlock() 1218 1219 dctx, err := d.isRunningLocked() 1220 if err != nil { 1221 return err 1222 } 1223 1224 ctx := stream.Context() 1225 1226 nodeInfo, err := ca.RemoteNode(ctx) 1227 if err != nil { 1228 return err 1229 } 1230 nodeID := nodeInfo.NodeID 1231 1232 var sessionID string 1233 if _, err := d.nodes.GetWithSession(nodeID, r.SessionID); err != nil { 1234 // register the node. 1235 sessionID, err = d.register(ctx, nodeID, r.Description) 1236 if err != nil { 1237 return err 1238 } 1239 } else { 1240 sessionID = r.SessionID 1241 // get the node IP addr 1242 addr, err := nodeIPFromContext(stream.Context()) 1243 if err != nil { 1244 log.G(ctx).WithError(err).Debug("failed to get remote node IP") 1245 } 1246 // update the node description 1247 if err := d.markNodeReady(dctx, nodeID, r.Description, addr); err != nil { 1248 return err 1249 } 1250 } 1251 1252 fields := logrus.Fields{ 1253 "node.id": nodeID, 1254 "node.session": sessionID, 1255 "method": "(*Dispatcher).Session", 1256 } 1257 if nodeInfo.ForwardedBy != nil { 1258 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 1259 } 1260 log := log.G(ctx).WithFields(fields) 1261 1262 var nodeObj *api.Node 1263 nodeUpdates, cancel, err := store.ViewAndWatch(d.store, func(readTx store.ReadTx) error { 1264 nodeObj = store.GetNode(readTx, nodeID) 1265 return nil 1266 }, api.EventUpdateNode{Node: &api.Node{ID: nodeID}, 1267 Checks: []api.NodeCheckFunc{api.NodeCheckID}}, 1268 ) 1269 if cancel != nil { 1270 defer cancel() 1271 } 1272 1273 if err != nil { 1274 log.WithError(err).Error("ViewAndWatch Node failed") 1275 } 1276 1277 if _, err = d.nodes.GetWithSession(nodeID, sessionID); err != nil { 1278 return err 1279 } 1280 1281 clusterUpdatesCh, clusterCancel := d.clusterUpdateQueue.Watch() 1282 defer clusterCancel() 1283 1284 if err := stream.Send(&api.SessionMessage{ 1285 SessionID: sessionID, 1286 Node: nodeObj, 1287 Managers: d.getManagers(), 1288 NetworkBootstrapKeys: d.getNetworkBootstrapKeys(), 1289 RootCA: d.getRootCACert(), 1290 }); err != nil { 1291 return err 1292 } 1293 1294 // disconnectNode is a helper forcibly shutdown connection 1295 disconnectNode := func() error { 1296 log.Infof("dispatcher session dropped, marking node %s down", nodeID) 1297 if err := d.markNodeNotReady(nodeID, api.NodeStatus_DISCONNECTED, "node is currently trying to find new manager"); err != nil { 1298 log.WithError(err).Error("failed to remove node") 1299 } 1300 // still return an abort if the transport closure was ineffective. 1301 return status.Errorf(codes.Aborted, "node must disconnect") 1302 } 1303 1304 for { 1305 // After each message send, we need to check the nodes sessionID hasn't 1306 // changed. If it has, we will shut down the stream and make the node 1307 // re-register. 1308 node, err := d.nodes.GetWithSession(nodeID, sessionID) 1309 if err != nil { 1310 return err 1311 } 1312 1313 var ( 1314 disconnect bool 1315 mgrs []*api.WeightedPeer 1316 netKeys []*api.EncryptionKey 1317 rootCert []byte 1318 ) 1319 1320 select { 1321 case ev := <-clusterUpdatesCh: 1322 update := ev.(clusterUpdate) 1323 if update.managerUpdate != nil { 1324 mgrs = *update.managerUpdate 1325 } 1326 if update.bootstrapKeyUpdate != nil { 1327 netKeys = *update.bootstrapKeyUpdate 1328 } 1329 if update.rootCAUpdate != nil { 1330 rootCert = *update.rootCAUpdate 1331 } 1332 case ev := <-nodeUpdates: 1333 nodeObj = ev.(api.EventUpdateNode).Node 1334 case <-stream.Context().Done(): 1335 return stream.Context().Err() 1336 case <-node.Disconnect: 1337 disconnect = true 1338 case <-dctx.Done(): 1339 disconnect = true 1340 } 1341 if mgrs == nil { 1342 mgrs = d.getManagers() 1343 } 1344 if netKeys == nil { 1345 netKeys = d.getNetworkBootstrapKeys() 1346 } 1347 if rootCert == nil { 1348 rootCert = d.getRootCACert() 1349 } 1350 1351 if err := stream.Send(&api.SessionMessage{ 1352 SessionID: sessionID, 1353 Node: nodeObj, 1354 Managers: mgrs, 1355 NetworkBootstrapKeys: netKeys, 1356 RootCA: rootCert, 1357 }); err != nil { 1358 return err 1359 } 1360 if disconnect { 1361 return disconnectNode() 1362 } 1363 } 1364 }