github.com/noxiouz/docker@v0.7.3-0.20160629055221-3d231c78e8c5/daemon/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "io/ioutil" 7 "os" 8 "path/filepath" 9 "strings" 10 "sync" 11 "time" 12 13 "google.golang.org/grpc" 14 15 "github.com/Sirupsen/logrus" 16 "github.com/docker/distribution/digest" 17 "github.com/docker/docker/daemon/cluster/convert" 18 executorpkg "github.com/docker/docker/daemon/cluster/executor" 19 "github.com/docker/docker/daemon/cluster/executor/container" 20 "github.com/docker/docker/errors" 21 "github.com/docker/docker/opts" 22 "github.com/docker/docker/pkg/ioutils" 23 "github.com/docker/docker/runconfig" 24 apitypes "github.com/docker/engine-api/types" 25 types "github.com/docker/engine-api/types/swarm" 26 swarmagent "github.com/docker/swarmkit/agent" 27 swarmapi "github.com/docker/swarmkit/api" 28 "golang.org/x/net/context" 29 ) 30 31 const swarmDirName = "swarm" 32 const controlSocket = "control.sock" 33 const swarmConnectTimeout = 20 * time.Second 34 const stateFile = "docker-state.json" 35 const defaultAddr = "0.0.0.0:2377" 36 37 const ( 38 initialReconnectDelay = 100 * time.Millisecond 39 maxReconnectDelay = 30 * time.Second 40 ) 41 42 // ErrNoSwarm is returned on leaving a cluster that was never initialized 43 var ErrNoSwarm = fmt.Errorf("This node is not part of Swarm") 44 45 // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated 46 var ErrSwarmExists = fmt.Errorf("This node is already part of a Swarm cluster. Use \"docker swarm leave\" to leave this cluster and join another one.") 47 48 // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet. 49 var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.") 50 51 // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. 52 var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. Attempt to join the cluster will continue in the background. Use \"docker info\" command to see the current Swarm status of your node.") 53 54 // defaultSpec contains some sane defaults if cluster options are missing on init 55 var defaultSpec = types.Spec{ 56 Raft: types.RaftConfig{ 57 SnapshotInterval: 10000, 58 KeepOldSnapshots: 0, 59 LogEntriesForSlowFollowers: 500, 60 HeartbeatTick: 1, 61 ElectionTick: 3, 62 }, 63 CAConfig: types.CAConfig{ 64 NodeCertExpiry: 90 * 24 * time.Hour, 65 }, 66 Dispatcher: types.DispatcherConfig{ 67 HeartbeatPeriod: uint64((5 * time.Second).Nanoseconds()), 68 }, 69 Orchestration: types.OrchestrationConfig{ 70 TaskHistoryRetentionLimit: 10, 71 }, 72 } 73 74 type state struct { 75 ListenAddr string 76 } 77 78 // Config provides values for Cluster. 79 type Config struct { 80 Root string 81 Name string 82 Backend executorpkg.Backend 83 } 84 85 // Cluster provides capabilities to participate in a cluster as a worker or a 86 // manager. 87 type Cluster struct { 88 sync.RWMutex 89 root string 90 config Config 91 configEvent chan struct{} // todo: make this array and goroutine safe 92 node *swarmagent.Node 93 conn *grpc.ClientConn 94 client swarmapi.ControlClient 95 ready bool 96 listenAddr string 97 err error 98 reconnectDelay time.Duration 99 stop bool 100 cancelDelay func() 101 } 102 103 // New creates a new Cluster instance using provided config. 104 func New(config Config) (*Cluster, error) { 105 root := filepath.Join(config.Root, swarmDirName) 106 if err := os.MkdirAll(root, 0700); err != nil { 107 return nil, err 108 } 109 c := &Cluster{ 110 root: root, 111 config: config, 112 configEvent: make(chan struct{}, 10), 113 reconnectDelay: initialReconnectDelay, 114 } 115 116 st, err := c.loadState() 117 if err != nil { 118 if os.IsNotExist(err) { 119 return c, nil 120 } 121 return nil, err 122 } 123 124 n, ctx, err := c.startNewNode(false, st.ListenAddr, "", "", "", false) 125 if err != nil { 126 return nil, err 127 } 128 129 select { 130 case <-time.After(swarmConnectTimeout): 131 logrus.Errorf("swarm component could not be started before timeout was reached") 132 case <-n.Ready(): 133 case <-ctx.Done(): 134 } 135 if ctx.Err() != nil { 136 return nil, fmt.Errorf("swarm component could not be started") 137 } 138 go c.reconnectOnFailure(ctx) 139 return c, nil 140 } 141 142 func (c *Cluster) loadState() (*state, error) { 143 dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile)) 144 if err != nil { 145 return nil, err 146 } 147 // missing certificate means no actual state to restore from 148 if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil { 149 if os.IsNotExist(err) { 150 c.clearState() 151 } 152 return nil, err 153 } 154 var st state 155 if err := json.Unmarshal(dt, &st); err != nil { 156 return nil, err 157 } 158 return &st, nil 159 } 160 161 func (c *Cluster) saveState() error { 162 dt, err := json.Marshal(state{ListenAddr: c.listenAddr}) 163 if err != nil { 164 return err 165 } 166 return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600) 167 } 168 169 func (c *Cluster) reconnectOnFailure(ctx context.Context) { 170 for { 171 <-ctx.Done() 172 c.Lock() 173 if c.stop || c.node != nil { 174 c.Unlock() 175 return 176 } 177 c.reconnectDelay *= 2 178 if c.reconnectDelay > maxReconnectDelay { 179 c.reconnectDelay = maxReconnectDelay 180 } 181 logrus.Warnf("Restarting swarm in %.2f seconds", c.reconnectDelay.Seconds()) 182 delayCtx, cancel := context.WithTimeout(context.Background(), c.reconnectDelay) 183 c.cancelDelay = cancel 184 c.Unlock() 185 <-delayCtx.Done() 186 if delayCtx.Err() != context.DeadlineExceeded { 187 return 188 } 189 c.Lock() 190 if c.node != nil { 191 c.Unlock() 192 return 193 } 194 var err error 195 _, ctx, err = c.startNewNode(false, c.listenAddr, c.getRemoteAddress(), "", "", false) 196 if err != nil { 197 c.err = err 198 ctx = delayCtx 199 } 200 c.Unlock() 201 } 202 } 203 204 func (c *Cluster) startNewNode(forceNewCluster bool, listenAddr, joinAddr, secret, cahash string, ismanager bool) (*swarmagent.Node, context.Context, error) { 205 if err := c.config.Backend.IsSwarmCompatible(); err != nil { 206 return nil, nil, err 207 } 208 c.node = nil 209 c.cancelDelay = nil 210 node, err := swarmagent.NewNode(&swarmagent.NodeConfig{ 211 Hostname: c.config.Name, 212 ForceNewCluster: forceNewCluster, 213 ListenControlAPI: filepath.Join(c.root, controlSocket), 214 ListenRemoteAPI: listenAddr, 215 JoinAddr: joinAddr, 216 StateDir: c.root, 217 CAHash: cahash, 218 Secret: secret, 219 Executor: container.NewExecutor(c.config.Backend), 220 HeartbeatTick: 1, 221 ElectionTick: 3, 222 IsManager: ismanager, 223 }) 224 if err != nil { 225 return nil, nil, err 226 } 227 ctx, cancel := context.WithCancel(context.Background()) 228 if err := node.Start(ctx); err != nil { 229 return nil, nil, err 230 } 231 232 c.node = node 233 c.listenAddr = listenAddr 234 c.saveState() 235 c.config.Backend.SetClusterProvider(c) 236 go func() { 237 err := node.Err(ctx) 238 if err != nil { 239 logrus.Errorf("cluster exited with error: %v", err) 240 } 241 c.Lock() 242 c.conn = nil 243 c.client = nil 244 c.node = nil 245 c.ready = false 246 c.err = err 247 c.Unlock() 248 cancel() 249 }() 250 251 go func() { 252 select { 253 case <-node.Ready(): 254 c.Lock() 255 c.reconnectDelay = initialReconnectDelay 256 c.Unlock() 257 case <-ctx.Done(): 258 } 259 if ctx.Err() == nil { 260 c.Lock() 261 c.ready = true 262 c.err = nil 263 c.Unlock() 264 } 265 c.configEvent <- struct{}{} 266 }() 267 268 go func() { 269 for conn := range node.ListenControlSocket(ctx) { 270 c.Lock() 271 if c.conn != conn { 272 c.client = swarmapi.NewControlClient(conn) 273 } 274 if c.conn != nil { 275 c.client = nil 276 } 277 c.conn = conn 278 c.Unlock() 279 c.configEvent <- struct{}{} 280 } 281 }() 282 283 return node, ctx, nil 284 } 285 286 // Init initializes new cluster from user provided request. 287 func (c *Cluster) Init(req types.InitRequest) (string, error) { 288 c.Lock() 289 if node := c.node; node != nil { 290 c.Unlock() 291 if !req.ForceNewCluster { 292 return "", errSwarmExists(node) 293 } 294 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 295 defer cancel() 296 c.cancelReconnect() 297 if err := c.node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 298 return "", err 299 } 300 c.Lock() 301 c.node = nil 302 c.conn = nil 303 c.ready = false 304 } 305 306 if err := validateAndSanitizeInitRequest(&req); err != nil { 307 c.Unlock() 308 return "", err 309 } 310 311 // todo: check current state existing 312 n, ctx, err := c.startNewNode(req.ForceNewCluster, req.ListenAddr, "", "", "", false) 313 if err != nil { 314 c.Unlock() 315 return "", err 316 } 317 c.Unlock() 318 319 select { 320 case <-n.Ready(): 321 if err := initClusterSpec(n, req.Spec); err != nil { 322 return "", err 323 } 324 go c.reconnectOnFailure(ctx) 325 return n.NodeID(), nil 326 case <-ctx.Done(): 327 c.RLock() 328 defer c.RUnlock() 329 if c.err != nil { 330 if !req.ForceNewCluster { // if failure on first attempt don't keep state 331 if err := c.clearState(); err != nil { 332 return "", err 333 } 334 } 335 return "", c.err 336 } 337 return "", ctx.Err() 338 } 339 } 340 341 // Join makes current Cluster part of an existing swarm cluster. 342 func (c *Cluster) Join(req types.JoinRequest) error { 343 c.Lock() 344 if node := c.node; node != nil { 345 c.Unlock() 346 return errSwarmExists(node) 347 } 348 if err := validateAndSanitizeJoinRequest(&req); err != nil { 349 c.Unlock() 350 return err 351 } 352 // todo: check current state existing 353 n, ctx, err := c.startNewNode(false, req.ListenAddr, req.RemoteAddrs[0], req.Secret, req.CACertHash, req.Manager) 354 if err != nil { 355 c.Unlock() 356 return err 357 } 358 c.Unlock() 359 360 certificateRequested := n.CertificateRequested() 361 for { 362 select { 363 case <-certificateRequested: 364 if n.NodeMembership() == swarmapi.NodeMembershipPending { 365 return fmt.Errorf("Your node is in the process of joining the cluster but needs to be accepted by existing cluster member.\nTo accept this node into cluster run \"docker node accept %v\" in an existing cluster manager. Use \"docker info\" command to see the current Swarm status of your node.", n.NodeID()) 366 } 367 certificateRequested = nil 368 case <-time.After(swarmConnectTimeout): 369 // attempt to connect will continue in background, also reconnecting 370 go c.reconnectOnFailure(ctx) 371 return ErrSwarmJoinTimeoutReached 372 case <-n.Ready(): 373 go c.reconnectOnFailure(ctx) 374 return nil 375 case <-ctx.Done(): 376 c.RLock() 377 defer c.RUnlock() 378 if c.err != nil { 379 return c.err 380 } 381 return ctx.Err() 382 } 383 } 384 } 385 386 func (c *Cluster) cancelReconnect() { 387 c.stop = true 388 if c.cancelDelay != nil { 389 c.cancelDelay() 390 c.cancelDelay = nil 391 } 392 } 393 394 // Leave shuts down Cluster and removes current state. 395 func (c *Cluster) Leave(force bool) error { 396 c.Lock() 397 node := c.node 398 if node == nil { 399 c.Unlock() 400 return ErrNoSwarm 401 } 402 403 if node.Manager() != nil && !force { 404 msg := "You are attempting to leave cluster on a node that is participating as a manager. " 405 if c.isActiveManager() { 406 active, reachable, unreachable, err := c.managerStats() 407 if err == nil { 408 if active && reachable-2 <= unreachable { 409 if reachable == 1 && unreachable == 0 { 410 msg += "Leaving last manager will remove all current state of the cluster. Use `--force` to ignore this message. " 411 c.Unlock() 412 return fmt.Errorf(msg) 413 } 414 msg += fmt.Sprintf("Leaving cluster will leave you with %v managers out of %v. This means Raft quorum will be lost and your cluster will become inaccessible. ", reachable-1, reachable+unreachable) 415 } 416 } 417 } else { 418 msg += "Doing so may lose the consensus of your cluster. " 419 } 420 421 msg += "Only way to restore a cluster that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to ignore this message." 422 c.Unlock() 423 return fmt.Errorf(msg) 424 } 425 c.cancelReconnect() 426 c.Unlock() 427 428 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 429 defer cancel() 430 if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 431 return err 432 } 433 if nodeID := node.NodeID(); nodeID != "" { 434 for _, id := range c.config.Backend.ListContainersForNode(nodeID) { 435 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 436 logrus.Errorf("error removing %v: %v", id, err) 437 } 438 } 439 } 440 c.Lock() 441 defer c.Unlock() 442 c.node = nil 443 c.conn = nil 444 c.ready = false 445 c.configEvent <- struct{}{} 446 // todo: cleanup optional? 447 if err := c.clearState(); err != nil { 448 return err 449 } 450 return nil 451 } 452 453 func (c *Cluster) clearState() error { 454 // todo: backup this data instead of removing? 455 if err := os.RemoveAll(c.root); err != nil { 456 return err 457 } 458 if err := os.MkdirAll(c.root, 0700); err != nil { 459 return err 460 } 461 c.config.Backend.SetClusterProvider(nil) 462 return nil 463 } 464 465 func (c *Cluster) getRequestContext() context.Context { // TODO: not needed when requests don't block on qourum lost 466 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 467 return ctx 468 } 469 470 // Inspect retrieves the configuration properties of a managed swarm cluster. 471 func (c *Cluster) Inspect() (types.Swarm, error) { 472 c.RLock() 473 defer c.RUnlock() 474 475 if !c.isActiveManager() { 476 return types.Swarm{}, c.errNoManager() 477 } 478 479 swarm, err := getSwarm(c.getRequestContext(), c.client) 480 if err != nil { 481 return types.Swarm{}, err 482 } 483 484 if err != nil { 485 return types.Swarm{}, err 486 } 487 488 return convert.SwarmFromGRPC(*swarm), nil 489 } 490 491 // Update updates configuration of a managed swarm cluster. 492 func (c *Cluster) Update(version uint64, spec types.Spec) error { 493 c.RLock() 494 defer c.RUnlock() 495 496 if !c.isActiveManager() { 497 return c.errNoManager() 498 } 499 500 swarm, err := getSwarm(c.getRequestContext(), c.client) 501 if err != nil { 502 return err 503 } 504 505 swarmSpec, err := convert.SwarmSpecToGRPCandMerge(spec, &swarm.Spec) 506 if err != nil { 507 return err 508 } 509 510 _, err = c.client.UpdateCluster( 511 c.getRequestContext(), 512 &swarmapi.UpdateClusterRequest{ 513 ClusterID: swarm.ID, 514 Spec: &swarmSpec, 515 ClusterVersion: &swarmapi.Version{ 516 Index: version, 517 }, 518 }, 519 ) 520 return err 521 } 522 523 // IsManager returns true if Cluster is participating as a manager. 524 func (c *Cluster) IsManager() bool { 525 c.RLock() 526 defer c.RUnlock() 527 return c.isActiveManager() 528 } 529 530 // IsAgent returns true if Cluster is participating as a worker/agent. 531 func (c *Cluster) IsAgent() bool { 532 c.RLock() 533 defer c.RUnlock() 534 return c.ready 535 } 536 537 // GetListenAddress returns the listening address for current manager's 538 // consensus and dispatcher APIs. 539 func (c *Cluster) GetListenAddress() string { 540 c.RLock() 541 defer c.RUnlock() 542 if c.conn != nil { 543 return c.listenAddr 544 } 545 return "" 546 } 547 548 // GetRemoteAddress returns a known advertise address of a remote manager if 549 // available. 550 // todo: change to array/connect with info 551 func (c *Cluster) GetRemoteAddress() string { 552 c.RLock() 553 defer c.RUnlock() 554 return c.getRemoteAddress() 555 } 556 557 func (c *Cluster) getRemoteAddress() string { 558 if c.node == nil { 559 return "" 560 } 561 nodeID := c.node.NodeID() 562 for _, r := range c.node.Remotes() { 563 if r.NodeID != nodeID { 564 return r.Addr 565 } 566 } 567 return "" 568 } 569 570 // ListenClusterEvents returns a channel that receives messages on cluster 571 // participation changes. 572 // todo: make cancelable and accessible to multiple callers 573 func (c *Cluster) ListenClusterEvents() <-chan struct{} { 574 return c.configEvent 575 } 576 577 // Info returns information about the current cluster state. 578 func (c *Cluster) Info() types.Info { 579 var info types.Info 580 c.RLock() 581 defer c.RUnlock() 582 583 if c.node == nil { 584 info.LocalNodeState = types.LocalNodeStateInactive 585 if c.cancelDelay != nil { 586 info.LocalNodeState = types.LocalNodeStateError 587 } 588 } else { 589 info.LocalNodeState = types.LocalNodeStatePending 590 if c.ready == true { 591 info.LocalNodeState = types.LocalNodeStateActive 592 } 593 } 594 if c.err != nil { 595 info.Error = c.err.Error() 596 } 597 598 if c.isActiveManager() { 599 info.ControlAvailable = true 600 if r, err := c.client.ListNodes(c.getRequestContext(), &swarmapi.ListNodesRequest{}); err == nil { 601 info.Nodes = len(r.Nodes) 602 for _, n := range r.Nodes { 603 if n.ManagerStatus != nil { 604 info.Managers = info.Managers + 1 605 } 606 } 607 } 608 609 if swarm, err := getSwarm(c.getRequestContext(), c.client); err == nil && swarm != nil { 610 info.CACertHash = swarm.RootCA.CACertHash 611 } 612 } 613 614 if c.node != nil { 615 for _, r := range c.node.Remotes() { 616 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 617 } 618 info.NodeID = c.node.NodeID() 619 } 620 621 return info 622 } 623 624 // isActiveManager should not be called without a read lock 625 func (c *Cluster) isActiveManager() bool { 626 return c.conn != nil 627 } 628 629 // errNoManager returns error describing why manager commands can't be used. 630 // Call with read lock. 631 func (c *Cluster) errNoManager() error { 632 if c.node == nil { 633 return fmt.Errorf("This node is not a Swarm manager. Use \"docker swarm init\" or \"docker swarm join --manager\" to connect this node to Swarm and try again.") 634 } 635 if c.node.Manager() != nil { 636 return fmt.Errorf("This node is not a Swarm manager. Manager is being prepared or has trouble connecting to the cluster.") 637 } 638 return fmt.Errorf("This node is not a Swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.") 639 } 640 641 // GetServices returns all services of a managed swarm cluster. 642 func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) { 643 c.RLock() 644 defer c.RUnlock() 645 646 if !c.isActiveManager() { 647 return nil, c.errNoManager() 648 } 649 650 filters, err := newListServicesFilters(options.Filter) 651 if err != nil { 652 return nil, err 653 } 654 r, err := c.client.ListServices( 655 c.getRequestContext(), 656 &swarmapi.ListServicesRequest{Filters: filters}) 657 if err != nil { 658 return nil, err 659 } 660 661 var services []types.Service 662 663 for _, service := range r.Services { 664 services = append(services, convert.ServiceFromGRPC(*service)) 665 } 666 667 return services, nil 668 } 669 670 // CreateService creates a new service in a managed swarm cluster. 671 func (c *Cluster) CreateService(s types.ServiceSpec) (string, error) { 672 c.RLock() 673 defer c.RUnlock() 674 675 if !c.isActiveManager() { 676 return "", c.errNoManager() 677 } 678 679 ctx := c.getRequestContext() 680 681 err := populateNetworkID(ctx, c.client, &s) 682 if err != nil { 683 return "", err 684 } 685 686 serviceSpec, err := convert.ServiceSpecToGRPC(s) 687 if err != nil { 688 return "", err 689 } 690 r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec}) 691 if err != nil { 692 return "", err 693 } 694 695 return r.Service.ID, nil 696 } 697 698 // GetService returns a service based on an ID or name. 699 func (c *Cluster) GetService(input string) (types.Service, error) { 700 c.RLock() 701 defer c.RUnlock() 702 703 if !c.isActiveManager() { 704 return types.Service{}, c.errNoManager() 705 } 706 707 service, err := getService(c.getRequestContext(), c.client, input) 708 if err != nil { 709 return types.Service{}, err 710 } 711 return convert.ServiceFromGRPC(*service), nil 712 } 713 714 // UpdateService updates existing service to match new properties. 715 func (c *Cluster) UpdateService(serviceID string, version uint64, spec types.ServiceSpec) error { 716 c.RLock() 717 defer c.RUnlock() 718 719 if !c.isActiveManager() { 720 return c.errNoManager() 721 } 722 723 serviceSpec, err := convert.ServiceSpecToGRPC(spec) 724 if err != nil { 725 return err 726 } 727 728 _, err = c.client.UpdateService( 729 c.getRequestContext(), 730 &swarmapi.UpdateServiceRequest{ 731 ServiceID: serviceID, 732 Spec: &serviceSpec, 733 ServiceVersion: &swarmapi.Version{ 734 Index: version, 735 }, 736 }, 737 ) 738 return err 739 } 740 741 // RemoveService removes a service from a managed swarm cluster. 742 func (c *Cluster) RemoveService(input string) error { 743 c.RLock() 744 defer c.RUnlock() 745 746 if !c.isActiveManager() { 747 return c.errNoManager() 748 } 749 750 service, err := getService(c.getRequestContext(), c.client, input) 751 if err != nil { 752 return err 753 } 754 755 if _, err := c.client.RemoveService(c.getRequestContext(), &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil { 756 return err 757 } 758 return nil 759 } 760 761 // GetNodes returns a list of all nodes known to a cluster. 762 func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) { 763 c.RLock() 764 defer c.RUnlock() 765 766 if !c.isActiveManager() { 767 return nil, c.errNoManager() 768 } 769 770 filters, err := newListNodesFilters(options.Filter) 771 if err != nil { 772 return nil, err 773 } 774 r, err := c.client.ListNodes( 775 c.getRequestContext(), 776 &swarmapi.ListNodesRequest{Filters: filters}) 777 if err != nil { 778 return nil, err 779 } 780 781 nodes := []types.Node{} 782 783 for _, node := range r.Nodes { 784 nodes = append(nodes, convert.NodeFromGRPC(*node)) 785 } 786 return nodes, nil 787 } 788 789 // GetNode returns a node based on an ID or name. 790 func (c *Cluster) GetNode(input string) (types.Node, error) { 791 c.RLock() 792 defer c.RUnlock() 793 794 if !c.isActiveManager() { 795 return types.Node{}, c.errNoManager() 796 } 797 798 node, err := getNode(c.getRequestContext(), c.client, input) 799 if err != nil { 800 return types.Node{}, err 801 } 802 return convert.NodeFromGRPC(*node), nil 803 } 804 805 // UpdateNode updates existing nodes properties. 806 func (c *Cluster) UpdateNode(nodeID string, version uint64, spec types.NodeSpec) error { 807 c.RLock() 808 defer c.RUnlock() 809 810 if !c.isActiveManager() { 811 return c.errNoManager() 812 } 813 814 nodeSpec, err := convert.NodeSpecToGRPC(spec) 815 if err != nil { 816 return err 817 } 818 819 _, err = c.client.UpdateNode( 820 c.getRequestContext(), 821 &swarmapi.UpdateNodeRequest{ 822 NodeID: nodeID, 823 Spec: &nodeSpec, 824 NodeVersion: &swarmapi.Version{ 825 Index: version, 826 }, 827 }, 828 ) 829 return err 830 } 831 832 // RemoveNode removes a node from a cluster 833 func (c *Cluster) RemoveNode(input string) error { 834 c.RLock() 835 defer c.RUnlock() 836 837 if !c.isActiveManager() { 838 return c.errNoManager() 839 } 840 841 ctx := c.getRequestContext() 842 843 node, err := getNode(ctx, c.client, input) 844 if err != nil { 845 return err 846 } 847 848 if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID}); err != nil { 849 return err 850 } 851 return nil 852 } 853 854 // GetTasks returns a list of tasks matching the filter options. 855 func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) { 856 c.RLock() 857 defer c.RUnlock() 858 859 if !c.isActiveManager() { 860 return nil, c.errNoManager() 861 } 862 863 filters, err := newListTasksFilters(options.Filter) 864 if err != nil { 865 return nil, err 866 } 867 r, err := c.client.ListTasks( 868 c.getRequestContext(), 869 &swarmapi.ListTasksRequest{Filters: filters}) 870 if err != nil { 871 return nil, err 872 } 873 874 tasks := []types.Task{} 875 876 for _, task := range r.Tasks { 877 tasks = append(tasks, convert.TaskFromGRPC(*task)) 878 } 879 return tasks, nil 880 } 881 882 // GetTask returns a task by an ID. 883 func (c *Cluster) GetTask(input string) (types.Task, error) { 884 c.RLock() 885 defer c.RUnlock() 886 887 if !c.isActiveManager() { 888 return types.Task{}, c.errNoManager() 889 } 890 891 task, err := getTask(c.getRequestContext(), c.client, input) 892 if err != nil { 893 return types.Task{}, err 894 } 895 return convert.TaskFromGRPC(*task), nil 896 } 897 898 // GetNetwork returns a cluster network by an ID. 899 func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) { 900 c.RLock() 901 defer c.RUnlock() 902 903 if !c.isActiveManager() { 904 return apitypes.NetworkResource{}, c.errNoManager() 905 } 906 907 network, err := getNetwork(c.getRequestContext(), c.client, input) 908 if err != nil { 909 return apitypes.NetworkResource{}, err 910 } 911 return convert.BasicNetworkFromGRPC(*network), nil 912 } 913 914 // GetNetworks returns all current cluster managed networks. 915 func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) { 916 c.RLock() 917 defer c.RUnlock() 918 919 if !c.isActiveManager() { 920 return nil, c.errNoManager() 921 } 922 923 r, err := c.client.ListNetworks(c.getRequestContext(), &swarmapi.ListNetworksRequest{}) 924 if err != nil { 925 return nil, err 926 } 927 928 var networks []apitypes.NetworkResource 929 930 for _, network := range r.Networks { 931 networks = append(networks, convert.BasicNetworkFromGRPC(*network)) 932 } 933 934 return networks, nil 935 } 936 937 // CreateNetwork creates a new cluster managed network. 938 func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) { 939 c.RLock() 940 defer c.RUnlock() 941 942 if !c.isActiveManager() { 943 return "", c.errNoManager() 944 } 945 946 if runconfig.IsPreDefinedNetwork(s.Name) { 947 err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name) 948 return "", errors.NewRequestForbiddenError(err) 949 } 950 951 networkSpec := convert.BasicNetworkCreateToGRPC(s) 952 r, err := c.client.CreateNetwork(c.getRequestContext(), &swarmapi.CreateNetworkRequest{Spec: &networkSpec}) 953 if err != nil { 954 return "", err 955 } 956 957 return r.Network.ID, nil 958 } 959 960 // RemoveNetwork removes a cluster network. 961 func (c *Cluster) RemoveNetwork(input string) error { 962 c.RLock() 963 defer c.RUnlock() 964 965 if !c.isActiveManager() { 966 return c.errNoManager() 967 } 968 969 network, err := getNetwork(c.getRequestContext(), c.client, input) 970 if err != nil { 971 return err 972 } 973 974 if _, err := c.client.RemoveNetwork(c.getRequestContext(), &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil { 975 return err 976 } 977 return nil 978 } 979 980 func populateNetworkID(ctx context.Context, c swarmapi.ControlClient, s *types.ServiceSpec) error { 981 for i, n := range s.Networks { 982 apiNetwork, err := getNetwork(ctx, c, n.Target) 983 if err != nil { 984 return err 985 } 986 s.Networks[i].Target = apiNetwork.ID 987 } 988 return nil 989 } 990 991 func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) { 992 // GetNetwork to match via full ID. 993 rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input}) 994 if err != nil { 995 // If any error (including NotFound), ListNetworks to match via ID prefix and full name. 996 rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}}) 997 if err != nil || len(rl.Networks) == 0 { 998 rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}}) 999 } 1000 1001 if err != nil { 1002 return nil, err 1003 } 1004 1005 if len(rl.Networks) == 0 { 1006 return nil, fmt.Errorf("network %s not found", input) 1007 } 1008 1009 if l := len(rl.Networks); l > 1 { 1010 return nil, fmt.Errorf("network %s is ambigious (%d matches found)", input, l) 1011 } 1012 1013 return rl.Networks[0], nil 1014 } 1015 return rg.Network, nil 1016 } 1017 1018 // Cleanup stops active swarm node. This is run before daemon shutdown. 1019 func (c *Cluster) Cleanup() { 1020 c.Lock() 1021 node := c.node 1022 if node == nil { 1023 c.Unlock() 1024 return 1025 } 1026 1027 if c.isActiveManager() { 1028 active, reachable, unreachable, err := c.managerStats() 1029 if err == nil { 1030 singlenode := active && reachable == 1 && unreachable == 0 1031 if active && !singlenode && reachable-2 <= unreachable { 1032 logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable) 1033 } 1034 } 1035 } 1036 c.cancelReconnect() 1037 c.Unlock() 1038 ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) 1039 defer cancel() 1040 if err := node.Stop(ctx); err != nil { 1041 logrus.Errorf("error cleaning up cluster: %v", err) 1042 } 1043 c.Lock() 1044 c.node = nil 1045 c.ready = false 1046 c.conn = nil 1047 c.Unlock() 1048 } 1049 1050 func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) { 1051 ctx, _ := context.WithTimeout(context.Background(), 3*time.Second) 1052 nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}) 1053 if err != nil { 1054 return false, 0, 0, err 1055 } 1056 for _, n := range nodes.Nodes { 1057 if n.ManagerStatus != nil { 1058 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE { 1059 reachable++ 1060 if n.ID == c.node.NodeID() { 1061 current = true 1062 } 1063 } 1064 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE { 1065 unreachable++ 1066 } 1067 } 1068 } 1069 return 1070 } 1071 1072 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 1073 var err error 1074 req.ListenAddr, err = validateAddr(req.ListenAddr) 1075 if err != nil { 1076 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1077 } 1078 1079 spec := &req.Spec 1080 // provide sane defaults instead of erroring 1081 if spec.Name == "" { 1082 spec.Name = "default" 1083 } 1084 if spec.Raft.SnapshotInterval == 0 { 1085 spec.Raft.SnapshotInterval = defaultSpec.Raft.SnapshotInterval 1086 } 1087 if spec.Raft.LogEntriesForSlowFollowers == 0 { 1088 spec.Raft.LogEntriesForSlowFollowers = defaultSpec.Raft.LogEntriesForSlowFollowers 1089 } 1090 if spec.Raft.ElectionTick == 0 { 1091 spec.Raft.ElectionTick = defaultSpec.Raft.ElectionTick 1092 } 1093 if spec.Raft.HeartbeatTick == 0 { 1094 spec.Raft.HeartbeatTick = defaultSpec.Raft.HeartbeatTick 1095 } 1096 if spec.Dispatcher.HeartbeatPeriod == 0 { 1097 spec.Dispatcher.HeartbeatPeriod = defaultSpec.Dispatcher.HeartbeatPeriod 1098 } 1099 if spec.CAConfig.NodeCertExpiry == 0 { 1100 spec.CAConfig.NodeCertExpiry = defaultSpec.CAConfig.NodeCertExpiry 1101 } 1102 if spec.Orchestration.TaskHistoryRetentionLimit == 0 { 1103 spec.Orchestration.TaskHistoryRetentionLimit = defaultSpec.Orchestration.TaskHistoryRetentionLimit 1104 } 1105 return nil 1106 } 1107 1108 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 1109 var err error 1110 req.ListenAddr, err = validateAddr(req.ListenAddr) 1111 if err != nil { 1112 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1113 } 1114 if len(req.RemoteAddrs) == 0 { 1115 return fmt.Errorf("at least 1 RemoteAddr is required to join") 1116 } 1117 for i := range req.RemoteAddrs { 1118 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 1119 if err != nil { 1120 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 1121 } 1122 } 1123 if req.CACertHash != "" { 1124 if _, err := digest.ParseDigest(req.CACertHash); err != nil { 1125 return fmt.Errorf("invalid CACertHash %q, %v", req.CACertHash, err) 1126 } 1127 } 1128 return nil 1129 } 1130 1131 func validateAddr(addr string) (string, error) { 1132 if addr == "" { 1133 return addr, fmt.Errorf("invalid empty address") 1134 } 1135 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 1136 if err != nil { 1137 return addr, nil 1138 } 1139 return strings.TrimPrefix(newaddr, "tcp://"), nil 1140 } 1141 1142 func errSwarmExists(node *swarmagent.Node) error { 1143 if node.NodeMembership() != swarmapi.NodeMembershipAccepted { 1144 return ErrPendingSwarmExists 1145 } 1146 return ErrSwarmExists 1147 } 1148 1149 func initClusterSpec(node *swarmagent.Node, spec types.Spec) error { 1150 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 1151 for conn := range node.ListenControlSocket(ctx) { 1152 if ctx.Err() != nil { 1153 return ctx.Err() 1154 } 1155 if conn != nil { 1156 client := swarmapi.NewControlClient(conn) 1157 var cluster *swarmapi.Cluster 1158 for i := 0; ; i++ { 1159 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 1160 if err != nil { 1161 return fmt.Errorf("error on listing clusters: %v", err) 1162 } 1163 if len(lcr.Clusters) == 0 { 1164 if i < 10 { 1165 time.Sleep(200 * time.Millisecond) 1166 continue 1167 } 1168 return fmt.Errorf("empty list of clusters was returned") 1169 } 1170 cluster = lcr.Clusters[0] 1171 break 1172 } 1173 newspec, err := convert.SwarmSpecToGRPCandMerge(spec, &cluster.Spec) 1174 if err != nil { 1175 return fmt.Errorf("error updating cluster settings: %v", err) 1176 } 1177 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 1178 ClusterID: cluster.ID, 1179 ClusterVersion: &cluster.Meta.Version, 1180 Spec: &newspec, 1181 }) 1182 if err != nil { 1183 return fmt.Errorf("error updating cluster settings: %v", err) 1184 } 1185 return nil 1186 } 1187 } 1188 return ctx.Err() 1189 }