github.com/portworx/docker@v1.12.1/daemon/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "io/ioutil" 7 "net" 8 "os" 9 "path/filepath" 10 "strings" 11 "sync" 12 "time" 13 14 "google.golang.org/grpc" 15 16 "github.com/Sirupsen/logrus" 17 "github.com/docker/docker/daemon/cluster/convert" 18 executorpkg "github.com/docker/docker/daemon/cluster/executor" 19 "github.com/docker/docker/daemon/cluster/executor/container" 20 "github.com/docker/docker/errors" 21 "github.com/docker/docker/opts" 22 "github.com/docker/docker/pkg/ioutils" 23 "github.com/docker/docker/runconfig" 24 apitypes "github.com/docker/engine-api/types" 25 "github.com/docker/engine-api/types/filters" 26 types "github.com/docker/engine-api/types/swarm" 27 swarmagent "github.com/docker/swarmkit/agent" 28 swarmapi "github.com/docker/swarmkit/api" 29 "golang.org/x/net/context" 30 ) 31 32 const swarmDirName = "swarm" 33 const controlSocket = "control.sock" 34 const swarmConnectTimeout = 20 * time.Second 35 const swarmRequestTimeout = 20 * time.Second 36 const stateFile = "docker-state.json" 37 const defaultAddr = "0.0.0.0:2377" 38 39 const ( 40 initialReconnectDelay = 100 * time.Millisecond 41 maxReconnectDelay = 30 * time.Second 42 ) 43 44 // ErrNoSwarm is returned on leaving a cluster that was never initialized 45 var ErrNoSwarm = fmt.Errorf("This node is not part of a swarm") 46 47 // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated 48 var ErrSwarmExists = fmt.Errorf("This node is already part of a swarm. Use \"docker swarm leave\" to leave this swarm and join another one.") 49 50 // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet. 51 var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.") 52 53 // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. 54 var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. The attempt to join the swarm will continue in the background. Use the \"docker info\" command to see the current swarm status of your node.") 55 56 // defaultSpec contains some sane defaults if cluster options are missing on init 57 var defaultSpec = types.Spec{ 58 Raft: types.RaftConfig{ 59 SnapshotInterval: 10000, 60 KeepOldSnapshots: 0, 61 LogEntriesForSlowFollowers: 500, 62 HeartbeatTick: 1, 63 ElectionTick: 3, 64 }, 65 CAConfig: types.CAConfig{ 66 NodeCertExpiry: 90 * 24 * time.Hour, 67 }, 68 Dispatcher: types.DispatcherConfig{ 69 HeartbeatPeriod: uint64((5 * time.Second).Nanoseconds()), 70 }, 71 Orchestration: types.OrchestrationConfig{ 72 TaskHistoryRetentionLimit: 10, 73 }, 74 } 75 76 type state struct { 77 // LocalAddr is this machine's local IP or hostname, if specified. 78 LocalAddr string 79 // RemoteAddr is the address that was given to "swarm join. It is used 80 // to find LocalAddr if necessary. 81 RemoteAddr string 82 // ListenAddr is the address we bind to, including a port. 83 ListenAddr string 84 // AdvertiseAddr is the address other nodes should connect to, 85 // including a port. 86 AdvertiseAddr string 87 } 88 89 // NetworkSubnetsProvider exposes functions for retrieving the subnets 90 // of networks managed by Docker, so they can be filtered. 91 type NetworkSubnetsProvider interface { 92 V4Subnets() []net.IPNet 93 V6Subnets() []net.IPNet 94 } 95 96 // Config provides values for Cluster. 97 type Config struct { 98 Root string 99 Name string 100 Backend executorpkg.Backend 101 NetworkSubnetsProvider NetworkSubnetsProvider 102 103 // DefaultAdvertiseAddr is the default host/IP or network interface to use 104 // if no AdvertiseAddr value is specified. 105 DefaultAdvertiseAddr string 106 } 107 108 // Cluster provides capabilities to participate in a cluster as a worker or a 109 // manager. 110 type Cluster struct { 111 sync.RWMutex 112 *node 113 root string 114 config Config 115 configEvent chan struct{} // todo: make this array and goroutine safe 116 localAddr string 117 actualLocalAddr string // after resolution, not persisted 118 remoteAddr string 119 listenAddr string 120 advertiseAddr string 121 stop bool 122 err error 123 cancelDelay func() 124 } 125 126 type node struct { 127 *swarmagent.Node 128 done chan struct{} 129 ready bool 130 conn *grpc.ClientConn 131 client swarmapi.ControlClient 132 reconnectDelay time.Duration 133 } 134 135 // New creates a new Cluster instance using provided config. 136 func New(config Config) (*Cluster, error) { 137 root := filepath.Join(config.Root, swarmDirName) 138 if err := os.MkdirAll(root, 0700); err != nil { 139 return nil, err 140 } 141 c := &Cluster{ 142 root: root, 143 config: config, 144 configEvent: make(chan struct{}, 10), 145 } 146 147 st, err := c.loadState() 148 if err != nil { 149 if os.IsNotExist(err) { 150 return c, nil 151 } 152 return nil, err 153 } 154 155 n, err := c.startNewNode(false, st.LocalAddr, st.RemoteAddr, st.ListenAddr, st.AdvertiseAddr, "", "") 156 if err != nil { 157 return nil, err 158 } 159 160 select { 161 case <-time.After(swarmConnectTimeout): 162 logrus.Errorf("swarm component could not be started before timeout was reached") 163 case <-n.Ready(): 164 case <-n.done: 165 return nil, fmt.Errorf("swarm component could not be started: %v", c.err) 166 } 167 go c.reconnectOnFailure(n) 168 return c, nil 169 } 170 171 func (c *Cluster) loadState() (*state, error) { 172 dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile)) 173 if err != nil { 174 return nil, err 175 } 176 // missing certificate means no actual state to restore from 177 if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil { 178 if os.IsNotExist(err) { 179 c.clearState() 180 } 181 return nil, err 182 } 183 var st state 184 if err := json.Unmarshal(dt, &st); err != nil { 185 return nil, err 186 } 187 return &st, nil 188 } 189 190 func (c *Cluster) saveState() error { 191 dt, err := json.Marshal(state{ 192 LocalAddr: c.localAddr, 193 RemoteAddr: c.remoteAddr, 194 ListenAddr: c.listenAddr, 195 AdvertiseAddr: c.advertiseAddr, 196 }) 197 if err != nil { 198 return err 199 } 200 return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600) 201 } 202 203 func (c *Cluster) reconnectOnFailure(n *node) { 204 for { 205 <-n.done 206 c.Lock() 207 if c.stop || c.node != nil { 208 c.Unlock() 209 return 210 } 211 n.reconnectDelay *= 2 212 if n.reconnectDelay > maxReconnectDelay { 213 n.reconnectDelay = maxReconnectDelay 214 } 215 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 216 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 217 c.cancelDelay = cancel 218 c.Unlock() 219 <-delayCtx.Done() 220 if delayCtx.Err() != context.DeadlineExceeded { 221 return 222 } 223 c.Lock() 224 if c.node != nil { 225 c.Unlock() 226 return 227 } 228 var err error 229 n, err = c.startNewNode(false, c.localAddr, c.getRemoteAddress(), c.listenAddr, c.advertiseAddr, c.getRemoteAddress(), "") 230 if err != nil { 231 c.err = err 232 close(n.done) 233 } 234 c.Unlock() 235 } 236 } 237 238 func (c *Cluster) startNewNode(forceNewCluster bool, localAddr, remoteAddr, listenAddr, advertiseAddr, joinAddr, joinToken string) (*node, error) { 239 if err := c.config.Backend.IsSwarmCompatible(); err != nil { 240 return nil, err 241 } 242 243 actualLocalAddr := localAddr 244 if actualLocalAddr == "" { 245 // If localAddr was not specified, resolve it automatically 246 // based on the route to joinAddr. localAddr can only be left 247 // empty on "join". 248 listenHost, _, err := net.SplitHostPort(listenAddr) 249 if err != nil { 250 return nil, fmt.Errorf("could not parse listen address: %v", err) 251 } 252 253 listenAddrIP := net.ParseIP(listenHost) 254 if listenAddrIP == nil || !listenAddrIP.IsUnspecified() { 255 actualLocalAddr = listenHost 256 } else { 257 if remoteAddr == "" { 258 // Should never happen except using swarms created by 259 // old versions that didn't save remoteAddr. 260 remoteAddr = "8.8.8.8:53" 261 } 262 conn, err := net.Dial("udp", remoteAddr) 263 if err != nil { 264 return nil, fmt.Errorf("could not find local IP address: %v", err) 265 } 266 localHostPort := conn.LocalAddr().String() 267 actualLocalAddr, _, _ = net.SplitHostPort(localHostPort) 268 conn.Close() 269 } 270 } 271 272 c.node = nil 273 c.cancelDelay = nil 274 c.stop = false 275 n, err := swarmagent.NewNode(&swarmagent.NodeConfig{ 276 Hostname: c.config.Name, 277 ForceNewCluster: forceNewCluster, 278 ListenControlAPI: filepath.Join(c.root, controlSocket), 279 ListenRemoteAPI: listenAddr, 280 AdvertiseRemoteAPI: advertiseAddr, 281 JoinAddr: joinAddr, 282 StateDir: c.root, 283 JoinToken: joinToken, 284 Executor: container.NewExecutor(c.config.Backend), 285 HeartbeatTick: 1, 286 ElectionTick: 3, 287 }) 288 if err != nil { 289 return nil, err 290 } 291 ctx := context.Background() 292 if err := n.Start(ctx); err != nil { 293 return nil, err 294 } 295 node := &node{ 296 Node: n, 297 done: make(chan struct{}), 298 reconnectDelay: initialReconnectDelay, 299 } 300 c.node = node 301 c.localAddr = localAddr 302 c.actualLocalAddr = actualLocalAddr // not saved 303 c.remoteAddr = remoteAddr 304 c.listenAddr = listenAddr 305 c.advertiseAddr = advertiseAddr 306 c.saveState() 307 308 c.config.Backend.SetClusterProvider(c) 309 go func() { 310 err := n.Err(ctx) 311 if err != nil { 312 logrus.Errorf("cluster exited with error: %v", err) 313 } 314 c.Lock() 315 c.node = nil 316 c.err = err 317 c.Unlock() 318 close(node.done) 319 }() 320 321 go func() { 322 select { 323 case <-n.Ready(): 324 c.Lock() 325 node.ready = true 326 c.err = nil 327 c.Unlock() 328 case <-ctx.Done(): 329 } 330 c.configEvent <- struct{}{} 331 }() 332 333 go func() { 334 for conn := range n.ListenControlSocket(ctx) { 335 c.Lock() 336 if node.conn != conn { 337 if conn == nil { 338 node.client = nil 339 } else { 340 node.client = swarmapi.NewControlClient(conn) 341 } 342 } 343 node.conn = conn 344 c.Unlock() 345 c.configEvent <- struct{}{} 346 } 347 }() 348 349 return node, nil 350 } 351 352 // Init initializes new cluster from user provided request. 353 func (c *Cluster) Init(req types.InitRequest) (string, error) { 354 c.Lock() 355 if node := c.node; node != nil { 356 if !req.ForceNewCluster { 357 c.Unlock() 358 return "", ErrSwarmExists 359 } 360 if err := c.stopNode(); err != nil { 361 c.Unlock() 362 return "", err 363 } 364 } 365 366 if err := validateAndSanitizeInitRequest(&req); err != nil { 367 c.Unlock() 368 return "", err 369 } 370 371 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 372 if err != nil { 373 c.Unlock() 374 return "", err 375 } 376 377 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 378 if err != nil { 379 c.Unlock() 380 return "", err 381 } 382 383 localAddr := listenHost 384 385 // If the advertise address is not one of the system's 386 // addresses, we also require a listen address. 387 listenAddrIP := net.ParseIP(listenHost) 388 if listenAddrIP != nil && listenAddrIP.IsUnspecified() { 389 advertiseIP := net.ParseIP(advertiseHost) 390 if advertiseIP == nil { 391 // not an IP 392 c.Unlock() 393 return "", errMustSpecifyListenAddr 394 } 395 396 systemIPs := listSystemIPs() 397 398 found := false 399 for _, systemIP := range systemIPs { 400 if systemIP.Equal(advertiseIP) { 401 found = true 402 break 403 } 404 } 405 if !found { 406 c.Unlock() 407 return "", errMustSpecifyListenAddr 408 } 409 localAddr = advertiseIP.String() 410 } 411 412 // todo: check current state existing 413 n, err := c.startNewNode(req.ForceNewCluster, localAddr, "", net.JoinHostPort(listenHost, listenPort), net.JoinHostPort(advertiseHost, advertisePort), "", "") 414 if err != nil { 415 c.Unlock() 416 return "", err 417 } 418 c.Unlock() 419 420 select { 421 case <-n.Ready(): 422 if err := initClusterSpec(n, req.Spec); err != nil { 423 return "", err 424 } 425 go c.reconnectOnFailure(n) 426 return n.NodeID(), nil 427 case <-n.done: 428 c.RLock() 429 defer c.RUnlock() 430 if !req.ForceNewCluster { // if failure on first attempt don't keep state 431 if err := c.clearState(); err != nil { 432 return "", err 433 } 434 } 435 return "", c.err 436 } 437 } 438 439 // Join makes current Cluster part of an existing swarm cluster. 440 func (c *Cluster) Join(req types.JoinRequest) error { 441 c.Lock() 442 if node := c.node; node != nil { 443 c.Unlock() 444 return ErrSwarmExists 445 } 446 if err := validateAndSanitizeJoinRequest(&req); err != nil { 447 c.Unlock() 448 return err 449 } 450 451 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 452 if err != nil { 453 c.Unlock() 454 return err 455 } 456 457 var advertiseAddr string 458 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 459 // For joining, we don't need to provide an advertise address, 460 // since the remote side can detect it. 461 if err == nil { 462 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 463 } 464 465 // todo: check current state existing 466 n, err := c.startNewNode(false, "", req.RemoteAddrs[0], net.JoinHostPort(listenHost, listenPort), advertiseAddr, req.RemoteAddrs[0], req.JoinToken) 467 if err != nil { 468 c.Unlock() 469 return err 470 } 471 c.Unlock() 472 473 select { 474 case <-time.After(swarmConnectTimeout): 475 // attempt to connect will continue in background, also reconnecting 476 go c.reconnectOnFailure(n) 477 return ErrSwarmJoinTimeoutReached 478 case <-n.Ready(): 479 go c.reconnectOnFailure(n) 480 return nil 481 case <-n.done: 482 c.RLock() 483 defer c.RUnlock() 484 return c.err 485 } 486 } 487 488 // stopNode is a helper that stops the active c.node and waits until it has 489 // shut down. Call while keeping the cluster lock. 490 func (c *Cluster) stopNode() error { 491 if c.node == nil { 492 return nil 493 } 494 c.stop = true 495 if c.cancelDelay != nil { 496 c.cancelDelay() 497 c.cancelDelay = nil 498 } 499 node := c.node 500 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 501 defer cancel() 502 // TODO: can't hold lock on stop because it calls back to network 503 c.Unlock() 504 defer c.Lock() 505 if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 506 return err 507 } 508 <-node.done 509 return nil 510 } 511 512 // Leave shuts down Cluster and removes current state. 513 func (c *Cluster) Leave(force bool) error { 514 c.Lock() 515 node := c.node 516 if node == nil { 517 c.Unlock() 518 return ErrNoSwarm 519 } 520 521 if node.Manager() != nil && !force { 522 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 523 if c.isActiveManager() { 524 active, reachable, unreachable, err := c.managerStats() 525 if err == nil { 526 if active && reachable-2 <= unreachable { 527 if reachable == 1 && unreachable == 0 { 528 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 529 c.Unlock() 530 return fmt.Errorf(msg) 531 } 532 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 533 } 534 } 535 } else { 536 msg += "Doing so may lose the consensus of your cluster. " 537 } 538 539 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 540 c.Unlock() 541 return fmt.Errorf(msg) 542 } 543 if err := c.stopNode(); err != nil { 544 c.Unlock() 545 return err 546 } 547 c.Unlock() 548 if nodeID := node.NodeID(); nodeID != "" { 549 for _, id := range c.config.Backend.ListContainersForNode(nodeID) { 550 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 551 logrus.Errorf("error removing %v: %v", id, err) 552 } 553 } 554 } 555 c.configEvent <- struct{}{} 556 // todo: cleanup optional? 557 if err := c.clearState(); err != nil { 558 return err 559 } 560 return nil 561 } 562 563 func (c *Cluster) clearState() error { 564 // todo: backup this data instead of removing? 565 if err := os.RemoveAll(c.root); err != nil { 566 return err 567 } 568 if err := os.MkdirAll(c.root, 0700); err != nil { 569 return err 570 } 571 c.config.Backend.SetClusterProvider(nil) 572 return nil 573 } 574 575 func (c *Cluster) getRequestContext() (context.Context, func()) { // TODO: not needed when requests don't block on qourum lost 576 return context.WithTimeout(context.Background(), swarmRequestTimeout) 577 } 578 579 // Inspect retrieves the configuration properties of a managed swarm cluster. 580 func (c *Cluster) Inspect() (types.Swarm, error) { 581 c.RLock() 582 defer c.RUnlock() 583 584 if !c.isActiveManager() { 585 return types.Swarm{}, c.errNoManager() 586 } 587 588 ctx, cancel := c.getRequestContext() 589 defer cancel() 590 591 swarm, err := getSwarm(ctx, c.client) 592 if err != nil { 593 return types.Swarm{}, err 594 } 595 596 if err != nil { 597 return types.Swarm{}, err 598 } 599 600 return convert.SwarmFromGRPC(*swarm), nil 601 } 602 603 // Update updates configuration of a managed swarm cluster. 604 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 605 c.RLock() 606 defer c.RUnlock() 607 608 if !c.isActiveManager() { 609 return c.errNoManager() 610 } 611 612 ctx, cancel := c.getRequestContext() 613 defer cancel() 614 615 swarm, err := getSwarm(ctx, c.client) 616 if err != nil { 617 return err 618 } 619 620 swarmSpec, err := convert.SwarmSpecToGRPC(spec) 621 if err != nil { 622 return err 623 } 624 625 _, err = c.client.UpdateCluster( 626 ctx, 627 &swarmapi.UpdateClusterRequest{ 628 ClusterID: swarm.ID, 629 Spec: &swarmSpec, 630 ClusterVersion: &swarmapi.Version{ 631 Index: version, 632 }, 633 Rotation: swarmapi.JoinTokenRotation{ 634 RotateWorkerToken: flags.RotateWorkerToken, 635 RotateManagerToken: flags.RotateManagerToken, 636 }, 637 }, 638 ) 639 return err 640 } 641 642 // IsManager returns true if Cluster is participating as a manager. 643 func (c *Cluster) IsManager() bool { 644 c.RLock() 645 defer c.RUnlock() 646 return c.isActiveManager() 647 } 648 649 // IsAgent returns true if Cluster is participating as a worker/agent. 650 func (c *Cluster) IsAgent() bool { 651 c.RLock() 652 defer c.RUnlock() 653 return c.node != nil && c.ready 654 } 655 656 // GetLocalAddress returns the local address. 657 func (c *Cluster) GetLocalAddress() string { 658 c.RLock() 659 defer c.RUnlock() 660 return c.actualLocalAddr 661 } 662 663 // GetAdvertiseAddress returns the remotely reachable address of this node. 664 func (c *Cluster) GetAdvertiseAddress() string { 665 c.RLock() 666 defer c.RUnlock() 667 if c.advertiseAddr != "" { 668 advertiseHost, _, _ := net.SplitHostPort(c.advertiseAddr) 669 return advertiseHost 670 } 671 return c.actualLocalAddr 672 } 673 674 // GetRemoteAddress returns a known advertise address of a remote manager if 675 // available. 676 // todo: change to array/connect with info 677 func (c *Cluster) GetRemoteAddress() string { 678 c.RLock() 679 defer c.RUnlock() 680 return c.getRemoteAddress() 681 } 682 683 func (c *Cluster) getRemoteAddress() string { 684 if c.node == nil { 685 return "" 686 } 687 nodeID := c.node.NodeID() 688 for _, r := range c.node.Remotes() { 689 if r.NodeID != nodeID { 690 return r.Addr 691 } 692 } 693 return "" 694 } 695 696 // ListenClusterEvents returns a channel that receives messages on cluster 697 // participation changes. 698 // todo: make cancelable and accessible to multiple callers 699 func (c *Cluster) ListenClusterEvents() <-chan struct{} { 700 return c.configEvent 701 } 702 703 // Info returns information about the current cluster state. 704 func (c *Cluster) Info() types.Info { 705 info := types.Info{ 706 NodeAddr: c.GetAdvertiseAddress(), 707 } 708 709 c.RLock() 710 defer c.RUnlock() 711 712 if c.node == nil { 713 info.LocalNodeState = types.LocalNodeStateInactive 714 if c.cancelDelay != nil { 715 info.LocalNodeState = types.LocalNodeStateError 716 } 717 } else { 718 info.LocalNodeState = types.LocalNodeStatePending 719 if c.ready == true { 720 info.LocalNodeState = types.LocalNodeStateActive 721 } 722 } 723 if c.err != nil { 724 info.Error = c.err.Error() 725 } 726 727 ctx, cancel := c.getRequestContext() 728 defer cancel() 729 730 if c.isActiveManager() { 731 info.ControlAvailable = true 732 swarm, err := c.Inspect() 733 if err != nil { 734 info.Error = err.Error() 735 } 736 737 // Strip JoinTokens 738 info.Cluster = swarm.ClusterInfo 739 740 if r, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err == nil { 741 info.Nodes = len(r.Nodes) 742 for _, n := range r.Nodes { 743 if n.ManagerStatus != nil { 744 info.Managers = info.Managers + 1 745 } 746 } 747 } 748 } 749 750 if c.node != nil { 751 for _, r := range c.node.Remotes() { 752 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 753 } 754 info.NodeID = c.node.NodeID() 755 } 756 757 return info 758 } 759 760 // isActiveManager should not be called without a read lock 761 func (c *Cluster) isActiveManager() bool { 762 return c.node != nil && c.conn != nil 763 } 764 765 // errNoManager returns error describing why manager commands can't be used. 766 // Call with read lock. 767 func (c *Cluster) errNoManager() error { 768 if c.node == nil { 769 return fmt.Errorf("This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again.") 770 } 771 if c.node.Manager() != nil { 772 return fmt.Errorf("This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster.") 773 } 774 return fmt.Errorf("This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.") 775 } 776 777 // GetServices returns all services of a managed swarm cluster. 778 func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) { 779 c.RLock() 780 defer c.RUnlock() 781 782 if !c.isActiveManager() { 783 return nil, c.errNoManager() 784 } 785 786 filters, err := newListServicesFilters(options.Filter) 787 if err != nil { 788 return nil, err 789 } 790 ctx, cancel := c.getRequestContext() 791 defer cancel() 792 793 r, err := c.client.ListServices( 794 ctx, 795 &swarmapi.ListServicesRequest{Filters: filters}) 796 if err != nil { 797 return nil, err 798 } 799 800 services := []types.Service{} 801 802 for _, service := range r.Services { 803 services = append(services, convert.ServiceFromGRPC(*service)) 804 } 805 806 return services, nil 807 } 808 809 // CreateService creates a new service in a managed swarm cluster. 810 func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (string, error) { 811 c.RLock() 812 defer c.RUnlock() 813 814 if !c.isActiveManager() { 815 return "", c.errNoManager() 816 } 817 818 ctx, cancel := c.getRequestContext() 819 defer cancel() 820 821 err := c.populateNetworkID(ctx, c.client, &s) 822 if err != nil { 823 return "", err 824 } 825 826 serviceSpec, err := convert.ServiceSpecToGRPC(s) 827 if err != nil { 828 return "", err 829 } 830 831 if encodedAuth != "" { 832 ctnr := serviceSpec.Task.GetContainer() 833 if ctnr == nil { 834 return "", fmt.Errorf("service does not use container tasks") 835 } 836 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 837 } 838 839 r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec}) 840 if err != nil { 841 return "", err 842 } 843 844 return r.Service.ID, nil 845 } 846 847 // GetService returns a service based on an ID or name. 848 func (c *Cluster) GetService(input string) (types.Service, error) { 849 c.RLock() 850 defer c.RUnlock() 851 852 if !c.isActiveManager() { 853 return types.Service{}, c.errNoManager() 854 } 855 856 ctx, cancel := c.getRequestContext() 857 defer cancel() 858 859 service, err := getService(ctx, c.client, input) 860 if err != nil { 861 return types.Service{}, err 862 } 863 return convert.ServiceFromGRPC(*service), nil 864 } 865 866 // UpdateService updates existing service to match new properties. 867 func (c *Cluster) UpdateService(serviceID string, version uint64, spec types.ServiceSpec, encodedAuth string) error { 868 c.RLock() 869 defer c.RUnlock() 870 871 if !c.isActiveManager() { 872 return c.errNoManager() 873 } 874 875 ctx, cancel := c.getRequestContext() 876 defer cancel() 877 878 err := c.populateNetworkID(ctx, c.client, &spec) 879 if err != nil { 880 return err 881 } 882 883 serviceSpec, err := convert.ServiceSpecToGRPC(spec) 884 if err != nil { 885 return err 886 } 887 888 if encodedAuth != "" { 889 ctnr := serviceSpec.Task.GetContainer() 890 if ctnr == nil { 891 return fmt.Errorf("service does not use container tasks") 892 } 893 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 894 } else { 895 // this is needed because if the encodedAuth isn't being updated then we 896 // shouldn't lose it, and continue to use the one that was already present 897 currentService, err := getService(ctx, c.client, serviceID) 898 if err != nil { 899 return err 900 } 901 ctnr := currentService.Spec.Task.GetContainer() 902 if ctnr == nil { 903 return fmt.Errorf("service does not use container tasks") 904 } 905 serviceSpec.Task.GetContainer().PullOptions = ctnr.PullOptions 906 } 907 908 _, err = c.client.UpdateService( 909 ctx, 910 &swarmapi.UpdateServiceRequest{ 911 ServiceID: serviceID, 912 Spec: &serviceSpec, 913 ServiceVersion: &swarmapi.Version{ 914 Index: version, 915 }, 916 }, 917 ) 918 return err 919 } 920 921 // RemoveService removes a service from a managed swarm cluster. 922 func (c *Cluster) RemoveService(input string) error { 923 c.RLock() 924 defer c.RUnlock() 925 926 if !c.isActiveManager() { 927 return c.errNoManager() 928 } 929 930 ctx, cancel := c.getRequestContext() 931 defer cancel() 932 933 service, err := getService(ctx, c.client, input) 934 if err != nil { 935 return err 936 } 937 938 if _, err := c.client.RemoveService(ctx, &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil { 939 return err 940 } 941 return nil 942 } 943 944 // GetNodes returns a list of all nodes known to a cluster. 945 func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) { 946 c.RLock() 947 defer c.RUnlock() 948 949 if !c.isActiveManager() { 950 return nil, c.errNoManager() 951 } 952 953 filters, err := newListNodesFilters(options.Filter) 954 if err != nil { 955 return nil, err 956 } 957 958 ctx, cancel := c.getRequestContext() 959 defer cancel() 960 961 r, err := c.client.ListNodes( 962 ctx, 963 &swarmapi.ListNodesRequest{Filters: filters}) 964 if err != nil { 965 return nil, err 966 } 967 968 nodes := []types.Node{} 969 970 for _, node := range r.Nodes { 971 nodes = append(nodes, convert.NodeFromGRPC(*node)) 972 } 973 return nodes, nil 974 } 975 976 // GetNode returns a node based on an ID or name. 977 func (c *Cluster) GetNode(input string) (types.Node, error) { 978 c.RLock() 979 defer c.RUnlock() 980 981 if !c.isActiveManager() { 982 return types.Node{}, c.errNoManager() 983 } 984 985 ctx, cancel := c.getRequestContext() 986 defer cancel() 987 988 node, err := getNode(ctx, c.client, input) 989 if err != nil { 990 return types.Node{}, err 991 } 992 return convert.NodeFromGRPC(*node), nil 993 } 994 995 // UpdateNode updates existing nodes properties. 996 func (c *Cluster) UpdateNode(nodeID string, version uint64, spec types.NodeSpec) error { 997 c.RLock() 998 defer c.RUnlock() 999 1000 if !c.isActiveManager() { 1001 return c.errNoManager() 1002 } 1003 1004 nodeSpec, err := convert.NodeSpecToGRPC(spec) 1005 if err != nil { 1006 return err 1007 } 1008 1009 ctx, cancel := c.getRequestContext() 1010 defer cancel() 1011 1012 _, err = c.client.UpdateNode( 1013 ctx, 1014 &swarmapi.UpdateNodeRequest{ 1015 NodeID: nodeID, 1016 Spec: &nodeSpec, 1017 NodeVersion: &swarmapi.Version{ 1018 Index: version, 1019 }, 1020 }, 1021 ) 1022 return err 1023 } 1024 1025 // RemoveNode removes a node from a cluster 1026 func (c *Cluster) RemoveNode(input string, force bool) error { 1027 c.RLock() 1028 defer c.RUnlock() 1029 1030 if !c.isActiveManager() { 1031 return c.errNoManager() 1032 } 1033 1034 ctx, cancel := c.getRequestContext() 1035 defer cancel() 1036 1037 node, err := getNode(ctx, c.client, input) 1038 if err != nil { 1039 return err 1040 } 1041 1042 if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID, Force: force}); err != nil { 1043 return err 1044 } 1045 return nil 1046 } 1047 1048 // GetTasks returns a list of tasks matching the filter options. 1049 func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) { 1050 c.RLock() 1051 defer c.RUnlock() 1052 1053 if !c.isActiveManager() { 1054 return nil, c.errNoManager() 1055 } 1056 1057 byName := func(filter filters.Args) error { 1058 if filter.Include("service") { 1059 serviceFilters := filter.Get("service") 1060 for _, serviceFilter := range serviceFilters { 1061 service, err := c.GetService(serviceFilter) 1062 if err != nil { 1063 return err 1064 } 1065 filter.Del("service", serviceFilter) 1066 filter.Add("service", service.ID) 1067 } 1068 } 1069 if filter.Include("node") { 1070 nodeFilters := filter.Get("node") 1071 for _, nodeFilter := range nodeFilters { 1072 node, err := c.GetNode(nodeFilter) 1073 if err != nil { 1074 return err 1075 } 1076 filter.Del("node", nodeFilter) 1077 filter.Add("node", node.ID) 1078 } 1079 } 1080 return nil 1081 } 1082 1083 filters, err := newListTasksFilters(options.Filter, byName) 1084 if err != nil { 1085 return nil, err 1086 } 1087 1088 ctx, cancel := c.getRequestContext() 1089 defer cancel() 1090 1091 r, err := c.client.ListTasks( 1092 ctx, 1093 &swarmapi.ListTasksRequest{Filters: filters}) 1094 if err != nil { 1095 return nil, err 1096 } 1097 1098 tasks := []types.Task{} 1099 1100 for _, task := range r.Tasks { 1101 tasks = append(tasks, convert.TaskFromGRPC(*task)) 1102 } 1103 return tasks, nil 1104 } 1105 1106 // GetTask returns a task by an ID. 1107 func (c *Cluster) GetTask(input string) (types.Task, error) { 1108 c.RLock() 1109 defer c.RUnlock() 1110 1111 if !c.isActiveManager() { 1112 return types.Task{}, c.errNoManager() 1113 } 1114 1115 ctx, cancel := c.getRequestContext() 1116 defer cancel() 1117 1118 task, err := getTask(ctx, c.client, input) 1119 if err != nil { 1120 return types.Task{}, err 1121 } 1122 return convert.TaskFromGRPC(*task), nil 1123 } 1124 1125 // GetNetwork returns a cluster network by an ID. 1126 func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) { 1127 c.RLock() 1128 defer c.RUnlock() 1129 1130 if !c.isActiveManager() { 1131 return apitypes.NetworkResource{}, c.errNoManager() 1132 } 1133 1134 ctx, cancel := c.getRequestContext() 1135 defer cancel() 1136 1137 network, err := getNetwork(ctx, c.client, input) 1138 if err != nil { 1139 return apitypes.NetworkResource{}, err 1140 } 1141 return convert.BasicNetworkFromGRPC(*network), nil 1142 } 1143 1144 // GetNetworks returns all current cluster managed networks. 1145 func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) { 1146 c.RLock() 1147 defer c.RUnlock() 1148 1149 if !c.isActiveManager() { 1150 return nil, c.errNoManager() 1151 } 1152 1153 ctx, cancel := c.getRequestContext() 1154 defer cancel() 1155 1156 r, err := c.client.ListNetworks(ctx, &swarmapi.ListNetworksRequest{}) 1157 if err != nil { 1158 return nil, err 1159 } 1160 1161 var networks []apitypes.NetworkResource 1162 1163 for _, network := range r.Networks { 1164 networks = append(networks, convert.BasicNetworkFromGRPC(*network)) 1165 } 1166 1167 return networks, nil 1168 } 1169 1170 // CreateNetwork creates a new cluster managed network. 1171 func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) { 1172 c.RLock() 1173 defer c.RUnlock() 1174 1175 if !c.isActiveManager() { 1176 return "", c.errNoManager() 1177 } 1178 1179 if runconfig.IsPreDefinedNetwork(s.Name) { 1180 err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name) 1181 return "", errors.NewRequestForbiddenError(err) 1182 } 1183 1184 ctx, cancel := c.getRequestContext() 1185 defer cancel() 1186 1187 networkSpec := convert.BasicNetworkCreateToGRPC(s) 1188 r, err := c.client.CreateNetwork(ctx, &swarmapi.CreateNetworkRequest{Spec: &networkSpec}) 1189 if err != nil { 1190 return "", err 1191 } 1192 1193 return r.Network.ID, nil 1194 } 1195 1196 // RemoveNetwork removes a cluster network. 1197 func (c *Cluster) RemoveNetwork(input string) error { 1198 c.RLock() 1199 defer c.RUnlock() 1200 1201 if !c.isActiveManager() { 1202 return c.errNoManager() 1203 } 1204 1205 ctx, cancel := c.getRequestContext() 1206 defer cancel() 1207 1208 network, err := getNetwork(ctx, c.client, input) 1209 if err != nil { 1210 return err 1211 } 1212 1213 if _, err := c.client.RemoveNetwork(ctx, &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil { 1214 return err 1215 } 1216 return nil 1217 } 1218 1219 func (c *Cluster) populateNetworkID(ctx context.Context, client swarmapi.ControlClient, s *types.ServiceSpec) error { 1220 for i, n := range s.Networks { 1221 apiNetwork, err := getNetwork(ctx, client, n.Target) 1222 if err != nil { 1223 if ln, _ := c.config.Backend.FindNetwork(n.Target); ln != nil && !ln.Info().Dynamic() { 1224 err = fmt.Errorf("network %s is not eligible for docker services", ln.Name()) 1225 return errors.NewRequestForbiddenError(err) 1226 } 1227 return err 1228 } 1229 s.Networks[i].Target = apiNetwork.ID 1230 } 1231 return nil 1232 } 1233 1234 func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) { 1235 // GetNetwork to match via full ID. 1236 rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input}) 1237 if err != nil { 1238 // If any error (including NotFound), ListNetworks to match via ID prefix and full name. 1239 rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}}) 1240 if err != nil || len(rl.Networks) == 0 { 1241 rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}}) 1242 } 1243 1244 if err != nil { 1245 return nil, err 1246 } 1247 1248 if len(rl.Networks) == 0 { 1249 return nil, fmt.Errorf("network %s not found", input) 1250 } 1251 1252 if l := len(rl.Networks); l > 1 { 1253 return nil, fmt.Errorf("network %s is ambiguous (%d matches found)", input, l) 1254 } 1255 1256 return rl.Networks[0], nil 1257 } 1258 return rg.Network, nil 1259 } 1260 1261 // Cleanup stops active swarm node. This is run before daemon shutdown. 1262 func (c *Cluster) Cleanup() { 1263 c.Lock() 1264 node := c.node 1265 if node == nil { 1266 c.Unlock() 1267 return 1268 } 1269 defer c.Unlock() 1270 if c.isActiveManager() { 1271 active, reachable, unreachable, err := c.managerStats() 1272 if err == nil { 1273 singlenode := active && reachable == 1 && unreachable == 0 1274 if active && !singlenode && reachable-2 <= unreachable { 1275 logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable) 1276 } 1277 } 1278 } 1279 c.stopNode() 1280 } 1281 1282 func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) { 1283 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1284 defer cancel() 1285 nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}) 1286 if err != nil { 1287 return false, 0, 0, err 1288 } 1289 for _, n := range nodes.Nodes { 1290 if n.ManagerStatus != nil { 1291 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE { 1292 reachable++ 1293 if n.ID == c.node.NodeID() { 1294 current = true 1295 } 1296 } 1297 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE { 1298 unreachable++ 1299 } 1300 } 1301 } 1302 return 1303 } 1304 1305 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 1306 var err error 1307 req.ListenAddr, err = validateAddr(req.ListenAddr) 1308 if err != nil { 1309 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1310 } 1311 1312 spec := &req.Spec 1313 // provide sane defaults instead of erroring 1314 if spec.Name == "" { 1315 spec.Name = "default" 1316 } 1317 if spec.Raft.SnapshotInterval == 0 { 1318 spec.Raft.SnapshotInterval = defaultSpec.Raft.SnapshotInterval 1319 } 1320 if spec.Raft.LogEntriesForSlowFollowers == 0 { 1321 spec.Raft.LogEntriesForSlowFollowers = defaultSpec.Raft.LogEntriesForSlowFollowers 1322 } 1323 if spec.Raft.ElectionTick == 0 { 1324 spec.Raft.ElectionTick = defaultSpec.Raft.ElectionTick 1325 } 1326 if spec.Raft.HeartbeatTick == 0 { 1327 spec.Raft.HeartbeatTick = defaultSpec.Raft.HeartbeatTick 1328 } 1329 if spec.Dispatcher.HeartbeatPeriod == 0 { 1330 spec.Dispatcher.HeartbeatPeriod = defaultSpec.Dispatcher.HeartbeatPeriod 1331 } 1332 if spec.CAConfig.NodeCertExpiry == 0 { 1333 spec.CAConfig.NodeCertExpiry = defaultSpec.CAConfig.NodeCertExpiry 1334 } 1335 if spec.Orchestration.TaskHistoryRetentionLimit == 0 { 1336 spec.Orchestration.TaskHistoryRetentionLimit = defaultSpec.Orchestration.TaskHistoryRetentionLimit 1337 } 1338 return nil 1339 } 1340 1341 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 1342 var err error 1343 req.ListenAddr, err = validateAddr(req.ListenAddr) 1344 if err != nil { 1345 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1346 } 1347 if len(req.RemoteAddrs) == 0 { 1348 return fmt.Errorf("at least 1 RemoteAddr is required to join") 1349 } 1350 for i := range req.RemoteAddrs { 1351 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 1352 if err != nil { 1353 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 1354 } 1355 } 1356 return nil 1357 } 1358 1359 func validateAddr(addr string) (string, error) { 1360 if addr == "" { 1361 return addr, fmt.Errorf("invalid empty address") 1362 } 1363 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 1364 if err != nil { 1365 return addr, nil 1366 } 1367 return strings.TrimPrefix(newaddr, "tcp://"), nil 1368 } 1369 1370 func initClusterSpec(node *node, spec types.Spec) error { 1371 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 1372 for conn := range node.ListenControlSocket(ctx) { 1373 if ctx.Err() != nil { 1374 return ctx.Err() 1375 } 1376 if conn != nil { 1377 client := swarmapi.NewControlClient(conn) 1378 var cluster *swarmapi.Cluster 1379 for i := 0; ; i++ { 1380 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 1381 if err != nil { 1382 return fmt.Errorf("error on listing clusters: %v", err) 1383 } 1384 if len(lcr.Clusters) == 0 { 1385 if i < 10 { 1386 time.Sleep(200 * time.Millisecond) 1387 continue 1388 } 1389 return fmt.Errorf("empty list of clusters was returned") 1390 } 1391 cluster = lcr.Clusters[0] 1392 break 1393 } 1394 newspec, err := convert.SwarmSpecToGRPC(spec) 1395 if err != nil { 1396 return fmt.Errorf("error updating cluster settings: %v", err) 1397 } 1398 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 1399 ClusterID: cluster.ID, 1400 ClusterVersion: &cluster.Meta.Version, 1401 Spec: &newspec, 1402 }) 1403 if err != nil { 1404 return fmt.Errorf("error updating cluster settings: %v", err) 1405 } 1406 return nil 1407 } 1408 } 1409 return ctx.Err() 1410 }