github.com/dpiddy/docker@v1.12.2-rc1/daemon/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "encoding/json" 5 "fmt" 6 "io/ioutil" 7 "net" 8 "os" 9 "path/filepath" 10 "strings" 11 "sync" 12 "time" 13 14 "google.golang.org/grpc" 15 16 "github.com/Sirupsen/logrus" 17 "github.com/docker/docker/daemon/cluster/convert" 18 executorpkg "github.com/docker/docker/daemon/cluster/executor" 19 "github.com/docker/docker/daemon/cluster/executor/container" 20 "github.com/docker/docker/errors" 21 "github.com/docker/docker/opts" 22 "github.com/docker/docker/pkg/ioutils" 23 "github.com/docker/docker/runconfig" 24 apitypes "github.com/docker/engine-api/types" 25 "github.com/docker/engine-api/types/filters" 26 types "github.com/docker/engine-api/types/swarm" 27 swarmagent "github.com/docker/swarmkit/agent" 28 swarmapi "github.com/docker/swarmkit/api" 29 "golang.org/x/net/context" 30 ) 31 32 const swarmDirName = "swarm" 33 const controlSocket = "control.sock" 34 const swarmConnectTimeout = 20 * time.Second 35 const swarmRequestTimeout = 20 * time.Second 36 const stateFile = "docker-state.json" 37 const defaultAddr = "0.0.0.0:2377" 38 39 const ( 40 initialReconnectDelay = 100 * time.Millisecond 41 maxReconnectDelay = 30 * time.Second 42 ) 43 44 // ErrNoSwarm is returned on leaving a cluster that was never initialized 45 var ErrNoSwarm = fmt.Errorf("This node is not part of a swarm") 46 47 // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated 48 var ErrSwarmExists = fmt.Errorf("This node is already part of a swarm. Use \"docker swarm leave\" to leave this swarm and join another one.") 49 50 // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet. 51 var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.") 52 53 // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. 54 var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. The attempt to join the swarm will continue in the background. Use the \"docker info\" command to see the current swarm status of your node.") 55 56 // defaultSpec contains some sane defaults if cluster options are missing on init 57 var defaultSpec = types.Spec{ 58 Raft: types.RaftConfig{ 59 SnapshotInterval: 10000, 60 KeepOldSnapshots: 0, 61 LogEntriesForSlowFollowers: 500, 62 HeartbeatTick: 1, 63 ElectionTick: 3, 64 }, 65 CAConfig: types.CAConfig{ 66 NodeCertExpiry: 90 * 24 * time.Hour, 67 }, 68 Dispatcher: types.DispatcherConfig{ 69 HeartbeatPeriod: uint64((5 * time.Second).Nanoseconds()), 70 }, 71 Orchestration: types.OrchestrationConfig{ 72 TaskHistoryRetentionLimit: 10, 73 }, 74 } 75 76 type state struct { 77 // LocalAddr is this machine's local IP or hostname, if specified. 78 LocalAddr string 79 // RemoteAddr is the address that was given to "swarm join. It is used 80 // to find LocalAddr if necessary. 81 RemoteAddr string 82 // ListenAddr is the address we bind to, including a port. 83 ListenAddr string 84 // AdvertiseAddr is the address other nodes should connect to, 85 // including a port. 86 AdvertiseAddr string 87 } 88 89 // NetworkSubnetsProvider exposes functions for retrieving the subnets 90 // of networks managed by Docker, so they can be filtered. 91 type NetworkSubnetsProvider interface { 92 V4Subnets() []net.IPNet 93 V6Subnets() []net.IPNet 94 } 95 96 // Config provides values for Cluster. 97 type Config struct { 98 Root string 99 Name string 100 Backend executorpkg.Backend 101 NetworkSubnetsProvider NetworkSubnetsProvider 102 103 // DefaultAdvertiseAddr is the default host/IP or network interface to use 104 // if no AdvertiseAddr value is specified. 105 DefaultAdvertiseAddr string 106 } 107 108 // Cluster provides capabilities to participate in a cluster as a worker or a 109 // manager. 110 type Cluster struct { 111 sync.RWMutex 112 *node 113 root string 114 config Config 115 configEvent chan struct{} // todo: make this array and goroutine safe 116 localAddr string 117 actualLocalAddr string // after resolution, not persisted 118 remoteAddr string 119 listenAddr string 120 advertiseAddr string 121 stop bool 122 err error 123 cancelDelay func() 124 } 125 126 type node struct { 127 *swarmagent.Node 128 done chan struct{} 129 ready bool 130 conn *grpc.ClientConn 131 client swarmapi.ControlClient 132 reconnectDelay time.Duration 133 } 134 135 // New creates a new Cluster instance using provided config. 136 func New(config Config) (*Cluster, error) { 137 root := filepath.Join(config.Root, swarmDirName) 138 if err := os.MkdirAll(root, 0700); err != nil { 139 return nil, err 140 } 141 c := &Cluster{ 142 root: root, 143 config: config, 144 configEvent: make(chan struct{}, 10), 145 } 146 147 st, err := c.loadState() 148 if err != nil { 149 if os.IsNotExist(err) { 150 return c, nil 151 } 152 return nil, err 153 } 154 155 n, err := c.startNewNode(false, st.LocalAddr, st.RemoteAddr, st.ListenAddr, st.AdvertiseAddr, "", "") 156 if err != nil { 157 return nil, err 158 } 159 160 select { 161 case <-time.After(swarmConnectTimeout): 162 logrus.Errorf("swarm component could not be started before timeout was reached") 163 case <-n.Ready(): 164 case <-n.done: 165 return nil, fmt.Errorf("swarm component could not be started: %v", c.err) 166 } 167 go c.reconnectOnFailure(n) 168 return c, nil 169 } 170 171 func (c *Cluster) loadState() (*state, error) { 172 dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile)) 173 if err != nil { 174 return nil, err 175 } 176 // missing certificate means no actual state to restore from 177 if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil { 178 if os.IsNotExist(err) { 179 c.clearState() 180 } 181 return nil, err 182 } 183 var st state 184 if err := json.Unmarshal(dt, &st); err != nil { 185 return nil, err 186 } 187 return &st, nil 188 } 189 190 func (c *Cluster) saveState() error { 191 dt, err := json.Marshal(state{ 192 LocalAddr: c.localAddr, 193 RemoteAddr: c.remoteAddr, 194 ListenAddr: c.listenAddr, 195 AdvertiseAddr: c.advertiseAddr, 196 }) 197 if err != nil { 198 return err 199 } 200 return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600) 201 } 202 203 func (c *Cluster) reconnectOnFailure(n *node) { 204 for { 205 <-n.done 206 c.Lock() 207 if c.stop || c.node != nil { 208 c.Unlock() 209 return 210 } 211 n.reconnectDelay *= 2 212 if n.reconnectDelay > maxReconnectDelay { 213 n.reconnectDelay = maxReconnectDelay 214 } 215 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 216 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 217 c.cancelDelay = cancel 218 c.Unlock() 219 <-delayCtx.Done() 220 if delayCtx.Err() != context.DeadlineExceeded { 221 return 222 } 223 c.Lock() 224 if c.node != nil { 225 c.Unlock() 226 return 227 } 228 var err error 229 n, err = c.startNewNode(false, c.localAddr, c.getRemoteAddress(), c.listenAddr, c.advertiseAddr, c.getRemoteAddress(), "") 230 if err != nil { 231 c.err = err 232 close(n.done) 233 } 234 c.Unlock() 235 } 236 } 237 238 func (c *Cluster) startNewNode(forceNewCluster bool, localAddr, remoteAddr, listenAddr, advertiseAddr, joinAddr, joinToken string) (*node, error) { 239 if err := c.config.Backend.IsSwarmCompatible(); err != nil { 240 return nil, err 241 } 242 243 actualLocalAddr := localAddr 244 if actualLocalAddr == "" { 245 // If localAddr was not specified, resolve it automatically 246 // based on the route to joinAddr. localAddr can only be left 247 // empty on "join". 248 listenHost, _, err := net.SplitHostPort(listenAddr) 249 if err != nil { 250 return nil, fmt.Errorf("could not parse listen address: %v", err) 251 } 252 253 listenAddrIP := net.ParseIP(listenHost) 254 if listenAddrIP == nil || !listenAddrIP.IsUnspecified() { 255 actualLocalAddr = listenHost 256 } else { 257 if remoteAddr == "" { 258 // Should never happen except using swarms created by 259 // old versions that didn't save remoteAddr. 260 remoteAddr = "8.8.8.8:53" 261 } 262 conn, err := net.Dial("udp", remoteAddr) 263 if err != nil { 264 return nil, fmt.Errorf("could not find local IP address: %v", err) 265 } 266 localHostPort := conn.LocalAddr().String() 267 actualLocalAddr, _, _ = net.SplitHostPort(localHostPort) 268 conn.Close() 269 } 270 } 271 272 c.node = nil 273 c.cancelDelay = nil 274 c.stop = false 275 n, err := swarmagent.NewNode(&swarmagent.NodeConfig{ 276 Hostname: c.config.Name, 277 ForceNewCluster: forceNewCluster, 278 ListenControlAPI: filepath.Join(c.root, controlSocket), 279 ListenRemoteAPI: listenAddr, 280 AdvertiseRemoteAPI: advertiseAddr, 281 JoinAddr: joinAddr, 282 StateDir: c.root, 283 JoinToken: joinToken, 284 Executor: container.NewExecutor(c.config.Backend), 285 HeartbeatTick: 1, 286 ElectionTick: 3, 287 }) 288 if err != nil { 289 return nil, err 290 } 291 ctx := context.Background() 292 if err := n.Start(ctx); err != nil { 293 return nil, err 294 } 295 node := &node{ 296 Node: n, 297 done: make(chan struct{}), 298 reconnectDelay: initialReconnectDelay, 299 } 300 c.node = node 301 c.localAddr = localAddr 302 c.actualLocalAddr = actualLocalAddr // not saved 303 c.remoteAddr = remoteAddr 304 c.listenAddr = listenAddr 305 c.advertiseAddr = advertiseAddr 306 c.saveState() 307 308 c.config.Backend.SetClusterProvider(c) 309 go func() { 310 err := n.Err(ctx) 311 if err != nil { 312 logrus.Errorf("cluster exited with error: %v", err) 313 } 314 c.Lock() 315 c.node = nil 316 c.err = err 317 c.Unlock() 318 close(node.done) 319 }() 320 321 go func() { 322 select { 323 case <-n.Ready(): 324 c.Lock() 325 node.ready = true 326 c.err = nil 327 c.Unlock() 328 case <-ctx.Done(): 329 } 330 c.configEvent <- struct{}{} 331 }() 332 333 go func() { 334 for conn := range n.ListenControlSocket(ctx) { 335 c.Lock() 336 if node.conn != conn { 337 if conn == nil { 338 node.client = nil 339 } else { 340 node.client = swarmapi.NewControlClient(conn) 341 } 342 } 343 node.conn = conn 344 c.Unlock() 345 c.configEvent <- struct{}{} 346 } 347 }() 348 349 return node, nil 350 } 351 352 // Init initializes new cluster from user provided request. 353 func (c *Cluster) Init(req types.InitRequest) (string, error) { 354 c.Lock() 355 if node := c.node; node != nil { 356 if !req.ForceNewCluster { 357 c.Unlock() 358 return "", ErrSwarmExists 359 } 360 if err := c.stopNode(); err != nil { 361 c.Unlock() 362 return "", err 363 } 364 } 365 366 if err := validateAndSanitizeInitRequest(&req); err != nil { 367 c.Unlock() 368 return "", err 369 } 370 371 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 372 if err != nil { 373 c.Unlock() 374 return "", err 375 } 376 377 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 378 if err != nil { 379 c.Unlock() 380 return "", err 381 } 382 383 localAddr := listenHost 384 385 // If the advertise address is not one of the system's 386 // addresses, we also require a listen address. 387 listenAddrIP := net.ParseIP(listenHost) 388 if listenAddrIP != nil && listenAddrIP.IsUnspecified() { 389 advertiseIP := net.ParseIP(advertiseHost) 390 if advertiseIP == nil { 391 // not an IP 392 c.Unlock() 393 return "", errMustSpecifyListenAddr 394 } 395 396 systemIPs := listSystemIPs() 397 398 found := false 399 for _, systemIP := range systemIPs { 400 if systemIP.Equal(advertiseIP) { 401 found = true 402 break 403 } 404 } 405 if !found { 406 c.Unlock() 407 return "", errMustSpecifyListenAddr 408 } 409 localAddr = advertiseIP.String() 410 } 411 412 // todo: check current state existing 413 n, err := c.startNewNode(req.ForceNewCluster, localAddr, "", net.JoinHostPort(listenHost, listenPort), net.JoinHostPort(advertiseHost, advertisePort), "", "") 414 if err != nil { 415 c.Unlock() 416 return "", err 417 } 418 c.Unlock() 419 420 select { 421 case <-n.Ready(): 422 if err := initClusterSpec(n, req.Spec); err != nil { 423 return "", err 424 } 425 go c.reconnectOnFailure(n) 426 return n.NodeID(), nil 427 case <-n.done: 428 c.RLock() 429 defer c.RUnlock() 430 if !req.ForceNewCluster { // if failure on first attempt don't keep state 431 if err := c.clearState(); err != nil { 432 return "", err 433 } 434 } 435 return "", c.err 436 } 437 } 438 439 // Join makes current Cluster part of an existing swarm cluster. 440 func (c *Cluster) Join(req types.JoinRequest) error { 441 c.Lock() 442 if node := c.node; node != nil { 443 c.Unlock() 444 return ErrSwarmExists 445 } 446 if err := validateAndSanitizeJoinRequest(&req); err != nil { 447 c.Unlock() 448 return err 449 } 450 451 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 452 if err != nil { 453 c.Unlock() 454 return err 455 } 456 457 var advertiseAddr string 458 if req.AdvertiseAddr != "" { 459 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 460 // For joining, we don't need to provide an advertise address, 461 // since the remote side can detect it. 462 if err == nil { 463 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 464 } 465 } 466 467 // todo: check current state existing 468 n, err := c.startNewNode(false, "", req.RemoteAddrs[0], net.JoinHostPort(listenHost, listenPort), advertiseAddr, req.RemoteAddrs[0], req.JoinToken) 469 if err != nil { 470 c.Unlock() 471 return err 472 } 473 c.Unlock() 474 475 select { 476 case <-time.After(swarmConnectTimeout): 477 // attempt to connect will continue in background, also reconnecting 478 go c.reconnectOnFailure(n) 479 return ErrSwarmJoinTimeoutReached 480 case <-n.Ready(): 481 go c.reconnectOnFailure(n) 482 return nil 483 case <-n.done: 484 c.RLock() 485 defer c.RUnlock() 486 return c.err 487 } 488 } 489 490 // stopNode is a helper that stops the active c.node and waits until it has 491 // shut down. Call while keeping the cluster lock. 492 func (c *Cluster) stopNode() error { 493 if c.node == nil { 494 return nil 495 } 496 c.stop = true 497 if c.cancelDelay != nil { 498 c.cancelDelay() 499 c.cancelDelay = nil 500 } 501 node := c.node 502 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 503 defer cancel() 504 // TODO: can't hold lock on stop because it calls back to network 505 c.Unlock() 506 defer c.Lock() 507 if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 508 return err 509 } 510 <-node.done 511 return nil 512 } 513 514 // Leave shuts down Cluster and removes current state. 515 func (c *Cluster) Leave(force bool) error { 516 c.Lock() 517 node := c.node 518 if node == nil { 519 c.Unlock() 520 return ErrNoSwarm 521 } 522 523 if node.Manager() != nil && !force { 524 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 525 if c.isActiveManager() { 526 active, reachable, unreachable, err := c.managerStats() 527 if err == nil { 528 if active && reachable-2 <= unreachable { 529 if reachable == 1 && unreachable == 0 { 530 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 531 c.Unlock() 532 return fmt.Errorf(msg) 533 } 534 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 535 } 536 } 537 } else { 538 msg += "Doing so may lose the consensus of your cluster. " 539 } 540 541 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 542 c.Unlock() 543 return fmt.Errorf(msg) 544 } 545 if err := c.stopNode(); err != nil { 546 c.Unlock() 547 return err 548 } 549 c.Unlock() 550 if nodeID := node.NodeID(); nodeID != "" { 551 for _, id := range c.config.Backend.ListContainersForNode(nodeID) { 552 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 553 logrus.Errorf("error removing %v: %v", id, err) 554 } 555 } 556 } 557 c.configEvent <- struct{}{} 558 // todo: cleanup optional? 559 if err := c.clearState(); err != nil { 560 return err 561 } 562 return nil 563 } 564 565 func (c *Cluster) clearState() error { 566 // todo: backup this data instead of removing? 567 if err := os.RemoveAll(c.root); err != nil { 568 return err 569 } 570 if err := os.MkdirAll(c.root, 0700); err != nil { 571 return err 572 } 573 c.config.Backend.SetClusterProvider(nil) 574 return nil 575 } 576 577 func (c *Cluster) getRequestContext() (context.Context, func()) { // TODO: not needed when requests don't block on qourum lost 578 return context.WithTimeout(context.Background(), swarmRequestTimeout) 579 } 580 581 // Inspect retrieves the configuration properties of a managed swarm cluster. 582 func (c *Cluster) Inspect() (types.Swarm, error) { 583 c.RLock() 584 defer c.RUnlock() 585 586 if !c.isActiveManager() { 587 return types.Swarm{}, c.errNoManager() 588 } 589 590 ctx, cancel := c.getRequestContext() 591 defer cancel() 592 593 swarm, err := getSwarm(ctx, c.client) 594 if err != nil { 595 return types.Swarm{}, err 596 } 597 598 if err != nil { 599 return types.Swarm{}, err 600 } 601 602 return convert.SwarmFromGRPC(*swarm), nil 603 } 604 605 // Update updates configuration of a managed swarm cluster. 606 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 607 c.RLock() 608 defer c.RUnlock() 609 610 if !c.isActiveManager() { 611 return c.errNoManager() 612 } 613 614 ctx, cancel := c.getRequestContext() 615 defer cancel() 616 617 swarm, err := getSwarm(ctx, c.client) 618 if err != nil { 619 return err 620 } 621 622 swarmSpec, err := convert.SwarmSpecToGRPC(spec) 623 if err != nil { 624 return err 625 } 626 627 _, err = c.client.UpdateCluster( 628 ctx, 629 &swarmapi.UpdateClusterRequest{ 630 ClusterID: swarm.ID, 631 Spec: &swarmSpec, 632 ClusterVersion: &swarmapi.Version{ 633 Index: version, 634 }, 635 Rotation: swarmapi.JoinTokenRotation{ 636 RotateWorkerToken: flags.RotateWorkerToken, 637 RotateManagerToken: flags.RotateManagerToken, 638 }, 639 }, 640 ) 641 return err 642 } 643 644 // IsManager returns true if Cluster is participating as a manager. 645 func (c *Cluster) IsManager() bool { 646 c.RLock() 647 defer c.RUnlock() 648 return c.isActiveManager() 649 } 650 651 // IsAgent returns true if Cluster is participating as a worker/agent. 652 func (c *Cluster) IsAgent() bool { 653 c.RLock() 654 defer c.RUnlock() 655 return c.node != nil && c.ready 656 } 657 658 // GetLocalAddress returns the local address. 659 func (c *Cluster) GetLocalAddress() string { 660 c.RLock() 661 defer c.RUnlock() 662 return c.actualLocalAddr 663 } 664 665 // GetListenAddress returns the listen address. 666 func (c *Cluster) GetListenAddress() string { 667 c.RLock() 668 defer c.RUnlock() 669 return c.listenAddr 670 } 671 672 // GetAdvertiseAddress returns the remotely reachable address of this node. 673 func (c *Cluster) GetAdvertiseAddress() string { 674 c.RLock() 675 defer c.RUnlock() 676 if c.advertiseAddr != "" { 677 advertiseHost, _, _ := net.SplitHostPort(c.advertiseAddr) 678 return advertiseHost 679 } 680 return c.actualLocalAddr 681 } 682 683 // GetRemoteAddress returns a known advertise address of a remote manager if 684 // available. 685 // todo: change to array/connect with info 686 func (c *Cluster) GetRemoteAddress() string { 687 c.RLock() 688 defer c.RUnlock() 689 return c.getRemoteAddress() 690 } 691 692 func (c *Cluster) getRemoteAddress() string { 693 if c.node == nil { 694 return "" 695 } 696 nodeID := c.node.NodeID() 697 for _, r := range c.node.Remotes() { 698 if r.NodeID != nodeID { 699 return r.Addr 700 } 701 } 702 return "" 703 } 704 705 // ListenClusterEvents returns a channel that receives messages on cluster 706 // participation changes. 707 // todo: make cancelable and accessible to multiple callers 708 func (c *Cluster) ListenClusterEvents() <-chan struct{} { 709 return c.configEvent 710 } 711 712 // Info returns information about the current cluster state. 713 func (c *Cluster) Info() types.Info { 714 info := types.Info{ 715 NodeAddr: c.GetAdvertiseAddress(), 716 } 717 718 c.RLock() 719 defer c.RUnlock() 720 721 if c.node == nil { 722 info.LocalNodeState = types.LocalNodeStateInactive 723 if c.cancelDelay != nil { 724 info.LocalNodeState = types.LocalNodeStateError 725 } 726 } else { 727 info.LocalNodeState = types.LocalNodeStatePending 728 if c.ready == true { 729 info.LocalNodeState = types.LocalNodeStateActive 730 } 731 } 732 if c.err != nil { 733 info.Error = c.err.Error() 734 } 735 736 ctx, cancel := c.getRequestContext() 737 defer cancel() 738 739 if c.isActiveManager() { 740 info.ControlAvailable = true 741 swarm, err := c.Inspect() 742 if err != nil { 743 info.Error = err.Error() 744 } 745 746 // Strip JoinTokens 747 info.Cluster = swarm.ClusterInfo 748 749 if r, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err == nil { 750 info.Nodes = len(r.Nodes) 751 for _, n := range r.Nodes { 752 if n.ManagerStatus != nil { 753 info.Managers = info.Managers + 1 754 } 755 } 756 } 757 } 758 759 if c.node != nil { 760 for _, r := range c.node.Remotes() { 761 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 762 } 763 info.NodeID = c.node.NodeID() 764 } 765 766 return info 767 } 768 769 // isActiveManager should not be called without a read lock 770 func (c *Cluster) isActiveManager() bool { 771 return c.node != nil && c.conn != nil 772 } 773 774 // errNoManager returns error describing why manager commands can't be used. 775 // Call with read lock. 776 func (c *Cluster) errNoManager() error { 777 if c.node == nil { 778 return fmt.Errorf("This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again.") 779 } 780 if c.node.Manager() != nil { 781 return fmt.Errorf("This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster.") 782 } 783 return fmt.Errorf("This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.") 784 } 785 786 // GetServices returns all services of a managed swarm cluster. 787 func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) { 788 c.RLock() 789 defer c.RUnlock() 790 791 if !c.isActiveManager() { 792 return nil, c.errNoManager() 793 } 794 795 filters, err := newListServicesFilters(options.Filter) 796 if err != nil { 797 return nil, err 798 } 799 ctx, cancel := c.getRequestContext() 800 defer cancel() 801 802 r, err := c.client.ListServices( 803 ctx, 804 &swarmapi.ListServicesRequest{Filters: filters}) 805 if err != nil { 806 return nil, err 807 } 808 809 services := []types.Service{} 810 811 for _, service := range r.Services { 812 services = append(services, convert.ServiceFromGRPC(*service)) 813 } 814 815 return services, nil 816 } 817 818 // CreateService creates a new service in a managed swarm cluster. 819 func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (string, error) { 820 c.RLock() 821 defer c.RUnlock() 822 823 if !c.isActiveManager() { 824 return "", c.errNoManager() 825 } 826 827 ctx, cancel := c.getRequestContext() 828 defer cancel() 829 830 err := c.populateNetworkID(ctx, c.client, &s) 831 if err != nil { 832 return "", err 833 } 834 835 serviceSpec, err := convert.ServiceSpecToGRPC(s) 836 if err != nil { 837 return "", err 838 } 839 840 if encodedAuth != "" { 841 ctnr := serviceSpec.Task.GetContainer() 842 if ctnr == nil { 843 return "", fmt.Errorf("service does not use container tasks") 844 } 845 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 846 } 847 848 r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec}) 849 if err != nil { 850 return "", err 851 } 852 853 return r.Service.ID, nil 854 } 855 856 // GetService returns a service based on an ID or name. 857 func (c *Cluster) GetService(input string) (types.Service, error) { 858 c.RLock() 859 defer c.RUnlock() 860 861 if !c.isActiveManager() { 862 return types.Service{}, c.errNoManager() 863 } 864 865 ctx, cancel := c.getRequestContext() 866 defer cancel() 867 868 service, err := getService(ctx, c.client, input) 869 if err != nil { 870 return types.Service{}, err 871 } 872 return convert.ServiceFromGRPC(*service), nil 873 } 874 875 // UpdateService updates existing service to match new properties. 876 func (c *Cluster) UpdateService(serviceID string, version uint64, spec types.ServiceSpec, encodedAuth string) error { 877 c.RLock() 878 defer c.RUnlock() 879 880 if !c.isActiveManager() { 881 return c.errNoManager() 882 } 883 884 ctx, cancel := c.getRequestContext() 885 defer cancel() 886 887 err := c.populateNetworkID(ctx, c.client, &spec) 888 if err != nil { 889 return err 890 } 891 892 serviceSpec, err := convert.ServiceSpecToGRPC(spec) 893 if err != nil { 894 return err 895 } 896 897 if encodedAuth != "" { 898 ctnr := serviceSpec.Task.GetContainer() 899 if ctnr == nil { 900 return fmt.Errorf("service does not use container tasks") 901 } 902 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 903 } else { 904 // this is needed because if the encodedAuth isn't being updated then we 905 // shouldn't lose it, and continue to use the one that was already present 906 currentService, err := getService(ctx, c.client, serviceID) 907 if err != nil { 908 return err 909 } 910 ctnr := currentService.Spec.Task.GetContainer() 911 if ctnr == nil { 912 return fmt.Errorf("service does not use container tasks") 913 } 914 serviceSpec.Task.GetContainer().PullOptions = ctnr.PullOptions 915 } 916 917 _, err = c.client.UpdateService( 918 ctx, 919 &swarmapi.UpdateServiceRequest{ 920 ServiceID: serviceID, 921 Spec: &serviceSpec, 922 ServiceVersion: &swarmapi.Version{ 923 Index: version, 924 }, 925 }, 926 ) 927 return err 928 } 929 930 // RemoveService removes a service from a managed swarm cluster. 931 func (c *Cluster) RemoveService(input string) error { 932 c.RLock() 933 defer c.RUnlock() 934 935 if !c.isActiveManager() { 936 return c.errNoManager() 937 } 938 939 ctx, cancel := c.getRequestContext() 940 defer cancel() 941 942 service, err := getService(ctx, c.client, input) 943 if err != nil { 944 return err 945 } 946 947 if _, err := c.client.RemoveService(ctx, &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil { 948 return err 949 } 950 return nil 951 } 952 953 // GetNodes returns a list of all nodes known to a cluster. 954 func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) { 955 c.RLock() 956 defer c.RUnlock() 957 958 if !c.isActiveManager() { 959 return nil, c.errNoManager() 960 } 961 962 filters, err := newListNodesFilters(options.Filter) 963 if err != nil { 964 return nil, err 965 } 966 967 ctx, cancel := c.getRequestContext() 968 defer cancel() 969 970 r, err := c.client.ListNodes( 971 ctx, 972 &swarmapi.ListNodesRequest{Filters: filters}) 973 if err != nil { 974 return nil, err 975 } 976 977 nodes := []types.Node{} 978 979 for _, node := range r.Nodes { 980 nodes = append(nodes, convert.NodeFromGRPC(*node)) 981 } 982 return nodes, nil 983 } 984 985 // GetNode returns a node based on an ID or name. 986 func (c *Cluster) GetNode(input string) (types.Node, error) { 987 c.RLock() 988 defer c.RUnlock() 989 990 if !c.isActiveManager() { 991 return types.Node{}, c.errNoManager() 992 } 993 994 ctx, cancel := c.getRequestContext() 995 defer cancel() 996 997 node, err := getNode(ctx, c.client, input) 998 if err != nil { 999 return types.Node{}, err 1000 } 1001 return convert.NodeFromGRPC(*node), nil 1002 } 1003 1004 // UpdateNode updates existing nodes properties. 1005 func (c *Cluster) UpdateNode(nodeID string, version uint64, spec types.NodeSpec) error { 1006 c.RLock() 1007 defer c.RUnlock() 1008 1009 if !c.isActiveManager() { 1010 return c.errNoManager() 1011 } 1012 1013 nodeSpec, err := convert.NodeSpecToGRPC(spec) 1014 if err != nil { 1015 return err 1016 } 1017 1018 ctx, cancel := c.getRequestContext() 1019 defer cancel() 1020 1021 _, err = c.client.UpdateNode( 1022 ctx, 1023 &swarmapi.UpdateNodeRequest{ 1024 NodeID: nodeID, 1025 Spec: &nodeSpec, 1026 NodeVersion: &swarmapi.Version{ 1027 Index: version, 1028 }, 1029 }, 1030 ) 1031 return err 1032 } 1033 1034 // RemoveNode removes a node from a cluster 1035 func (c *Cluster) RemoveNode(input string, force bool) error { 1036 c.RLock() 1037 defer c.RUnlock() 1038 1039 if !c.isActiveManager() { 1040 return c.errNoManager() 1041 } 1042 1043 ctx, cancel := c.getRequestContext() 1044 defer cancel() 1045 1046 node, err := getNode(ctx, c.client, input) 1047 if err != nil { 1048 return err 1049 } 1050 1051 if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID, Force: force}); err != nil { 1052 return err 1053 } 1054 return nil 1055 } 1056 1057 // GetTasks returns a list of tasks matching the filter options. 1058 func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) { 1059 c.RLock() 1060 defer c.RUnlock() 1061 1062 if !c.isActiveManager() { 1063 return nil, c.errNoManager() 1064 } 1065 1066 byName := func(filter filters.Args) error { 1067 if filter.Include("service") { 1068 serviceFilters := filter.Get("service") 1069 for _, serviceFilter := range serviceFilters { 1070 service, err := c.GetService(serviceFilter) 1071 if err != nil { 1072 return err 1073 } 1074 filter.Del("service", serviceFilter) 1075 filter.Add("service", service.ID) 1076 } 1077 } 1078 if filter.Include("node") { 1079 nodeFilters := filter.Get("node") 1080 for _, nodeFilter := range nodeFilters { 1081 node, err := c.GetNode(nodeFilter) 1082 if err != nil { 1083 return err 1084 } 1085 filter.Del("node", nodeFilter) 1086 filter.Add("node", node.ID) 1087 } 1088 } 1089 return nil 1090 } 1091 1092 filters, err := newListTasksFilters(options.Filter, byName) 1093 if err != nil { 1094 return nil, err 1095 } 1096 1097 ctx, cancel := c.getRequestContext() 1098 defer cancel() 1099 1100 r, err := c.client.ListTasks( 1101 ctx, 1102 &swarmapi.ListTasksRequest{Filters: filters}) 1103 if err != nil { 1104 return nil, err 1105 } 1106 1107 tasks := []types.Task{} 1108 1109 for _, task := range r.Tasks { 1110 tasks = append(tasks, convert.TaskFromGRPC(*task)) 1111 } 1112 return tasks, nil 1113 } 1114 1115 // GetTask returns a task by an ID. 1116 func (c *Cluster) GetTask(input string) (types.Task, error) { 1117 c.RLock() 1118 defer c.RUnlock() 1119 1120 if !c.isActiveManager() { 1121 return types.Task{}, c.errNoManager() 1122 } 1123 1124 ctx, cancel := c.getRequestContext() 1125 defer cancel() 1126 1127 task, err := getTask(ctx, c.client, input) 1128 if err != nil { 1129 return types.Task{}, err 1130 } 1131 return convert.TaskFromGRPC(*task), nil 1132 } 1133 1134 // GetNetwork returns a cluster network by an ID. 1135 func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) { 1136 c.RLock() 1137 defer c.RUnlock() 1138 1139 if !c.isActiveManager() { 1140 return apitypes.NetworkResource{}, c.errNoManager() 1141 } 1142 1143 ctx, cancel := c.getRequestContext() 1144 defer cancel() 1145 1146 network, err := getNetwork(ctx, c.client, input) 1147 if err != nil { 1148 return apitypes.NetworkResource{}, err 1149 } 1150 return convert.BasicNetworkFromGRPC(*network), nil 1151 } 1152 1153 // GetNetworks returns all current cluster managed networks. 1154 func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) { 1155 c.RLock() 1156 defer c.RUnlock() 1157 1158 if !c.isActiveManager() { 1159 return nil, c.errNoManager() 1160 } 1161 1162 ctx, cancel := c.getRequestContext() 1163 defer cancel() 1164 1165 r, err := c.client.ListNetworks(ctx, &swarmapi.ListNetworksRequest{}) 1166 if err != nil { 1167 return nil, err 1168 } 1169 1170 var networks []apitypes.NetworkResource 1171 1172 for _, network := range r.Networks { 1173 networks = append(networks, convert.BasicNetworkFromGRPC(*network)) 1174 } 1175 1176 return networks, nil 1177 } 1178 1179 // CreateNetwork creates a new cluster managed network. 1180 func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) { 1181 c.RLock() 1182 defer c.RUnlock() 1183 1184 if !c.isActiveManager() { 1185 return "", c.errNoManager() 1186 } 1187 1188 if runconfig.IsPreDefinedNetwork(s.Name) { 1189 err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name) 1190 return "", errors.NewRequestForbiddenError(err) 1191 } 1192 1193 ctx, cancel := c.getRequestContext() 1194 defer cancel() 1195 1196 networkSpec := convert.BasicNetworkCreateToGRPC(s) 1197 r, err := c.client.CreateNetwork(ctx, &swarmapi.CreateNetworkRequest{Spec: &networkSpec}) 1198 if err != nil { 1199 return "", err 1200 } 1201 1202 return r.Network.ID, nil 1203 } 1204 1205 // RemoveNetwork removes a cluster network. 1206 func (c *Cluster) RemoveNetwork(input string) error { 1207 c.RLock() 1208 defer c.RUnlock() 1209 1210 if !c.isActiveManager() { 1211 return c.errNoManager() 1212 } 1213 1214 ctx, cancel := c.getRequestContext() 1215 defer cancel() 1216 1217 network, err := getNetwork(ctx, c.client, input) 1218 if err != nil { 1219 return err 1220 } 1221 1222 if _, err := c.client.RemoveNetwork(ctx, &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil { 1223 return err 1224 } 1225 return nil 1226 } 1227 1228 func (c *Cluster) populateNetworkID(ctx context.Context, client swarmapi.ControlClient, s *types.ServiceSpec) error { 1229 for i, n := range s.Networks { 1230 apiNetwork, err := getNetwork(ctx, client, n.Target) 1231 if err != nil { 1232 if ln, _ := c.config.Backend.FindNetwork(n.Target); ln != nil && !ln.Info().Dynamic() { 1233 err = fmt.Errorf("network %s is not eligible for docker services", ln.Name()) 1234 return errors.NewRequestForbiddenError(err) 1235 } 1236 return err 1237 } 1238 s.Networks[i].Target = apiNetwork.ID 1239 } 1240 return nil 1241 } 1242 1243 func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) { 1244 // GetNetwork to match via full ID. 1245 rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input}) 1246 if err != nil { 1247 // If any error (including NotFound), ListNetworks to match via ID prefix and full name. 1248 rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}}) 1249 if err != nil || len(rl.Networks) == 0 { 1250 rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}}) 1251 } 1252 1253 if err != nil { 1254 return nil, err 1255 } 1256 1257 if len(rl.Networks) == 0 { 1258 return nil, fmt.Errorf("network %s not found", input) 1259 } 1260 1261 if l := len(rl.Networks); l > 1 { 1262 return nil, fmt.Errorf("network %s is ambiguous (%d matches found)", input, l) 1263 } 1264 1265 return rl.Networks[0], nil 1266 } 1267 return rg.Network, nil 1268 } 1269 1270 // Cleanup stops active swarm node. This is run before daemon shutdown. 1271 func (c *Cluster) Cleanup() { 1272 c.Lock() 1273 node := c.node 1274 if node == nil { 1275 c.Unlock() 1276 return 1277 } 1278 defer c.Unlock() 1279 if c.isActiveManager() { 1280 active, reachable, unreachable, err := c.managerStats() 1281 if err == nil { 1282 singlenode := active && reachable == 1 && unreachable == 0 1283 if active && !singlenode && reachable-2 <= unreachable { 1284 logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable) 1285 } 1286 } 1287 } 1288 c.stopNode() 1289 } 1290 1291 func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) { 1292 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1293 defer cancel() 1294 nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}) 1295 if err != nil { 1296 return false, 0, 0, err 1297 } 1298 for _, n := range nodes.Nodes { 1299 if n.ManagerStatus != nil { 1300 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE { 1301 reachable++ 1302 if n.ID == c.node.NodeID() { 1303 current = true 1304 } 1305 } 1306 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE { 1307 unreachable++ 1308 } 1309 } 1310 } 1311 return 1312 } 1313 1314 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 1315 var err error 1316 req.ListenAddr, err = validateAddr(req.ListenAddr) 1317 if err != nil { 1318 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1319 } 1320 1321 spec := &req.Spec 1322 // provide sane defaults instead of erroring 1323 if spec.Name == "" { 1324 spec.Name = "default" 1325 } 1326 if spec.Raft.SnapshotInterval == 0 { 1327 spec.Raft.SnapshotInterval = defaultSpec.Raft.SnapshotInterval 1328 } 1329 if spec.Raft.LogEntriesForSlowFollowers == 0 { 1330 spec.Raft.LogEntriesForSlowFollowers = defaultSpec.Raft.LogEntriesForSlowFollowers 1331 } 1332 if spec.Raft.ElectionTick == 0 { 1333 spec.Raft.ElectionTick = defaultSpec.Raft.ElectionTick 1334 } 1335 if spec.Raft.HeartbeatTick == 0 { 1336 spec.Raft.HeartbeatTick = defaultSpec.Raft.HeartbeatTick 1337 } 1338 if spec.Dispatcher.HeartbeatPeriod == 0 { 1339 spec.Dispatcher.HeartbeatPeriod = defaultSpec.Dispatcher.HeartbeatPeriod 1340 } 1341 if spec.CAConfig.NodeCertExpiry == 0 { 1342 spec.CAConfig.NodeCertExpiry = defaultSpec.CAConfig.NodeCertExpiry 1343 } 1344 if spec.Orchestration.TaskHistoryRetentionLimit == 0 { 1345 spec.Orchestration.TaskHistoryRetentionLimit = defaultSpec.Orchestration.TaskHistoryRetentionLimit 1346 } 1347 return nil 1348 } 1349 1350 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 1351 var err error 1352 req.ListenAddr, err = validateAddr(req.ListenAddr) 1353 if err != nil { 1354 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1355 } 1356 if len(req.RemoteAddrs) == 0 { 1357 return fmt.Errorf("at least 1 RemoteAddr is required to join") 1358 } 1359 for i := range req.RemoteAddrs { 1360 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 1361 if err != nil { 1362 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 1363 } 1364 } 1365 return nil 1366 } 1367 1368 func validateAddr(addr string) (string, error) { 1369 if addr == "" { 1370 return addr, fmt.Errorf("invalid empty address") 1371 } 1372 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 1373 if err != nil { 1374 return addr, nil 1375 } 1376 return strings.TrimPrefix(newaddr, "tcp://"), nil 1377 } 1378 1379 func initClusterSpec(node *node, spec types.Spec) error { 1380 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 1381 for conn := range node.ListenControlSocket(ctx) { 1382 if ctx.Err() != nil { 1383 return ctx.Err() 1384 } 1385 if conn != nil { 1386 client := swarmapi.NewControlClient(conn) 1387 var cluster *swarmapi.Cluster 1388 for i := 0; ; i++ { 1389 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 1390 if err != nil { 1391 return fmt.Errorf("error on listing clusters: %v", err) 1392 } 1393 if len(lcr.Clusters) == 0 { 1394 if i < 10 { 1395 time.Sleep(200 * time.Millisecond) 1396 continue 1397 } 1398 return fmt.Errorf("empty list of clusters was returned") 1399 } 1400 cluster = lcr.Clusters[0] 1401 break 1402 } 1403 newspec, err := convert.SwarmSpecToGRPC(spec) 1404 if err != nil { 1405 return fmt.Errorf("error updating cluster settings: %v", err) 1406 } 1407 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 1408 ClusterID: cluster.ID, 1409 ClusterVersion: &cluster.Meta.Version, 1410 Spec: &newspec, 1411 }) 1412 if err != nil { 1413 return fmt.Errorf("error updating cluster settings: %v", err) 1414 } 1415 return nil 1416 } 1417 } 1418 return ctx.Err() 1419 }