github.com/brandon-bethke-neudesic/moby@v1.13.1/daemon/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "crypto/x509" 5 "encoding/base64" 6 "encoding/json" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "net" 11 "os" 12 "path/filepath" 13 "runtime" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/Sirupsen/logrus" 19 "github.com/docker/distribution/digest" 20 distreference "github.com/docker/distribution/reference" 21 apierrors "github.com/docker/docker/api/errors" 22 apitypes "github.com/docker/docker/api/types" 23 "github.com/docker/docker/api/types/backend" 24 "github.com/docker/docker/api/types/filters" 25 "github.com/docker/docker/api/types/network" 26 types "github.com/docker/docker/api/types/swarm" 27 "github.com/docker/docker/daemon/cluster/convert" 28 executorpkg "github.com/docker/docker/daemon/cluster/executor" 29 "github.com/docker/docker/daemon/cluster/executor/container" 30 "github.com/docker/docker/daemon/logger" 31 "github.com/docker/docker/opts" 32 "github.com/docker/docker/pkg/ioutils" 33 "github.com/docker/docker/pkg/signal" 34 "github.com/docker/docker/pkg/stdcopy" 35 "github.com/docker/docker/reference" 36 "github.com/docker/docker/runconfig" 37 swarmapi "github.com/docker/swarmkit/api" 38 "github.com/docker/swarmkit/manager/encryption" 39 swarmnode "github.com/docker/swarmkit/node" 40 "github.com/docker/swarmkit/protobuf/ptypes" 41 "github.com/pkg/errors" 42 "golang.org/x/net/context" 43 "google.golang.org/grpc" 44 ) 45 46 const swarmDirName = "swarm" 47 const controlSocket = "control.sock" 48 const swarmConnectTimeout = 20 * time.Second 49 const swarmRequestTimeout = 20 * time.Second 50 const stateFile = "docker-state.json" 51 const defaultAddr = "0.0.0.0:2377" 52 53 const ( 54 initialReconnectDelay = 100 * time.Millisecond 55 maxReconnectDelay = 30 * time.Second 56 contextPrefix = "com.docker.swarm" 57 ) 58 59 // ErrNoSwarm is returned on leaving a cluster that was never initialized 60 var ErrNoSwarm = fmt.Errorf("This node is not part of a swarm") 61 62 // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated 63 var ErrSwarmExists = fmt.Errorf("This node is already part of a swarm. Use \"docker swarm leave\" to leave this swarm and join another one.") 64 65 // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet. 66 var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.") 67 68 // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. 69 var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. The attempt to join the swarm will continue in the background. Use the \"docker info\" command to see the current swarm status of your node.") 70 71 // ErrSwarmLocked is returned if the swarm is encrypted and needs a key to unlock it. 72 var ErrSwarmLocked = fmt.Errorf("Swarm is encrypted and needs to be unlocked before it can be used. Please use \"docker swarm unlock\" to unlock it.") 73 74 // ErrSwarmCertificatesExpired is returned if docker was not started for the whole validity period and they had no chance to renew automatically. 75 var ErrSwarmCertificatesExpired = errors.New("Swarm certificates have expired. To replace them, leave the swarm and join again.") 76 77 // NetworkSubnetsProvider exposes functions for retrieving the subnets 78 // of networks managed by Docker, so they can be filtered. 79 type NetworkSubnetsProvider interface { 80 V4Subnets() []net.IPNet 81 V6Subnets() []net.IPNet 82 } 83 84 // Config provides values for Cluster. 85 type Config struct { 86 Root string 87 Name string 88 Backend executorpkg.Backend 89 NetworkSubnetsProvider NetworkSubnetsProvider 90 91 // DefaultAdvertiseAddr is the default host/IP or network interface to use 92 // if no AdvertiseAddr value is specified. 93 DefaultAdvertiseAddr string 94 95 // path to store runtime state, such as the swarm control socket 96 RuntimeRoot string 97 } 98 99 // Cluster provides capabilities to participate in a cluster as a worker or a 100 // manager. 101 type Cluster struct { 102 sync.RWMutex 103 *node 104 root string 105 runtimeRoot string 106 config Config 107 configEvent chan struct{} // todo: make this array and goroutine safe 108 actualLocalAddr string // after resolution, not persisted 109 stop bool 110 err error 111 cancelDelay func() 112 attachers map[string]*attacher 113 locked bool 114 lastNodeConfig *nodeStartConfig 115 } 116 117 // attacher manages the in-memory attachment state of a container 118 // attachment to a global scope network managed by swarm manager. It 119 // helps in identifying the attachment ID via the taskID and the 120 // corresponding attachment configuration obtained from the manager. 121 type attacher struct { 122 taskID string 123 config *network.NetworkingConfig 124 attachWaitCh chan *network.NetworkingConfig 125 attachCompleteCh chan struct{} 126 detachWaitCh chan struct{} 127 } 128 129 type node struct { 130 *swarmnode.Node 131 done chan struct{} 132 ready bool 133 conn *grpc.ClientConn 134 client swarmapi.ControlClient 135 logs swarmapi.LogsClient 136 reconnectDelay time.Duration 137 config nodeStartConfig 138 } 139 140 // nodeStartConfig holds configuration needed to start a new node. Exported 141 // fields of this structure are saved to disk in json. Unexported fields 142 // contain data that shouldn't be persisted between daemon reloads. 143 type nodeStartConfig struct { 144 // LocalAddr is this machine's local IP or hostname, if specified. 145 LocalAddr string 146 // RemoteAddr is the address that was given to "swarm join". It is used 147 // to find LocalAddr if necessary. 148 RemoteAddr string 149 // ListenAddr is the address we bind to, including a port. 150 ListenAddr string 151 // AdvertiseAddr is the address other nodes should connect to, 152 // including a port. 153 AdvertiseAddr string 154 joinAddr string 155 forceNewCluster bool 156 joinToken string 157 lockKey []byte 158 autolock bool 159 } 160 161 // New creates a new Cluster instance using provided config. 162 func New(config Config) (*Cluster, error) { 163 root := filepath.Join(config.Root, swarmDirName) 164 if err := os.MkdirAll(root, 0700); err != nil { 165 return nil, err 166 } 167 if config.RuntimeRoot == "" { 168 config.RuntimeRoot = root 169 } 170 if err := os.MkdirAll(config.RuntimeRoot, 0700); err != nil { 171 return nil, err 172 } 173 c := &Cluster{ 174 root: root, 175 config: config, 176 configEvent: make(chan struct{}, 10), 177 runtimeRoot: config.RuntimeRoot, 178 attachers: make(map[string]*attacher), 179 } 180 181 nodeConfig, err := c.loadState() 182 if err != nil { 183 if os.IsNotExist(err) { 184 return c, nil 185 } 186 return nil, err 187 } 188 189 n, err := c.startNewNode(*nodeConfig) 190 if err != nil { 191 return nil, err 192 } 193 194 select { 195 case <-time.After(swarmConnectTimeout): 196 logrus.Error("swarm component could not be started before timeout was reached") 197 case <-n.Ready(): 198 case <-n.done: 199 if errors.Cause(c.err) == ErrSwarmLocked { 200 return c, nil 201 } 202 if err, ok := errors.Cause(c.err).(x509.CertificateInvalidError); ok && err.Reason == x509.Expired { 203 c.err = ErrSwarmCertificatesExpired 204 return c, nil 205 } 206 return nil, fmt.Errorf("swarm component could not be started: %v", c.err) 207 } 208 go c.reconnectOnFailure(n) 209 return c, nil 210 } 211 212 func (c *Cluster) loadState() (*nodeStartConfig, error) { 213 dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile)) 214 if err != nil { 215 return nil, err 216 } 217 // missing certificate means no actual state to restore from 218 if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil { 219 if os.IsNotExist(err) { 220 c.clearState() 221 } 222 return nil, err 223 } 224 var st nodeStartConfig 225 if err := json.Unmarshal(dt, &st); err != nil { 226 return nil, err 227 } 228 return &st, nil 229 } 230 231 func (c *Cluster) saveState(config nodeStartConfig) error { 232 dt, err := json.Marshal(config) 233 if err != nil { 234 return err 235 } 236 return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600) 237 } 238 239 func (c *Cluster) reconnectOnFailure(n *node) { 240 for { 241 <-n.done 242 c.Lock() 243 if c.stop || c.node != nil { 244 c.Unlock() 245 return 246 } 247 n.reconnectDelay *= 2 248 if n.reconnectDelay > maxReconnectDelay { 249 n.reconnectDelay = maxReconnectDelay 250 } 251 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 252 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 253 c.cancelDelay = cancel 254 c.Unlock() 255 <-delayCtx.Done() 256 if delayCtx.Err() != context.DeadlineExceeded { 257 return 258 } 259 c.Lock() 260 if c.node != nil { 261 c.Unlock() 262 return 263 } 264 var err error 265 config := n.config 266 config.RemoteAddr = c.getRemoteAddress() 267 config.joinAddr = config.RemoteAddr 268 n, err = c.startNewNode(config) 269 if err != nil { 270 c.err = err 271 close(n.done) 272 } 273 c.Unlock() 274 } 275 } 276 277 func (c *Cluster) startNewNode(conf nodeStartConfig) (*node, error) { 278 if err := c.config.Backend.IsSwarmCompatible(); err != nil { 279 return nil, err 280 } 281 282 actualLocalAddr := conf.LocalAddr 283 if actualLocalAddr == "" { 284 // If localAddr was not specified, resolve it automatically 285 // based on the route to joinAddr. localAddr can only be left 286 // empty on "join". 287 listenHost, _, err := net.SplitHostPort(conf.ListenAddr) 288 if err != nil { 289 return nil, fmt.Errorf("could not parse listen address: %v", err) 290 } 291 292 listenAddrIP := net.ParseIP(listenHost) 293 if listenAddrIP == nil || !listenAddrIP.IsUnspecified() { 294 actualLocalAddr = listenHost 295 } else { 296 if conf.RemoteAddr == "" { 297 // Should never happen except using swarms created by 298 // old versions that didn't save remoteAddr. 299 conf.RemoteAddr = "8.8.8.8:53" 300 } 301 conn, err := net.Dial("udp", conf.RemoteAddr) 302 if err != nil { 303 return nil, fmt.Errorf("could not find local IP address: %v", err) 304 } 305 localHostPort := conn.LocalAddr().String() 306 actualLocalAddr, _, _ = net.SplitHostPort(localHostPort) 307 conn.Close() 308 } 309 } 310 311 var control string 312 if runtime.GOOS == "windows" { 313 control = `\\.\pipe\` + controlSocket 314 } else { 315 control = filepath.Join(c.runtimeRoot, controlSocket) 316 } 317 318 c.node = nil 319 c.cancelDelay = nil 320 c.stop = false 321 n, err := swarmnode.New(&swarmnode.Config{ 322 Hostname: c.config.Name, 323 ForceNewCluster: conf.forceNewCluster, 324 ListenControlAPI: control, 325 ListenRemoteAPI: conf.ListenAddr, 326 AdvertiseRemoteAPI: conf.AdvertiseAddr, 327 JoinAddr: conf.joinAddr, 328 StateDir: c.root, 329 JoinToken: conf.joinToken, 330 Executor: container.NewExecutor(c.config.Backend), 331 HeartbeatTick: 1, 332 ElectionTick: 3, 333 UnlockKey: conf.lockKey, 334 AutoLockManagers: conf.autolock, 335 PluginGetter: c.config.Backend.PluginGetter(), 336 }) 337 338 if err != nil { 339 return nil, err 340 } 341 ctx := context.Background() 342 if err := n.Start(ctx); err != nil { 343 return nil, err 344 } 345 node := &node{ 346 Node: n, 347 done: make(chan struct{}), 348 reconnectDelay: initialReconnectDelay, 349 config: conf, 350 } 351 c.node = node 352 c.actualLocalAddr = actualLocalAddr // not saved 353 c.saveState(conf) 354 355 c.config.Backend.DaemonJoinsCluster(c) 356 go func() { 357 err := detectLockedError(n.Err(ctx)) 358 if err != nil { 359 logrus.Errorf("cluster exited with error: %v", err) 360 } 361 c.Lock() 362 c.node = nil 363 c.err = err 364 if errors.Cause(err) == ErrSwarmLocked { 365 c.locked = true 366 confClone := conf 367 c.lastNodeConfig = &confClone 368 } 369 c.Unlock() 370 close(node.done) 371 }() 372 373 go func() { 374 select { 375 case <-n.Ready(): 376 c.Lock() 377 node.ready = true 378 c.err = nil 379 c.Unlock() 380 case <-ctx.Done(): 381 } 382 c.configEvent <- struct{}{} 383 }() 384 385 go func() { 386 for conn := range n.ListenControlSocket(ctx) { 387 c.Lock() 388 if node.conn != conn { 389 if conn == nil { 390 node.client = nil 391 node.logs = nil 392 } else { 393 node.client = swarmapi.NewControlClient(conn) 394 node.logs = swarmapi.NewLogsClient(conn) 395 } 396 } 397 node.conn = conn 398 c.Unlock() 399 c.configEvent <- struct{}{} 400 } 401 }() 402 403 return node, nil 404 } 405 406 // Init initializes new cluster from user provided request. 407 func (c *Cluster) Init(req types.InitRequest) (string, error) { 408 c.Lock() 409 if c.swarmExists() { 410 if !req.ForceNewCluster { 411 c.Unlock() 412 return "", ErrSwarmExists 413 } 414 if err := c.stopNode(); err != nil { 415 c.Unlock() 416 return "", err 417 } 418 } 419 420 if err := validateAndSanitizeInitRequest(&req); err != nil { 421 c.Unlock() 422 return "", err 423 } 424 425 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 426 if err != nil { 427 c.Unlock() 428 return "", err 429 } 430 431 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 432 if err != nil { 433 c.Unlock() 434 return "", err 435 } 436 437 localAddr := listenHost 438 439 // If the local address is undetermined, the advertise address 440 // will be used as local address, if it belongs to this system. 441 // If the advertise address is not local, then we try to find 442 // a system address to use as local address. If this fails, 443 // we give up and ask user to pass the listen address. 444 if net.ParseIP(localAddr).IsUnspecified() { 445 advertiseIP := net.ParseIP(advertiseHost) 446 447 found := false 448 for _, systemIP := range listSystemIPs() { 449 if systemIP.Equal(advertiseIP) { 450 localAddr = advertiseIP.String() 451 found = true 452 break 453 } 454 } 455 456 if !found { 457 ip, err := c.resolveSystemAddr() 458 if err != nil { 459 c.Unlock() 460 logrus.Warnf("Could not find a local address: %v", err) 461 return "", errMustSpecifyListenAddr 462 } 463 localAddr = ip.String() 464 } 465 } 466 467 // todo: check current state existing 468 n, err := c.startNewNode(nodeStartConfig{ 469 forceNewCluster: req.ForceNewCluster, 470 autolock: req.AutoLockManagers, 471 LocalAddr: localAddr, 472 ListenAddr: net.JoinHostPort(listenHost, listenPort), 473 AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort), 474 }) 475 if err != nil { 476 c.Unlock() 477 return "", err 478 } 479 c.Unlock() 480 481 select { 482 case <-n.Ready(): 483 if err := initClusterSpec(n, req.Spec); err != nil { 484 return "", err 485 } 486 go c.reconnectOnFailure(n) 487 return n.NodeID(), nil 488 case <-n.done: 489 c.RLock() 490 defer c.RUnlock() 491 if !req.ForceNewCluster { // if failure on first attempt don't keep state 492 if err := c.clearState(); err != nil { 493 return "", err 494 } 495 } 496 return "", c.err 497 } 498 } 499 500 // Join makes current Cluster part of an existing swarm cluster. 501 func (c *Cluster) Join(req types.JoinRequest) error { 502 c.Lock() 503 if c.swarmExists() { 504 c.Unlock() 505 return ErrSwarmExists 506 } 507 if err := validateAndSanitizeJoinRequest(&req); err != nil { 508 c.Unlock() 509 return err 510 } 511 512 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 513 if err != nil { 514 c.Unlock() 515 return err 516 } 517 518 var advertiseAddr string 519 if req.AdvertiseAddr != "" { 520 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 521 // For joining, we don't need to provide an advertise address, 522 // since the remote side can detect it. 523 if err == nil { 524 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 525 } 526 } 527 528 // todo: check current state existing 529 n, err := c.startNewNode(nodeStartConfig{ 530 RemoteAddr: req.RemoteAddrs[0], 531 ListenAddr: net.JoinHostPort(listenHost, listenPort), 532 AdvertiseAddr: advertiseAddr, 533 joinAddr: req.RemoteAddrs[0], 534 joinToken: req.JoinToken, 535 }) 536 if err != nil { 537 c.Unlock() 538 return err 539 } 540 c.Unlock() 541 542 select { 543 case <-time.After(swarmConnectTimeout): 544 // attempt to connect will continue in background, but reconnect only if it didn't fail 545 go func() { 546 select { 547 case <-n.Ready(): 548 c.reconnectOnFailure(n) 549 case <-n.done: 550 logrus.Errorf("failed to join the cluster: %+v", c.err) 551 } 552 }() 553 return ErrSwarmJoinTimeoutReached 554 case <-n.Ready(): 555 go c.reconnectOnFailure(n) 556 return nil 557 case <-n.done: 558 c.RLock() 559 defer c.RUnlock() 560 return c.err 561 } 562 } 563 564 // GetUnlockKey returns the unlock key for the swarm. 565 func (c *Cluster) GetUnlockKey() (string, error) { 566 c.RLock() 567 defer c.RUnlock() 568 569 if !c.isActiveManager() { 570 return "", c.errNoManager() 571 } 572 573 ctx, cancel := c.getRequestContext() 574 defer cancel() 575 576 client := swarmapi.NewCAClient(c.conn) 577 578 r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{}) 579 if err != nil { 580 return "", err 581 } 582 583 if len(r.UnlockKey) == 0 { 584 // no key 585 return "", nil 586 } 587 588 return encryption.HumanReadableKey(r.UnlockKey), nil 589 } 590 591 // UnlockSwarm provides a key to decrypt data that is encrypted at rest. 592 func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error { 593 c.RLock() 594 if !c.isActiveManager() { 595 if err := c.errNoManager(); err != ErrSwarmLocked { 596 c.RUnlock() 597 return err 598 } 599 } 600 601 if c.node != nil || c.locked != true { 602 c.RUnlock() 603 return errors.New("swarm is not locked") 604 } 605 c.RUnlock() 606 607 key, err := encryption.ParseHumanReadableKey(req.UnlockKey) 608 if err != nil { 609 return err 610 } 611 612 c.Lock() 613 config := *c.lastNodeConfig 614 config.lockKey = key 615 n, err := c.startNewNode(config) 616 if err != nil { 617 c.Unlock() 618 return err 619 } 620 c.Unlock() 621 select { 622 case <-n.Ready(): 623 case <-n.done: 624 if errors.Cause(c.err) == ErrSwarmLocked { 625 return errors.New("swarm could not be unlocked: invalid key provided") 626 } 627 return fmt.Errorf("swarm component could not be started: %v", c.err) 628 } 629 go c.reconnectOnFailure(n) 630 return nil 631 } 632 633 // stopNode is a helper that stops the active c.node and waits until it has 634 // shut down. Call while keeping the cluster lock. 635 func (c *Cluster) stopNode() error { 636 if c.node == nil { 637 return nil 638 } 639 c.stop = true 640 if c.cancelDelay != nil { 641 c.cancelDelay() 642 c.cancelDelay = nil 643 } 644 node := c.node 645 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 646 defer cancel() 647 // TODO: can't hold lock on stop because it calls back to network 648 c.Unlock() 649 defer c.Lock() 650 if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 651 return err 652 } 653 <-node.done 654 return nil 655 } 656 657 func removingManagerCausesLossOfQuorum(reachable, unreachable int) bool { 658 return reachable-2 <= unreachable 659 } 660 661 func isLastManager(reachable, unreachable int) bool { 662 return reachable == 1 && unreachable == 0 663 } 664 665 // Leave shuts down Cluster and removes current state. 666 func (c *Cluster) Leave(force bool) error { 667 c.Lock() 668 node := c.node 669 if node == nil { 670 if c.locked { 671 c.locked = false 672 c.lastNodeConfig = nil 673 c.Unlock() 674 } else if c.err == ErrSwarmCertificatesExpired { 675 c.err = nil 676 c.Unlock() 677 } else { 678 c.Unlock() 679 return ErrNoSwarm 680 } 681 } else { 682 if node.Manager() != nil && !force { 683 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 684 if c.isActiveManager() { 685 active, reachable, unreachable, err := c.managerStats() 686 if err == nil { 687 if active && removingManagerCausesLossOfQuorum(reachable, unreachable) { 688 if isLastManager(reachable, unreachable) { 689 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 690 c.Unlock() 691 return fmt.Errorf(msg) 692 } 693 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 694 } 695 } 696 } else { 697 msg += "Doing so may lose the consensus of your cluster. " 698 } 699 700 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 701 c.Unlock() 702 return fmt.Errorf(msg) 703 } 704 if err := c.stopNode(); err != nil { 705 logrus.Errorf("failed to shut down cluster node: %v", err) 706 signal.DumpStacks("") 707 c.Unlock() 708 return err 709 } 710 c.Unlock() 711 if nodeID := node.NodeID(); nodeID != "" { 712 nodeContainers, err := c.listContainerForNode(nodeID) 713 if err != nil { 714 return err 715 } 716 for _, id := range nodeContainers { 717 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 718 logrus.Errorf("error removing %v: %v", id, err) 719 } 720 } 721 } 722 } 723 c.configEvent <- struct{}{} 724 // todo: cleanup optional? 725 if err := c.clearState(); err != nil { 726 return err 727 } 728 729 return nil 730 } 731 732 func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) { 733 var ids []string 734 filters := filters.NewArgs() 735 filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID)) 736 containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{ 737 Filters: filters, 738 }) 739 if err != nil { 740 return []string{}, err 741 } 742 for _, c := range containers { 743 ids = append(ids, c.ID) 744 } 745 return ids, nil 746 } 747 748 func (c *Cluster) clearState() error { 749 // todo: backup this data instead of removing? 750 if err := os.RemoveAll(c.root); err != nil { 751 return err 752 } 753 if err := os.MkdirAll(c.root, 0700); err != nil { 754 return err 755 } 756 c.config.Backend.DaemonLeavesCluster() 757 return nil 758 } 759 760 func (c *Cluster) getRequestContext() (context.Context, func()) { // TODO: not needed when requests don't block on qourum lost 761 return context.WithTimeout(context.Background(), swarmRequestTimeout) 762 } 763 764 // Inspect retrieves the configuration properties of a managed swarm cluster. 765 func (c *Cluster) Inspect() (types.Swarm, error) { 766 c.RLock() 767 defer c.RUnlock() 768 769 if !c.isActiveManager() { 770 return types.Swarm{}, c.errNoManager() 771 } 772 773 ctx, cancel := c.getRequestContext() 774 defer cancel() 775 776 swarm, err := getSwarm(ctx, c.client) 777 if err != nil { 778 return types.Swarm{}, err 779 } 780 781 return convert.SwarmFromGRPC(*swarm), nil 782 } 783 784 // Update updates configuration of a managed swarm cluster. 785 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 786 c.RLock() 787 defer c.RUnlock() 788 789 if !c.isActiveManager() { 790 return c.errNoManager() 791 } 792 793 ctx, cancel := c.getRequestContext() 794 defer cancel() 795 796 swarm, err := getSwarm(ctx, c.client) 797 if err != nil { 798 return err 799 } 800 801 // In update, client should provide the complete spec of the swarm, including 802 // Name and Labels. If a field is specified with 0 or nil, then the default value 803 // will be used to swarmkit. 804 clusterSpec, err := convert.SwarmSpecToGRPC(spec) 805 if err != nil { 806 return err 807 } 808 809 _, err = c.client.UpdateCluster( 810 ctx, 811 &swarmapi.UpdateClusterRequest{ 812 ClusterID: swarm.ID, 813 Spec: &clusterSpec, 814 ClusterVersion: &swarmapi.Version{ 815 Index: version, 816 }, 817 Rotation: swarmapi.KeyRotation{ 818 WorkerJoinToken: flags.RotateWorkerToken, 819 ManagerJoinToken: flags.RotateManagerToken, 820 ManagerUnlockKey: flags.RotateManagerUnlockKey, 821 }, 822 }, 823 ) 824 return err 825 } 826 827 // IsManager returns true if Cluster is participating as a manager. 828 func (c *Cluster) IsManager() bool { 829 c.RLock() 830 defer c.RUnlock() 831 return c.isActiveManager() 832 } 833 834 // IsAgent returns true if Cluster is participating as a worker/agent. 835 func (c *Cluster) IsAgent() bool { 836 c.RLock() 837 defer c.RUnlock() 838 return c.node != nil && c.ready 839 } 840 841 // GetLocalAddress returns the local address. 842 func (c *Cluster) GetLocalAddress() string { 843 c.RLock() 844 defer c.RUnlock() 845 return c.actualLocalAddr 846 } 847 848 // GetListenAddress returns the listen address. 849 func (c *Cluster) GetListenAddress() string { 850 c.RLock() 851 defer c.RUnlock() 852 if c.node != nil { 853 return c.node.config.ListenAddr 854 } 855 return "" 856 } 857 858 // GetAdvertiseAddress returns the remotely reachable address of this node. 859 func (c *Cluster) GetAdvertiseAddress() string { 860 c.RLock() 861 defer c.RUnlock() 862 if c.node != nil && c.node.config.AdvertiseAddr != "" { 863 advertiseHost, _, _ := net.SplitHostPort(c.node.config.AdvertiseAddr) 864 return advertiseHost 865 } 866 return c.actualLocalAddr 867 } 868 869 // GetRemoteAddress returns a known advertise address of a remote manager if 870 // available. 871 // todo: change to array/connect with info 872 func (c *Cluster) GetRemoteAddress() string { 873 c.RLock() 874 defer c.RUnlock() 875 return c.getRemoteAddress() 876 } 877 878 func (c *Cluster) getRemoteAddress() string { 879 if c.node == nil { 880 return "" 881 } 882 nodeID := c.node.NodeID() 883 for _, r := range c.node.Remotes() { 884 if r.NodeID != nodeID { 885 return r.Addr 886 } 887 } 888 return "" 889 } 890 891 // ListenClusterEvents returns a channel that receives messages on cluster 892 // participation changes. 893 // todo: make cancelable and accessible to multiple callers 894 func (c *Cluster) ListenClusterEvents() <-chan struct{} { 895 return c.configEvent 896 } 897 898 // Info returns information about the current cluster state. 899 func (c *Cluster) Info() types.Info { 900 info := types.Info{ 901 NodeAddr: c.GetAdvertiseAddress(), 902 } 903 904 c.RLock() 905 defer c.RUnlock() 906 907 if c.node == nil { 908 info.LocalNodeState = types.LocalNodeStateInactive 909 if c.cancelDelay != nil { 910 info.LocalNodeState = types.LocalNodeStateError 911 } 912 if c.locked { 913 info.LocalNodeState = types.LocalNodeStateLocked 914 } else if c.err == ErrSwarmCertificatesExpired { 915 info.LocalNodeState = types.LocalNodeStateError 916 } 917 } else { 918 info.LocalNodeState = types.LocalNodeStatePending 919 if c.ready == true { 920 info.LocalNodeState = types.LocalNodeStateActive 921 } else if c.locked { 922 info.LocalNodeState = types.LocalNodeStateLocked 923 } 924 } 925 if c.err != nil { 926 info.Error = c.err.Error() 927 } 928 929 ctx, cancel := c.getRequestContext() 930 defer cancel() 931 932 if c.isActiveManager() { 933 info.ControlAvailable = true 934 swarm, err := c.Inspect() 935 if err != nil { 936 info.Error = err.Error() 937 } 938 939 // Strip JoinTokens 940 info.Cluster = swarm.ClusterInfo 941 942 if r, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err == nil { 943 info.Nodes = len(r.Nodes) 944 for _, n := range r.Nodes { 945 if n.ManagerStatus != nil { 946 info.Managers = info.Managers + 1 947 } 948 } 949 } 950 } 951 952 if c.node != nil { 953 for _, r := range c.node.Remotes() { 954 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 955 } 956 info.NodeID = c.node.NodeID() 957 } 958 959 return info 960 } 961 962 // isActiveManager should not be called without a read lock 963 func (c *Cluster) isActiveManager() bool { 964 return c.node != nil && c.conn != nil 965 } 966 967 // swarmExists should not be called without a read lock 968 func (c *Cluster) swarmExists() bool { 969 return c.node != nil || c.locked || c.err == ErrSwarmCertificatesExpired 970 } 971 972 // errNoManager returns error describing why manager commands can't be used. 973 // Call with read lock. 974 func (c *Cluster) errNoManager() error { 975 if c.node == nil { 976 if c.locked { 977 return ErrSwarmLocked 978 } 979 if c.err == ErrSwarmCertificatesExpired { 980 return ErrSwarmCertificatesExpired 981 } 982 return fmt.Errorf("This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again.") 983 } 984 if c.node.Manager() != nil { 985 return fmt.Errorf("This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster.") 986 } 987 return fmt.Errorf("This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.") 988 } 989 990 // GetServices returns all services of a managed swarm cluster. 991 func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) { 992 c.RLock() 993 defer c.RUnlock() 994 995 if !c.isActiveManager() { 996 return nil, c.errNoManager() 997 } 998 999 filters, err := newListServicesFilters(options.Filters) 1000 if err != nil { 1001 return nil, err 1002 } 1003 ctx, cancel := c.getRequestContext() 1004 defer cancel() 1005 1006 r, err := c.client.ListServices( 1007 ctx, 1008 &swarmapi.ListServicesRequest{Filters: filters}) 1009 if err != nil { 1010 return nil, err 1011 } 1012 1013 services := []types.Service{} 1014 1015 for _, service := range r.Services { 1016 services = append(services, convert.ServiceFromGRPC(*service)) 1017 } 1018 1019 return services, nil 1020 } 1021 1022 // imageWithDigestString takes an image such as name or name:tag 1023 // and returns the image pinned to a digest, such as name@sha256:34234... 1024 // Due to the difference between the docker/docker/reference, and the 1025 // docker/distribution/reference packages, we're parsing the image twice. 1026 // As the two packages converge, this function should be simplified. 1027 // TODO(nishanttotla): After the packages converge, the function must 1028 // convert distreference.Named -> distreference.Canonical, and the logic simplified. 1029 func (c *Cluster) imageWithDigestString(ctx context.Context, image string, authConfig *apitypes.AuthConfig) (string, error) { 1030 if _, err := digest.ParseDigest(image); err == nil { 1031 return "", errors.New("image reference is an image ID") 1032 } 1033 ref, err := distreference.ParseNamed(image) 1034 if err != nil { 1035 return "", err 1036 } 1037 // only query registry if not a canonical reference (i.e. with digest) 1038 if _, ok := ref.(distreference.Canonical); !ok { 1039 // create a docker/docker/reference Named object because GetRepository needs it 1040 dockerRef, err := reference.ParseNamed(image) 1041 if err != nil { 1042 return "", err 1043 } 1044 dockerRef = reference.WithDefaultTag(dockerRef) 1045 namedTaggedRef, ok := dockerRef.(reference.NamedTagged) 1046 if !ok { 1047 return "", fmt.Errorf("unable to cast image to NamedTagged reference object") 1048 } 1049 1050 repo, _, err := c.config.Backend.GetRepository(ctx, namedTaggedRef, authConfig) 1051 if err != nil { 1052 return "", err 1053 } 1054 dscrptr, err := repo.Tags(ctx).Get(ctx, namedTaggedRef.Tag()) 1055 if err != nil { 1056 return "", err 1057 } 1058 1059 namedDigestedRef, err := distreference.WithDigest(distreference.EnsureTagged(ref), dscrptr.Digest) 1060 if err != nil { 1061 return "", err 1062 } 1063 return namedDigestedRef.String(), nil 1064 } 1065 // reference already contains a digest, so just return it 1066 return ref.String(), nil 1067 } 1068 1069 // CreateService creates a new service in a managed swarm cluster. 1070 func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (*apitypes.ServiceCreateResponse, error) { 1071 c.RLock() 1072 defer c.RUnlock() 1073 1074 if !c.isActiveManager() { 1075 return nil, c.errNoManager() 1076 } 1077 1078 ctx, cancel := c.getRequestContext() 1079 defer cancel() 1080 1081 err := c.populateNetworkID(ctx, c.client, &s) 1082 if err != nil { 1083 return nil, err 1084 } 1085 1086 serviceSpec, err := convert.ServiceSpecToGRPC(s) 1087 if err != nil { 1088 return nil, err 1089 } 1090 1091 ctnr := serviceSpec.Task.GetContainer() 1092 if ctnr == nil { 1093 return nil, fmt.Errorf("service does not use container tasks") 1094 } 1095 1096 if encodedAuth != "" { 1097 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 1098 } 1099 1100 // retrieve auth config from encoded auth 1101 authConfig := &apitypes.AuthConfig{} 1102 if encodedAuth != "" { 1103 if err := json.NewDecoder(base64.NewDecoder(base64.URLEncoding, strings.NewReader(encodedAuth))).Decode(authConfig); err != nil { 1104 logrus.Warnf("invalid authconfig: %v", err) 1105 } 1106 } 1107 1108 resp := &apitypes.ServiceCreateResponse{} 1109 1110 // pin image by digest 1111 if os.Getenv("DOCKER_SERVICE_PREFER_OFFLINE_IMAGE") != "1" { 1112 digestImage, err := c.imageWithDigestString(ctx, ctnr.Image, authConfig) 1113 if err != nil { 1114 logrus.Warnf("unable to pin image %s to digest: %s", ctnr.Image, err.Error()) 1115 resp.Warnings = append(resp.Warnings, fmt.Sprintf("unable to pin image %s to digest: %s", ctnr.Image, err.Error())) 1116 } else if ctnr.Image != digestImage { 1117 logrus.Debugf("pinning image %s by digest: %s", ctnr.Image, digestImage) 1118 ctnr.Image = digestImage 1119 } else { 1120 logrus.Debugf("creating service using supplied digest reference %s", ctnr.Image) 1121 } 1122 } 1123 1124 r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec}) 1125 if err != nil { 1126 return nil, err 1127 } 1128 1129 resp.ID = r.Service.ID 1130 return resp, nil 1131 } 1132 1133 // GetService returns a service based on an ID or name. 1134 func (c *Cluster) GetService(input string) (types.Service, error) { 1135 c.RLock() 1136 defer c.RUnlock() 1137 1138 if !c.isActiveManager() { 1139 return types.Service{}, c.errNoManager() 1140 } 1141 1142 ctx, cancel := c.getRequestContext() 1143 defer cancel() 1144 1145 service, err := getService(ctx, c.client, input) 1146 if err != nil { 1147 return types.Service{}, err 1148 } 1149 return convert.ServiceFromGRPC(*service), nil 1150 } 1151 1152 // UpdateService updates existing service to match new properties. 1153 func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string, registryAuthFrom string) (*apitypes.ServiceUpdateResponse, error) { 1154 c.RLock() 1155 defer c.RUnlock() 1156 1157 if !c.isActiveManager() { 1158 return nil, c.errNoManager() 1159 } 1160 1161 ctx, cancel := c.getRequestContext() 1162 defer cancel() 1163 1164 err := c.populateNetworkID(ctx, c.client, &spec) 1165 if err != nil { 1166 return nil, err 1167 } 1168 1169 serviceSpec, err := convert.ServiceSpecToGRPC(spec) 1170 if err != nil { 1171 return nil, err 1172 } 1173 1174 currentService, err := getService(ctx, c.client, serviceIDOrName) 1175 if err != nil { 1176 return nil, err 1177 } 1178 1179 newCtnr := serviceSpec.Task.GetContainer() 1180 if newCtnr == nil { 1181 return nil, fmt.Errorf("service does not use container tasks") 1182 } 1183 1184 if encodedAuth != "" { 1185 newCtnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 1186 } else { 1187 // this is needed because if the encodedAuth isn't being updated then we 1188 // shouldn't lose it, and continue to use the one that was already present 1189 var ctnr *swarmapi.ContainerSpec 1190 switch registryAuthFrom { 1191 case apitypes.RegistryAuthFromSpec, "": 1192 ctnr = currentService.Spec.Task.GetContainer() 1193 case apitypes.RegistryAuthFromPreviousSpec: 1194 if currentService.PreviousSpec == nil { 1195 return nil, fmt.Errorf("service does not have a previous spec") 1196 } 1197 ctnr = currentService.PreviousSpec.Task.GetContainer() 1198 default: 1199 return nil, fmt.Errorf("unsupported registryAuthFromValue") 1200 } 1201 if ctnr == nil { 1202 return nil, fmt.Errorf("service does not use container tasks") 1203 } 1204 newCtnr.PullOptions = ctnr.PullOptions 1205 // update encodedAuth so it can be used to pin image by digest 1206 if ctnr.PullOptions != nil { 1207 encodedAuth = ctnr.PullOptions.RegistryAuth 1208 } 1209 } 1210 1211 // retrieve auth config from encoded auth 1212 authConfig := &apitypes.AuthConfig{} 1213 if encodedAuth != "" { 1214 if err := json.NewDecoder(base64.NewDecoder(base64.URLEncoding, strings.NewReader(encodedAuth))).Decode(authConfig); err != nil { 1215 logrus.Warnf("invalid authconfig: %v", err) 1216 } 1217 } 1218 1219 resp := &apitypes.ServiceUpdateResponse{} 1220 1221 // pin image by digest 1222 if os.Getenv("DOCKER_SERVICE_PREFER_OFFLINE_IMAGE") != "1" { 1223 digestImage, err := c.imageWithDigestString(ctx, newCtnr.Image, authConfig) 1224 if err != nil { 1225 logrus.Warnf("unable to pin image %s to digest: %s", newCtnr.Image, err.Error()) 1226 resp.Warnings = append(resp.Warnings, fmt.Sprintf("unable to pin image %s to digest: %s", newCtnr.Image, err.Error())) 1227 } else if newCtnr.Image != digestImage { 1228 logrus.Debugf("pinning image %s by digest: %s", newCtnr.Image, digestImage) 1229 newCtnr.Image = digestImage 1230 } else { 1231 logrus.Debugf("updating service using supplied digest reference %s", newCtnr.Image) 1232 } 1233 } 1234 1235 _, err = c.client.UpdateService( 1236 ctx, 1237 &swarmapi.UpdateServiceRequest{ 1238 ServiceID: currentService.ID, 1239 Spec: &serviceSpec, 1240 ServiceVersion: &swarmapi.Version{ 1241 Index: version, 1242 }, 1243 }, 1244 ) 1245 1246 return resp, err 1247 } 1248 1249 // RemoveService removes a service from a managed swarm cluster. 1250 func (c *Cluster) RemoveService(input string) error { 1251 c.RLock() 1252 defer c.RUnlock() 1253 1254 if !c.isActiveManager() { 1255 return c.errNoManager() 1256 } 1257 1258 ctx, cancel := c.getRequestContext() 1259 defer cancel() 1260 1261 service, err := getService(ctx, c.client, input) 1262 if err != nil { 1263 return err 1264 } 1265 1266 if _, err := c.client.RemoveService(ctx, &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil { 1267 return err 1268 } 1269 return nil 1270 } 1271 1272 // ServiceLogs collects service logs and writes them back to `config.OutStream` 1273 func (c *Cluster) ServiceLogs(ctx context.Context, input string, config *backend.ContainerLogsConfig, started chan struct{}) error { 1274 c.RLock() 1275 if !c.isActiveManager() { 1276 c.RUnlock() 1277 return c.errNoManager() 1278 } 1279 1280 service, err := getService(ctx, c.client, input) 1281 if err != nil { 1282 c.RUnlock() 1283 return err 1284 } 1285 1286 stream, err := c.logs.SubscribeLogs(ctx, &swarmapi.SubscribeLogsRequest{ 1287 Selector: &swarmapi.LogSelector{ 1288 ServiceIDs: []string{service.ID}, 1289 }, 1290 Options: &swarmapi.LogSubscriptionOptions{ 1291 Follow: config.Follow, 1292 }, 1293 }) 1294 if err != nil { 1295 c.RUnlock() 1296 return err 1297 } 1298 1299 wf := ioutils.NewWriteFlusher(config.OutStream) 1300 defer wf.Close() 1301 close(started) 1302 wf.Flush() 1303 1304 outStream := stdcopy.NewStdWriter(wf, stdcopy.Stdout) 1305 errStream := stdcopy.NewStdWriter(wf, stdcopy.Stderr) 1306 1307 // Release the lock before starting the stream. 1308 c.RUnlock() 1309 for { 1310 // Check the context before doing anything. 1311 select { 1312 case <-ctx.Done(): 1313 return ctx.Err() 1314 default: 1315 } 1316 1317 subscribeMsg, err := stream.Recv() 1318 if err == io.EOF { 1319 return nil 1320 } 1321 if err != nil { 1322 return err 1323 } 1324 1325 for _, msg := range subscribeMsg.Messages { 1326 data := []byte{} 1327 1328 if config.Timestamps { 1329 ts, err := ptypes.Timestamp(msg.Timestamp) 1330 if err != nil { 1331 return err 1332 } 1333 data = append(data, []byte(ts.Format(logger.TimeFormat)+" ")...) 1334 } 1335 1336 data = append(data, []byte(fmt.Sprintf("%s.node.id=%s,%s.service.id=%s,%s.task.id=%s ", 1337 contextPrefix, msg.Context.NodeID, 1338 contextPrefix, msg.Context.ServiceID, 1339 contextPrefix, msg.Context.TaskID, 1340 ))...) 1341 1342 data = append(data, msg.Data...) 1343 1344 switch msg.Stream { 1345 case swarmapi.LogStreamStdout: 1346 outStream.Write(data) 1347 case swarmapi.LogStreamStderr: 1348 errStream.Write(data) 1349 } 1350 } 1351 } 1352 } 1353 1354 // GetNodes returns a list of all nodes known to a cluster. 1355 func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) { 1356 c.RLock() 1357 defer c.RUnlock() 1358 1359 if !c.isActiveManager() { 1360 return nil, c.errNoManager() 1361 } 1362 1363 filters, err := newListNodesFilters(options.Filters) 1364 if err != nil { 1365 return nil, err 1366 } 1367 1368 ctx, cancel := c.getRequestContext() 1369 defer cancel() 1370 1371 r, err := c.client.ListNodes( 1372 ctx, 1373 &swarmapi.ListNodesRequest{Filters: filters}) 1374 if err != nil { 1375 return nil, err 1376 } 1377 1378 nodes := []types.Node{} 1379 1380 for _, node := range r.Nodes { 1381 nodes = append(nodes, convert.NodeFromGRPC(*node)) 1382 } 1383 return nodes, nil 1384 } 1385 1386 // GetNode returns a node based on an ID or name. 1387 func (c *Cluster) GetNode(input string) (types.Node, error) { 1388 c.RLock() 1389 defer c.RUnlock() 1390 1391 if !c.isActiveManager() { 1392 return types.Node{}, c.errNoManager() 1393 } 1394 1395 ctx, cancel := c.getRequestContext() 1396 defer cancel() 1397 1398 node, err := getNode(ctx, c.client, input) 1399 if err != nil { 1400 return types.Node{}, err 1401 } 1402 return convert.NodeFromGRPC(*node), nil 1403 } 1404 1405 // UpdateNode updates existing nodes properties. 1406 func (c *Cluster) UpdateNode(input string, version uint64, spec types.NodeSpec) error { 1407 c.RLock() 1408 defer c.RUnlock() 1409 1410 if !c.isActiveManager() { 1411 return c.errNoManager() 1412 } 1413 1414 nodeSpec, err := convert.NodeSpecToGRPC(spec) 1415 if err != nil { 1416 return err 1417 } 1418 1419 ctx, cancel := c.getRequestContext() 1420 defer cancel() 1421 1422 currentNode, err := getNode(ctx, c.client, input) 1423 if err != nil { 1424 return err 1425 } 1426 1427 _, err = c.client.UpdateNode( 1428 ctx, 1429 &swarmapi.UpdateNodeRequest{ 1430 NodeID: currentNode.ID, 1431 Spec: &nodeSpec, 1432 NodeVersion: &swarmapi.Version{ 1433 Index: version, 1434 }, 1435 }, 1436 ) 1437 return err 1438 } 1439 1440 // RemoveNode removes a node from a cluster 1441 func (c *Cluster) RemoveNode(input string, force bool) error { 1442 c.RLock() 1443 defer c.RUnlock() 1444 1445 if !c.isActiveManager() { 1446 return c.errNoManager() 1447 } 1448 1449 ctx, cancel := c.getRequestContext() 1450 defer cancel() 1451 1452 node, err := getNode(ctx, c.client, input) 1453 if err != nil { 1454 return err 1455 } 1456 1457 if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID, Force: force}); err != nil { 1458 return err 1459 } 1460 return nil 1461 } 1462 1463 // GetTasks returns a list of tasks matching the filter options. 1464 func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) { 1465 c.RLock() 1466 defer c.RUnlock() 1467 1468 if !c.isActiveManager() { 1469 return nil, c.errNoManager() 1470 } 1471 1472 byName := func(filter filters.Args) error { 1473 if filter.Include("service") { 1474 serviceFilters := filter.Get("service") 1475 for _, serviceFilter := range serviceFilters { 1476 service, err := c.GetService(serviceFilter) 1477 if err != nil { 1478 return err 1479 } 1480 filter.Del("service", serviceFilter) 1481 filter.Add("service", service.ID) 1482 } 1483 } 1484 if filter.Include("node") { 1485 nodeFilters := filter.Get("node") 1486 for _, nodeFilter := range nodeFilters { 1487 node, err := c.GetNode(nodeFilter) 1488 if err != nil { 1489 return err 1490 } 1491 filter.Del("node", nodeFilter) 1492 filter.Add("node", node.ID) 1493 } 1494 } 1495 return nil 1496 } 1497 1498 filters, err := newListTasksFilters(options.Filters, byName) 1499 if err != nil { 1500 return nil, err 1501 } 1502 1503 ctx, cancel := c.getRequestContext() 1504 defer cancel() 1505 1506 r, err := c.client.ListTasks( 1507 ctx, 1508 &swarmapi.ListTasksRequest{Filters: filters}) 1509 if err != nil { 1510 return nil, err 1511 } 1512 1513 tasks := []types.Task{} 1514 1515 for _, task := range r.Tasks { 1516 if task.Spec.GetContainer() != nil { 1517 tasks = append(tasks, convert.TaskFromGRPC(*task)) 1518 } 1519 } 1520 return tasks, nil 1521 } 1522 1523 // GetTask returns a task by an ID. 1524 func (c *Cluster) GetTask(input string) (types.Task, error) { 1525 c.RLock() 1526 defer c.RUnlock() 1527 1528 if !c.isActiveManager() { 1529 return types.Task{}, c.errNoManager() 1530 } 1531 1532 ctx, cancel := c.getRequestContext() 1533 defer cancel() 1534 1535 task, err := getTask(ctx, c.client, input) 1536 if err != nil { 1537 return types.Task{}, err 1538 } 1539 return convert.TaskFromGRPC(*task), nil 1540 } 1541 1542 // GetNetwork returns a cluster network by an ID. 1543 func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) { 1544 c.RLock() 1545 defer c.RUnlock() 1546 1547 if !c.isActiveManager() { 1548 return apitypes.NetworkResource{}, c.errNoManager() 1549 } 1550 1551 ctx, cancel := c.getRequestContext() 1552 defer cancel() 1553 1554 network, err := getNetwork(ctx, c.client, input) 1555 if err != nil { 1556 return apitypes.NetworkResource{}, err 1557 } 1558 return convert.BasicNetworkFromGRPC(*network), nil 1559 } 1560 1561 func (c *Cluster) getNetworks(filters *swarmapi.ListNetworksRequest_Filters) ([]apitypes.NetworkResource, error) { 1562 c.RLock() 1563 defer c.RUnlock() 1564 1565 if !c.isActiveManager() { 1566 return nil, c.errNoManager() 1567 } 1568 1569 ctx, cancel := c.getRequestContext() 1570 defer cancel() 1571 1572 r, err := c.client.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: filters}) 1573 if err != nil { 1574 return nil, err 1575 } 1576 1577 var networks []apitypes.NetworkResource 1578 1579 for _, network := range r.Networks { 1580 networks = append(networks, convert.BasicNetworkFromGRPC(*network)) 1581 } 1582 1583 return networks, nil 1584 } 1585 1586 // GetNetworks returns all current cluster managed networks. 1587 func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) { 1588 return c.getNetworks(nil) 1589 } 1590 1591 // GetNetworksByName returns cluster managed networks by name. 1592 // It is ok to have multiple networks here. #18864 1593 func (c *Cluster) GetNetworksByName(name string) ([]apitypes.NetworkResource, error) { 1594 // Note that swarmapi.GetNetworkRequest.Name is not functional. 1595 // So we cannot just use that with c.GetNetwork. 1596 return c.getNetworks(&swarmapi.ListNetworksRequest_Filters{ 1597 Names: []string{name}, 1598 }) 1599 } 1600 1601 func attacherKey(target, containerID string) string { 1602 return containerID + ":" + target 1603 } 1604 1605 // UpdateAttachment signals the attachment config to the attachment 1606 // waiter who is trying to start or attach the container to the 1607 // network. 1608 func (c *Cluster) UpdateAttachment(target, containerID string, config *network.NetworkingConfig) error { 1609 c.RLock() 1610 attacher, ok := c.attachers[attacherKey(target, containerID)] 1611 c.RUnlock() 1612 if !ok || attacher == nil { 1613 return fmt.Errorf("could not find attacher for container %s to network %s", containerID, target) 1614 } 1615 1616 attacher.attachWaitCh <- config 1617 close(attacher.attachWaitCh) 1618 return nil 1619 } 1620 1621 // WaitForDetachment waits for the container to stop or detach from 1622 // the network. 1623 func (c *Cluster) WaitForDetachment(ctx context.Context, networkName, networkID, taskID, containerID string) error { 1624 c.RLock() 1625 attacher, ok := c.attachers[attacherKey(networkName, containerID)] 1626 if !ok { 1627 attacher, ok = c.attachers[attacherKey(networkID, containerID)] 1628 } 1629 if c.node == nil || c.node.Agent() == nil { 1630 c.RUnlock() 1631 return fmt.Errorf("invalid cluster node while waiting for detachment") 1632 } 1633 1634 agent := c.node.Agent() 1635 c.RUnlock() 1636 1637 if ok && attacher != nil && 1638 attacher.detachWaitCh != nil && 1639 attacher.attachCompleteCh != nil { 1640 // Attachment may be in progress still so wait for 1641 // attachment to complete. 1642 select { 1643 case <-attacher.attachCompleteCh: 1644 case <-ctx.Done(): 1645 return ctx.Err() 1646 } 1647 1648 if attacher.taskID == taskID { 1649 select { 1650 case <-attacher.detachWaitCh: 1651 case <-ctx.Done(): 1652 return ctx.Err() 1653 } 1654 } 1655 } 1656 1657 return agent.ResourceAllocator().DetachNetwork(ctx, taskID) 1658 } 1659 1660 // AttachNetwork generates an attachment request towards the manager. 1661 func (c *Cluster) AttachNetwork(target string, containerID string, addresses []string) (*network.NetworkingConfig, error) { 1662 aKey := attacherKey(target, containerID) 1663 c.Lock() 1664 if c.node == nil || c.node.Agent() == nil { 1665 c.Unlock() 1666 return nil, fmt.Errorf("invalid cluster node while attaching to network") 1667 } 1668 if attacher, ok := c.attachers[aKey]; ok { 1669 c.Unlock() 1670 return attacher.config, nil 1671 } 1672 1673 agent := c.node.Agent() 1674 attachWaitCh := make(chan *network.NetworkingConfig) 1675 detachWaitCh := make(chan struct{}) 1676 attachCompleteCh := make(chan struct{}) 1677 c.attachers[aKey] = &attacher{ 1678 attachWaitCh: attachWaitCh, 1679 attachCompleteCh: attachCompleteCh, 1680 detachWaitCh: detachWaitCh, 1681 } 1682 c.Unlock() 1683 1684 ctx, cancel := c.getRequestContext() 1685 defer cancel() 1686 1687 taskID, err := agent.ResourceAllocator().AttachNetwork(ctx, containerID, target, addresses) 1688 if err != nil { 1689 c.Lock() 1690 delete(c.attachers, aKey) 1691 c.Unlock() 1692 return nil, fmt.Errorf("Could not attach to network %s: %v", target, err) 1693 } 1694 1695 c.Lock() 1696 c.attachers[aKey].taskID = taskID 1697 close(attachCompleteCh) 1698 c.Unlock() 1699 1700 logrus.Debugf("Successfully attached to network %s with tid %s", target, taskID) 1701 1702 var config *network.NetworkingConfig 1703 select { 1704 case config = <-attachWaitCh: 1705 case <-ctx.Done(): 1706 return nil, fmt.Errorf("attaching to network failed, make sure your network options are correct and check manager logs: %v", ctx.Err()) 1707 } 1708 1709 c.Lock() 1710 c.attachers[aKey].config = config 1711 c.Unlock() 1712 return config, nil 1713 } 1714 1715 // DetachNetwork unblocks the waiters waiting on WaitForDetachment so 1716 // that a request to detach can be generated towards the manager. 1717 func (c *Cluster) DetachNetwork(target string, containerID string) error { 1718 aKey := attacherKey(target, containerID) 1719 1720 c.Lock() 1721 attacher, ok := c.attachers[aKey] 1722 delete(c.attachers, aKey) 1723 c.Unlock() 1724 1725 if !ok { 1726 return fmt.Errorf("could not find network attachment for container %s to network %s", containerID, target) 1727 } 1728 1729 close(attacher.detachWaitCh) 1730 return nil 1731 } 1732 1733 // CreateNetwork creates a new cluster managed network. 1734 func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) { 1735 c.RLock() 1736 defer c.RUnlock() 1737 1738 if !c.isActiveManager() { 1739 return "", c.errNoManager() 1740 } 1741 1742 if runconfig.IsPreDefinedNetwork(s.Name) { 1743 err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name) 1744 return "", apierrors.NewRequestForbiddenError(err) 1745 } 1746 1747 ctx, cancel := c.getRequestContext() 1748 defer cancel() 1749 1750 networkSpec := convert.BasicNetworkCreateToGRPC(s) 1751 r, err := c.client.CreateNetwork(ctx, &swarmapi.CreateNetworkRequest{Spec: &networkSpec}) 1752 if err != nil { 1753 return "", err 1754 } 1755 1756 return r.Network.ID, nil 1757 } 1758 1759 // RemoveNetwork removes a cluster network. 1760 func (c *Cluster) RemoveNetwork(input string) error { 1761 c.RLock() 1762 defer c.RUnlock() 1763 1764 if !c.isActiveManager() { 1765 return c.errNoManager() 1766 } 1767 1768 ctx, cancel := c.getRequestContext() 1769 defer cancel() 1770 1771 network, err := getNetwork(ctx, c.client, input) 1772 if err != nil { 1773 return err 1774 } 1775 1776 if _, err := c.client.RemoveNetwork(ctx, &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil { 1777 return err 1778 } 1779 return nil 1780 } 1781 1782 func (c *Cluster) populateNetworkID(ctx context.Context, client swarmapi.ControlClient, s *types.ServiceSpec) error { 1783 // Always prefer NetworkAttachmentConfigs from TaskTemplate 1784 // but fallback to service spec for backward compatibility 1785 networks := s.TaskTemplate.Networks 1786 if len(networks) == 0 { 1787 networks = s.Networks 1788 } 1789 1790 for i, n := range networks { 1791 apiNetwork, err := getNetwork(ctx, client, n.Target) 1792 if err != nil { 1793 if ln, _ := c.config.Backend.FindNetwork(n.Target); ln != nil && !ln.Info().Dynamic() { 1794 err = fmt.Errorf("The network %s cannot be used with services. Only networks scoped to the swarm can be used, such as those created with the overlay driver.", ln.Name()) 1795 return apierrors.NewRequestForbiddenError(err) 1796 } 1797 return err 1798 } 1799 networks[i].Target = apiNetwork.ID 1800 } 1801 return nil 1802 } 1803 1804 func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) { 1805 // GetNetwork to match via full ID. 1806 rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input}) 1807 if err != nil { 1808 // If any error (including NotFound), ListNetworks to match via ID prefix and full name. 1809 rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}}) 1810 if err != nil || len(rl.Networks) == 0 { 1811 rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}}) 1812 } 1813 1814 if err != nil { 1815 return nil, err 1816 } 1817 1818 if len(rl.Networks) == 0 { 1819 return nil, fmt.Errorf("network %s not found", input) 1820 } 1821 1822 if l := len(rl.Networks); l > 1 { 1823 return nil, fmt.Errorf("network %s is ambiguous (%d matches found)", input, l) 1824 } 1825 1826 return rl.Networks[0], nil 1827 } 1828 return rg.Network, nil 1829 } 1830 1831 // Cleanup stops active swarm node. This is run before daemon shutdown. 1832 func (c *Cluster) Cleanup() { 1833 c.Lock() 1834 node := c.node 1835 if node == nil { 1836 c.Unlock() 1837 return 1838 } 1839 defer c.Unlock() 1840 if c.isActiveManager() { 1841 active, reachable, unreachable, err := c.managerStats() 1842 if err == nil { 1843 singlenode := active && isLastManager(reachable, unreachable) 1844 if active && !singlenode && removingManagerCausesLossOfQuorum(reachable, unreachable) { 1845 logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable) 1846 } 1847 } 1848 } 1849 c.stopNode() 1850 } 1851 1852 func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) { 1853 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1854 defer cancel() 1855 nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}) 1856 if err != nil { 1857 return false, 0, 0, err 1858 } 1859 for _, n := range nodes.Nodes { 1860 if n.ManagerStatus != nil { 1861 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE { 1862 reachable++ 1863 if n.ID == c.node.NodeID() { 1864 current = true 1865 } 1866 } 1867 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE { 1868 unreachable++ 1869 } 1870 } 1871 } 1872 return 1873 } 1874 1875 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 1876 var err error 1877 req.ListenAddr, err = validateAddr(req.ListenAddr) 1878 if err != nil { 1879 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1880 } 1881 1882 if req.Spec.Annotations.Name == "" { 1883 req.Spec.Annotations.Name = "default" 1884 } else if req.Spec.Annotations.Name != "default" { 1885 return errors.New(`swarm spec must be named "default"`) 1886 } 1887 1888 return nil 1889 } 1890 1891 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 1892 var err error 1893 req.ListenAddr, err = validateAddr(req.ListenAddr) 1894 if err != nil { 1895 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1896 } 1897 if len(req.RemoteAddrs) == 0 { 1898 return fmt.Errorf("at least 1 RemoteAddr is required to join") 1899 } 1900 for i := range req.RemoteAddrs { 1901 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 1902 if err != nil { 1903 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 1904 } 1905 } 1906 return nil 1907 } 1908 1909 func validateAddr(addr string) (string, error) { 1910 if addr == "" { 1911 return addr, fmt.Errorf("invalid empty address") 1912 } 1913 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 1914 if err != nil { 1915 return addr, nil 1916 } 1917 return strings.TrimPrefix(newaddr, "tcp://"), nil 1918 } 1919 1920 func initClusterSpec(node *node, spec types.Spec) error { 1921 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 1922 for conn := range node.ListenControlSocket(ctx) { 1923 if ctx.Err() != nil { 1924 return ctx.Err() 1925 } 1926 if conn != nil { 1927 client := swarmapi.NewControlClient(conn) 1928 var cluster *swarmapi.Cluster 1929 for i := 0; ; i++ { 1930 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 1931 if err != nil { 1932 return fmt.Errorf("error on listing clusters: %v", err) 1933 } 1934 if len(lcr.Clusters) == 0 { 1935 if i < 10 { 1936 time.Sleep(200 * time.Millisecond) 1937 continue 1938 } 1939 return fmt.Errorf("empty list of clusters was returned") 1940 } 1941 cluster = lcr.Clusters[0] 1942 break 1943 } 1944 // In init, we take the initial default values from swarmkit, and merge 1945 // any non nil or 0 value from spec to GRPC spec. This will leave the 1946 // default value alone. 1947 // Note that this is different from Update(), as in Update() we expect 1948 // user to specify the complete spec of the cluster (as they already know 1949 // the existing one and knows which field to update) 1950 clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec) 1951 if err != nil { 1952 return fmt.Errorf("error updating cluster settings: %v", err) 1953 } 1954 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 1955 ClusterID: cluster.ID, 1956 ClusterVersion: &cluster.Meta.Version, 1957 Spec: &clusterSpec, 1958 }) 1959 if err != nil { 1960 return fmt.Errorf("error updating cluster settings: %v", err) 1961 } 1962 return nil 1963 } 1964 } 1965 return ctx.Err() 1966 } 1967 1968 func detectLockedError(err error) error { 1969 if err == swarmnode.ErrInvalidUnlockKey { 1970 return errors.WithStack(ErrSwarmLocked) 1971 } 1972 return err 1973 }