github.com/openshift/moby-moby@v1.13.2-0.20170601211448-f5ec1e2936dc/daemon/cluster/cluster.go (about) 1 package cluster 2 3 import ( 4 "crypto/x509" 5 "encoding/base64" 6 "encoding/json" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "net" 11 "os" 12 "path/filepath" 13 "runtime" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/Sirupsen/logrus" 19 "github.com/docker/distribution/digest" 20 distreference "github.com/docker/distribution/reference" 21 apierrors "github.com/docker/docker/api/errors" 22 apitypes "github.com/docker/docker/api/types" 23 "github.com/docker/docker/api/types/backend" 24 "github.com/docker/docker/api/types/filters" 25 "github.com/docker/docker/api/types/network" 26 types "github.com/docker/docker/api/types/swarm" 27 "github.com/docker/docker/daemon/cluster/convert" 28 executorpkg "github.com/docker/docker/daemon/cluster/executor" 29 "github.com/docker/docker/daemon/cluster/executor/container" 30 "github.com/docker/docker/daemon/logger" 31 "github.com/docker/docker/opts" 32 "github.com/docker/docker/pkg/ioutils" 33 "github.com/docker/docker/pkg/signal" 34 "github.com/docker/docker/pkg/stdcopy" 35 "github.com/docker/docker/reference" 36 "github.com/docker/docker/runconfig" 37 swarmapi "github.com/docker/swarmkit/api" 38 "github.com/docker/swarmkit/manager/encryption" 39 swarmnode "github.com/docker/swarmkit/node" 40 "github.com/docker/swarmkit/protobuf/ptypes" 41 "github.com/pkg/errors" 42 "golang.org/x/net/context" 43 "google.golang.org/grpc" 44 ) 45 46 const swarmDirName = "swarm" 47 const controlSocket = "control.sock" 48 const swarmConnectTimeout = 20 * time.Second 49 const swarmRequestTimeout = 20 * time.Second 50 const stateFile = "docker-state.json" 51 const defaultAddr = "0.0.0.0:2377" 52 53 const ( 54 initialReconnectDelay = 100 * time.Millisecond 55 maxReconnectDelay = 30 * time.Second 56 contextPrefix = "com.docker.swarm" 57 ) 58 59 // ErrNoSwarm is returned on leaving a cluster that was never initialized 60 var ErrNoSwarm = fmt.Errorf("This node is not part of a swarm") 61 62 // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated 63 var ErrSwarmExists = fmt.Errorf("This node is already part of a swarm. Use \"docker swarm leave\" to leave this swarm and join another one.") 64 65 // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet. 66 var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.") 67 68 // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. 69 var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. The attempt to join the swarm will continue in the background. Use the \"docker info\" command to see the current swarm status of your node.") 70 71 // ErrSwarmLocked is returned if the swarm is encrypted and needs a key to unlock it. 72 var ErrSwarmLocked = fmt.Errorf("Swarm is encrypted and needs to be unlocked before it can be used. Please use \"docker swarm unlock\" to unlock it.") 73 74 // ErrSwarmCertificatesExpired is returned if docker was not started for the whole validity period and they had no chance to renew automatically. 75 var ErrSwarmCertificatesExpired = errors.New("Swarm certificates have expired. To replace them, leave the swarm and join again.") 76 77 // NetworkSubnetsProvider exposes functions for retrieving the subnets 78 // of networks managed by Docker, so they can be filtered. 79 type NetworkSubnetsProvider interface { 80 V4Subnets() []net.IPNet 81 V6Subnets() []net.IPNet 82 } 83 84 // Config provides values for Cluster. 85 type Config struct { 86 Root string 87 Name string 88 Backend executorpkg.Backend 89 NetworkSubnetsProvider NetworkSubnetsProvider 90 91 // DefaultAdvertiseAddr is the default host/IP or network interface to use 92 // if no AdvertiseAddr value is specified. 93 DefaultAdvertiseAddr string 94 95 // path to store runtime state, such as the swarm control socket 96 RuntimeRoot string 97 } 98 99 // Cluster provides capabilities to participate in a cluster as a worker or a 100 // manager. 101 type Cluster struct { 102 sync.RWMutex 103 *node 104 root string 105 runtimeRoot string 106 config Config 107 configEvent chan struct{} // todo: make this array and goroutine safe 108 actualLocalAddr string // after resolution, not persisted 109 stop bool 110 err error 111 cancelDelay func() 112 attachers map[string]*attacher 113 locked bool 114 lastNodeConfig *nodeStartConfig 115 } 116 117 // attacher manages the in-memory attachment state of a container 118 // attachment to a global scope network managed by swarm manager. It 119 // helps in identifying the attachment ID via the taskID and the 120 // corresponding attachment configuration obtained from the manager. 121 type attacher struct { 122 taskID string 123 config *network.NetworkingConfig 124 attachWaitCh chan *network.NetworkingConfig 125 attachCompleteCh chan struct{} 126 detachWaitCh chan struct{} 127 } 128 129 type node struct { 130 *swarmnode.Node 131 done chan struct{} 132 ready bool 133 conn *grpc.ClientConn 134 client swarmapi.ControlClient 135 logs swarmapi.LogsClient 136 reconnectDelay time.Duration 137 config nodeStartConfig 138 } 139 140 // nodeStartConfig holds configuration needed to start a new node. Exported 141 // fields of this structure are saved to disk in json. Unexported fields 142 // contain data that shouldn't be persisted between daemon reloads. 143 type nodeStartConfig struct { 144 // LocalAddr is this machine's local IP or hostname, if specified. 145 LocalAddr string 146 // RemoteAddr is the address that was given to "swarm join". It is used 147 // to find LocalAddr if necessary. 148 RemoteAddr string 149 // ListenAddr is the address we bind to, including a port. 150 ListenAddr string 151 // AdvertiseAddr is the address other nodes should connect to, 152 // including a port. 153 AdvertiseAddr string 154 joinAddr string 155 forceNewCluster bool 156 joinToken string 157 lockKey []byte 158 autolock bool 159 } 160 161 // New creates a new Cluster instance using provided config. 162 func New(config Config) (*Cluster, error) { 163 root := filepath.Join(config.Root, swarmDirName) 164 if err := os.MkdirAll(root, 0700); err != nil { 165 return nil, err 166 } 167 if config.RuntimeRoot == "" { 168 config.RuntimeRoot = root 169 } 170 if err := os.MkdirAll(config.RuntimeRoot, 0700); err != nil { 171 return nil, err 172 } 173 c := &Cluster{ 174 root: root, 175 config: config, 176 configEvent: make(chan struct{}, 10), 177 runtimeRoot: config.RuntimeRoot, 178 attachers: make(map[string]*attacher), 179 } 180 181 nodeConfig, err := c.loadState() 182 if err != nil { 183 if os.IsNotExist(err) { 184 return c, nil 185 } 186 return nil, err 187 } 188 189 n, err := c.startNewNode(*nodeConfig) 190 if err != nil { 191 return nil, err 192 } 193 194 select { 195 case <-time.After(swarmConnectTimeout): 196 logrus.Error("swarm component could not be started before timeout was reached") 197 case <-n.Ready(): 198 case <-n.done: 199 if errors.Cause(c.err) == ErrSwarmLocked { 200 return c, nil 201 } 202 if err, ok := errors.Cause(c.err).(x509.CertificateInvalidError); ok && err.Reason == x509.Expired { 203 c.err = ErrSwarmCertificatesExpired 204 return c, nil 205 } 206 return nil, fmt.Errorf("swarm component could not be started: %v", c.err) 207 } 208 go c.reconnectOnFailure(n) 209 return c, nil 210 } 211 212 func (c *Cluster) loadState() (*nodeStartConfig, error) { 213 dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile)) 214 if err != nil { 215 return nil, err 216 } 217 // missing certificate means no actual state to restore from 218 if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil { 219 if os.IsNotExist(err) { 220 c.clearState() 221 } 222 return nil, err 223 } 224 var st nodeStartConfig 225 if err := json.Unmarshal(dt, &st); err != nil { 226 return nil, err 227 } 228 return &st, nil 229 } 230 231 func (c *Cluster) saveState(config nodeStartConfig) error { 232 dt, err := json.Marshal(config) 233 if err != nil { 234 return err 235 } 236 return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600) 237 } 238 239 func (c *Cluster) reconnectOnFailure(n *node) { 240 for { 241 <-n.done 242 c.Lock() 243 if c.stop || c.node != nil { 244 c.Unlock() 245 return 246 } 247 n.reconnectDelay *= 2 248 if n.reconnectDelay > maxReconnectDelay { 249 n.reconnectDelay = maxReconnectDelay 250 } 251 logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds()) 252 delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) 253 c.cancelDelay = cancel 254 c.Unlock() 255 <-delayCtx.Done() 256 if delayCtx.Err() != context.DeadlineExceeded { 257 return 258 } 259 c.Lock() 260 if c.node != nil { 261 c.Unlock() 262 return 263 } 264 var err error 265 config := n.config 266 config.RemoteAddr = c.getRemoteAddress() 267 config.joinAddr = config.RemoteAddr 268 n, err = c.startNewNode(config) 269 if err != nil { 270 c.err = err 271 close(n.done) 272 } 273 c.Unlock() 274 } 275 } 276 277 func (c *Cluster) startNewNode(conf nodeStartConfig) (*node, error) { 278 if err := c.config.Backend.IsSwarmCompatible(); err != nil { 279 return nil, err 280 } 281 282 actualLocalAddr := conf.LocalAddr 283 if actualLocalAddr == "" { 284 // If localAddr was not specified, resolve it automatically 285 // based on the route to joinAddr. localAddr can only be left 286 // empty on "join". 287 listenHost, _, err := net.SplitHostPort(conf.ListenAddr) 288 if err != nil { 289 return nil, fmt.Errorf("could not parse listen address: %v", err) 290 } 291 292 listenAddrIP := net.ParseIP(listenHost) 293 if listenAddrIP == nil || !listenAddrIP.IsUnspecified() { 294 actualLocalAddr = listenHost 295 } else { 296 if conf.RemoteAddr == "" { 297 // Should never happen except using swarms created by 298 // old versions that didn't save remoteAddr. 299 conf.RemoteAddr = "8.8.8.8:53" 300 } 301 conn, err := net.Dial("udp", conf.RemoteAddr) 302 if err != nil { 303 return nil, fmt.Errorf("could not find local IP address: %v", err) 304 } 305 localHostPort := conn.LocalAddr().String() 306 actualLocalAddr, _, _ = net.SplitHostPort(localHostPort) 307 conn.Close() 308 } 309 } 310 311 var control string 312 if runtime.GOOS == "windows" { 313 control = `\\.\pipe\` + controlSocket 314 } else { 315 control = filepath.Join(c.runtimeRoot, controlSocket) 316 } 317 318 c.node = nil 319 c.cancelDelay = nil 320 c.stop = false 321 n, err := swarmnode.New(&swarmnode.Config{ 322 Hostname: c.config.Name, 323 ForceNewCluster: conf.forceNewCluster, 324 ListenControlAPI: control, 325 ListenRemoteAPI: conf.ListenAddr, 326 AdvertiseRemoteAPI: conf.AdvertiseAddr, 327 JoinAddr: conf.joinAddr, 328 StateDir: c.root, 329 JoinToken: conf.joinToken, 330 Executor: container.NewExecutor(c.config.Backend), 331 HeartbeatTick: 1, 332 ElectionTick: 3, 333 UnlockKey: conf.lockKey, 334 AutoLockManagers: conf.autolock, 335 PluginGetter: c.config.Backend.PluginGetter(), 336 }) 337 338 if err != nil { 339 return nil, err 340 } 341 ctx := context.Background() 342 if err := n.Start(ctx); err != nil { 343 return nil, err 344 } 345 node := &node{ 346 Node: n, 347 done: make(chan struct{}), 348 reconnectDelay: initialReconnectDelay, 349 config: conf, 350 } 351 c.node = node 352 c.actualLocalAddr = actualLocalAddr // not saved 353 c.saveState(conf) 354 355 c.config.Backend.DaemonJoinsCluster(c) 356 go func() { 357 err := detectLockedError(n.Err(ctx)) 358 if err != nil { 359 logrus.Errorf("cluster exited with error: %v", err) 360 } 361 c.Lock() 362 c.node = nil 363 c.err = err 364 if errors.Cause(err) == ErrSwarmLocked { 365 c.locked = true 366 confClone := conf 367 c.lastNodeConfig = &confClone 368 } 369 c.Unlock() 370 close(node.done) 371 }() 372 373 go func() { 374 select { 375 case <-n.Ready(): 376 c.Lock() 377 node.ready = true 378 c.err = nil 379 c.Unlock() 380 case <-ctx.Done(): 381 } 382 c.configEvent <- struct{}{} 383 }() 384 385 go func() { 386 for conn := range n.ListenControlSocket(ctx) { 387 c.Lock() 388 if node.conn != conn { 389 if conn == nil { 390 node.client = nil 391 node.logs = nil 392 } else { 393 node.client = swarmapi.NewControlClient(conn) 394 node.logs = swarmapi.NewLogsClient(conn) 395 } 396 } 397 node.conn = conn 398 c.Unlock() 399 c.configEvent <- struct{}{} 400 } 401 }() 402 403 return node, nil 404 } 405 406 // Init initializes new cluster from user provided request. 407 func (c *Cluster) Init(req types.InitRequest) (string, error) { 408 c.Lock() 409 if c.swarmExists() { 410 if !req.ForceNewCluster { 411 c.Unlock() 412 return "", ErrSwarmExists 413 } 414 if err := c.stopNode(); err != nil { 415 c.Unlock() 416 return "", err 417 } 418 } 419 420 if err := validateAndSanitizeInitRequest(&req); err != nil { 421 c.Unlock() 422 return "", apierrors.NewBadRequestError(err) 423 } 424 425 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 426 if err != nil { 427 c.Unlock() 428 return "", err 429 } 430 431 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 432 if err != nil { 433 c.Unlock() 434 return "", err 435 } 436 437 localAddr := listenHost 438 439 // If the local address is undetermined, the advertise address 440 // will be used as local address, if it belongs to this system. 441 // If the advertise address is not local, then we try to find 442 // a system address to use as local address. If this fails, 443 // we give up and ask user to pass the listen address. 444 if net.ParseIP(localAddr).IsUnspecified() { 445 advertiseIP := net.ParseIP(advertiseHost) 446 447 found := false 448 for _, systemIP := range listSystemIPs() { 449 if systemIP.Equal(advertiseIP) { 450 localAddr = advertiseIP.String() 451 found = true 452 break 453 } 454 } 455 456 if !found { 457 ip, err := c.resolveSystemAddr() 458 if err != nil { 459 c.Unlock() 460 logrus.Warnf("Could not find a local address: %v", err) 461 return "", errMustSpecifyListenAddr 462 } 463 localAddr = ip.String() 464 } 465 } 466 467 // todo: check current state existing 468 n, err := c.startNewNode(nodeStartConfig{ 469 forceNewCluster: req.ForceNewCluster, 470 autolock: req.AutoLockManagers, 471 LocalAddr: localAddr, 472 ListenAddr: net.JoinHostPort(listenHost, listenPort), 473 AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort), 474 }) 475 if err != nil { 476 c.Unlock() 477 return "", err 478 } 479 c.Unlock() 480 481 select { 482 case <-n.Ready(): 483 if err := initClusterSpec(n, req.Spec); err != nil { 484 return "", err 485 } 486 go c.reconnectOnFailure(n) 487 return n.NodeID(), nil 488 case <-n.done: 489 c.RLock() 490 defer c.RUnlock() 491 if !req.ForceNewCluster { // if failure on first attempt don't keep state 492 if err := c.clearState(); err != nil { 493 return "", err 494 } 495 } 496 return "", c.err 497 } 498 } 499 500 // Join makes current Cluster part of an existing swarm cluster. 501 func (c *Cluster) Join(req types.JoinRequest) error { 502 c.Lock() 503 if c.swarmExists() { 504 c.Unlock() 505 return ErrSwarmExists 506 } 507 if err := validateAndSanitizeJoinRequest(&req); err != nil { 508 c.Unlock() 509 return apierrors.NewBadRequestError(err) 510 } 511 512 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 513 if err != nil { 514 c.Unlock() 515 return err 516 } 517 518 var advertiseAddr string 519 if req.AdvertiseAddr != "" { 520 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 521 // For joining, we don't need to provide an advertise address, 522 // since the remote side can detect it. 523 if err == nil { 524 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 525 } 526 } 527 528 // todo: check current state existing 529 n, err := c.startNewNode(nodeStartConfig{ 530 RemoteAddr: req.RemoteAddrs[0], 531 ListenAddr: net.JoinHostPort(listenHost, listenPort), 532 AdvertiseAddr: advertiseAddr, 533 joinAddr: req.RemoteAddrs[0], 534 joinToken: req.JoinToken, 535 }) 536 if err != nil { 537 c.Unlock() 538 return err 539 } 540 c.Unlock() 541 542 select { 543 case <-time.After(swarmConnectTimeout): 544 // attempt to connect will continue in background, but reconnect only if it didn't fail 545 go func() { 546 select { 547 case <-n.Ready(): 548 c.reconnectOnFailure(n) 549 case <-n.done: 550 logrus.Errorf("failed to join the cluster: %+v", c.err) 551 } 552 }() 553 return ErrSwarmJoinTimeoutReached 554 case <-n.Ready(): 555 go c.reconnectOnFailure(n) 556 return nil 557 case <-n.done: 558 c.RLock() 559 defer c.RUnlock() 560 return c.err 561 } 562 } 563 564 // GetUnlockKey returns the unlock key for the swarm. 565 func (c *Cluster) GetUnlockKey() (string, error) { 566 c.RLock() 567 defer c.RUnlock() 568 569 if !c.isActiveManager() { 570 return "", c.errNoManager() 571 } 572 573 ctx, cancel := c.getRequestContext() 574 defer cancel() 575 576 client := swarmapi.NewCAClient(c.conn) 577 578 r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{}) 579 if err != nil { 580 return "", err 581 } 582 583 if len(r.UnlockKey) == 0 { 584 // no key 585 return "", nil 586 } 587 588 return encryption.HumanReadableKey(r.UnlockKey), nil 589 } 590 591 // UnlockSwarm provides a key to decrypt data that is encrypted at rest. 592 func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error { 593 c.RLock() 594 if !c.isActiveManager() { 595 if err := c.errNoManager(); err != ErrSwarmLocked { 596 c.RUnlock() 597 return err 598 } 599 } 600 601 if c.node != nil || c.locked != true { 602 c.RUnlock() 603 return errors.New("swarm is not locked") 604 } 605 c.RUnlock() 606 607 key, err := encryption.ParseHumanReadableKey(req.UnlockKey) 608 if err != nil { 609 return err 610 } 611 612 c.Lock() 613 config := *c.lastNodeConfig 614 config.lockKey = key 615 n, err := c.startNewNode(config) 616 if err != nil { 617 c.Unlock() 618 return err 619 } 620 c.Unlock() 621 select { 622 case <-n.Ready(): 623 case <-n.done: 624 if errors.Cause(c.err) == ErrSwarmLocked { 625 return errors.New("swarm could not be unlocked: invalid key provided") 626 } 627 return fmt.Errorf("swarm component could not be started: %v", c.err) 628 } 629 go c.reconnectOnFailure(n) 630 return nil 631 } 632 633 // stopNode is a helper that stops the active c.node and waits until it has 634 // shut down. Call while keeping the cluster lock. 635 func (c *Cluster) stopNode() error { 636 if c.node == nil { 637 return nil 638 } 639 c.stop = true 640 if c.cancelDelay != nil { 641 c.cancelDelay() 642 c.cancelDelay = nil 643 } 644 node := c.node 645 ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) 646 defer cancel() 647 // TODO: can't hold lock on stop because it calls back to network 648 c.Unlock() 649 defer c.Lock() 650 if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { 651 return err 652 } 653 <-node.done 654 return nil 655 } 656 657 func removingManagerCausesLossOfQuorum(reachable, unreachable int) bool { 658 return reachable-2 <= unreachable 659 } 660 661 func isLastManager(reachable, unreachable int) bool { 662 return reachable == 1 && unreachable == 0 663 } 664 665 // Leave shuts down Cluster and removes current state. 666 func (c *Cluster) Leave(force bool) error { 667 c.Lock() 668 node := c.node 669 if node == nil { 670 if c.locked { 671 c.locked = false 672 c.lastNodeConfig = nil 673 c.Unlock() 674 } else if c.err == ErrSwarmCertificatesExpired { 675 c.err = nil 676 c.Unlock() 677 } else { 678 c.Unlock() 679 return ErrNoSwarm 680 } 681 } else { 682 if node.Manager() != nil && !force { 683 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 684 if c.isActiveManager() { 685 active, reachable, unreachable, err := c.managerStats() 686 if err == nil { 687 if active && removingManagerCausesLossOfQuorum(reachable, unreachable) { 688 if isLastManager(reachable, unreachable) { 689 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 690 c.Unlock() 691 return fmt.Errorf(msg) 692 } 693 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 694 } 695 } 696 } else { 697 msg += "Doing so may lose the consensus of your cluster. " 698 } 699 700 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 701 c.Unlock() 702 return fmt.Errorf(msg) 703 } 704 if err := c.stopNode(); err != nil { 705 logrus.Errorf("failed to shut down cluster node: %v", err) 706 signal.DumpStacks("") 707 c.Unlock() 708 return err 709 } 710 c.Unlock() 711 if nodeID := node.NodeID(); nodeID != "" { 712 nodeContainers, err := c.listContainerForNode(nodeID) 713 if err != nil { 714 return err 715 } 716 for _, id := range nodeContainers { 717 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 718 logrus.Errorf("error removing %v: %v", id, err) 719 } 720 } 721 } 722 } 723 c.configEvent <- struct{}{} 724 // todo: cleanup optional? 725 if err := c.clearState(); err != nil { 726 return err 727 } 728 729 return nil 730 } 731 732 func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) { 733 var ids []string 734 filters := filters.NewArgs() 735 filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID)) 736 containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{ 737 Filters: filters, 738 }) 739 if err != nil { 740 return []string{}, err 741 } 742 for _, c := range containers { 743 ids = append(ids, c.ID) 744 } 745 return ids, nil 746 } 747 748 func (c *Cluster) clearState() error { 749 // todo: backup this data instead of removing? 750 if err := os.RemoveAll(c.root); err != nil { 751 return err 752 } 753 if err := os.MkdirAll(c.root, 0700); err != nil { 754 return err 755 } 756 c.config.Backend.DaemonLeavesCluster() 757 return nil 758 } 759 760 func (c *Cluster) getRequestContext() (context.Context, func()) { // TODO: not needed when requests don't block on qourum lost 761 return context.WithTimeout(context.Background(), swarmRequestTimeout) 762 } 763 764 // Inspect retrieves the configuration properties of a managed swarm cluster. 765 func (c *Cluster) Inspect() (types.Swarm, error) { 766 c.RLock() 767 defer c.RUnlock() 768 769 if !c.isActiveManager() { 770 return types.Swarm{}, c.errNoManager() 771 } 772 773 ctx, cancel := c.getRequestContext() 774 defer cancel() 775 776 swarm, err := getSwarm(ctx, c.client) 777 if err != nil { 778 return types.Swarm{}, err 779 } 780 781 return convert.SwarmFromGRPC(*swarm), nil 782 } 783 784 // Update updates configuration of a managed swarm cluster. 785 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 786 c.RLock() 787 defer c.RUnlock() 788 789 if !c.isActiveManager() { 790 return c.errNoManager() 791 } 792 793 ctx, cancel := c.getRequestContext() 794 defer cancel() 795 796 swarm, err := getSwarm(ctx, c.client) 797 if err != nil { 798 return err 799 } 800 801 // In update, client should provide the complete spec of the swarm, including 802 // Name and Labels. If a field is specified with 0 or nil, then the default value 803 // will be used to swarmkit. 804 clusterSpec, err := convert.SwarmSpecToGRPC(spec) 805 if err != nil { 806 return apierrors.NewBadRequestError(err) 807 } 808 809 _, err = c.client.UpdateCluster( 810 ctx, 811 &swarmapi.UpdateClusterRequest{ 812 ClusterID: swarm.ID, 813 Spec: &clusterSpec, 814 ClusterVersion: &swarmapi.Version{ 815 Index: version, 816 }, 817 Rotation: swarmapi.KeyRotation{ 818 WorkerJoinToken: flags.RotateWorkerToken, 819 ManagerJoinToken: flags.RotateManagerToken, 820 ManagerUnlockKey: flags.RotateManagerUnlockKey, 821 }, 822 }, 823 ) 824 return err 825 } 826 827 // IsManager returns true if Cluster is participating as a manager. 828 func (c *Cluster) IsManager() bool { 829 c.RLock() 830 defer c.RUnlock() 831 return c.isActiveManager() 832 } 833 834 // IsAgent returns true if Cluster is participating as a worker/agent. 835 func (c *Cluster) IsAgent() bool { 836 c.RLock() 837 defer c.RUnlock() 838 return c.node != nil && c.ready 839 } 840 841 // GetLocalAddress returns the local address. 842 func (c *Cluster) GetLocalAddress() string { 843 c.RLock() 844 defer c.RUnlock() 845 return c.actualLocalAddr 846 } 847 848 // GetListenAddress returns the listen address. 849 func (c *Cluster) GetListenAddress() string { 850 c.RLock() 851 defer c.RUnlock() 852 if c.node != nil { 853 return c.node.config.ListenAddr 854 } 855 return "" 856 } 857 858 // GetAdvertiseAddress returns the remotely reachable address of this node. 859 func (c *Cluster) GetAdvertiseAddress() string { 860 c.RLock() 861 defer c.RUnlock() 862 if c.node != nil && c.node.config.AdvertiseAddr != "" { 863 advertiseHost, _, _ := net.SplitHostPort(c.node.config.AdvertiseAddr) 864 return advertiseHost 865 } 866 return c.actualLocalAddr 867 } 868 869 // GetRemoteAddress returns a known advertise address of a remote manager if 870 // available. 871 // todo: change to array/connect with info 872 func (c *Cluster) GetRemoteAddress() string { 873 c.RLock() 874 defer c.RUnlock() 875 return c.getRemoteAddress() 876 } 877 878 func (c *Cluster) getRemoteAddress() string { 879 if c.node == nil { 880 return "" 881 } 882 nodeID := c.node.NodeID() 883 for _, r := range c.node.Remotes() { 884 if r.NodeID != nodeID { 885 return r.Addr 886 } 887 } 888 return "" 889 } 890 891 // ListenClusterEvents returns a channel that receives messages on cluster 892 // participation changes. 893 // todo: make cancelable and accessible to multiple callers 894 func (c *Cluster) ListenClusterEvents() <-chan struct{} { 895 return c.configEvent 896 } 897 898 // Info returns information about the current cluster state. 899 func (c *Cluster) Info() types.Info { 900 info := types.Info{ 901 NodeAddr: c.GetAdvertiseAddress(), 902 } 903 904 c.RLock() 905 defer c.RUnlock() 906 907 if c.node == nil { 908 info.LocalNodeState = types.LocalNodeStateInactive 909 if c.cancelDelay != nil { 910 info.LocalNodeState = types.LocalNodeStateError 911 } 912 if c.locked { 913 info.LocalNodeState = types.LocalNodeStateLocked 914 } else if c.err == ErrSwarmCertificatesExpired { 915 info.LocalNodeState = types.LocalNodeStateError 916 } 917 } else { 918 info.LocalNodeState = types.LocalNodeStatePending 919 if c.ready == true { 920 info.LocalNodeState = types.LocalNodeStateActive 921 } else if c.locked { 922 info.LocalNodeState = types.LocalNodeStateLocked 923 } 924 } 925 if c.err != nil { 926 info.Error = c.err.Error() 927 } 928 929 ctx, cancel := c.getRequestContext() 930 defer cancel() 931 932 if c.isActiveManager() { 933 info.ControlAvailable = true 934 swarm, err := c.Inspect() 935 if err != nil { 936 info.Error = err.Error() 937 } 938 939 // Strip JoinTokens 940 info.Cluster = swarm.ClusterInfo 941 942 if r, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err == nil { 943 info.Nodes = len(r.Nodes) 944 for _, n := range r.Nodes { 945 if n.ManagerStatus != nil { 946 info.Managers = info.Managers + 1 947 } 948 } 949 } 950 } 951 952 if c.node != nil { 953 for _, r := range c.node.Remotes() { 954 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 955 } 956 info.NodeID = c.node.NodeID() 957 } 958 959 return info 960 } 961 962 // isActiveManager should not be called without a read lock 963 func (c *Cluster) isActiveManager() bool { 964 return c.node != nil && c.conn != nil 965 } 966 967 // swarmExists should not be called without a read lock 968 func (c *Cluster) swarmExists() bool { 969 return c.node != nil || c.locked || c.err == ErrSwarmCertificatesExpired 970 } 971 972 // errNoManager returns error describing why manager commands can't be used. 973 // Call with read lock. 974 func (c *Cluster) errNoManager() error { 975 if c.node == nil { 976 if c.locked { 977 return ErrSwarmLocked 978 } 979 if c.err == ErrSwarmCertificatesExpired { 980 return ErrSwarmCertificatesExpired 981 } 982 return fmt.Errorf("This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again.") 983 } 984 if c.node.Manager() != nil { 985 return fmt.Errorf("This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster.") 986 } 987 return fmt.Errorf("This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.") 988 } 989 990 // GetServices returns all services of a managed swarm cluster. 991 func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) { 992 c.RLock() 993 defer c.RUnlock() 994 995 if !c.isActiveManager() { 996 return nil, c.errNoManager() 997 } 998 999 filters, err := newListServicesFilters(options.Filters) 1000 if err != nil { 1001 return nil, err 1002 } 1003 ctx, cancel := c.getRequestContext() 1004 defer cancel() 1005 1006 r, err := c.client.ListServices( 1007 ctx, 1008 &swarmapi.ListServicesRequest{Filters: filters}) 1009 if err != nil { 1010 return nil, err 1011 } 1012 1013 services := []types.Service{} 1014 1015 for _, service := range r.Services { 1016 services = append(services, convert.ServiceFromGRPC(*service)) 1017 } 1018 1019 return services, nil 1020 } 1021 1022 // imageWithDigestString takes an image such as name or name:tag 1023 // and returns the image pinned to a digest, such as name@sha256:34234... 1024 // Due to the difference between the docker/docker/reference, and the 1025 // docker/distribution/reference packages, we're parsing the image twice. 1026 // As the two packages converge, this function should be simplified. 1027 // TODO(nishanttotla): After the packages converge, the function must 1028 // convert distreference.Named -> distreference.Canonical, and the logic simplified. 1029 func (c *Cluster) imageWithDigestString(ctx context.Context, image string, authConfig *apitypes.AuthConfig) (string, error) { 1030 if _, err := digest.ParseDigest(image); err == nil { 1031 return "", errors.New("image reference is an image ID") 1032 } 1033 ref, err := distreference.ParseNamed(image) 1034 if err != nil { 1035 return "", err 1036 } 1037 // only query registry if not a canonical reference (i.e. with digest) 1038 if _, ok := ref.(distreference.Canonical); !ok { 1039 // create a docker/docker/reference Named object because GetRepository needs it 1040 dockerRef, err := reference.ParseNamed(image) 1041 if err != nil { 1042 return "", err 1043 } 1044 dockerRef = reference.WithDefaultTag(dockerRef) 1045 namedTaggedRef, ok := dockerRef.(reference.NamedTagged) 1046 if !ok { 1047 return "", fmt.Errorf("unable to cast image to NamedTagged reference object") 1048 } 1049 1050 repo, _, err := c.config.Backend.GetRepository(ctx, namedTaggedRef, authConfig) 1051 if err != nil { 1052 return "", err 1053 } 1054 dscrptr, err := repo.Tags(ctx).Get(ctx, namedTaggedRef.Tag()) 1055 if err != nil { 1056 return "", err 1057 } 1058 1059 namedDigestedRef, err := distreference.WithDigest(distreference.EnsureTagged(ref), dscrptr.Digest) 1060 if err != nil { 1061 return "", err 1062 } 1063 return namedDigestedRef.String(), nil 1064 } 1065 // reference already contains a digest, so just return it 1066 return ref.String(), nil 1067 } 1068 1069 // CreateService creates a new service in a managed swarm cluster. 1070 func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (*apitypes.ServiceCreateResponse, error) { 1071 c.RLock() 1072 defer c.RUnlock() 1073 1074 if !c.isActiveManager() { 1075 return nil, c.errNoManager() 1076 } 1077 1078 ctx, cancel := c.getRequestContext() 1079 defer cancel() 1080 1081 err := c.populateNetworkID(ctx, c.client, &s) 1082 if err != nil { 1083 return nil, err 1084 } 1085 1086 serviceSpec, err := convert.ServiceSpecToGRPC(s) 1087 if err != nil { 1088 return nil, apierrors.NewBadRequestError(err) 1089 } 1090 1091 ctnr := serviceSpec.Task.GetContainer() 1092 if ctnr == nil { 1093 return nil, fmt.Errorf("service does not use container tasks") 1094 } 1095 1096 if encodedAuth != "" { 1097 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 1098 } 1099 1100 // retrieve auth config from encoded auth 1101 authConfig := &apitypes.AuthConfig{} 1102 if encodedAuth != "" { 1103 if err := json.NewDecoder(base64.NewDecoder(base64.URLEncoding, strings.NewReader(encodedAuth))).Decode(authConfig); err != nil { 1104 logrus.Warnf("invalid authconfig: %v", err) 1105 } 1106 } 1107 1108 resp := &apitypes.ServiceCreateResponse{} 1109 1110 // pin image by digest 1111 if os.Getenv("DOCKER_SERVICE_PREFER_OFFLINE_IMAGE") != "1" { 1112 digestImage, err := c.imageWithDigestString(ctx, ctnr.Image, authConfig) 1113 if err != nil { 1114 logrus.Warnf("unable to pin image %s to digest: %s", ctnr.Image, err.Error()) 1115 resp.Warnings = append(resp.Warnings, fmt.Sprintf("unable to pin image %s to digest: %s", ctnr.Image, err.Error())) 1116 } else if ctnr.Image != digestImage { 1117 logrus.Debugf("pinning image %s by digest: %s", ctnr.Image, digestImage) 1118 ctnr.Image = digestImage 1119 } else { 1120 logrus.Debugf("creating service using supplied digest reference %s", ctnr.Image) 1121 } 1122 1123 // Replace the context with a fresh one. 1124 // If we timed out while communicating with the 1125 // registry, then "ctx" will already be expired, which 1126 // would cause UpdateService below to fail. Reusing 1127 // "ctx" could make it impossible to create a service 1128 // if the registry is slow or unresponsive. 1129 var newCancel func() 1130 ctx, newCancel = c.getRequestContext() 1131 defer newCancel() 1132 } 1133 1134 r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec}) 1135 if err != nil { 1136 return nil, err 1137 } 1138 1139 resp.ID = r.Service.ID 1140 return resp, nil 1141 } 1142 1143 // GetService returns a service based on an ID or name. 1144 func (c *Cluster) GetService(input string) (types.Service, error) { 1145 c.RLock() 1146 defer c.RUnlock() 1147 1148 if !c.isActiveManager() { 1149 return types.Service{}, c.errNoManager() 1150 } 1151 1152 ctx, cancel := c.getRequestContext() 1153 defer cancel() 1154 1155 service, err := getService(ctx, c.client, input) 1156 if err != nil { 1157 return types.Service{}, err 1158 } 1159 return convert.ServiceFromGRPC(*service), nil 1160 } 1161 1162 // UpdateService updates existing service to match new properties. 1163 func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string, registryAuthFrom string) (*apitypes.ServiceUpdateResponse, error) { 1164 c.RLock() 1165 defer c.RUnlock() 1166 1167 if !c.isActiveManager() { 1168 return nil, c.errNoManager() 1169 } 1170 1171 ctx, cancel := c.getRequestContext() 1172 defer cancel() 1173 1174 err := c.populateNetworkID(ctx, c.client, &spec) 1175 if err != nil { 1176 return nil, err 1177 } 1178 1179 serviceSpec, err := convert.ServiceSpecToGRPC(spec) 1180 if err != nil { 1181 return nil, apierrors.NewBadRequestError(err) 1182 } 1183 1184 currentService, err := getService(ctx, c.client, serviceIDOrName) 1185 if err != nil { 1186 return nil, err 1187 } 1188 1189 newCtnr := serviceSpec.Task.GetContainer() 1190 if newCtnr == nil { 1191 return nil, fmt.Errorf("service does not use container tasks") 1192 } 1193 1194 if encodedAuth != "" { 1195 newCtnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 1196 } else { 1197 // this is needed because if the encodedAuth isn't being updated then we 1198 // shouldn't lose it, and continue to use the one that was already present 1199 var ctnr *swarmapi.ContainerSpec 1200 switch registryAuthFrom { 1201 case apitypes.RegistryAuthFromSpec, "": 1202 ctnr = currentService.Spec.Task.GetContainer() 1203 case apitypes.RegistryAuthFromPreviousSpec: 1204 if currentService.PreviousSpec == nil { 1205 return nil, fmt.Errorf("service does not have a previous spec") 1206 } 1207 ctnr = currentService.PreviousSpec.Task.GetContainer() 1208 default: 1209 return nil, fmt.Errorf("unsupported registryAuthFromValue") 1210 } 1211 if ctnr == nil { 1212 return nil, fmt.Errorf("service does not use container tasks") 1213 } 1214 newCtnr.PullOptions = ctnr.PullOptions 1215 // update encodedAuth so it can be used to pin image by digest 1216 if ctnr.PullOptions != nil { 1217 encodedAuth = ctnr.PullOptions.RegistryAuth 1218 } 1219 } 1220 1221 // retrieve auth config from encoded auth 1222 authConfig := &apitypes.AuthConfig{} 1223 if encodedAuth != "" { 1224 if err := json.NewDecoder(base64.NewDecoder(base64.URLEncoding, strings.NewReader(encodedAuth))).Decode(authConfig); err != nil { 1225 logrus.Warnf("invalid authconfig: %v", err) 1226 } 1227 } 1228 1229 resp := &apitypes.ServiceUpdateResponse{} 1230 1231 // pin image by digest 1232 if os.Getenv("DOCKER_SERVICE_PREFER_OFFLINE_IMAGE") != "1" { 1233 digestImage, err := c.imageWithDigestString(ctx, newCtnr.Image, authConfig) 1234 if err != nil { 1235 logrus.Warnf("unable to pin image %s to digest: %s", newCtnr.Image, err.Error()) 1236 resp.Warnings = append(resp.Warnings, fmt.Sprintf("unable to pin image %s to digest: %s", newCtnr.Image, err.Error())) 1237 } else if newCtnr.Image != digestImage { 1238 logrus.Debugf("pinning image %s by digest: %s", newCtnr.Image, digestImage) 1239 newCtnr.Image = digestImage 1240 } else { 1241 logrus.Debugf("updating service using supplied digest reference %s", newCtnr.Image) 1242 } 1243 1244 // Replace the context with a fresh one. 1245 // If we timed out while communicating with the 1246 // registry, then "ctx" will already be expired, which 1247 // would cause UpdateService below to fail. Reusing 1248 // "ctx" could make it impossible to create a service 1249 // if the registry is slow or unresponsive. 1250 var newCancel func() 1251 ctx, newCancel = c.getRequestContext() 1252 defer newCancel() 1253 } 1254 1255 _, err = c.client.UpdateService( 1256 ctx, 1257 &swarmapi.UpdateServiceRequest{ 1258 ServiceID: currentService.ID, 1259 Spec: &serviceSpec, 1260 ServiceVersion: &swarmapi.Version{ 1261 Index: version, 1262 }, 1263 }, 1264 ) 1265 1266 return resp, err 1267 } 1268 1269 // RemoveService removes a service from a managed swarm cluster. 1270 func (c *Cluster) RemoveService(input string) error { 1271 c.RLock() 1272 defer c.RUnlock() 1273 1274 if !c.isActiveManager() { 1275 return c.errNoManager() 1276 } 1277 1278 ctx, cancel := c.getRequestContext() 1279 defer cancel() 1280 1281 service, err := getService(ctx, c.client, input) 1282 if err != nil { 1283 return err 1284 } 1285 1286 if _, err := c.client.RemoveService(ctx, &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil { 1287 return err 1288 } 1289 return nil 1290 } 1291 1292 // ServiceLogs collects service logs and writes them back to `config.OutStream` 1293 func (c *Cluster) ServiceLogs(ctx context.Context, input string, config *backend.ContainerLogsConfig, started chan struct{}) error { 1294 c.RLock() 1295 if !c.isActiveManager() { 1296 c.RUnlock() 1297 return c.errNoManager() 1298 } 1299 1300 service, err := getService(ctx, c.client, input) 1301 if err != nil { 1302 c.RUnlock() 1303 return err 1304 } 1305 1306 stream, err := c.logs.SubscribeLogs(ctx, &swarmapi.SubscribeLogsRequest{ 1307 Selector: &swarmapi.LogSelector{ 1308 ServiceIDs: []string{service.ID}, 1309 }, 1310 Options: &swarmapi.LogSubscriptionOptions{ 1311 Follow: config.Follow, 1312 }, 1313 }) 1314 if err != nil { 1315 c.RUnlock() 1316 return err 1317 } 1318 1319 wf := ioutils.NewWriteFlusher(config.OutStream) 1320 defer wf.Close() 1321 close(started) 1322 wf.Flush() 1323 1324 outStream := stdcopy.NewStdWriter(wf, stdcopy.Stdout) 1325 errStream := stdcopy.NewStdWriter(wf, stdcopy.Stderr) 1326 1327 // Release the lock before starting the stream. 1328 c.RUnlock() 1329 for { 1330 // Check the context before doing anything. 1331 select { 1332 case <-ctx.Done(): 1333 return ctx.Err() 1334 default: 1335 } 1336 1337 subscribeMsg, err := stream.Recv() 1338 if err == io.EOF { 1339 return nil 1340 } 1341 if err != nil { 1342 return err 1343 } 1344 1345 for _, msg := range subscribeMsg.Messages { 1346 data := []byte{} 1347 1348 if config.Timestamps { 1349 ts, err := ptypes.Timestamp(msg.Timestamp) 1350 if err != nil { 1351 return err 1352 } 1353 data = append(data, []byte(ts.Format(logger.TimeFormat)+" ")...) 1354 } 1355 1356 data = append(data, []byte(fmt.Sprintf("%s.node.id=%s,%s.service.id=%s,%s.task.id=%s ", 1357 contextPrefix, msg.Context.NodeID, 1358 contextPrefix, msg.Context.ServiceID, 1359 contextPrefix, msg.Context.TaskID, 1360 ))...) 1361 1362 data = append(data, msg.Data...) 1363 1364 switch msg.Stream { 1365 case swarmapi.LogStreamStdout: 1366 outStream.Write(data) 1367 case swarmapi.LogStreamStderr: 1368 errStream.Write(data) 1369 } 1370 } 1371 } 1372 } 1373 1374 // GetNodes returns a list of all nodes known to a cluster. 1375 func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) { 1376 c.RLock() 1377 defer c.RUnlock() 1378 1379 if !c.isActiveManager() { 1380 return nil, c.errNoManager() 1381 } 1382 1383 filters, err := newListNodesFilters(options.Filters) 1384 if err != nil { 1385 return nil, err 1386 } 1387 1388 ctx, cancel := c.getRequestContext() 1389 defer cancel() 1390 1391 r, err := c.client.ListNodes( 1392 ctx, 1393 &swarmapi.ListNodesRequest{Filters: filters}) 1394 if err != nil { 1395 return nil, err 1396 } 1397 1398 nodes := []types.Node{} 1399 1400 for _, node := range r.Nodes { 1401 nodes = append(nodes, convert.NodeFromGRPC(*node)) 1402 } 1403 return nodes, nil 1404 } 1405 1406 // GetNode returns a node based on an ID. 1407 func (c *Cluster) GetNode(input string) (types.Node, error) { 1408 c.RLock() 1409 defer c.RUnlock() 1410 1411 if !c.isActiveManager() { 1412 return types.Node{}, c.errNoManager() 1413 } 1414 1415 ctx, cancel := c.getRequestContext() 1416 defer cancel() 1417 1418 node, err := getNode(ctx, c.client, input) 1419 if err != nil { 1420 return types.Node{}, err 1421 } 1422 return convert.NodeFromGRPC(*node), nil 1423 } 1424 1425 // UpdateNode updates existing nodes properties. 1426 func (c *Cluster) UpdateNode(input string, version uint64, spec types.NodeSpec) error { 1427 c.RLock() 1428 defer c.RUnlock() 1429 1430 if !c.isActiveManager() { 1431 return c.errNoManager() 1432 } 1433 1434 nodeSpec, err := convert.NodeSpecToGRPC(spec) 1435 if err != nil { 1436 return apierrors.NewBadRequestError(err) 1437 } 1438 1439 ctx, cancel := c.getRequestContext() 1440 defer cancel() 1441 1442 currentNode, err := getNode(ctx, c.client, input) 1443 if err != nil { 1444 return err 1445 } 1446 1447 _, err = c.client.UpdateNode( 1448 ctx, 1449 &swarmapi.UpdateNodeRequest{ 1450 NodeID: currentNode.ID, 1451 Spec: &nodeSpec, 1452 NodeVersion: &swarmapi.Version{ 1453 Index: version, 1454 }, 1455 }, 1456 ) 1457 return err 1458 } 1459 1460 // RemoveNode removes a node from a cluster 1461 func (c *Cluster) RemoveNode(input string, force bool) error { 1462 c.RLock() 1463 defer c.RUnlock() 1464 1465 if !c.isActiveManager() { 1466 return c.errNoManager() 1467 } 1468 1469 ctx, cancel := c.getRequestContext() 1470 defer cancel() 1471 1472 node, err := getNode(ctx, c.client, input) 1473 if err != nil { 1474 return err 1475 } 1476 1477 if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID, Force: force}); err != nil { 1478 return err 1479 } 1480 return nil 1481 } 1482 1483 // GetTasks returns a list of tasks matching the filter options. 1484 func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) { 1485 c.RLock() 1486 defer c.RUnlock() 1487 1488 if !c.isActiveManager() { 1489 return nil, c.errNoManager() 1490 } 1491 1492 byName := func(filter filters.Args) error { 1493 if filter.Include("service") { 1494 serviceFilters := filter.Get("service") 1495 for _, serviceFilter := range serviceFilters { 1496 service, err := c.GetService(serviceFilter) 1497 if err != nil { 1498 return err 1499 } 1500 filter.Del("service", serviceFilter) 1501 filter.Add("service", service.ID) 1502 } 1503 } 1504 if filter.Include("node") { 1505 nodeFilters := filter.Get("node") 1506 for _, nodeFilter := range nodeFilters { 1507 node, err := c.GetNode(nodeFilter) 1508 if err != nil { 1509 return err 1510 } 1511 filter.Del("node", nodeFilter) 1512 filter.Add("node", node.ID) 1513 } 1514 } 1515 return nil 1516 } 1517 1518 filters, err := newListTasksFilters(options.Filters, byName) 1519 if err != nil { 1520 return nil, err 1521 } 1522 1523 ctx, cancel := c.getRequestContext() 1524 defer cancel() 1525 1526 r, err := c.client.ListTasks( 1527 ctx, 1528 &swarmapi.ListTasksRequest{Filters: filters}) 1529 if err != nil { 1530 return nil, err 1531 } 1532 1533 tasks := []types.Task{} 1534 1535 for _, task := range r.Tasks { 1536 if task.Spec.GetContainer() != nil { 1537 tasks = append(tasks, convert.TaskFromGRPC(*task)) 1538 } 1539 } 1540 return tasks, nil 1541 } 1542 1543 // GetTask returns a task by an ID. 1544 func (c *Cluster) GetTask(input string) (types.Task, error) { 1545 c.RLock() 1546 defer c.RUnlock() 1547 1548 if !c.isActiveManager() { 1549 return types.Task{}, c.errNoManager() 1550 } 1551 1552 ctx, cancel := c.getRequestContext() 1553 defer cancel() 1554 1555 task, err := getTask(ctx, c.client, input) 1556 if err != nil { 1557 return types.Task{}, err 1558 } 1559 return convert.TaskFromGRPC(*task), nil 1560 } 1561 1562 // GetNetwork returns a cluster network by an ID. 1563 func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) { 1564 c.RLock() 1565 defer c.RUnlock() 1566 1567 if !c.isActiveManager() { 1568 return apitypes.NetworkResource{}, c.errNoManager() 1569 } 1570 1571 ctx, cancel := c.getRequestContext() 1572 defer cancel() 1573 1574 network, err := getNetwork(ctx, c.client, input) 1575 if err != nil { 1576 return apitypes.NetworkResource{}, err 1577 } 1578 return convert.BasicNetworkFromGRPC(*network), nil 1579 } 1580 1581 func (c *Cluster) getNetworks(filters *swarmapi.ListNetworksRequest_Filters) ([]apitypes.NetworkResource, error) { 1582 c.RLock() 1583 defer c.RUnlock() 1584 1585 if !c.isActiveManager() { 1586 return nil, c.errNoManager() 1587 } 1588 1589 ctx, cancel := c.getRequestContext() 1590 defer cancel() 1591 1592 r, err := c.client.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: filters}) 1593 if err != nil { 1594 return nil, err 1595 } 1596 1597 var networks []apitypes.NetworkResource 1598 1599 for _, network := range r.Networks { 1600 networks = append(networks, convert.BasicNetworkFromGRPC(*network)) 1601 } 1602 1603 return networks, nil 1604 } 1605 1606 // GetNetworks returns all current cluster managed networks. 1607 func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) { 1608 return c.getNetworks(nil) 1609 } 1610 1611 // GetNetworksByName returns cluster managed networks by name. 1612 // It is ok to have multiple networks here. #18864 1613 func (c *Cluster) GetNetworksByName(name string) ([]apitypes.NetworkResource, error) { 1614 // Note that swarmapi.GetNetworkRequest.Name is not functional. 1615 // So we cannot just use that with c.GetNetwork. 1616 return c.getNetworks(&swarmapi.ListNetworksRequest_Filters{ 1617 Names: []string{name}, 1618 }) 1619 } 1620 1621 func attacherKey(target, containerID string) string { 1622 return containerID + ":" + target 1623 } 1624 1625 // UpdateAttachment signals the attachment config to the attachment 1626 // waiter who is trying to start or attach the container to the 1627 // network. 1628 func (c *Cluster) UpdateAttachment(target, containerID string, config *network.NetworkingConfig) error { 1629 c.RLock() 1630 attacher, ok := c.attachers[attacherKey(target, containerID)] 1631 c.RUnlock() 1632 if !ok || attacher == nil { 1633 return fmt.Errorf("could not find attacher for container %s to network %s", containerID, target) 1634 } 1635 1636 attacher.attachWaitCh <- config 1637 close(attacher.attachWaitCh) 1638 return nil 1639 } 1640 1641 // WaitForDetachment waits for the container to stop or detach from 1642 // the network. 1643 func (c *Cluster) WaitForDetachment(ctx context.Context, networkName, networkID, taskID, containerID string) error { 1644 c.RLock() 1645 attacher, ok := c.attachers[attacherKey(networkName, containerID)] 1646 if !ok { 1647 attacher, ok = c.attachers[attacherKey(networkID, containerID)] 1648 } 1649 if c.node == nil || c.node.Agent() == nil { 1650 c.RUnlock() 1651 return fmt.Errorf("invalid cluster node while waiting for detachment") 1652 } 1653 1654 agent := c.node.Agent() 1655 c.RUnlock() 1656 1657 if ok && attacher != nil && 1658 attacher.detachWaitCh != nil && 1659 attacher.attachCompleteCh != nil { 1660 // Attachment may be in progress still so wait for 1661 // attachment to complete. 1662 select { 1663 case <-attacher.attachCompleteCh: 1664 case <-ctx.Done(): 1665 return ctx.Err() 1666 } 1667 1668 if attacher.taskID == taskID { 1669 select { 1670 case <-attacher.detachWaitCh: 1671 case <-ctx.Done(): 1672 return ctx.Err() 1673 } 1674 } 1675 } 1676 1677 return agent.ResourceAllocator().DetachNetwork(ctx, taskID) 1678 } 1679 1680 // AttachNetwork generates an attachment request towards the manager. 1681 func (c *Cluster) AttachNetwork(target string, containerID string, addresses []string) (*network.NetworkingConfig, error) { 1682 aKey := attacherKey(target, containerID) 1683 c.Lock() 1684 if c.node == nil || c.node.Agent() == nil { 1685 c.Unlock() 1686 return nil, fmt.Errorf("invalid cluster node while attaching to network") 1687 } 1688 if attacher, ok := c.attachers[aKey]; ok { 1689 c.Unlock() 1690 return attacher.config, nil 1691 } 1692 1693 agent := c.node.Agent() 1694 attachWaitCh := make(chan *network.NetworkingConfig) 1695 detachWaitCh := make(chan struct{}) 1696 attachCompleteCh := make(chan struct{}) 1697 c.attachers[aKey] = &attacher{ 1698 attachWaitCh: attachWaitCh, 1699 attachCompleteCh: attachCompleteCh, 1700 detachWaitCh: detachWaitCh, 1701 } 1702 c.Unlock() 1703 1704 ctx, cancel := c.getRequestContext() 1705 defer cancel() 1706 1707 taskID, err := agent.ResourceAllocator().AttachNetwork(ctx, containerID, target, addresses) 1708 if err != nil { 1709 c.Lock() 1710 delete(c.attachers, aKey) 1711 c.Unlock() 1712 return nil, fmt.Errorf("Could not attach to network %s: %v", target, err) 1713 } 1714 1715 c.Lock() 1716 c.attachers[aKey].taskID = taskID 1717 close(attachCompleteCh) 1718 c.Unlock() 1719 1720 logrus.Debugf("Successfully attached to network %s with task id %s", target, taskID) 1721 1722 release := func() { 1723 ctx, cancel := c.getRequestContext() 1724 defer cancel() 1725 if err := agent.ResourceAllocator().DetachNetwork(ctx, taskID); err != nil { 1726 logrus.Errorf("Failed remove network attachment %s to network %s on allocation failure: %v", 1727 taskID, target, err) 1728 } 1729 } 1730 1731 var config *network.NetworkingConfig 1732 select { 1733 case config = <-attachWaitCh: 1734 case <-ctx.Done(): 1735 release() 1736 return nil, fmt.Errorf("attaching to network failed, make sure your network options are correct and check manager logs: %v", ctx.Err()) 1737 } 1738 1739 c.Lock() 1740 c.attachers[aKey].config = config 1741 c.Unlock() 1742 1743 logrus.Debugf("Successfully allocated resources on network %s for task id %s", target, taskID) 1744 1745 return config, nil 1746 } 1747 1748 // DetachNetwork unblocks the waiters waiting on WaitForDetachment so 1749 // that a request to detach can be generated towards the manager. 1750 func (c *Cluster) DetachNetwork(target string, containerID string) error { 1751 aKey := attacherKey(target, containerID) 1752 1753 c.Lock() 1754 attacher, ok := c.attachers[aKey] 1755 delete(c.attachers, aKey) 1756 c.Unlock() 1757 1758 if !ok { 1759 return fmt.Errorf("could not find network attachment for container %s to network %s", containerID, target) 1760 } 1761 1762 close(attacher.detachWaitCh) 1763 return nil 1764 } 1765 1766 // CreateNetwork creates a new cluster managed network. 1767 func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) { 1768 c.RLock() 1769 defer c.RUnlock() 1770 1771 if !c.isActiveManager() { 1772 return "", c.errNoManager() 1773 } 1774 1775 if runconfig.IsPreDefinedNetwork(s.Name) { 1776 err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name) 1777 return "", apierrors.NewRequestForbiddenError(err) 1778 } 1779 1780 ctx, cancel := c.getRequestContext() 1781 defer cancel() 1782 1783 networkSpec := convert.BasicNetworkCreateToGRPC(s) 1784 r, err := c.client.CreateNetwork(ctx, &swarmapi.CreateNetworkRequest{Spec: &networkSpec}) 1785 if err != nil { 1786 return "", err 1787 } 1788 1789 return r.Network.ID, nil 1790 } 1791 1792 // RemoveNetwork removes a cluster network. 1793 func (c *Cluster) RemoveNetwork(input string) error { 1794 c.RLock() 1795 defer c.RUnlock() 1796 1797 if !c.isActiveManager() { 1798 return c.errNoManager() 1799 } 1800 1801 ctx, cancel := c.getRequestContext() 1802 defer cancel() 1803 1804 network, err := getNetwork(ctx, c.client, input) 1805 if err != nil { 1806 return err 1807 } 1808 1809 if _, err := c.client.RemoveNetwork(ctx, &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil { 1810 return err 1811 } 1812 return nil 1813 } 1814 1815 func (c *Cluster) populateNetworkID(ctx context.Context, client swarmapi.ControlClient, s *types.ServiceSpec) error { 1816 // Always prefer NetworkAttachmentConfigs from TaskTemplate 1817 // but fallback to service spec for backward compatibility 1818 networks := s.TaskTemplate.Networks 1819 if len(networks) == 0 { 1820 networks = s.Networks 1821 } 1822 1823 for i, n := range networks { 1824 apiNetwork, err := getNetwork(ctx, client, n.Target) 1825 if err != nil { 1826 if ln, _ := c.config.Backend.FindNetwork(n.Target); ln != nil && !ln.Info().Dynamic() { 1827 err = fmt.Errorf("The network %s cannot be used with services. Only networks scoped to the swarm can be used, such as those created with the overlay driver.", ln.Name()) 1828 return apierrors.NewRequestForbiddenError(err) 1829 } 1830 return err 1831 } 1832 networks[i].Target = apiNetwork.ID 1833 } 1834 return nil 1835 } 1836 1837 func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) { 1838 // GetNetwork to match via full ID. 1839 rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input}) 1840 if err != nil { 1841 // If any error (including NotFound), ListNetworks to match via ID prefix and full name. 1842 rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}}) 1843 if err != nil || len(rl.Networks) == 0 { 1844 rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}}) 1845 } 1846 1847 if err != nil { 1848 return nil, err 1849 } 1850 1851 if len(rl.Networks) == 0 { 1852 return nil, fmt.Errorf("network %s not found", input) 1853 } 1854 1855 if l := len(rl.Networks); l > 1 { 1856 return nil, fmt.Errorf("network %s is ambiguous (%d matches found)", input, l) 1857 } 1858 1859 return rl.Networks[0], nil 1860 } 1861 return rg.Network, nil 1862 } 1863 1864 // Cleanup stops active swarm node. This is run before daemon shutdown. 1865 func (c *Cluster) Cleanup() { 1866 c.Lock() 1867 node := c.node 1868 if node == nil { 1869 c.Unlock() 1870 return 1871 } 1872 defer c.Unlock() 1873 if c.isActiveManager() { 1874 active, reachable, unreachable, err := c.managerStats() 1875 if err == nil { 1876 singlenode := active && isLastManager(reachable, unreachable) 1877 if active && !singlenode && removingManagerCausesLossOfQuorum(reachable, unreachable) { 1878 logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable) 1879 } 1880 } 1881 } 1882 c.stopNode() 1883 } 1884 1885 func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) { 1886 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1887 defer cancel() 1888 nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}) 1889 if err != nil { 1890 return false, 0, 0, err 1891 } 1892 for _, n := range nodes.Nodes { 1893 if n.ManagerStatus != nil { 1894 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE { 1895 reachable++ 1896 if n.ID == c.node.NodeID() { 1897 current = true 1898 } 1899 } 1900 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE { 1901 unreachable++ 1902 } 1903 } 1904 } 1905 return 1906 } 1907 1908 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 1909 var err error 1910 req.ListenAddr, err = validateAddr(req.ListenAddr) 1911 if err != nil { 1912 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1913 } 1914 1915 if req.Spec.Annotations.Name == "" { 1916 req.Spec.Annotations.Name = "default" 1917 } else if req.Spec.Annotations.Name != "default" { 1918 return errors.New(`swarm spec must be named "default"`) 1919 } 1920 1921 return nil 1922 } 1923 1924 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 1925 var err error 1926 req.ListenAddr, err = validateAddr(req.ListenAddr) 1927 if err != nil { 1928 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1929 } 1930 if len(req.RemoteAddrs) == 0 { 1931 return fmt.Errorf("at least 1 RemoteAddr is required to join") 1932 } 1933 for i := range req.RemoteAddrs { 1934 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 1935 if err != nil { 1936 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 1937 } 1938 } 1939 return nil 1940 } 1941 1942 func validateAddr(addr string) (string, error) { 1943 if addr == "" { 1944 return addr, fmt.Errorf("invalid empty address") 1945 } 1946 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 1947 if err != nil { 1948 return addr, nil 1949 } 1950 return strings.TrimPrefix(newaddr, "tcp://"), nil 1951 } 1952 1953 func initClusterSpec(node *node, spec types.Spec) error { 1954 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 1955 for conn := range node.ListenControlSocket(ctx) { 1956 if ctx.Err() != nil { 1957 return ctx.Err() 1958 } 1959 if conn != nil { 1960 client := swarmapi.NewControlClient(conn) 1961 var cluster *swarmapi.Cluster 1962 for i := 0; ; i++ { 1963 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 1964 if err != nil { 1965 return fmt.Errorf("error on listing clusters: %v", err) 1966 } 1967 if len(lcr.Clusters) == 0 { 1968 if i < 10 { 1969 time.Sleep(200 * time.Millisecond) 1970 continue 1971 } 1972 return fmt.Errorf("empty list of clusters was returned") 1973 } 1974 cluster = lcr.Clusters[0] 1975 break 1976 } 1977 // In init, we take the initial default values from swarmkit, and merge 1978 // any non nil or 0 value from spec to GRPC spec. This will leave the 1979 // default value alone. 1980 // Note that this is different from Update(), as in Update() we expect 1981 // user to specify the complete spec of the cluster (as they already know 1982 // the existing one and knows which field to update) 1983 clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec) 1984 if err != nil { 1985 return fmt.Errorf("error updating cluster settings: %v", err) 1986 } 1987 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 1988 ClusterID: cluster.ID, 1989 ClusterVersion: &cluster.Meta.Version, 1990 Spec: &clusterSpec, 1991 }) 1992 if err != nil { 1993 return fmt.Errorf("error updating cluster settings: %v", err) 1994 } 1995 return nil 1996 } 1997 } 1998 return ctx.Err() 1999 } 2000 2001 func detectLockedError(err error) error { 2002 if err == swarmnode.ErrInvalidUnlockKey { 2003 return errors.WithStack(ErrSwarmLocked) 2004 } 2005 return err 2006 }