github.com/flavio/docker@v0.1.3-0.20170117145210-f63d1a6eec47/daemon/cluster/cluster.go (about) 1 package cluster 2 3 // 4 // ## Swarmkit integration 5 // 6 // Cluster - static configurable object for accessing everything swarm related. 7 // Contains methods for connecting and controlling the cluster. Exists always, 8 // even if swarm mode is not enabled. 9 // 10 // NodeRunner - Manager for starting the swarmkit node. Is present only and 11 // always if swarm mode is enabled. Implements backoff restart loop in case of 12 // errors. 13 // 14 // NodeState - Information about the current node status including access to 15 // gRPC clients if a manager is active. 16 // 17 // ### Locking 18 // 19 // `cluster.controlMutex` - taken for the whole lifecycle of the processes that 20 // can reconfigure cluster(init/join/leave etc). Protects that one 21 // reconfiguration action has fully completed before another can start. 22 // 23 // `cluster.mu` - taken when the actual changes in cluster configurations 24 // happen. Different from `controlMutex` because in some cases we need to 25 // access current cluster state even if the long-running reconfiguration is 26 // going on. For example network stack may ask for the current cluster state in 27 // the middle of the shutdown. Any time current cluster state is asked you 28 // should take the read lock of `cluster.mu`. If you are writing an API 29 // responder that returns synchronously, hold `cluster.mu.RLock()` for the 30 // duration of the whole handler function. That ensures that node will not be 31 // shut down until the handler has finished. 32 // 33 // NodeRunner implements its internal locks that should not be used outside of 34 // the struct. Instead, you should just call `nodeRunner.State()` method to get 35 // the current state of the cluster(still need `cluster.mu.RLock()` to access 36 // `cluster.nr` reference itself). Most of the changes in NodeRunner happen 37 // because of an external event(network problem, unexpected swarmkit error) and 38 // Docker shouldn't take any locks that delay these changes from happening. 39 // 40 41 import ( 42 "crypto/x509" 43 "encoding/base64" 44 "encoding/json" 45 "fmt" 46 "io" 47 "net" 48 "os" 49 "path/filepath" 50 "strings" 51 "sync" 52 "time" 53 54 "github.com/Sirupsen/logrus" 55 distreference "github.com/docker/distribution/reference" 56 apierrors "github.com/docker/docker/api/errors" 57 apitypes "github.com/docker/docker/api/types" 58 "github.com/docker/docker/api/types/backend" 59 "github.com/docker/docker/api/types/filters" 60 "github.com/docker/docker/api/types/network" 61 types "github.com/docker/docker/api/types/swarm" 62 "github.com/docker/docker/daemon/cluster/convert" 63 executorpkg "github.com/docker/docker/daemon/cluster/executor" 64 "github.com/docker/docker/daemon/logger" 65 "github.com/docker/docker/opts" 66 "github.com/docker/docker/pkg/ioutils" 67 "github.com/docker/docker/pkg/signal" 68 "github.com/docker/docker/pkg/stdcopy" 69 "github.com/docker/docker/reference" 70 "github.com/docker/docker/runconfig" 71 swarmapi "github.com/docker/swarmkit/api" 72 "github.com/docker/swarmkit/manager/encryption" 73 swarmnode "github.com/docker/swarmkit/node" 74 "github.com/docker/swarmkit/protobuf/ptypes" 75 "github.com/opencontainers/go-digest" 76 "github.com/pkg/errors" 77 "golang.org/x/net/context" 78 ) 79 80 const swarmDirName = "swarm" 81 const controlSocket = "control.sock" 82 const swarmConnectTimeout = 20 * time.Second 83 const swarmRequestTimeout = 20 * time.Second 84 const stateFile = "docker-state.json" 85 const defaultAddr = "0.0.0.0:2377" 86 87 const ( 88 initialReconnectDelay = 100 * time.Millisecond 89 maxReconnectDelay = 30 * time.Second 90 contextPrefix = "com.docker.swarm" 91 ) 92 93 // errNoSwarm is returned on leaving a cluster that was never initialized 94 var errNoSwarm = errors.New("This node is not part of a swarm") 95 96 // errSwarmExists is returned on initialize or join request for a cluster that has already been activated 97 var errSwarmExists = errors.New("This node is already part of a swarm. Use \"docker swarm leave\" to leave this swarm and join another one.") 98 99 // errSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached. 100 var errSwarmJoinTimeoutReached = errors.New("Timeout was reached before node was joined. The attempt to join the swarm will continue in the background. Use the \"docker info\" command to see the current swarm status of your node.") 101 102 // errSwarmLocked is returned if the swarm is encrypted and needs a key to unlock it. 103 var errSwarmLocked = errors.New("Swarm is encrypted and needs to be unlocked before it can be used. Please use \"docker swarm unlock\" to unlock it.") 104 105 // errSwarmCertificatesExpired is returned if docker was not started for the whole validity period and they had no chance to renew automatically. 106 var errSwarmCertificatesExpired = errors.New("Swarm certificates have expired. To replace them, leave the swarm and join again.") 107 108 // NetworkSubnetsProvider exposes functions for retrieving the subnets 109 // of networks managed by Docker, so they can be filtered. 110 type NetworkSubnetsProvider interface { 111 V4Subnets() []net.IPNet 112 V6Subnets() []net.IPNet 113 } 114 115 // Config provides values for Cluster. 116 type Config struct { 117 Root string 118 Name string 119 Backend executorpkg.Backend 120 NetworkSubnetsProvider NetworkSubnetsProvider 121 122 // DefaultAdvertiseAddr is the default host/IP or network interface to use 123 // if no AdvertiseAddr value is specified. 124 DefaultAdvertiseAddr string 125 126 // path to store runtime state, such as the swarm control socket 127 RuntimeRoot string 128 } 129 130 // Cluster provides capabilities to participate in a cluster as a worker or a 131 // manager. 132 type Cluster struct { 133 mu sync.RWMutex 134 controlMutex sync.RWMutex // protect init/join/leave user operations 135 nr *nodeRunner 136 root string 137 runtimeRoot string 138 config Config 139 configEvent chan struct{} // todo: make this array and goroutine safe 140 attachers map[string]*attacher 141 } 142 143 // attacher manages the in-memory attachment state of a container 144 // attachment to a global scope network managed by swarm manager. It 145 // helps in identifying the attachment ID via the taskID and the 146 // corresponding attachment configuration obtained from the manager. 147 type attacher struct { 148 taskID string 149 config *network.NetworkingConfig 150 attachWaitCh chan *network.NetworkingConfig 151 attachCompleteCh chan struct{} 152 detachWaitCh chan struct{} 153 } 154 155 // New creates a new Cluster instance using provided config. 156 func New(config Config) (*Cluster, error) { 157 root := filepath.Join(config.Root, swarmDirName) 158 if err := os.MkdirAll(root, 0700); err != nil { 159 return nil, err 160 } 161 if config.RuntimeRoot == "" { 162 config.RuntimeRoot = root 163 } 164 if err := os.MkdirAll(config.RuntimeRoot, 0700); err != nil { 165 return nil, err 166 } 167 c := &Cluster{ 168 root: root, 169 config: config, 170 configEvent: make(chan struct{}, 10), 171 runtimeRoot: config.RuntimeRoot, 172 attachers: make(map[string]*attacher), 173 } 174 175 nodeConfig, err := loadPersistentState(root) 176 if err != nil { 177 if os.IsNotExist(err) { 178 return c, nil 179 } 180 return nil, err 181 } 182 183 nr, err := c.newNodeRunner(*nodeConfig) 184 if err != nil { 185 return nil, err 186 } 187 c.nr = nr 188 189 select { 190 case <-time.After(swarmConnectTimeout): 191 logrus.Error("swarm component could not be started before timeout was reached") 192 case err := <-nr.Ready(): 193 if err != nil { 194 if errors.Cause(err) == errSwarmLocked { 195 return c, nil 196 } 197 if err, ok := errors.Cause(c.nr.err).(x509.CertificateInvalidError); ok && err.Reason == x509.Expired { 198 return c, nil 199 } 200 return nil, errors.Wrap(err, "swarm component could not be started") 201 } 202 } 203 return c, nil 204 } 205 206 func (c *Cluster) newNodeRunner(conf nodeStartConfig) (*nodeRunner, error) { 207 if err := c.config.Backend.IsSwarmCompatible(); err != nil { 208 return nil, err 209 } 210 211 actualLocalAddr := conf.LocalAddr 212 if actualLocalAddr == "" { 213 // If localAddr was not specified, resolve it automatically 214 // based on the route to joinAddr. localAddr can only be left 215 // empty on "join". 216 listenHost, _, err := net.SplitHostPort(conf.ListenAddr) 217 if err != nil { 218 return nil, fmt.Errorf("could not parse listen address: %v", err) 219 } 220 221 listenAddrIP := net.ParseIP(listenHost) 222 if listenAddrIP == nil || !listenAddrIP.IsUnspecified() { 223 actualLocalAddr = listenHost 224 } else { 225 if conf.RemoteAddr == "" { 226 // Should never happen except using swarms created by 227 // old versions that didn't save remoteAddr. 228 conf.RemoteAddr = "8.8.8.8:53" 229 } 230 conn, err := net.Dial("udp", conf.RemoteAddr) 231 if err != nil { 232 return nil, fmt.Errorf("could not find local IP address: %v", err) 233 } 234 localHostPort := conn.LocalAddr().String() 235 actualLocalAddr, _, _ = net.SplitHostPort(localHostPort) 236 conn.Close() 237 } 238 } 239 240 nr := &nodeRunner{cluster: c} 241 nr.actualLocalAddr = actualLocalAddr 242 243 if err := nr.Start(conf); err != nil { 244 return nil, err 245 } 246 247 c.config.Backend.SetClusterProvider(c) 248 249 return nr, nil 250 } 251 252 // Init initializes new cluster from user provided request. 253 func (c *Cluster) Init(req types.InitRequest) (string, error) { 254 c.controlMutex.Lock() 255 defer c.controlMutex.Unlock() 256 c.mu.Lock() 257 if c.nr != nil { 258 if req.ForceNewCluster { 259 if err := c.nr.Stop(); err != nil { 260 c.mu.Unlock() 261 return "", err 262 } 263 } else { 264 c.mu.Unlock() 265 return "", errSwarmExists 266 } 267 } 268 c.mu.Unlock() 269 270 if err := validateAndSanitizeInitRequest(&req); err != nil { 271 return "", apierrors.NewBadRequestError(err) 272 } 273 274 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 275 if err != nil { 276 return "", err 277 } 278 279 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 280 if err != nil { 281 return "", err 282 } 283 284 localAddr := listenHost 285 286 // If the local address is undetermined, the advertise address 287 // will be used as local address, if it belongs to this system. 288 // If the advertise address is not local, then we try to find 289 // a system address to use as local address. If this fails, 290 // we give up and ask the user to pass the listen address. 291 if net.ParseIP(localAddr).IsUnspecified() { 292 advertiseIP := net.ParseIP(advertiseHost) 293 294 found := false 295 for _, systemIP := range listSystemIPs() { 296 if systemIP.Equal(advertiseIP) { 297 localAddr = advertiseIP.String() 298 found = true 299 break 300 } 301 } 302 303 if !found { 304 ip, err := c.resolveSystemAddr() 305 if err != nil { 306 logrus.Warnf("Could not find a local address: %v", err) 307 return "", errMustSpecifyListenAddr 308 } 309 localAddr = ip.String() 310 } 311 } 312 313 if !req.ForceNewCluster { 314 clearPersistentState(c.root) 315 } 316 317 nr, err := c.newNodeRunner(nodeStartConfig{ 318 forceNewCluster: req.ForceNewCluster, 319 autolock: req.AutoLockManagers, 320 LocalAddr: localAddr, 321 ListenAddr: net.JoinHostPort(listenHost, listenPort), 322 AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort), 323 availability: req.Availability, 324 }) 325 if err != nil { 326 return "", err 327 } 328 c.mu.Lock() 329 c.nr = nr 330 c.mu.Unlock() 331 332 if err := <-nr.Ready(); err != nil { 333 if !req.ForceNewCluster { // if failure on first attempt don't keep state 334 if err := clearPersistentState(c.root); err != nil { 335 return "", err 336 } 337 } 338 if err != nil { 339 c.mu.Lock() 340 c.nr = nil 341 c.mu.Unlock() 342 } 343 return "", err 344 } 345 state := nr.State() 346 if state.swarmNode == nil { // should never happen but protect from panic 347 return "", errors.New("invalid cluster state for spec initialization") 348 } 349 if err := initClusterSpec(state.swarmNode, req.Spec); err != nil { 350 return "", err 351 } 352 return state.NodeID(), nil 353 } 354 355 // Join makes current Cluster part of an existing swarm cluster. 356 func (c *Cluster) Join(req types.JoinRequest) error { 357 c.controlMutex.Lock() 358 defer c.controlMutex.Unlock() 359 c.mu.Lock() 360 if c.nr != nil { 361 c.mu.Unlock() 362 return errSwarmExists 363 } 364 c.mu.Unlock() 365 366 if err := validateAndSanitizeJoinRequest(&req); err != nil { 367 return apierrors.NewBadRequestError(err) 368 } 369 370 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 371 if err != nil { 372 return err 373 } 374 375 var advertiseAddr string 376 if req.AdvertiseAddr != "" { 377 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 378 // For joining, we don't need to provide an advertise address, 379 // since the remote side can detect it. 380 if err == nil { 381 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 382 } 383 } 384 385 clearPersistentState(c.root) 386 387 nr, err := c.newNodeRunner(nodeStartConfig{ 388 RemoteAddr: req.RemoteAddrs[0], 389 ListenAddr: net.JoinHostPort(listenHost, listenPort), 390 AdvertiseAddr: advertiseAddr, 391 joinAddr: req.RemoteAddrs[0], 392 joinToken: req.JoinToken, 393 availability: req.Availability, 394 }) 395 if err != nil { 396 return err 397 } 398 399 c.mu.Lock() 400 c.nr = nr 401 c.mu.Unlock() 402 403 select { 404 case <-time.After(swarmConnectTimeout): 405 return errSwarmJoinTimeoutReached 406 case err := <-nr.Ready(): 407 if err != nil { 408 c.mu.Lock() 409 c.nr = nil 410 c.mu.Unlock() 411 } 412 return err 413 } 414 } 415 416 // GetUnlockKey returns the unlock key for the swarm. 417 func (c *Cluster) GetUnlockKey() (string, error) { 418 c.mu.RLock() 419 defer c.mu.RUnlock() 420 421 state := c.currentNodeState() 422 if !state.IsActiveManager() { 423 return "", c.errNoManager(state) 424 } 425 426 ctx, cancel := c.getRequestContext() 427 defer cancel() 428 429 client := swarmapi.NewCAClient(state.grpcConn) 430 431 r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{}) 432 if err != nil { 433 return "", err 434 } 435 436 if len(r.UnlockKey) == 0 { 437 // no key 438 return "", nil 439 } 440 441 return encryption.HumanReadableKey(r.UnlockKey), nil 442 } 443 444 // UnlockSwarm provides a key to decrypt data that is encrypted at rest. 445 func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error { 446 c.controlMutex.Lock() 447 defer c.controlMutex.Unlock() 448 449 c.mu.RLock() 450 state := c.currentNodeState() 451 452 if !state.IsActiveManager() { 453 // when manager is not active, 454 // unless it is locked, otherwise return error. 455 if err := c.errNoManager(state); err != errSwarmLocked { 456 c.mu.RUnlock() 457 return err 458 } 459 } else { 460 // when manager is active, return an error of "not locked" 461 c.mu.RUnlock() 462 return errors.New("swarm is not locked") 463 } 464 465 // only when swarm is locked, code running reaches here 466 nr := c.nr 467 c.mu.RUnlock() 468 469 key, err := encryption.ParseHumanReadableKey(req.UnlockKey) 470 if err != nil { 471 return err 472 } 473 474 config := nr.config 475 config.lockKey = key 476 if err := nr.Stop(); err != nil { 477 return err 478 } 479 nr, err = c.newNodeRunner(config) 480 if err != nil { 481 return err 482 } 483 484 c.mu.Lock() 485 c.nr = nr 486 c.mu.Unlock() 487 488 if err := <-nr.Ready(); err != nil { 489 if errors.Cause(err) == errSwarmLocked { 490 return errors.New("swarm could not be unlocked: invalid key provided") 491 } 492 return fmt.Errorf("swarm component could not be started: %v", err) 493 } 494 return nil 495 } 496 497 // Leave shuts down Cluster and removes current state. 498 func (c *Cluster) Leave(force bool) error { 499 c.controlMutex.Lock() 500 defer c.controlMutex.Unlock() 501 502 c.mu.Lock() 503 nr := c.nr 504 if nr == nil { 505 c.mu.Unlock() 506 return errNoSwarm 507 } 508 509 state := c.currentNodeState() 510 511 if errors.Cause(state.err) == errSwarmLocked && !force { 512 // leave a locked swarm without --force is not allowed 513 c.mu.Unlock() 514 return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.") 515 } 516 517 if state.IsManager() && !force { 518 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 519 if state.IsActiveManager() { 520 active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID()) 521 if err == nil { 522 if active && removingManagerCausesLossOfQuorum(reachable, unreachable) { 523 if isLastManager(reachable, unreachable) { 524 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 525 c.mu.Unlock() 526 return errors.New(msg) 527 } 528 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 529 } 530 } 531 } else { 532 msg += "Doing so may lose the consensus of your cluster. " 533 } 534 535 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 536 c.mu.Unlock() 537 return errors.New(msg) 538 } 539 // release readers in here 540 if err := nr.Stop(); err != nil { 541 logrus.Errorf("failed to shut down cluster node: %v", err) 542 signal.DumpStacks("") 543 c.mu.Unlock() 544 return err 545 } 546 c.nr = nil 547 c.mu.Unlock() 548 if nodeID := state.NodeID(); nodeID != "" { 549 nodeContainers, err := c.listContainerForNode(nodeID) 550 if err != nil { 551 return err 552 } 553 for _, id := range nodeContainers { 554 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 555 logrus.Errorf("error removing %v: %v", id, err) 556 } 557 } 558 } 559 560 c.configEvent <- struct{}{} 561 // todo: cleanup optional? 562 if err := clearPersistentState(c.root); err != nil { 563 return err 564 } 565 c.config.Backend.SetClusterProvider(nil) 566 return nil 567 } 568 569 func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) { 570 var ids []string 571 filters := filters.NewArgs() 572 filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID)) 573 containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{ 574 Filters: filters, 575 }) 576 if err != nil { 577 return []string{}, err 578 } 579 for _, c := range containers { 580 ids = append(ids, c.ID) 581 } 582 return ids, nil 583 } 584 585 func (c *Cluster) getRequestContext() (context.Context, func()) { // TODO: not needed when requests don't block on qourum lost 586 return context.WithTimeout(context.Background(), swarmRequestTimeout) 587 } 588 589 // Inspect retrieves the configuration properties of a managed swarm cluster. 590 func (c *Cluster) Inspect() (types.Swarm, error) { 591 c.mu.RLock() 592 defer c.mu.RUnlock() 593 594 state := c.currentNodeState() 595 if !state.IsActiveManager() { 596 return types.Swarm{}, c.errNoManager(state) 597 } 598 599 ctx, cancel := c.getRequestContext() 600 defer cancel() 601 602 swarm, err := getSwarm(ctx, state.controlClient) 603 if err != nil { 604 return types.Swarm{}, err 605 } 606 607 return convert.SwarmFromGRPC(*swarm), nil 608 } 609 610 // Update updates configuration of a managed swarm cluster. 611 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 612 c.mu.RLock() 613 defer c.mu.RUnlock() 614 615 state := c.currentNodeState() 616 if !state.IsActiveManager() { 617 return c.errNoManager(state) 618 } 619 620 ctx, cancel := c.getRequestContext() 621 defer cancel() 622 623 swarm, err := getSwarm(ctx, state.controlClient) 624 if err != nil { 625 return err 626 } 627 628 // In update, client should provide the complete spec of the swarm, including 629 // Name and Labels. If a field is specified with 0 or nil, then the default value 630 // will be used to swarmkit. 631 clusterSpec, err := convert.SwarmSpecToGRPC(spec) 632 if err != nil { 633 return apierrors.NewBadRequestError(err) 634 } 635 636 _, err = state.controlClient.UpdateCluster( 637 ctx, 638 &swarmapi.UpdateClusterRequest{ 639 ClusterID: swarm.ID, 640 Spec: &clusterSpec, 641 ClusterVersion: &swarmapi.Version{ 642 Index: version, 643 }, 644 Rotation: swarmapi.KeyRotation{ 645 WorkerJoinToken: flags.RotateWorkerToken, 646 ManagerJoinToken: flags.RotateManagerToken, 647 ManagerUnlockKey: flags.RotateManagerUnlockKey, 648 }, 649 }, 650 ) 651 return err 652 } 653 654 // IsManager returns true if Cluster is participating as a manager. 655 func (c *Cluster) IsManager() bool { 656 c.mu.RLock() 657 defer c.mu.RUnlock() 658 return c.currentNodeState().IsActiveManager() 659 } 660 661 // IsAgent returns true if Cluster is participating as a worker/agent. 662 func (c *Cluster) IsAgent() bool { 663 c.mu.RLock() 664 defer c.mu.RUnlock() 665 return c.currentNodeState().status == types.LocalNodeStateActive 666 } 667 668 // GetLocalAddress returns the local address. 669 func (c *Cluster) GetLocalAddress() string { 670 c.mu.RLock() 671 defer c.mu.RUnlock() 672 return c.currentNodeState().actualLocalAddr 673 } 674 675 // GetListenAddress returns the listen address. 676 func (c *Cluster) GetListenAddress() string { 677 c.mu.RLock() 678 defer c.mu.RUnlock() 679 if c.nr != nil { 680 return c.nr.config.ListenAddr 681 } 682 return "" 683 } 684 685 // GetAdvertiseAddress returns the remotely reachable address of this node. 686 func (c *Cluster) GetAdvertiseAddress() string { 687 c.mu.RLock() 688 defer c.mu.RUnlock() 689 if c.nr != nil && c.nr.config.AdvertiseAddr != "" { 690 advertiseHost, _, _ := net.SplitHostPort(c.nr.config.AdvertiseAddr) 691 return advertiseHost 692 } 693 return c.currentNodeState().actualLocalAddr 694 } 695 696 // GetRemoteAddress returns a known advertise address of a remote manager if 697 // available. 698 // todo: change to array/connect with info 699 func (c *Cluster) GetRemoteAddress() string { 700 c.mu.RLock() 701 defer c.mu.RUnlock() 702 return c.getRemoteAddress() 703 } 704 705 func (c *Cluster) getRemoteAddress() string { 706 state := c.currentNodeState() 707 if state.swarmNode == nil { 708 return "" 709 } 710 nodeID := state.swarmNode.NodeID() 711 for _, r := range state.swarmNode.Remotes() { 712 if r.NodeID != nodeID { 713 return r.Addr 714 } 715 } 716 return "" 717 } 718 719 // ListenClusterEvents returns a channel that receives messages on cluster 720 // participation changes. 721 // todo: make cancelable and accessible to multiple callers 722 func (c *Cluster) ListenClusterEvents() <-chan struct{} { 723 return c.configEvent 724 } 725 726 // Info returns information about the current cluster state. 727 func (c *Cluster) Info() types.Info { 728 info := types.Info{ 729 NodeAddr: c.GetAdvertiseAddress(), 730 } 731 c.mu.RLock() 732 defer c.mu.RUnlock() 733 734 state := c.currentNodeState() 735 info.LocalNodeState = state.status 736 if state.err != nil { 737 info.Error = state.err.Error() 738 } 739 740 ctx, cancel := c.getRequestContext() 741 defer cancel() 742 743 if state.IsActiveManager() { 744 info.ControlAvailable = true 745 swarm, err := c.Inspect() 746 if err != nil { 747 info.Error = err.Error() 748 } 749 750 // Strip JoinTokens 751 info.Cluster = swarm.ClusterInfo 752 753 if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err == nil { 754 info.Nodes = len(r.Nodes) 755 for _, n := range r.Nodes { 756 if n.ManagerStatus != nil { 757 info.Managers = info.Managers + 1 758 } 759 } 760 } 761 } 762 763 if state.swarmNode != nil { 764 for _, r := range state.swarmNode.Remotes() { 765 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 766 } 767 info.NodeID = state.swarmNode.NodeID() 768 } 769 770 return info 771 } 772 773 // currentNodeState should not be called without a read lock 774 func (c *Cluster) currentNodeState() nodeState { 775 return c.nr.State() 776 } 777 778 // errNoManager returns error describing why manager commands can't be used. 779 // Call with read lock. 780 func (c *Cluster) errNoManager(st nodeState) error { 781 if st.swarmNode == nil { 782 if errors.Cause(st.err) == errSwarmLocked { 783 return errSwarmLocked 784 } 785 if st.err == errSwarmCertificatesExpired { 786 return errSwarmCertificatesExpired 787 } 788 return errors.New("This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again.") 789 } 790 if st.swarmNode.Manager() != nil { 791 return errors.New("This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster.") 792 } 793 return errors.New("This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.") 794 } 795 796 // GetServices returns all services of a managed swarm cluster. 797 func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) { 798 c.mu.RLock() 799 defer c.mu.RUnlock() 800 801 state := c.currentNodeState() 802 if !state.IsActiveManager() { 803 return nil, c.errNoManager(state) 804 } 805 806 filters, err := newListServicesFilters(options.Filters) 807 if err != nil { 808 return nil, err 809 } 810 ctx, cancel := c.getRequestContext() 811 defer cancel() 812 813 r, err := state.controlClient.ListServices( 814 ctx, 815 &swarmapi.ListServicesRequest{Filters: filters}) 816 if err != nil { 817 return nil, err 818 } 819 820 services := []types.Service{} 821 822 for _, service := range r.Services { 823 services = append(services, convert.ServiceFromGRPC(*service)) 824 } 825 826 return services, nil 827 } 828 829 // imageWithDigestString takes an image such as name or name:tag 830 // and returns the image pinned to a digest, such as name@sha256:34234... 831 // Due to the difference between the docker/docker/reference, and the 832 // docker/distribution/reference packages, we're parsing the image twice. 833 // As the two packages converge, this function should be simplified. 834 // TODO(nishanttotla): After the packages converge, the function must 835 // convert distreference.Named -> distreference.Canonical, and the logic simplified. 836 func (c *Cluster) imageWithDigestString(ctx context.Context, image string, authConfig *apitypes.AuthConfig) (string, error) { 837 if _, err := digest.Parse(image); err == nil { 838 return "", errors.New("image reference is an image ID") 839 } 840 ref, err := distreference.ParseNamed(image) 841 if err != nil { 842 return "", err 843 } 844 // only query registry if not a canonical reference (i.e. with digest) 845 if _, ok := ref.(distreference.Canonical); !ok { 846 // create a docker/docker/reference Named object because GetRepository needs it 847 dockerRef, err := reference.ParseNamed(image) 848 if err != nil { 849 return "", err 850 } 851 dockerRef = reference.WithDefaultTag(dockerRef) 852 namedTaggedRef, ok := dockerRef.(reference.NamedTagged) 853 if !ok { 854 return "", errors.New("unable to cast image to NamedTagged reference object") 855 } 856 857 repo, _, err := c.config.Backend.GetRepository(ctx, namedTaggedRef, authConfig) 858 if err != nil { 859 return "", err 860 } 861 dscrptr, err := repo.Tags(ctx).Get(ctx, namedTaggedRef.Tag()) 862 if err != nil { 863 return "", err 864 } 865 866 namedDigestedRef, err := distreference.WithDigest(distreference.EnsureTagged(ref), dscrptr.Digest) 867 if err != nil { 868 return "", err 869 } 870 return namedDigestedRef.String(), nil 871 } 872 // reference already contains a digest, so just return it 873 return ref.String(), nil 874 } 875 876 // CreateService creates a new service in a managed swarm cluster. 877 func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (*apitypes.ServiceCreateResponse, error) { 878 c.mu.RLock() 879 defer c.mu.RUnlock() 880 881 state := c.currentNodeState() 882 if !state.IsActiveManager() { 883 return nil, c.errNoManager(state) 884 } 885 886 ctx, cancel := c.getRequestContext() 887 defer cancel() 888 889 err := c.populateNetworkID(ctx, state.controlClient, &s) 890 if err != nil { 891 return nil, err 892 } 893 894 serviceSpec, err := convert.ServiceSpecToGRPC(s) 895 if err != nil { 896 return nil, apierrors.NewBadRequestError(err) 897 } 898 899 ctnr := serviceSpec.Task.GetContainer() 900 if ctnr == nil { 901 return nil, errors.New("service does not use container tasks") 902 } 903 904 if encodedAuth != "" { 905 ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 906 } 907 908 // retrieve auth config from encoded auth 909 authConfig := &apitypes.AuthConfig{} 910 if encodedAuth != "" { 911 if err := json.NewDecoder(base64.NewDecoder(base64.URLEncoding, strings.NewReader(encodedAuth))).Decode(authConfig); err != nil { 912 logrus.Warnf("invalid authconfig: %v", err) 913 } 914 } 915 916 resp := &apitypes.ServiceCreateResponse{} 917 918 // pin image by digest 919 if os.Getenv("DOCKER_SERVICE_PREFER_OFFLINE_IMAGE") != "1" { 920 digestImage, err := c.imageWithDigestString(ctx, ctnr.Image, authConfig) 921 if err != nil { 922 logrus.Warnf("unable to pin image %s to digest: %s", ctnr.Image, err.Error()) 923 resp.Warnings = append(resp.Warnings, fmt.Sprintf("unable to pin image %s to digest: %s", ctnr.Image, err.Error())) 924 } else if ctnr.Image != digestImage { 925 logrus.Debugf("pinning image %s by digest: %s", ctnr.Image, digestImage) 926 ctnr.Image = digestImage 927 } else { 928 logrus.Debugf("creating service using supplied digest reference %s", ctnr.Image) 929 } 930 } 931 932 r, err := state.controlClient.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec}) 933 if err != nil { 934 return nil, err 935 } 936 937 resp.ID = r.Service.ID 938 return resp, nil 939 } 940 941 // GetService returns a service based on an ID or name. 942 func (c *Cluster) GetService(input string) (types.Service, error) { 943 c.mu.RLock() 944 defer c.mu.RUnlock() 945 946 state := c.currentNodeState() 947 if !state.IsActiveManager() { 948 return types.Service{}, c.errNoManager(state) 949 } 950 951 ctx, cancel := c.getRequestContext() 952 defer cancel() 953 954 service, err := getService(ctx, state.controlClient, input) 955 if err != nil { 956 return types.Service{}, err 957 } 958 return convert.ServiceFromGRPC(*service), nil 959 } 960 961 // UpdateService updates existing service to match new properties. 962 func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string, registryAuthFrom string) (*apitypes.ServiceUpdateResponse, error) { 963 c.mu.RLock() 964 defer c.mu.RUnlock() 965 966 state := c.currentNodeState() 967 if !state.IsActiveManager() { 968 return nil, c.errNoManager(state) 969 } 970 971 ctx, cancel := c.getRequestContext() 972 defer cancel() 973 974 err := c.populateNetworkID(ctx, state.controlClient, &spec) 975 if err != nil { 976 return nil, err 977 } 978 979 serviceSpec, err := convert.ServiceSpecToGRPC(spec) 980 if err != nil { 981 return nil, apierrors.NewBadRequestError(err) 982 } 983 984 currentService, err := getService(ctx, state.controlClient, serviceIDOrName) 985 if err != nil { 986 return nil, err 987 } 988 989 newCtnr := serviceSpec.Task.GetContainer() 990 if newCtnr == nil { 991 return nil, errors.New("service does not use container tasks") 992 } 993 994 if encodedAuth != "" { 995 newCtnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth} 996 } else { 997 // this is needed because if the encodedAuth isn't being updated then we 998 // shouldn't lose it, and continue to use the one that was already present 999 var ctnr *swarmapi.ContainerSpec 1000 switch registryAuthFrom { 1001 case apitypes.RegistryAuthFromSpec, "": 1002 ctnr = currentService.Spec.Task.GetContainer() 1003 case apitypes.RegistryAuthFromPreviousSpec: 1004 if currentService.PreviousSpec == nil { 1005 return nil, errors.New("service does not have a previous spec") 1006 } 1007 ctnr = currentService.PreviousSpec.Task.GetContainer() 1008 default: 1009 return nil, errors.New("unsupported registryAuthFrom value") 1010 } 1011 if ctnr == nil { 1012 return nil, errors.New("service does not use container tasks") 1013 } 1014 newCtnr.PullOptions = ctnr.PullOptions 1015 // update encodedAuth so it can be used to pin image by digest 1016 if ctnr.PullOptions != nil { 1017 encodedAuth = ctnr.PullOptions.RegistryAuth 1018 } 1019 } 1020 1021 // retrieve auth config from encoded auth 1022 authConfig := &apitypes.AuthConfig{} 1023 if encodedAuth != "" { 1024 if err := json.NewDecoder(base64.NewDecoder(base64.URLEncoding, strings.NewReader(encodedAuth))).Decode(authConfig); err != nil { 1025 logrus.Warnf("invalid authconfig: %v", err) 1026 } 1027 } 1028 1029 resp := &apitypes.ServiceUpdateResponse{} 1030 1031 // pin image by digest 1032 if os.Getenv("DOCKER_SERVICE_PREFER_OFFLINE_IMAGE") != "1" { 1033 digestImage, err := c.imageWithDigestString(ctx, newCtnr.Image, authConfig) 1034 if err != nil { 1035 logrus.Warnf("unable to pin image %s to digest: %s", newCtnr.Image, err.Error()) 1036 resp.Warnings = append(resp.Warnings, fmt.Sprintf("unable to pin image %s to digest: %s", newCtnr.Image, err.Error())) 1037 } else if newCtnr.Image != digestImage { 1038 logrus.Debugf("pinning image %s by digest: %s", newCtnr.Image, digestImage) 1039 newCtnr.Image = digestImage 1040 } else { 1041 logrus.Debugf("updating service using supplied digest reference %s", newCtnr.Image) 1042 } 1043 } 1044 1045 _, err = state.controlClient.UpdateService( 1046 ctx, 1047 &swarmapi.UpdateServiceRequest{ 1048 ServiceID: currentService.ID, 1049 Spec: &serviceSpec, 1050 ServiceVersion: &swarmapi.Version{ 1051 Index: version, 1052 }, 1053 }, 1054 ) 1055 1056 return resp, err 1057 } 1058 1059 // RemoveService removes a service from a managed swarm cluster. 1060 func (c *Cluster) RemoveService(input string) error { 1061 c.mu.RLock() 1062 defer c.mu.RUnlock() 1063 1064 state := c.currentNodeState() 1065 if !state.IsActiveManager() { 1066 return c.errNoManager(state) 1067 } 1068 1069 ctx, cancel := c.getRequestContext() 1070 defer cancel() 1071 1072 service, err := getService(ctx, state.controlClient, input) 1073 if err != nil { 1074 return err 1075 } 1076 1077 _, err = state.controlClient.RemoveService(ctx, &swarmapi.RemoveServiceRequest{ServiceID: service.ID}) 1078 return err 1079 } 1080 1081 // ServiceLogs collects service logs and writes them back to `config.OutStream` 1082 func (c *Cluster) ServiceLogs(ctx context.Context, input string, config *backend.ContainerLogsConfig, started chan struct{}) error { 1083 c.mu.RLock() 1084 state := c.currentNodeState() 1085 if !state.IsActiveManager() { 1086 c.mu.RUnlock() 1087 return c.errNoManager(state) 1088 } 1089 1090 service, err := getService(ctx, state.controlClient, input) 1091 if err != nil { 1092 c.mu.RUnlock() 1093 return err 1094 } 1095 1096 stream, err := state.logsClient.SubscribeLogs(ctx, &swarmapi.SubscribeLogsRequest{ 1097 Selector: &swarmapi.LogSelector{ 1098 ServiceIDs: []string{service.ID}, 1099 }, 1100 Options: &swarmapi.LogSubscriptionOptions{ 1101 Follow: config.Follow, 1102 }, 1103 }) 1104 if err != nil { 1105 c.mu.RUnlock() 1106 return err 1107 } 1108 1109 wf := ioutils.NewWriteFlusher(config.OutStream) 1110 defer wf.Close() 1111 close(started) 1112 wf.Flush() 1113 1114 outStream := stdcopy.NewStdWriter(wf, stdcopy.Stdout) 1115 errStream := stdcopy.NewStdWriter(wf, stdcopy.Stderr) 1116 1117 // Release the lock before starting the stream. 1118 c.mu.RUnlock() 1119 for { 1120 // Check the context before doing anything. 1121 select { 1122 case <-ctx.Done(): 1123 return ctx.Err() 1124 default: 1125 } 1126 1127 subscribeMsg, err := stream.Recv() 1128 if err == io.EOF { 1129 return nil 1130 } 1131 if err != nil { 1132 return err 1133 } 1134 1135 for _, msg := range subscribeMsg.Messages { 1136 data := []byte{} 1137 1138 if config.Timestamps { 1139 ts, err := ptypes.Timestamp(msg.Timestamp) 1140 if err != nil { 1141 return err 1142 } 1143 data = append(data, []byte(ts.Format(logger.TimeFormat)+" ")...) 1144 } 1145 1146 data = append(data, []byte(fmt.Sprintf("%s.node.id=%s,%s.service.id=%s,%s.task.id=%s ", 1147 contextPrefix, msg.Context.NodeID, 1148 contextPrefix, msg.Context.ServiceID, 1149 contextPrefix, msg.Context.TaskID, 1150 ))...) 1151 1152 data = append(data, msg.Data...) 1153 1154 switch msg.Stream { 1155 case swarmapi.LogStreamStdout: 1156 outStream.Write(data) 1157 case swarmapi.LogStreamStderr: 1158 errStream.Write(data) 1159 } 1160 } 1161 } 1162 } 1163 1164 // GetNodes returns a list of all nodes known to a cluster. 1165 func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) { 1166 c.mu.RLock() 1167 defer c.mu.RUnlock() 1168 1169 state := c.currentNodeState() 1170 if !state.IsActiveManager() { 1171 return nil, c.errNoManager(state) 1172 } 1173 1174 filters, err := newListNodesFilters(options.Filters) 1175 if err != nil { 1176 return nil, err 1177 } 1178 1179 ctx, cancel := c.getRequestContext() 1180 defer cancel() 1181 1182 r, err := state.controlClient.ListNodes( 1183 ctx, 1184 &swarmapi.ListNodesRequest{Filters: filters}) 1185 if err != nil { 1186 return nil, err 1187 } 1188 1189 nodes := []types.Node{} 1190 1191 for _, node := range r.Nodes { 1192 nodes = append(nodes, convert.NodeFromGRPC(*node)) 1193 } 1194 return nodes, nil 1195 } 1196 1197 // GetNode returns a node based on an ID. 1198 func (c *Cluster) GetNode(input string) (types.Node, error) { 1199 c.mu.RLock() 1200 defer c.mu.RUnlock() 1201 1202 state := c.currentNodeState() 1203 if !state.IsActiveManager() { 1204 return types.Node{}, c.errNoManager(state) 1205 } 1206 1207 ctx, cancel := c.getRequestContext() 1208 defer cancel() 1209 1210 node, err := getNode(ctx, state.controlClient, input) 1211 if err != nil { 1212 return types.Node{}, err 1213 } 1214 return convert.NodeFromGRPC(*node), nil 1215 } 1216 1217 // UpdateNode updates existing nodes properties. 1218 func (c *Cluster) UpdateNode(input string, version uint64, spec types.NodeSpec) error { 1219 c.mu.RLock() 1220 defer c.mu.RUnlock() 1221 1222 state := c.currentNodeState() 1223 if !state.IsActiveManager() { 1224 return c.errNoManager(state) 1225 } 1226 1227 nodeSpec, err := convert.NodeSpecToGRPC(spec) 1228 if err != nil { 1229 return apierrors.NewBadRequestError(err) 1230 } 1231 1232 ctx, cancel := c.getRequestContext() 1233 defer cancel() 1234 1235 currentNode, err := getNode(ctx, state.controlClient, input) 1236 if err != nil { 1237 return err 1238 } 1239 1240 _, err = state.controlClient.UpdateNode( 1241 ctx, 1242 &swarmapi.UpdateNodeRequest{ 1243 NodeID: currentNode.ID, 1244 Spec: &nodeSpec, 1245 NodeVersion: &swarmapi.Version{ 1246 Index: version, 1247 }, 1248 }, 1249 ) 1250 return err 1251 } 1252 1253 // RemoveNode removes a node from a cluster 1254 func (c *Cluster) RemoveNode(input string, force bool) error { 1255 c.mu.RLock() 1256 defer c.mu.RUnlock() 1257 1258 state := c.currentNodeState() 1259 if !state.IsActiveManager() { 1260 return c.errNoManager(state) 1261 } 1262 1263 ctx, cancel := c.getRequestContext() 1264 defer cancel() 1265 1266 node, err := getNode(ctx, state.controlClient, input) 1267 if err != nil { 1268 return err 1269 } 1270 1271 _, err = state.controlClient.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID, Force: force}) 1272 return err 1273 } 1274 1275 // GetTasks returns a list of tasks matching the filter options. 1276 func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) { 1277 c.mu.RLock() 1278 defer c.mu.RUnlock() 1279 1280 state := c.currentNodeState() 1281 if !state.IsActiveManager() { 1282 return nil, c.errNoManager(state) 1283 } 1284 1285 byName := func(filter filters.Args) error { 1286 if filter.Include("service") { 1287 serviceFilters := filter.Get("service") 1288 for _, serviceFilter := range serviceFilters { 1289 service, err := c.GetService(serviceFilter) 1290 if err != nil { 1291 return err 1292 } 1293 filter.Del("service", serviceFilter) 1294 filter.Add("service", service.ID) 1295 } 1296 } 1297 if filter.Include("node") { 1298 nodeFilters := filter.Get("node") 1299 for _, nodeFilter := range nodeFilters { 1300 node, err := c.GetNode(nodeFilter) 1301 if err != nil { 1302 return err 1303 } 1304 filter.Del("node", nodeFilter) 1305 filter.Add("node", node.ID) 1306 } 1307 } 1308 return nil 1309 } 1310 1311 filters, err := newListTasksFilters(options.Filters, byName) 1312 if err != nil { 1313 return nil, err 1314 } 1315 1316 ctx, cancel := c.getRequestContext() 1317 defer cancel() 1318 1319 r, err := state.controlClient.ListTasks( 1320 ctx, 1321 &swarmapi.ListTasksRequest{Filters: filters}) 1322 if err != nil { 1323 return nil, err 1324 } 1325 1326 tasks := []types.Task{} 1327 1328 for _, task := range r.Tasks { 1329 if task.Spec.GetContainer() != nil { 1330 tasks = append(tasks, convert.TaskFromGRPC(*task)) 1331 } 1332 } 1333 return tasks, nil 1334 } 1335 1336 // GetTask returns a task by an ID. 1337 func (c *Cluster) GetTask(input string) (types.Task, error) { 1338 c.mu.RLock() 1339 defer c.mu.RUnlock() 1340 1341 state := c.currentNodeState() 1342 if !state.IsActiveManager() { 1343 return types.Task{}, c.errNoManager(state) 1344 } 1345 1346 ctx, cancel := c.getRequestContext() 1347 defer cancel() 1348 1349 task, err := getTask(ctx, state.controlClient, input) 1350 if err != nil { 1351 return types.Task{}, err 1352 } 1353 return convert.TaskFromGRPC(*task), nil 1354 } 1355 1356 // GetNetwork returns a cluster network by an ID. 1357 func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) { 1358 c.mu.RLock() 1359 defer c.mu.RUnlock() 1360 1361 state := c.currentNodeState() 1362 if !state.IsActiveManager() { 1363 return apitypes.NetworkResource{}, c.errNoManager(state) 1364 } 1365 1366 ctx, cancel := c.getRequestContext() 1367 defer cancel() 1368 1369 network, err := getNetwork(ctx, state.controlClient, input) 1370 if err != nil { 1371 return apitypes.NetworkResource{}, err 1372 } 1373 return convert.BasicNetworkFromGRPC(*network), nil 1374 } 1375 1376 func (c *Cluster) getNetworks(filters *swarmapi.ListNetworksRequest_Filters) ([]apitypes.NetworkResource, error) { 1377 c.mu.RLock() 1378 defer c.mu.RUnlock() 1379 1380 state := c.currentNodeState() 1381 if !state.IsActiveManager() { 1382 return nil, c.errNoManager(state) 1383 } 1384 1385 ctx, cancel := c.getRequestContext() 1386 defer cancel() 1387 1388 r, err := state.controlClient.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: filters}) 1389 if err != nil { 1390 return nil, err 1391 } 1392 1393 var networks []apitypes.NetworkResource 1394 1395 for _, network := range r.Networks { 1396 networks = append(networks, convert.BasicNetworkFromGRPC(*network)) 1397 } 1398 1399 return networks, nil 1400 } 1401 1402 // GetNetworks returns all current cluster managed networks. 1403 func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) { 1404 return c.getNetworks(nil) 1405 } 1406 1407 // GetNetworksByName returns cluster managed networks by name. 1408 // It is ok to have multiple networks here. #18864 1409 func (c *Cluster) GetNetworksByName(name string) ([]apitypes.NetworkResource, error) { 1410 // Note that swarmapi.GetNetworkRequest.Name is not functional. 1411 // So we cannot just use that with c.GetNetwork. 1412 return c.getNetworks(&swarmapi.ListNetworksRequest_Filters{ 1413 Names: []string{name}, 1414 }) 1415 } 1416 1417 func attacherKey(target, containerID string) string { 1418 return containerID + ":" + target 1419 } 1420 1421 // UpdateAttachment signals the attachment config to the attachment 1422 // waiter who is trying to start or attach the container to the 1423 // network. 1424 func (c *Cluster) UpdateAttachment(target, containerID string, config *network.NetworkingConfig) error { 1425 c.mu.RLock() 1426 attacher, ok := c.attachers[attacherKey(target, containerID)] 1427 c.mu.RUnlock() 1428 if !ok || attacher == nil { 1429 return fmt.Errorf("could not find attacher for container %s to network %s", containerID, target) 1430 } 1431 1432 attacher.attachWaitCh <- config 1433 close(attacher.attachWaitCh) 1434 return nil 1435 } 1436 1437 // WaitForDetachment waits for the container to stop or detach from 1438 // the network. 1439 func (c *Cluster) WaitForDetachment(ctx context.Context, networkName, networkID, taskID, containerID string) error { 1440 c.mu.RLock() 1441 attacher, ok := c.attachers[attacherKey(networkName, containerID)] 1442 if !ok { 1443 attacher, ok = c.attachers[attacherKey(networkID, containerID)] 1444 } 1445 state := c.currentNodeState() 1446 if state.swarmNode == nil || state.swarmNode.Agent() == nil { 1447 c.mu.RUnlock() 1448 return errors.New("invalid cluster node while waiting for detachment") 1449 } 1450 1451 c.mu.RUnlock() 1452 agent := state.swarmNode.Agent() 1453 if ok && attacher != nil && 1454 attacher.detachWaitCh != nil && 1455 attacher.attachCompleteCh != nil { 1456 // Attachment may be in progress still so wait for 1457 // attachment to complete. 1458 select { 1459 case <-attacher.attachCompleteCh: 1460 case <-ctx.Done(): 1461 return ctx.Err() 1462 } 1463 1464 if attacher.taskID == taskID { 1465 select { 1466 case <-attacher.detachWaitCh: 1467 case <-ctx.Done(): 1468 return ctx.Err() 1469 } 1470 } 1471 } 1472 1473 return agent.ResourceAllocator().DetachNetwork(ctx, taskID) 1474 } 1475 1476 // AttachNetwork generates an attachment request towards the manager. 1477 func (c *Cluster) AttachNetwork(target string, containerID string, addresses []string) (*network.NetworkingConfig, error) { 1478 aKey := attacherKey(target, containerID) 1479 c.mu.Lock() 1480 state := c.currentNodeState() 1481 if state.swarmNode == nil || state.swarmNode.Agent() == nil { 1482 c.mu.Unlock() 1483 return nil, errors.New("invalid cluster node while attaching to network") 1484 } 1485 if attacher, ok := c.attachers[aKey]; ok { 1486 c.mu.Unlock() 1487 return attacher.config, nil 1488 } 1489 1490 agent := state.swarmNode.Agent() 1491 attachWaitCh := make(chan *network.NetworkingConfig) 1492 detachWaitCh := make(chan struct{}) 1493 attachCompleteCh := make(chan struct{}) 1494 c.attachers[aKey] = &attacher{ 1495 attachWaitCh: attachWaitCh, 1496 attachCompleteCh: attachCompleteCh, 1497 detachWaitCh: detachWaitCh, 1498 } 1499 c.mu.Unlock() 1500 1501 ctx, cancel := c.getRequestContext() 1502 defer cancel() 1503 1504 taskID, err := agent.ResourceAllocator().AttachNetwork(ctx, containerID, target, addresses) 1505 if err != nil { 1506 c.mu.Lock() 1507 delete(c.attachers, aKey) 1508 c.mu.Unlock() 1509 return nil, fmt.Errorf("Could not attach to network %s: %v", target, err) 1510 } 1511 1512 c.mu.Lock() 1513 c.attachers[aKey].taskID = taskID 1514 close(attachCompleteCh) 1515 c.mu.Unlock() 1516 1517 logrus.Debugf("Successfully attached to network %s with tid %s", target, taskID) 1518 1519 var config *network.NetworkingConfig 1520 select { 1521 case config = <-attachWaitCh: 1522 case <-ctx.Done(): 1523 return nil, fmt.Errorf("attaching to network failed, make sure your network options are correct and check manager logs: %v", ctx.Err()) 1524 } 1525 1526 c.mu.Lock() 1527 c.attachers[aKey].config = config 1528 c.mu.Unlock() 1529 return config, nil 1530 } 1531 1532 // DetachNetwork unblocks the waiters waiting on WaitForDetachment so 1533 // that a request to detach can be generated towards the manager. 1534 func (c *Cluster) DetachNetwork(target string, containerID string) error { 1535 aKey := attacherKey(target, containerID) 1536 1537 c.mu.Lock() 1538 attacher, ok := c.attachers[aKey] 1539 delete(c.attachers, aKey) 1540 c.mu.Unlock() 1541 1542 if !ok { 1543 return fmt.Errorf("could not find network attachment for container %s to network %s", containerID, target) 1544 } 1545 1546 close(attacher.detachWaitCh) 1547 return nil 1548 } 1549 1550 // CreateNetwork creates a new cluster managed network. 1551 func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) { 1552 c.mu.RLock() 1553 defer c.mu.RUnlock() 1554 1555 state := c.currentNodeState() 1556 if !state.IsActiveManager() { 1557 return "", c.errNoManager(state) 1558 } 1559 1560 if runconfig.IsPreDefinedNetwork(s.Name) { 1561 err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name) 1562 return "", apierrors.NewRequestForbiddenError(err) 1563 } 1564 1565 ctx, cancel := c.getRequestContext() 1566 defer cancel() 1567 1568 networkSpec := convert.BasicNetworkCreateToGRPC(s) 1569 r, err := state.controlClient.CreateNetwork(ctx, &swarmapi.CreateNetworkRequest{Spec: &networkSpec}) 1570 if err != nil { 1571 return "", err 1572 } 1573 1574 return r.Network.ID, nil 1575 } 1576 1577 // RemoveNetwork removes a cluster network. 1578 func (c *Cluster) RemoveNetwork(input string) error { 1579 c.mu.RLock() 1580 defer c.mu.RUnlock() 1581 1582 state := c.currentNodeState() 1583 if !state.IsActiveManager() { 1584 return c.errNoManager(state) 1585 } 1586 1587 ctx, cancel := c.getRequestContext() 1588 defer cancel() 1589 1590 network, err := getNetwork(ctx, state.controlClient, input) 1591 if err != nil { 1592 return err 1593 } 1594 1595 _, err = state.controlClient.RemoveNetwork(ctx, &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}) 1596 return err 1597 } 1598 1599 func (c *Cluster) populateNetworkID(ctx context.Context, client swarmapi.ControlClient, s *types.ServiceSpec) error { 1600 // Always prefer NetworkAttachmentConfigs from TaskTemplate 1601 // but fallback to service spec for backward compatibility 1602 networks := s.TaskTemplate.Networks 1603 if len(networks) == 0 { 1604 networks = s.Networks 1605 } 1606 1607 for i, n := range networks { 1608 apiNetwork, err := getNetwork(ctx, client, n.Target) 1609 if err != nil { 1610 if ln, _ := c.config.Backend.FindNetwork(n.Target); ln != nil && !ln.Info().Dynamic() { 1611 err = fmt.Errorf("The network %s cannot be used with services. Only networks scoped to the swarm can be used, such as those created with the overlay driver.", ln.Name()) 1612 return apierrors.NewRequestForbiddenError(err) 1613 } 1614 return err 1615 } 1616 networks[i].Target = apiNetwork.ID 1617 } 1618 return nil 1619 } 1620 1621 // Cleanup stops active swarm node. This is run before daemon shutdown. 1622 func (c *Cluster) Cleanup() { 1623 c.controlMutex.Lock() 1624 defer c.controlMutex.Unlock() 1625 1626 c.mu.Lock() 1627 node := c.nr 1628 if node == nil { 1629 c.mu.Unlock() 1630 return 1631 } 1632 defer c.mu.Unlock() 1633 state := c.currentNodeState() 1634 if state.IsActiveManager() { 1635 active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID()) 1636 if err == nil { 1637 singlenode := active && isLastManager(reachable, unreachable) 1638 if active && !singlenode && removingManagerCausesLossOfQuorum(reachable, unreachable) { 1639 logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable) 1640 } 1641 } 1642 } 1643 if err := node.Stop(); err != nil { 1644 logrus.Errorf("failed to shut down cluster node: %v", err) 1645 signal.DumpStacks("") 1646 } 1647 c.nr = nil 1648 } 1649 1650 func managerStats(client swarmapi.ControlClient, currentNodeID string) (current bool, reachable int, unreachable int, err error) { 1651 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 1652 defer cancel() 1653 nodes, err := client.ListNodes(ctx, &swarmapi.ListNodesRequest{}) 1654 if err != nil { 1655 return false, 0, 0, err 1656 } 1657 for _, n := range nodes.Nodes { 1658 if n.ManagerStatus != nil { 1659 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE { 1660 reachable++ 1661 if n.ID == currentNodeID { 1662 current = true 1663 } 1664 } 1665 if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE { 1666 unreachable++ 1667 } 1668 } 1669 } 1670 return 1671 } 1672 1673 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 1674 var err error 1675 req.ListenAddr, err = validateAddr(req.ListenAddr) 1676 if err != nil { 1677 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1678 } 1679 1680 if req.Spec.Annotations.Name == "" { 1681 req.Spec.Annotations.Name = "default" 1682 } else if req.Spec.Annotations.Name != "default" { 1683 return errors.New(`swarm spec must be named "default"`) 1684 } 1685 1686 return nil 1687 } 1688 1689 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 1690 var err error 1691 req.ListenAddr, err = validateAddr(req.ListenAddr) 1692 if err != nil { 1693 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 1694 } 1695 if len(req.RemoteAddrs) == 0 { 1696 return errors.New("at least 1 RemoteAddr is required to join") 1697 } 1698 for i := range req.RemoteAddrs { 1699 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 1700 if err != nil { 1701 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 1702 } 1703 } 1704 return nil 1705 } 1706 1707 func validateAddr(addr string) (string, error) { 1708 if addr == "" { 1709 return addr, errors.New("invalid empty address") 1710 } 1711 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 1712 if err != nil { 1713 return addr, nil 1714 } 1715 return strings.TrimPrefix(newaddr, "tcp://"), nil 1716 } 1717 1718 func initClusterSpec(node *swarmnode.Node, spec types.Spec) error { 1719 ctx, _ := context.WithTimeout(context.Background(), 5*time.Second) 1720 for conn := range node.ListenControlSocket(ctx) { 1721 if ctx.Err() != nil { 1722 return ctx.Err() 1723 } 1724 if conn != nil { 1725 client := swarmapi.NewControlClient(conn) 1726 var cluster *swarmapi.Cluster 1727 for i := 0; ; i++ { 1728 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 1729 if err != nil { 1730 return fmt.Errorf("error on listing clusters: %v", err) 1731 } 1732 if len(lcr.Clusters) == 0 { 1733 if i < 10 { 1734 time.Sleep(200 * time.Millisecond) 1735 continue 1736 } 1737 return errors.New("empty list of clusters was returned") 1738 } 1739 cluster = lcr.Clusters[0] 1740 break 1741 } 1742 // In init, we take the initial default values from swarmkit, and merge 1743 // any non nil or 0 value from spec to GRPC spec. This will leave the 1744 // default value alone. 1745 // Note that this is different from Update(), as in Update() we expect 1746 // user to specify the complete spec of the cluster (as they already know 1747 // the existing one and knows which field to update) 1748 clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec) 1749 if err != nil { 1750 return fmt.Errorf("error updating cluster settings: %v", err) 1751 } 1752 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 1753 ClusterID: cluster.ID, 1754 ClusterVersion: &cluster.Meta.Version, 1755 Spec: &clusterSpec, 1756 }) 1757 if err != nil { 1758 return fmt.Errorf("error updating cluster settings: %v", err) 1759 } 1760 return nil 1761 } 1762 } 1763 return ctx.Err() 1764 } 1765 1766 func detectLockedError(err error) error { 1767 if err == swarmnode.ErrInvalidUnlockKey { 1768 return errors.WithStack(errSwarmLocked) 1769 } 1770 return err 1771 }