github.com/jwhonce/docker@v0.6.7-0.20190327063223-da823cf3a5a3/daemon/cluster/swarm.go (about) 1 package cluster // import "github.com/docker/docker/daemon/cluster" 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "strings" 8 "time" 9 10 apitypes "github.com/docker/docker/api/types" 11 "github.com/docker/docker/api/types/filters" 12 types "github.com/docker/docker/api/types/swarm" 13 "github.com/docker/docker/daemon/cluster/convert" 14 "github.com/docker/docker/errdefs" 15 "github.com/docker/docker/opts" 16 "github.com/docker/docker/pkg/signal" 17 swarmapi "github.com/docker/swarmkit/api" 18 "github.com/docker/swarmkit/manager/encryption" 19 swarmnode "github.com/docker/swarmkit/node" 20 "github.com/pkg/errors" 21 "github.com/sirupsen/logrus" 22 ) 23 24 // Init initializes new cluster from user provided request. 25 func (c *Cluster) Init(req types.InitRequest) (string, error) { 26 c.controlMutex.Lock() 27 defer c.controlMutex.Unlock() 28 if c.nr != nil { 29 if req.ForceNewCluster { 30 31 // Take c.mu temporarily to wait for presently running 32 // API handlers to finish before shutting down the node. 33 c.mu.Lock() 34 if !c.nr.nodeState.IsManager() { 35 return "", errSwarmNotManager 36 } 37 c.mu.Unlock() 38 39 if err := c.nr.Stop(); err != nil { 40 return "", err 41 } 42 } else { 43 return "", errSwarmExists 44 } 45 } 46 47 if err := validateAndSanitizeInitRequest(&req); err != nil { 48 return "", errdefs.InvalidParameter(err) 49 } 50 51 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 52 if err != nil { 53 return "", err 54 } 55 56 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 57 if err != nil { 58 return "", err 59 } 60 61 dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr) 62 if err != nil { 63 return "", err 64 } 65 66 localAddr := listenHost 67 68 // If the local address is undetermined, the advertise address 69 // will be used as local address, if it belongs to this system. 70 // If the advertise address is not local, then we try to find 71 // a system address to use as local address. If this fails, 72 // we give up and ask the user to pass the listen address. 73 if net.ParseIP(localAddr).IsUnspecified() { 74 advertiseIP := net.ParseIP(advertiseHost) 75 76 found := false 77 for _, systemIP := range listSystemIPs() { 78 if systemIP.Equal(advertiseIP) { 79 localAddr = advertiseIP.String() 80 found = true 81 break 82 } 83 } 84 85 if !found { 86 ip, err := c.resolveSystemAddr() 87 if err != nil { 88 logrus.Warnf("Could not find a local address: %v", err) 89 return "", errMustSpecifyListenAddr 90 } 91 localAddr = ip.String() 92 } 93 } 94 95 //Validate Default Address Pool input 96 if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil { 97 return "", err 98 } 99 100 port, err := getDataPathPort(req.DataPathPort) 101 if err != nil { 102 return "", err 103 } 104 105 nr, err := c.newNodeRunner(nodeStartConfig{ 106 forceNewCluster: req.ForceNewCluster, 107 autolock: req.AutoLockManagers, 108 LocalAddr: localAddr, 109 ListenAddr: net.JoinHostPort(listenHost, listenPort), 110 AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort), 111 DataPathAddr: dataPathAddr, 112 DefaultAddressPool: req.DefaultAddrPool, 113 SubnetSize: req.SubnetSize, 114 availability: req.Availability, 115 DataPathPort: port, 116 }) 117 if err != nil { 118 return "", err 119 } 120 c.mu.Lock() 121 c.nr = nr 122 c.mu.Unlock() 123 124 if err := <-nr.Ready(); err != nil { 125 c.mu.Lock() 126 c.nr = nil 127 c.mu.Unlock() 128 if !req.ForceNewCluster { // if failure on first attempt don't keep state 129 if err := clearPersistentState(c.root); err != nil { 130 return "", err 131 } 132 } 133 return "", err 134 } 135 state := nr.State() 136 if state.swarmNode == nil { // should never happen but protect from panic 137 return "", errors.New("invalid cluster state for spec initialization") 138 } 139 if err := initClusterSpec(state.swarmNode, req.Spec); err != nil { 140 return "", err 141 } 142 return state.NodeID(), nil 143 } 144 145 // Join makes current Cluster part of an existing swarm cluster. 146 func (c *Cluster) Join(req types.JoinRequest) error { 147 c.controlMutex.Lock() 148 defer c.controlMutex.Unlock() 149 c.mu.Lock() 150 if c.nr != nil { 151 c.mu.Unlock() 152 return errors.WithStack(errSwarmExists) 153 } 154 c.mu.Unlock() 155 156 if err := validateAndSanitizeJoinRequest(&req); err != nil { 157 return errdefs.InvalidParameter(err) 158 } 159 160 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 161 if err != nil { 162 return err 163 } 164 165 var advertiseAddr string 166 if req.AdvertiseAddr != "" { 167 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 168 // For joining, we don't need to provide an advertise address, 169 // since the remote side can detect it. 170 if err == nil { 171 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 172 } 173 } 174 175 dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr) 176 if err != nil { 177 return err 178 } 179 180 nr, err := c.newNodeRunner(nodeStartConfig{ 181 RemoteAddr: req.RemoteAddrs[0], 182 ListenAddr: net.JoinHostPort(listenHost, listenPort), 183 AdvertiseAddr: advertiseAddr, 184 DataPathAddr: dataPathAddr, 185 joinAddr: req.RemoteAddrs[0], 186 joinToken: req.JoinToken, 187 availability: req.Availability, 188 }) 189 if err != nil { 190 return err 191 } 192 193 c.mu.Lock() 194 c.nr = nr 195 c.mu.Unlock() 196 197 select { 198 case <-time.After(swarmConnectTimeout): 199 return errSwarmJoinTimeoutReached 200 case err := <-nr.Ready(): 201 if err != nil { 202 c.mu.Lock() 203 c.nr = nil 204 c.mu.Unlock() 205 if err := clearPersistentState(c.root); err != nil { 206 return err 207 } 208 } 209 return err 210 } 211 } 212 213 // Inspect retrieves the configuration properties of a managed swarm cluster. 214 func (c *Cluster) Inspect() (types.Swarm, error) { 215 var swarm types.Swarm 216 if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 217 s, err := c.inspect(ctx, state) 218 if err != nil { 219 return err 220 } 221 swarm = s 222 return nil 223 }); err != nil { 224 return types.Swarm{}, err 225 } 226 return swarm, nil 227 } 228 229 func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) { 230 s, err := getSwarm(ctx, state.controlClient) 231 if err != nil { 232 return types.Swarm{}, err 233 } 234 return convert.SwarmFromGRPC(*s), nil 235 } 236 237 // Update updates configuration of a managed swarm cluster. 238 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 239 return c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 240 swarm, err := getSwarm(ctx, state.controlClient) 241 if err != nil { 242 return err 243 } 244 245 // Validate spec name. 246 if spec.Annotations.Name == "" { 247 spec.Annotations.Name = "default" 248 } else if spec.Annotations.Name != "default" { 249 return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`)) 250 } 251 252 // In update, client should provide the complete spec of the swarm, including 253 // Name and Labels. If a field is specified with 0 or nil, then the default value 254 // will be used to swarmkit. 255 clusterSpec, err := convert.SwarmSpecToGRPC(spec) 256 if err != nil { 257 return errdefs.InvalidParameter(err) 258 } 259 260 _, err = state.controlClient.UpdateCluster( 261 ctx, 262 &swarmapi.UpdateClusterRequest{ 263 ClusterID: swarm.ID, 264 Spec: &clusterSpec, 265 ClusterVersion: &swarmapi.Version{ 266 Index: version, 267 }, 268 Rotation: swarmapi.KeyRotation{ 269 WorkerJoinToken: flags.RotateWorkerToken, 270 ManagerJoinToken: flags.RotateManagerToken, 271 ManagerUnlockKey: flags.RotateManagerUnlockKey, 272 }, 273 }, 274 ) 275 return err 276 }) 277 } 278 279 // GetUnlockKey returns the unlock key for the swarm. 280 func (c *Cluster) GetUnlockKey() (string, error) { 281 var resp *swarmapi.GetUnlockKeyResponse 282 if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 283 client := swarmapi.NewCAClient(state.grpcConn) 284 285 r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{}) 286 if err != nil { 287 return err 288 } 289 resp = r 290 return nil 291 }); err != nil { 292 return "", err 293 } 294 if len(resp.UnlockKey) == 0 { 295 // no key 296 return "", nil 297 } 298 return encryption.HumanReadableKey(resp.UnlockKey), nil 299 } 300 301 // UnlockSwarm provides a key to decrypt data that is encrypted at rest. 302 func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error { 303 c.controlMutex.Lock() 304 defer c.controlMutex.Unlock() 305 306 c.mu.RLock() 307 state := c.currentNodeState() 308 309 if !state.IsActiveManager() { 310 // when manager is not active, 311 // unless it is locked, otherwise return error. 312 if err := c.errNoManager(state); err != errSwarmLocked { 313 c.mu.RUnlock() 314 return err 315 } 316 } else { 317 // when manager is active, return an error of "not locked" 318 c.mu.RUnlock() 319 return notLockedError{} 320 } 321 322 // only when swarm is locked, code running reaches here 323 nr := c.nr 324 c.mu.RUnlock() 325 326 key, err := encryption.ParseHumanReadableKey(req.UnlockKey) 327 if err != nil { 328 return errdefs.InvalidParameter(err) 329 } 330 331 config := nr.config 332 config.lockKey = key 333 if err := nr.Stop(); err != nil { 334 return err 335 } 336 nr, err = c.newNodeRunner(config) 337 if err != nil { 338 return err 339 } 340 341 c.mu.Lock() 342 c.nr = nr 343 c.mu.Unlock() 344 345 if err := <-nr.Ready(); err != nil { 346 if errors.Cause(err) == errSwarmLocked { 347 return invalidUnlockKey{} 348 } 349 return errors.Errorf("swarm component could not be started: %v", err) 350 } 351 return nil 352 } 353 354 // Leave shuts down Cluster and removes current state. 355 func (c *Cluster) Leave(force bool) error { 356 c.controlMutex.Lock() 357 defer c.controlMutex.Unlock() 358 359 c.mu.Lock() 360 nr := c.nr 361 if nr == nil { 362 c.mu.Unlock() 363 return errors.WithStack(errNoSwarm) 364 } 365 366 state := c.currentNodeState() 367 368 c.mu.Unlock() 369 370 if errors.Cause(state.err) == errSwarmLocked && !force { 371 // leave a locked swarm without --force is not allowed 372 return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")) 373 } 374 375 if state.IsManager() && !force { 376 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 377 if state.IsActiveManager() { 378 active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID()) 379 if err == nil { 380 if active && removingManagerCausesLossOfQuorum(reachable, unreachable) { 381 if isLastManager(reachable, unreachable) { 382 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 383 return errors.WithStack(notAvailableError(msg)) 384 } 385 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 386 } 387 } 388 } else { 389 msg += "Doing so may lose the consensus of your cluster. " 390 } 391 392 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 393 return errors.WithStack(notAvailableError(msg)) 394 } 395 // release readers in here 396 if err := nr.Stop(); err != nil { 397 logrus.Errorf("failed to shut down cluster node: %v", err) 398 signal.DumpStacks("") 399 return err 400 } 401 402 c.mu.Lock() 403 c.nr = nil 404 c.mu.Unlock() 405 406 if nodeID := state.NodeID(); nodeID != "" { 407 nodeContainers, err := c.listContainerForNode(nodeID) 408 if err != nil { 409 return err 410 } 411 for _, id := range nodeContainers { 412 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 413 logrus.Errorf("error removing %v: %v", id, err) 414 } 415 } 416 } 417 418 // todo: cleanup optional? 419 if err := clearPersistentState(c.root); err != nil { 420 return err 421 } 422 c.config.Backend.DaemonLeavesCluster() 423 return nil 424 } 425 426 // Info returns information about the current cluster state. 427 func (c *Cluster) Info() types.Info { 428 info := types.Info{ 429 NodeAddr: c.GetAdvertiseAddress(), 430 } 431 c.mu.RLock() 432 defer c.mu.RUnlock() 433 434 state := c.currentNodeState() 435 info.LocalNodeState = state.status 436 if state.err != nil { 437 info.Error = state.err.Error() 438 } 439 440 ctx, cancel := c.getRequestContext() 441 defer cancel() 442 443 if state.IsActiveManager() { 444 info.ControlAvailable = true 445 swarm, err := c.inspect(ctx, state) 446 if err != nil { 447 info.Error = err.Error() 448 } 449 450 info.Cluster = &swarm.ClusterInfo 451 452 if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err != nil { 453 info.Error = err.Error() 454 } else { 455 info.Nodes = len(r.Nodes) 456 for _, n := range r.Nodes { 457 if n.ManagerStatus != nil { 458 info.Managers = info.Managers + 1 459 } 460 } 461 } 462 463 switch info.LocalNodeState { 464 case types.LocalNodeStateInactive, types.LocalNodeStateLocked, types.LocalNodeStateError: 465 // nothing to do 466 default: 467 if info.Managers == 2 { 468 const warn string = `WARNING: Running Swarm in a two-manager configuration. This configuration provides 469 no fault tolerance, and poses a high risk to loose control over the cluster. 470 Refer to https://docs.docker.com/engine/swarm/admin_guide/ to configure the 471 Swarm for fault-tolerance.` 472 473 info.Warnings = append(info.Warnings, warn) 474 } 475 } 476 } 477 478 if state.swarmNode != nil { 479 for _, r := range state.swarmNode.Remotes() { 480 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 481 } 482 info.NodeID = state.swarmNode.NodeID() 483 } 484 485 return info 486 } 487 488 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 489 var err error 490 req.ListenAddr, err = validateAddr(req.ListenAddr) 491 if err != nil { 492 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 493 } 494 495 if req.Spec.Annotations.Name == "" { 496 req.Spec.Annotations.Name = "default" 497 } else if req.Spec.Annotations.Name != "default" { 498 return errors.New(`swarm spec must be named "default"`) 499 } 500 501 return nil 502 } 503 504 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 505 var err error 506 req.ListenAddr, err = validateAddr(req.ListenAddr) 507 if err != nil { 508 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 509 } 510 if len(req.RemoteAddrs) == 0 { 511 return errors.New("at least 1 RemoteAddr is required to join") 512 } 513 for i := range req.RemoteAddrs { 514 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 515 if err != nil { 516 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 517 } 518 } 519 return nil 520 } 521 522 func validateAddr(addr string) (string, error) { 523 if addr == "" { 524 return addr, errors.New("invalid empty address") 525 } 526 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 527 if err != nil { 528 return addr, nil 529 } 530 return strings.TrimPrefix(newaddr, "tcp://"), nil 531 } 532 533 func initClusterSpec(node *swarmnode.Node, spec types.Spec) error { 534 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 535 defer cancel() 536 for conn := range node.ListenControlSocket(ctx) { 537 if ctx.Err() != nil { 538 return ctx.Err() 539 } 540 if conn != nil { 541 client := swarmapi.NewControlClient(conn) 542 var cluster *swarmapi.Cluster 543 for i := 0; ; i++ { 544 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 545 if err != nil { 546 return fmt.Errorf("error on listing clusters: %v", err) 547 } 548 if len(lcr.Clusters) == 0 { 549 if i < 10 { 550 time.Sleep(200 * time.Millisecond) 551 continue 552 } 553 return errors.New("empty list of clusters was returned") 554 } 555 cluster = lcr.Clusters[0] 556 break 557 } 558 // In init, we take the initial default values from swarmkit, and merge 559 // any non nil or 0 value from spec to GRPC spec. This will leave the 560 // default value alone. 561 // Note that this is different from Update(), as in Update() we expect 562 // user to specify the complete spec of the cluster (as they already know 563 // the existing one and knows which field to update) 564 clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec) 565 if err != nil { 566 return fmt.Errorf("error updating cluster settings: %v", err) 567 } 568 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 569 ClusterID: cluster.ID, 570 ClusterVersion: &cluster.Meta.Version, 571 Spec: &clusterSpec, 572 }) 573 if err != nil { 574 return fmt.Errorf("error updating cluster settings: %v", err) 575 } 576 return nil 577 } 578 } 579 return ctx.Err() 580 } 581 582 func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) { 583 var ids []string 584 filters := filters.NewArgs() 585 filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID)) 586 containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{ 587 Filters: filters, 588 }) 589 if err != nil { 590 return []string{}, err 591 } 592 for _, c := range containers { 593 ids = append(ids, c.ID) 594 } 595 return ids, nil 596 }