github.com/demonoid81/moby@v0.0.0-20200517203328-62dd8e17c460/daemon/cluster/swarm.go (about) 1 package cluster // import "github.com/demonoid81/moby/daemon/cluster" 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "strings" 8 "time" 9 10 apitypes "github.com/demonoid81/moby/api/types" 11 "github.com/demonoid81/moby/api/types/filters" 12 types "github.com/demonoid81/moby/api/types/swarm" 13 "github.com/demonoid81/moby/daemon/cluster/convert" 14 "github.com/demonoid81/moby/errdefs" 15 "github.com/demonoid81/moby/opts" 16 "github.com/demonoid81/moby/pkg/signal" 17 swarmapi "github.com/docker/swarmkit/api" 18 "github.com/docker/swarmkit/manager/encryption" 19 swarmnode "github.com/docker/swarmkit/node" 20 "github.com/pkg/errors" 21 "github.com/sirupsen/logrus" 22 "google.golang.org/grpc" 23 ) 24 25 // Init initializes new cluster from user provided request. 26 func (c *Cluster) Init(req types.InitRequest) (string, error) { 27 c.controlMutex.Lock() 28 defer c.controlMutex.Unlock() 29 if c.nr != nil { 30 if req.ForceNewCluster { 31 32 // Take c.mu temporarily to wait for presently running 33 // API handlers to finish before shutting down the node. 34 c.mu.Lock() 35 if !c.nr.nodeState.IsManager() { 36 c.mu.Unlock() 37 return "", errSwarmNotManager 38 } 39 c.mu.Unlock() 40 41 if err := c.nr.Stop(); err != nil { 42 return "", err 43 } 44 } else { 45 return "", errSwarmExists 46 } 47 } 48 49 if err := validateAndSanitizeInitRequest(&req); err != nil { 50 return "", errdefs.InvalidParameter(err) 51 } 52 53 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 54 if err != nil { 55 return "", err 56 } 57 58 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 59 if err != nil { 60 return "", err 61 } 62 63 dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr) 64 if err != nil { 65 return "", err 66 } 67 68 localAddr := listenHost 69 70 // If the local address is undetermined, the advertise address 71 // will be used as local address, if it belongs to this system. 72 // If the advertise address is not local, then we try to find 73 // a system address to use as local address. If this fails, 74 // we give up and ask the user to pass the listen address. 75 if net.ParseIP(localAddr).IsUnspecified() { 76 advertiseIP := net.ParseIP(advertiseHost) 77 78 found := false 79 for _, systemIP := range listSystemIPs() { 80 if systemIP.Equal(advertiseIP) { 81 localAddr = advertiseIP.String() 82 found = true 83 break 84 } 85 } 86 87 if !found { 88 ip, err := c.resolveSystemAddr() 89 if err != nil { 90 logrus.Warnf("Could not find a local address: %v", err) 91 return "", errMustSpecifyListenAddr 92 } 93 localAddr = ip.String() 94 } 95 } 96 97 if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil { 98 return "", err 99 } 100 101 port, err := getDataPathPort(req.DataPathPort) 102 if err != nil { 103 return "", err 104 } 105 106 nr, err := c.newNodeRunner(nodeStartConfig{ 107 forceNewCluster: req.ForceNewCluster, 108 autolock: req.AutoLockManagers, 109 LocalAddr: localAddr, 110 ListenAddr: net.JoinHostPort(listenHost, listenPort), 111 AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort), 112 DataPathAddr: dataPathAddr, 113 DefaultAddressPool: req.DefaultAddrPool, 114 SubnetSize: req.SubnetSize, 115 availability: req.Availability, 116 DataPathPort: port, 117 }) 118 if err != nil { 119 return "", err 120 } 121 c.mu.Lock() 122 c.nr = nr 123 c.mu.Unlock() 124 125 if err := <-nr.Ready(); err != nil { 126 c.mu.Lock() 127 c.nr = nil 128 c.mu.Unlock() 129 if !req.ForceNewCluster { // if failure on first attempt don't keep state 130 if err := clearPersistentState(c.root); err != nil { 131 return "", err 132 } 133 } 134 return "", err 135 } 136 state := nr.State() 137 if state.swarmNode == nil { // should never happen but protect from panic 138 return "", errors.New("invalid cluster state for spec initialization") 139 } 140 if err := initClusterSpec(state.swarmNode, req.Spec); err != nil { 141 return "", err 142 } 143 return state.NodeID(), nil 144 } 145 146 // Join makes current Cluster part of an existing swarm cluster. 147 func (c *Cluster) Join(req types.JoinRequest) error { 148 c.controlMutex.Lock() 149 defer c.controlMutex.Unlock() 150 c.mu.Lock() 151 if c.nr != nil { 152 c.mu.Unlock() 153 return errors.WithStack(errSwarmExists) 154 } 155 c.mu.Unlock() 156 157 if err := validateAndSanitizeJoinRequest(&req); err != nil { 158 return errdefs.InvalidParameter(err) 159 } 160 161 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 162 if err != nil { 163 return err 164 } 165 166 var advertiseAddr string 167 if req.AdvertiseAddr != "" { 168 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 169 // For joining, we don't need to provide an advertise address, 170 // since the remote side can detect it. 171 if err == nil { 172 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 173 } 174 } 175 176 dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr) 177 if err != nil { 178 return err 179 } 180 181 nr, err := c.newNodeRunner(nodeStartConfig{ 182 RemoteAddr: req.RemoteAddrs[0], 183 ListenAddr: net.JoinHostPort(listenHost, listenPort), 184 AdvertiseAddr: advertiseAddr, 185 DataPathAddr: dataPathAddr, 186 joinAddr: req.RemoteAddrs[0], 187 joinToken: req.JoinToken, 188 availability: req.Availability, 189 }) 190 if err != nil { 191 return err 192 } 193 194 c.mu.Lock() 195 c.nr = nr 196 c.mu.Unlock() 197 198 timeout := time.NewTimer(swarmConnectTimeout) 199 defer timeout.Stop() 200 201 select { 202 case <-timeout.C: 203 return errSwarmJoinTimeoutReached 204 case err := <-nr.Ready(): 205 if err != nil { 206 c.mu.Lock() 207 c.nr = nil 208 c.mu.Unlock() 209 if err := clearPersistentState(c.root); err != nil { 210 return err 211 } 212 } 213 return err 214 } 215 } 216 217 // Inspect retrieves the configuration properties of a managed swarm cluster. 218 func (c *Cluster) Inspect() (types.Swarm, error) { 219 var swarm types.Swarm 220 if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 221 s, err := c.inspect(ctx, state) 222 if err != nil { 223 return err 224 } 225 swarm = s 226 return nil 227 }); err != nil { 228 return types.Swarm{}, err 229 } 230 return swarm, nil 231 } 232 233 func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) { 234 s, err := getSwarm(ctx, state.controlClient) 235 if err != nil { 236 return types.Swarm{}, err 237 } 238 return convert.SwarmFromGRPC(*s), nil 239 } 240 241 // Update updates configuration of a managed swarm cluster. 242 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 243 return c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 244 swarm, err := getSwarm(ctx, state.controlClient) 245 if err != nil { 246 return err 247 } 248 249 // Validate spec name. 250 if spec.Annotations.Name == "" { 251 spec.Annotations.Name = "default" 252 } else if spec.Annotations.Name != "default" { 253 return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`)) 254 } 255 256 // In update, client should provide the complete spec of the swarm, including 257 // Name and Labels. If a field is specified with 0 or nil, then the default value 258 // will be used to swarmkit. 259 clusterSpec, err := convert.SwarmSpecToGRPC(spec) 260 if err != nil { 261 return errdefs.InvalidParameter(err) 262 } 263 264 _, err = state.controlClient.UpdateCluster( 265 ctx, 266 &swarmapi.UpdateClusterRequest{ 267 ClusterID: swarm.ID, 268 Spec: &clusterSpec, 269 ClusterVersion: &swarmapi.Version{ 270 Index: version, 271 }, 272 Rotation: swarmapi.KeyRotation{ 273 WorkerJoinToken: flags.RotateWorkerToken, 274 ManagerJoinToken: flags.RotateManagerToken, 275 ManagerUnlockKey: flags.RotateManagerUnlockKey, 276 }, 277 }, 278 ) 279 return err 280 }) 281 } 282 283 // GetUnlockKey returns the unlock key for the swarm. 284 func (c *Cluster) GetUnlockKey() (string, error) { 285 var resp *swarmapi.GetUnlockKeyResponse 286 if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 287 client := swarmapi.NewCAClient(state.grpcConn) 288 289 r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{}) 290 if err != nil { 291 return err 292 } 293 resp = r 294 return nil 295 }); err != nil { 296 return "", err 297 } 298 if len(resp.UnlockKey) == 0 { 299 // no key 300 return "", nil 301 } 302 return encryption.HumanReadableKey(resp.UnlockKey), nil 303 } 304 305 // UnlockSwarm provides a key to decrypt data that is encrypted at rest. 306 func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error { 307 c.controlMutex.Lock() 308 defer c.controlMutex.Unlock() 309 310 c.mu.RLock() 311 state := c.currentNodeState() 312 313 if !state.IsActiveManager() { 314 // when manager is not active, 315 // unless it is locked, otherwise return error. 316 if err := c.errNoManager(state); err != errSwarmLocked { 317 c.mu.RUnlock() 318 return err 319 } 320 } else { 321 // when manager is active, return an error of "not locked" 322 c.mu.RUnlock() 323 return notLockedError{} 324 } 325 326 // only when swarm is locked, code running reaches here 327 nr := c.nr 328 c.mu.RUnlock() 329 330 key, err := encryption.ParseHumanReadableKey(req.UnlockKey) 331 if err != nil { 332 return errdefs.InvalidParameter(err) 333 } 334 335 config := nr.config 336 config.lockKey = key 337 if err := nr.Stop(); err != nil { 338 return err 339 } 340 nr, err = c.newNodeRunner(config) 341 if err != nil { 342 return err 343 } 344 345 c.mu.Lock() 346 c.nr = nr 347 c.mu.Unlock() 348 349 if err := <-nr.Ready(); err != nil { 350 if errors.Is(err, errSwarmLocked) { 351 return invalidUnlockKey{} 352 } 353 return errors.Errorf("swarm component could not be started: %v", err) 354 } 355 return nil 356 } 357 358 // Leave shuts down Cluster and removes current state. 359 func (c *Cluster) Leave(force bool) error { 360 c.controlMutex.Lock() 361 defer c.controlMutex.Unlock() 362 363 c.mu.Lock() 364 nr := c.nr 365 if nr == nil { 366 c.mu.Unlock() 367 return errors.WithStack(errNoSwarm) 368 } 369 370 state := c.currentNodeState() 371 372 c.mu.Unlock() 373 374 if errors.Is(state.err, errSwarmLocked) && !force { 375 // leave a locked swarm without --force is not allowed 376 return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")) 377 } 378 379 if state.IsManager() && !force { 380 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 381 if state.IsActiveManager() { 382 active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID()) 383 if err == nil { 384 if active && removingManagerCausesLossOfQuorum(reachable, unreachable) { 385 if isLastManager(reachable, unreachable) { 386 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 387 return errors.WithStack(notAvailableError(msg)) 388 } 389 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 390 } 391 } 392 } else { 393 msg += "Doing so may lose the consensus of your cluster. " 394 } 395 396 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 397 return errors.WithStack(notAvailableError(msg)) 398 } 399 // release readers in here 400 if err := nr.Stop(); err != nil { 401 logrus.Errorf("failed to shut down cluster node: %v", err) 402 signal.DumpStacks("") 403 return err 404 } 405 406 c.mu.Lock() 407 c.nr = nil 408 c.mu.Unlock() 409 410 if nodeID := state.NodeID(); nodeID != "" { 411 nodeContainers, err := c.listContainerForNode(nodeID) 412 if err != nil { 413 return err 414 } 415 for _, id := range nodeContainers { 416 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 417 logrus.Errorf("error removing %v: %v", id, err) 418 } 419 } 420 } 421 422 // todo: cleanup optional? 423 if err := clearPersistentState(c.root); err != nil { 424 return err 425 } 426 c.config.Backend.DaemonLeavesCluster() 427 return nil 428 } 429 430 // Info returns information about the current cluster state. 431 func (c *Cluster) Info() types.Info { 432 info := types.Info{ 433 NodeAddr: c.GetAdvertiseAddress(), 434 } 435 c.mu.RLock() 436 defer c.mu.RUnlock() 437 438 state := c.currentNodeState() 439 info.LocalNodeState = state.status 440 if state.err != nil { 441 info.Error = state.err.Error() 442 } 443 444 ctx, cancel := c.getRequestContext() 445 defer cancel() 446 447 if state.IsActiveManager() { 448 info.ControlAvailable = true 449 swarm, err := c.inspect(ctx, state) 450 if err != nil { 451 info.Error = err.Error() 452 } 453 454 info.Cluster = &swarm.ClusterInfo 455 456 if r, err := state.controlClient.ListNodes( 457 ctx, &swarmapi.ListNodesRequest{}, 458 grpc.MaxCallRecvMsgSize(defaultRecvSizeForListResponse), 459 ); err != nil { 460 info.Error = err.Error() 461 } else { 462 info.Nodes = len(r.Nodes) 463 for _, n := range r.Nodes { 464 if n.ManagerStatus != nil { 465 info.Managers = info.Managers + 1 466 } 467 } 468 } 469 470 switch info.LocalNodeState { 471 case types.LocalNodeStateInactive, types.LocalNodeStateLocked, types.LocalNodeStateError: 472 // nothing to do 473 default: 474 if info.Managers == 2 { 475 const warn string = `WARNING: Running Swarm in a two-manager configuration. This configuration provides 476 no fault tolerance, and poses a high risk to lose control over the cluster. 477 Refer to https://docs.docker.com/engine/swarm/admin_guide/ to configure the 478 Swarm for fault-tolerance.` 479 480 info.Warnings = append(info.Warnings, warn) 481 } 482 } 483 } 484 485 if state.swarmNode != nil { 486 for _, r := range state.swarmNode.Remotes() { 487 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 488 } 489 info.NodeID = state.swarmNode.NodeID() 490 } 491 492 return info 493 } 494 495 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 496 var err error 497 req.ListenAddr, err = validateAddr(req.ListenAddr) 498 if err != nil { 499 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 500 } 501 502 if req.Spec.Annotations.Name == "" { 503 req.Spec.Annotations.Name = "default" 504 } else if req.Spec.Annotations.Name != "default" { 505 return errors.New(`swarm spec must be named "default"`) 506 } 507 508 return nil 509 } 510 511 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 512 var err error 513 req.ListenAddr, err = validateAddr(req.ListenAddr) 514 if err != nil { 515 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 516 } 517 if len(req.RemoteAddrs) == 0 { 518 return errors.New("at least 1 RemoteAddr is required to join") 519 } 520 for i := range req.RemoteAddrs { 521 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 522 if err != nil { 523 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 524 } 525 } 526 return nil 527 } 528 529 func validateAddr(addr string) (string, error) { 530 if addr == "" { 531 return addr, errors.New("invalid empty address") 532 } 533 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 534 if err != nil { 535 return addr, nil 536 } 537 return strings.TrimPrefix(newaddr, "tcp://"), nil 538 } 539 540 func initClusterSpec(node *swarmnode.Node, spec types.Spec) error { 541 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 542 defer cancel() 543 for conn := range node.ListenControlSocket(ctx) { 544 if ctx.Err() != nil { 545 return ctx.Err() 546 } 547 if conn != nil { 548 client := swarmapi.NewControlClient(conn) 549 var cluster *swarmapi.Cluster 550 for i := 0; ; i++ { 551 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 552 if err != nil { 553 return fmt.Errorf("error on listing clusters: %v", err) 554 } 555 if len(lcr.Clusters) == 0 { 556 if i < 10 { 557 time.Sleep(200 * time.Millisecond) 558 continue 559 } 560 return errors.New("empty list of clusters was returned") 561 } 562 cluster = lcr.Clusters[0] 563 break 564 } 565 // In init, we take the initial default values from swarmkit, and merge 566 // any non nil or 0 value from spec to GRPC spec. This will leave the 567 // default value alone. 568 // Note that this is different from Update(), as in Update() we expect 569 // user to specify the complete spec of the cluster (as they already know 570 // the existing one and knows which field to update) 571 clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec) 572 if err != nil { 573 return fmt.Errorf("error updating cluster settings: %v", err) 574 } 575 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 576 ClusterID: cluster.ID, 577 ClusterVersion: &cluster.Meta.Version, 578 Spec: &clusterSpec, 579 }) 580 if err != nil { 581 return fmt.Errorf("error updating cluster settings: %v", err) 582 } 583 return nil 584 } 585 } 586 return ctx.Err() 587 } 588 589 func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) { 590 var ids []string 591 filters := filters.NewArgs() 592 filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID)) 593 containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{ 594 Filters: filters, 595 }) 596 if err != nil { 597 return []string{}, err 598 } 599 for _, c := range containers { 600 ids = append(ids, c.ID) 601 } 602 return ids, nil 603 }