github.com/Prakhar-Agarwal-byte/moby@v0.0.0-20231027092010-a14e3e8ab87e/daemon/cluster/swarm.go (about) 1 package cluster // import "github.com/Prakhar-Agarwal-byte/moby/daemon/cluster" 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "strings" 8 "time" 9 10 apitypes "github.com/Prakhar-Agarwal-byte/moby/api/types" 11 "github.com/Prakhar-Agarwal-byte/moby/api/types/container" 12 "github.com/Prakhar-Agarwal-byte/moby/api/types/filters" 13 types "github.com/Prakhar-Agarwal-byte/moby/api/types/swarm" 14 "github.com/Prakhar-Agarwal-byte/moby/daemon/cluster/convert" 15 "github.com/Prakhar-Agarwal-byte/moby/errdefs" 16 "github.com/Prakhar-Agarwal-byte/moby/opts" 17 "github.com/Prakhar-Agarwal-byte/moby/pkg/stack" 18 "github.com/containerd/log" 19 swarmapi "github.com/moby/swarmkit/v2/api" 20 "github.com/moby/swarmkit/v2/manager/encryption" 21 swarmnode "github.com/moby/swarmkit/v2/node" 22 "github.com/pkg/errors" 23 "google.golang.org/grpc" 24 ) 25 26 // Init initializes new cluster from user provided request. 27 func (c *Cluster) Init(req types.InitRequest) (string, error) { 28 c.controlMutex.Lock() 29 defer c.controlMutex.Unlock() 30 if c.nr != nil { 31 if req.ForceNewCluster { 32 33 // Take c.mu temporarily to wait for presently running 34 // API handlers to finish before shutting down the node. 35 c.mu.Lock() 36 if !c.nr.nodeState.IsManager() { 37 c.mu.Unlock() 38 return "", errSwarmNotManager 39 } 40 c.mu.Unlock() 41 42 if err := c.nr.Stop(); err != nil { 43 return "", err 44 } 45 } else { 46 return "", errSwarmExists 47 } 48 } 49 50 if err := validateAndSanitizeInitRequest(&req); err != nil { 51 return "", errdefs.InvalidParameter(err) 52 } 53 54 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 55 if err != nil { 56 return "", errdefs.InvalidParameter(err) 57 } 58 59 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 60 if err != nil { 61 return "", err 62 } 63 64 dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr) 65 if err != nil { 66 return "", err 67 } 68 69 localAddr := listenHost 70 71 // If the local address is undetermined, the advertise address 72 // will be used as local address, if it belongs to this system. 73 // If the advertise address is not local, then we try to find 74 // a system address to use as local address. If this fails, 75 // we give up and ask the user to pass the listen address. 76 if net.ParseIP(localAddr).IsUnspecified() { 77 advertiseIP := net.ParseIP(advertiseHost) 78 79 found := false 80 for _, systemIP := range listSystemIPs() { 81 if systemIP.Equal(advertiseIP) { 82 localAddr = advertiseIP.String() 83 found = true 84 break 85 } 86 } 87 88 if !found { 89 ip, err := c.resolveSystemAddr() 90 if err != nil { 91 log.G(context.TODO()).Warnf("Could not find a local address: %v", err) 92 return "", errMustSpecifyListenAddr 93 } 94 localAddr = ip.String() 95 } 96 } 97 98 if err := validateDefaultAddrPool(req.DefaultAddrPool, req.SubnetSize); err != nil { 99 return "", err 100 } 101 102 port, err := getDataPathPort(req.DataPathPort) 103 if err != nil { 104 return "", err 105 } 106 107 nr, err := c.newNodeRunner(nodeStartConfig{ 108 forceNewCluster: req.ForceNewCluster, 109 autolock: req.AutoLockManagers, 110 LocalAddr: localAddr, 111 ListenAddr: net.JoinHostPort(listenHost, listenPort), 112 AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort), 113 DataPathAddr: dataPathAddr, 114 DefaultAddressPool: req.DefaultAddrPool, 115 SubnetSize: req.SubnetSize, 116 availability: req.Availability, 117 DataPathPort: port, 118 }) 119 if err != nil { 120 return "", err 121 } 122 c.mu.Lock() 123 c.nr = nr 124 c.mu.Unlock() 125 126 if err := <-nr.Ready(); err != nil { 127 c.mu.Lock() 128 c.nr = nil 129 c.mu.Unlock() 130 if !req.ForceNewCluster { // if failure on first attempt don't keep state 131 if err := clearPersistentState(c.root); err != nil { 132 return "", err 133 } 134 } 135 return "", err 136 } 137 state := nr.State() 138 if state.swarmNode == nil { // should never happen but protect from panic 139 return "", errors.New("invalid cluster state for spec initialization") 140 } 141 if err := initClusterSpec(state.swarmNode, req.Spec); err != nil { 142 return "", err 143 } 144 return state.NodeID(), nil 145 } 146 147 // Join makes current Cluster part of an existing swarm cluster. 148 func (c *Cluster) Join(req types.JoinRequest) error { 149 c.controlMutex.Lock() 150 defer c.controlMutex.Unlock() 151 c.mu.Lock() 152 if c.nr != nil { 153 c.mu.Unlock() 154 return errors.WithStack(errSwarmExists) 155 } 156 c.mu.Unlock() 157 158 if err := validateAndSanitizeJoinRequest(&req); err != nil { 159 return errdefs.InvalidParameter(err) 160 } 161 162 listenHost, listenPort, err := resolveListenAddr(req.ListenAddr) 163 if err != nil { 164 return err 165 } 166 167 var advertiseAddr string 168 if req.AdvertiseAddr != "" { 169 advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort) 170 // For joining, we don't need to provide an advertise address, 171 // since the remote side can detect it. 172 if err == nil { 173 advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort) 174 } 175 } 176 177 dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr) 178 if err != nil { 179 return err 180 } 181 182 nr, err := c.newNodeRunner(nodeStartConfig{ 183 RemoteAddr: req.RemoteAddrs[0], 184 ListenAddr: net.JoinHostPort(listenHost, listenPort), 185 AdvertiseAddr: advertiseAddr, 186 DataPathAddr: dataPathAddr, 187 joinAddr: req.RemoteAddrs[0], 188 joinToken: req.JoinToken, 189 availability: req.Availability, 190 }) 191 if err != nil { 192 return err 193 } 194 195 c.mu.Lock() 196 c.nr = nr 197 c.mu.Unlock() 198 199 timeout := time.NewTimer(swarmConnectTimeout) 200 defer timeout.Stop() 201 202 select { 203 case <-timeout.C: 204 return errSwarmJoinTimeoutReached 205 case err := <-nr.Ready(): 206 if err != nil { 207 c.mu.Lock() 208 c.nr = nil 209 c.mu.Unlock() 210 if err := clearPersistentState(c.root); err != nil { 211 return err 212 } 213 } 214 return err 215 } 216 } 217 218 // Inspect retrieves the configuration properties of a managed swarm cluster. 219 func (c *Cluster) Inspect() (types.Swarm, error) { 220 var swarm types.Swarm 221 if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 222 s, err := c.inspect(ctx, state) 223 if err != nil { 224 return err 225 } 226 swarm = s 227 return nil 228 }); err != nil { 229 return types.Swarm{}, err 230 } 231 return swarm, nil 232 } 233 234 func (c *Cluster) inspect(ctx context.Context, state nodeState) (types.Swarm, error) { 235 s, err := getSwarm(ctx, state.controlClient) 236 if err != nil { 237 return types.Swarm{}, err 238 } 239 return convert.SwarmFromGRPC(*s), nil 240 } 241 242 // Update updates configuration of a managed swarm cluster. 243 func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error { 244 return c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 245 swarm, err := getSwarm(ctx, state.controlClient) 246 if err != nil { 247 return err 248 } 249 250 // Validate spec name. 251 if spec.Annotations.Name == "" { 252 spec.Annotations.Name = "default" 253 } else if spec.Annotations.Name != "default" { 254 return errdefs.InvalidParameter(errors.New(`swarm spec must be named "default"`)) 255 } 256 257 // In update, client should provide the complete spec of the swarm, including 258 // Name and Labels. If a field is specified with 0 or nil, then the default value 259 // will be used to swarmkit. 260 clusterSpec, err := convert.SwarmSpecToGRPC(spec) 261 if err != nil { 262 return errdefs.InvalidParameter(err) 263 } 264 265 _, err = state.controlClient.UpdateCluster( 266 ctx, 267 &swarmapi.UpdateClusterRequest{ 268 ClusterID: swarm.ID, 269 Spec: &clusterSpec, 270 ClusterVersion: &swarmapi.Version{ 271 Index: version, 272 }, 273 Rotation: swarmapi.KeyRotation{ 274 WorkerJoinToken: flags.RotateWorkerToken, 275 ManagerJoinToken: flags.RotateManagerToken, 276 ManagerUnlockKey: flags.RotateManagerUnlockKey, 277 }, 278 }, 279 ) 280 return err 281 }) 282 } 283 284 // GetUnlockKey returns the unlock key for the swarm. 285 func (c *Cluster) GetUnlockKey() (string, error) { 286 var resp *swarmapi.GetUnlockKeyResponse 287 if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error { 288 client := swarmapi.NewCAClient(state.grpcConn) 289 290 r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{}) 291 if err != nil { 292 return err 293 } 294 resp = r 295 return nil 296 }); err != nil { 297 return "", err 298 } 299 if len(resp.UnlockKey) == 0 { 300 // no key 301 return "", nil 302 } 303 return encryption.HumanReadableKey(resp.UnlockKey), nil 304 } 305 306 // UnlockSwarm provides a key to decrypt data that is encrypted at rest. 307 func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error { 308 c.controlMutex.Lock() 309 defer c.controlMutex.Unlock() 310 311 c.mu.RLock() 312 state := c.currentNodeState() 313 314 if !state.IsActiveManager() { 315 // when manager is not active, 316 // unless it is locked, otherwise return error. 317 if err := c.errNoManager(state); err != errSwarmLocked { 318 c.mu.RUnlock() 319 return err 320 } 321 } else { 322 // when manager is active, return an error of "not locked" 323 c.mu.RUnlock() 324 return notLockedError{} 325 } 326 327 // only when swarm is locked, code running reaches here 328 nr := c.nr 329 c.mu.RUnlock() 330 331 key, err := encryption.ParseHumanReadableKey(req.UnlockKey) 332 if err != nil { 333 return errdefs.InvalidParameter(err) 334 } 335 336 config := nr.config 337 config.lockKey = key 338 if err := nr.Stop(); err != nil { 339 return err 340 } 341 nr, err = c.newNodeRunner(config) 342 if err != nil { 343 return err 344 } 345 346 c.mu.Lock() 347 c.nr = nr 348 c.mu.Unlock() 349 350 if err := <-nr.Ready(); err != nil { 351 if errors.Is(err, errSwarmLocked) { 352 return invalidUnlockKey{} 353 } 354 return errors.Errorf("swarm component could not be started: %v", err) 355 } 356 return nil 357 } 358 359 // Leave shuts down Cluster and removes current state. 360 func (c *Cluster) Leave(ctx context.Context, force bool) error { 361 c.controlMutex.Lock() 362 defer c.controlMutex.Unlock() 363 364 c.mu.Lock() 365 nr := c.nr 366 if nr == nil { 367 c.mu.Unlock() 368 return errors.WithStack(errNoSwarm) 369 } 370 371 state := c.currentNodeState() 372 373 c.mu.Unlock() 374 375 if errors.Is(state.err, errSwarmLocked) && !force { 376 // leave a locked swarm without --force is not allowed 377 return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")) 378 } 379 380 if state.IsManager() && !force { 381 msg := "You are attempting to leave the swarm on a node that is participating as a manager. " 382 if state.IsActiveManager() { 383 active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID()) 384 if err == nil { 385 if active && removingManagerCausesLossOfQuorum(reachable, unreachable) { 386 if isLastManager(reachable, unreachable) { 387 msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. " 388 return errors.WithStack(notAvailableError(msg)) 389 } 390 msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable) 391 } 392 } 393 } else { 394 msg += "Doing so may lose the consensus of your cluster. " 395 } 396 397 msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message." 398 return errors.WithStack(notAvailableError(msg)) 399 } 400 // release readers in here 401 if err := nr.Stop(); err != nil { 402 log.G(ctx).Errorf("failed to shut down cluster node: %v", err) 403 stack.Dump() 404 return err 405 } 406 407 c.mu.Lock() 408 c.nr = nil 409 c.mu.Unlock() 410 411 if nodeID := state.NodeID(); nodeID != "" { 412 nodeContainers, err := c.listContainerForNode(ctx, nodeID) 413 if err != nil { 414 return err 415 } 416 for _, id := range nodeContainers { 417 if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil { 418 log.G(ctx).Errorf("error removing %v: %v", id, err) 419 } 420 } 421 } 422 423 // todo: cleanup optional? 424 if err := clearPersistentState(c.root); err != nil { 425 return err 426 } 427 c.config.Backend.DaemonLeavesCluster() 428 return nil 429 } 430 431 // Info returns information about the current cluster state. 432 func (c *Cluster) Info() types.Info { 433 info := types.Info{ 434 NodeAddr: c.GetAdvertiseAddress(), 435 } 436 c.mu.RLock() 437 defer c.mu.RUnlock() 438 439 state := c.currentNodeState() 440 info.LocalNodeState = state.status 441 if state.err != nil { 442 info.Error = state.err.Error() 443 } 444 445 ctx, cancel := c.getRequestContext() 446 defer cancel() 447 448 if state.IsActiveManager() { 449 info.ControlAvailable = true 450 swarm, err := c.inspect(ctx, state) 451 if err != nil { 452 info.Error = err.Error() 453 } 454 455 info.Cluster = &swarm.ClusterInfo 456 457 if r, err := state.controlClient.ListNodes( 458 ctx, &swarmapi.ListNodesRequest{}, 459 grpc.MaxCallRecvMsgSize(defaultRecvSizeForListResponse), 460 ); err != nil { 461 info.Error = err.Error() 462 } else { 463 info.Nodes = len(r.Nodes) 464 for _, n := range r.Nodes { 465 if n.ManagerStatus != nil { 466 info.Managers = info.Managers + 1 467 } 468 } 469 } 470 471 switch info.LocalNodeState { 472 case types.LocalNodeStateInactive, types.LocalNodeStateLocked, types.LocalNodeStateError: 473 // nothing to do 474 default: 475 if info.Managers == 2 { 476 const warn string = `WARNING: Running Swarm in a two-manager configuration. This configuration provides 477 no fault tolerance, and poses a high risk to lose control over the cluster. 478 Refer to https://docs.docker.com/engine/swarm/admin_guide/ to configure the 479 Swarm for fault-tolerance.` 480 481 info.Warnings = append(info.Warnings, warn) 482 } 483 } 484 } 485 486 if state.swarmNode != nil { 487 for _, r := range state.swarmNode.Remotes() { 488 info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr}) 489 } 490 info.NodeID = state.swarmNode.NodeID() 491 } 492 493 return info 494 } 495 496 // Status returns a textual representation of the node's swarm status and role (manager/worker) 497 func (c *Cluster) Status() string { 498 c.mu.RLock() 499 s := c.currentNodeState() 500 c.mu.RUnlock() 501 502 state := string(s.status) 503 if s.status == types.LocalNodeStateActive { 504 if s.IsActiveManager() || s.IsManager() { 505 state += "/manager" 506 } else { 507 state += "/worker" 508 } 509 } 510 return state 511 } 512 513 func validateAndSanitizeInitRequest(req *types.InitRequest) error { 514 var err error 515 req.ListenAddr, err = validateAddr(req.ListenAddr) 516 if err != nil { 517 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 518 } 519 520 if req.Spec.Annotations.Name == "" { 521 req.Spec.Annotations.Name = "default" 522 } else if req.Spec.Annotations.Name != "default" { 523 return errors.New(`swarm spec must be named "default"`) 524 } 525 526 return nil 527 } 528 529 func validateAndSanitizeJoinRequest(req *types.JoinRequest) error { 530 var err error 531 req.ListenAddr, err = validateAddr(req.ListenAddr) 532 if err != nil { 533 return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err) 534 } 535 if len(req.RemoteAddrs) == 0 { 536 return errors.New("at least 1 RemoteAddr is required to join") 537 } 538 for i := range req.RemoteAddrs { 539 req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i]) 540 if err != nil { 541 return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err) 542 } 543 } 544 return nil 545 } 546 547 func validateAddr(addr string) (string, error) { 548 if addr == "" { 549 return addr, errors.New("invalid empty address") 550 } 551 newaddr, err := opts.ParseTCPAddr(addr, defaultAddr) 552 if err != nil { 553 // TODO(thaJeztah) why are we ignoring the error here? Is this to allow "non-tcp" addresses? 554 return addr, nil 555 } 556 return strings.TrimPrefix(newaddr, "tcp://"), nil 557 } 558 559 func initClusterSpec(node *swarmnode.Node, spec types.Spec) error { 560 ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) 561 defer cancel() 562 for conn := range node.ListenControlSocket(ctx) { 563 if ctx.Err() != nil { 564 return ctx.Err() 565 } 566 if conn != nil { 567 client := swarmapi.NewControlClient(conn) 568 var cluster *swarmapi.Cluster 569 for i := 0; ; i++ { 570 lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{}) 571 if err != nil { 572 return fmt.Errorf("error on listing clusters: %v", err) 573 } 574 if len(lcr.Clusters) == 0 { 575 if i < 10 { 576 time.Sleep(200 * time.Millisecond) 577 continue 578 } 579 return errors.New("empty list of clusters was returned") 580 } 581 cluster = lcr.Clusters[0] 582 break 583 } 584 // In init, we take the initial default values from swarmkit, and merge 585 // any non nil or 0 value from spec to GRPC spec. This will leave the 586 // default value alone. 587 // Note that this is different from Update(), as in Update() we expect 588 // user to specify the complete spec of the cluster (as they already know 589 // the existing one and knows which field to update) 590 clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec) 591 if err != nil { 592 return fmt.Errorf("error updating cluster settings: %v", err) 593 } 594 _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{ 595 ClusterID: cluster.ID, 596 ClusterVersion: &cluster.Meta.Version, 597 Spec: &clusterSpec, 598 }) 599 if err != nil { 600 return fmt.Errorf("error updating cluster settings: %v", err) 601 } 602 return nil 603 } 604 } 605 return ctx.Err() 606 } 607 608 func (c *Cluster) listContainerForNode(ctx context.Context, nodeID string) ([]string, error) { 609 var ids []string 610 containers, err := c.config.Backend.Containers(ctx, &container.ListOptions{ 611 Filters: filters.NewArgs(filters.Arg("label", "com.docker.swarm.node.id="+nodeID)), 612 }) 613 if err != nil { 614 return []string{}, err 615 } 616 for _, c := range containers { 617 ids = append(ids, c.ID) 618 } 619 return ids, nil 620 }