github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/tools/node.go (about) 1 // Package tools provides common tools and utilities for all unit and integration tests 2 /* 3 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package tools 6 7 import ( 8 "context" 9 "errors" 10 "fmt" 11 "math/rand" 12 "net" 13 "os" 14 "os/exec" 15 "path/filepath" 16 "strconv" 17 "strings" 18 "syscall" 19 "testing" 20 "time" 21 22 "github.com/NVIDIA/aistore/api" 23 "github.com/NVIDIA/aistore/api/apc" 24 "github.com/NVIDIA/aistore/api/env" 25 "github.com/NVIDIA/aistore/cmn" 26 "github.com/NVIDIA/aistore/cmn/cos" 27 "github.com/NVIDIA/aistore/cmn/fname" 28 "github.com/NVIDIA/aistore/cmn/jsp" 29 "github.com/NVIDIA/aistore/core/meta" 30 "github.com/NVIDIA/aistore/tools/docker" 31 "github.com/NVIDIA/aistore/tools/tassert" 32 "github.com/NVIDIA/aistore/tools/tlog" 33 "github.com/NVIDIA/aistore/xact" 34 ) 35 36 const ( 37 maxNodeRetry = 10 // max retries to get health 38 ) 39 40 const resilverTimeout = time.Minute 41 42 type ( 43 nodesCnt int 44 45 WaitRetryOpts struct { 46 MaxRetries int 47 Interval time.Duration 48 } 49 ) 50 51 var ErrTimedOutStabilize = errors.New("timed out waiting for the cluster to stabilize") 52 53 func (n nodesCnt) satisfied(actual int) bool { 54 if n == 0 { 55 return true 56 } 57 return int(n) == actual 58 } 59 60 // as the name implies 61 func WaitNodePubAddrNotInUse(si *meta.Snode, timeout time.Duration) error { 62 var ( 63 addr = si.PubNet.TCPEndpoint() 64 interval = controlPlaneSleep 65 ) 66 tlog.Logf("Waiting for %s to shutdown (and stop listening)\n", si.StringEx()) 67 time.Sleep(interval) // not immediate 68 for elapsed := time.Duration(0); elapsed < timeout; elapsed += interval { 69 if _, err := net.DialTimeout("tcp4", addr, interval); err != nil { 70 time.Sleep(interval) 71 return nil 72 } 73 time.Sleep(interval) 74 } 75 return errors.New("timeout") 76 } 77 78 // Add an alive node that is not in SMap to the cluster. 79 // Use to add a new node to the cluster or get back a node removed by `RemoveNodeUnsafe` 80 func JoinCluster(proxyURL string, node *meta.Snode) (string, error) { 81 return _joinCluster(proxyURL, node, registerTimeout) 82 } 83 84 // Restore a node put into maintenance: in this case the node is in 85 // Smap and canceling maintenance gets the node back. 86 func RestoreTarget(t *testing.T, proxyURL string, target *meta.Snode) (rebID string, newSmap *meta.Smap) { 87 smap := GetClusterMap(t, proxyURL) 88 tlog.Logf("Joining target %s (current %s)\n", target.StringEx(), smap.StringEx()) 89 val := &apc.ActValRmNode{DaemonID: target.ID()} 90 rebID, err := api.StopMaintenance(BaseAPIParams(proxyURL), val) 91 tassert.CheckFatal(t, err) 92 newSmap, err = WaitForClusterState( 93 proxyURL, 94 "join target", 95 smap.Version, 96 smap.CountActivePs(), 97 smap.CountActiveTs()+1, 98 ) 99 tassert.CheckFatal(t, err) 100 return rebID, newSmap 101 } 102 103 func ClearMaintenance(bp api.BaseParams, tsi *meta.Snode) { 104 val := &apc.ActValRmNode{DaemonID: tsi.ID(), SkipRebalance: true} 105 // it can fail if the node is not under maintenance but it is OK 106 _, _ = api.StopMaintenance(bp, val) 107 } 108 109 func RandomProxyURL(ts ...*testing.T) (url string) { 110 var ( 111 bp = BaseAPIParams(proxyURLReadOnly) 112 smap, err = waitForStartup(bp) 113 retries = 3 114 ) 115 if err == nil { 116 return getRandomProxyURL(smap) 117 } 118 for _, node := range pmapReadOnly { 119 url := node.URL(cmn.NetPublic) 120 if url == proxyURLReadOnly { 121 continue 122 } 123 if retries == 0 { 124 return "" 125 } 126 bp = BaseAPIParams(url) 127 if smap, err = waitForStartup(bp); err == nil { 128 return getRandomProxyURL(smap) 129 } 130 retries-- 131 } 132 if len(ts) > 0 { 133 tassert.CheckFatal(ts[0], err) 134 } 135 136 return "" 137 } 138 139 func getRandomProxyURL(smap *meta.Smap) string { 140 proxies := smap.Pmap.ActiveNodes() 141 return proxies[rand.Intn(len(proxies))].URL(cmn.NetPublic) 142 } 143 144 // Return the first proxy from smap that is IC member. The primary 145 // proxy has higher priority. 146 func GetICProxy(t testing.TB, smap *meta.Smap, ignoreID string) *meta.Snode { 147 if smap.IsIC(smap.Primary) { 148 return smap.Primary 149 } 150 for _, proxy := range smap.Pmap { 151 if ignoreID != "" && proxy.ID() == ignoreID { 152 continue 153 } 154 if !smap.IsIC(proxy) { 155 continue 156 } 157 return proxy 158 } 159 t.Fatal("failed to choose random IC member") 160 return nil 161 } 162 163 // WaitForClusterStateActual waits until a cluster reaches specified state, meaning: 164 // - smap has version larger than origVersion 165 // - number of proxies in Smap is equal proxyCnt, unless proxyCnt == 0 166 // - number of targets in Smap is equal targetCnt, unless targetCnt == 0. 167 // 168 // It returns the smap which satisfies those requirements. 169 func WaitForClusterStateActual(proxyURL, reason string, origVersion int64, proxyCnt, targetCnt int, 170 syncIgnoreIDs ...string) (*meta.Smap, error) { 171 for { 172 smap, err := WaitForClusterState(proxyURL, reason, origVersion, proxyCnt, targetCnt, syncIgnoreIDs...) 173 if err != nil { 174 return nil, err 175 } 176 if smap.CountTargets() == targetCnt && smap.CountProxies() == proxyCnt { 177 return smap, nil 178 } 179 tlog.Logf("Smap changed from %d to %d, but the number of proxies(%d/%d)/targets(%d/%d) is not reached", 180 origVersion, smap.Version, targetCnt, smap.CountTargets(), proxyCnt, smap.CountProxies()) 181 origVersion = smap.Version 182 } 183 } 184 185 // WaitForClusterState waits until a cluster reaches specified state, meaning: 186 // - smap has version larger than origVersion 187 // - number of active proxies is equal proxyCnt, unless proxyCnt == 0 188 // - number of active targets is equal targetCnt, unless targetCnt == 0. 189 // 190 // It returns the smap which satisfies those requirements. 191 // NOTE: Upon successful return from this function cluster state might have already changed. 192 func WaitForClusterState(proxyURL, reason string, origVer int64, pcnt, tcnt int, ignoreIDs ...string) (*meta.Smap, error) { 193 const ( 194 maxSleep = 7 * time.Second 195 maxWait = 2 * time.Minute 196 ) 197 var ( 198 expPrx = nodesCnt(pcnt) 199 expTgt = nodesCnt(tcnt) 200 bp = BaseAPIParams(proxyURL) 201 lastVer int64 202 iter int 203 ) 204 if expPrx == 0 && expTgt == 0 { 205 if origVer > 0 { 206 tlog.Logf("Waiting for %q (Smap > v%d)\n", reason, origVer) 207 } else { 208 tlog.Logf("Waiting for %q\n", reason) 209 } 210 } else { 211 if origVer > 0 { 212 tlog.Logf("Waiting for %q (p%d, t%d, Smap > v%d)\n", reason, expPrx, expTgt, origVer) 213 } else { 214 tlog.Logf("Waiting for %q (p%d, t%d)\n", reason, expPrx, expTgt) 215 } 216 } 217 started := time.Now() 218 deadline := started.Add(maxWait) 219 opDeadline := started.Add(2 * maxWait) 220 for { 221 var ( 222 smap, err = api.GetClusterMap(bp) 223 ok bool 224 ) 225 if err != nil { 226 if !cos.IsRetriableConnErr(err) { 227 return nil, err 228 } 229 tlog.Logf("%v\n", err) 230 goto next 231 } 232 ok = expTgt.satisfied(smap.CountActiveTs()) && expPrx.satisfied(smap.CountActivePs()) && 233 smap.Version > origVer 234 if ok && time.Since(started) < time.Second { 235 time.Sleep(time.Second) 236 lastVer = smap.Version 237 continue 238 } 239 if !ok { 240 if time.Since(started) > maxSleep { 241 pid := pidFromURL(smap, proxyURL) 242 if expPrx == 0 && expTgt == 0 { 243 tlog.Logf("Polling %s(%s) for (Smap > v%d)\n", meta.Pname(pid), smap.StringEx(), origVer) 244 } else { 245 tlog.Logf("Polling %s(%s) for (t=%d, p=%d, Smap > v%d)\n", 246 meta.Pname(pid), smap.StringEx(), expTgt, expPrx, origVer) 247 } 248 } 249 } 250 if smap.Version != lastVer && lastVer != 0 { 251 deadline = cos.MinTime(time.Now().Add(maxWait), opDeadline) 252 } 253 // if the primary's map changed to the state we want, wait for the map get populated 254 if ok { 255 syncedSmap := &meta.Smap{} 256 cos.CopyStruct(syncedSmap, smap) 257 258 // skip primary proxy and mock targets 259 proxyID := pidFromURL(smap, proxyURL) 260 idsToIgnore := cos.NewStrSet(MockDaemonID, proxyID) 261 idsToIgnore.Add(ignoreIDs...) 262 err = waitSmapSync(bp, deadline, syncedSmap, origVer, idsToIgnore) 263 if err != nil { 264 tlog.Logf("Failed waiting for cluster state condition: %v (%s, %s, %v, %v)\n", 265 err, smap, syncedSmap, origVer, idsToIgnore) 266 return nil, err 267 } 268 if !expTgt.satisfied(smap.CountActiveTs()) || !expPrx.satisfied(smap.CountActivePs()) { 269 return nil, fmt.Errorf("%s updated and does not satisfy the state condition anymore", smap.StringEx()) 270 } 271 return smap, nil 272 } 273 lastVer = smap.Version 274 iter++ 275 next: 276 if time.Now().After(deadline) { 277 break 278 } 279 time.Sleep(min(time.Second*time.Duration(iter), maxSleep)) 280 } 281 282 return nil, ErrTimedOutStabilize 283 } 284 285 func pidFromURL(smap *meta.Smap, proxyURL string) string { 286 for _, p := range smap.Pmap { 287 if p.PubNet.URL == proxyURL { 288 return p.ID() 289 } 290 } 291 return "" 292 } 293 294 func WaitForNewSmap(proxyURL string, prevVersion int64) (newSmap *meta.Smap, err error) { 295 return WaitForClusterState(proxyURL, "new smap version", prevVersion, 0, 0) 296 } 297 298 func WaitForResilvering(t *testing.T, bp api.BaseParams, target *meta.Snode) { 299 args := xact.ArgsMsg{Kind: apc.ActResilver, Timeout: resilverTimeout} 300 if target != nil { 301 args.DaemonID = target.ID() 302 time.Sleep(controlPlaneSleep) 303 } else { 304 time.Sleep(2 * controlPlaneSleep) 305 } 306 allFinished := func(snaps xact.MultiSnap) (done, resetProbeFreq bool) { 307 tid, xsnap, err := snaps.RunningTarget("") 308 tassert.CheckFatal(t, err) 309 if tid != "" { 310 tlog.Logf("t[%s]: x-%s[%s] is running\n", tid, xsnap.Kind, xsnap.ID) 311 return false, false 312 } 313 return true, true 314 } 315 err := api.WaitForXactionNode(bp, &args, allFinished) 316 tassert.CheckFatal(t, err) 317 } 318 319 func GetTargetsMountpaths(t *testing.T, smap *meta.Smap, params api.BaseParams) map[*meta.Snode][]string { 320 mpathsByTarget := make(map[*meta.Snode][]string, smap.CountTargets()) 321 for _, target := range smap.Tmap { 322 mpl, err := api.GetMountpaths(params, target) 323 tassert.CheckFatal(t, err) 324 mpathsByTarget[target] = mpl.Available 325 } 326 327 return mpathsByTarget 328 } 329 330 func KillNode(node *meta.Snode) (cmd RestoreCmd, err error) { 331 restoreNodesOnce.Do(func() { 332 initNodeCmd() 333 }) 334 335 var ( 336 daemonID = node.ID() 337 port = node.PubNet.Port 338 pid int 339 ) 340 cmd.Node = node 341 if docker.IsRunning() { 342 tlog.Logf("Stopping container %s\n", daemonID) 343 err := docker.Stop(daemonID) 344 return cmd, err 345 } 346 347 pid, cmd.Cmd, cmd.Args, err = getProcess(port) 348 if err != nil { 349 return 350 } 351 352 if err = syscall.Kill(pid, syscall.SIGINT); err != nil { 353 return 354 } 355 // wait for the process to actually disappear 356 to := time.Now().Add(time.Second * 30) 357 for { 358 if _, _, _, errPs := getProcess(port); errPs != nil { 359 break 360 } 361 if time.Now().After(to) { 362 err = fmt.Errorf("failed to 'kill -2' process (pid: %d, port: %s)", pid, port) 363 break 364 } 365 time.Sleep(time.Second) 366 } 367 368 syscall.Kill(pid, syscall.SIGKILL) 369 time.Sleep(time.Second) 370 371 if err != nil { 372 if _, _, _, errPs := getProcess(port); errPs != nil { 373 err = nil 374 } else { 375 err = fmt.Errorf("failed to 'kill -9' process (pid: %d, port: %s)", pid, port) 376 } 377 } 378 return 379 } 380 381 func ShutdownNode(_ *testing.T, bp api.BaseParams, node *meta.Snode) (pid int, cmd RestoreCmd, rebID string, err error) { 382 restoreNodesOnce.Do(func() { 383 initNodeCmd() 384 }) 385 386 var ( 387 daemonID = node.ID() 388 port = node.PubNet.Port 389 ) 390 tlog.Logf("Shutting down %s\n", node.StringEx()) 391 cmd.Node = node 392 if docker.IsRunning() { 393 tlog.Logf("Stopping container %s\n", daemonID) 394 err = docker.Stop(daemonID) 395 return 396 } 397 398 pid, cmd.Cmd, cmd.Args, err = getProcess(port) 399 if err != nil { 400 return 401 } 402 403 actValue := &apc.ActValRmNode{DaemonID: daemonID} 404 rebID, err = api.ShutdownNode(bp, actValue) 405 return 406 } 407 408 func RestoreNode(cmd RestoreCmd, asPrimary bool, tag string) error { 409 if docker.IsRunning() { 410 tlog.Logf("Restarting %s container %s\n", tag, cmd) 411 return docker.Restart(cmd.Node.ID()) 412 } 413 414 if !cos.AnyHasPrefixInSlice("-daemon_id", cmd.Args) { 415 cmd.Args = append(cmd.Args, "-daemon_id="+cmd.Node.ID()) 416 } 417 418 tlog.Logf("Restoring %s: %s %+v\n", tag, cmd.Cmd, cmd.Args) 419 pid, err := startNode(&cmd, asPrimary) 420 if err == nil && pid <= 0 { 421 err = fmt.Errorf("RestoreNode: invalid process ID %d", pid) 422 } 423 return err 424 } 425 426 // When using Ctrl-C on test, children (restored daemons) should not be killed as well. 427 // (see: https://groups.google.com/forum/#!topic/golang-nuts/shST-SDqIp4) 428 func startNode(cmd *RestoreCmd, asPrimary bool) (int, error) { 429 ncmd := exec.Command(cmd.Cmd, cmd.Args...) //nolint:gosec // used only in tests 430 ncmd.SysProcAttr = &syscall.SysProcAttr{ 431 Setpgid: true, 432 } 433 if asPrimary { 434 // Sets the environment variable to start as primary 435 environ := os.Environ() 436 environ = append(environ, env.AIS.PrimaryEP+"="+cmd.Node.PubNet.URL) 437 ncmd.Env = environ 438 } 439 if err := ncmd.Start(); err != nil { 440 return 0, err 441 } 442 pid := ncmd.Process.Pid 443 err := ncmd.Process.Release() 444 return pid, err 445 } 446 447 func DeployNode(t *testing.T, node *meta.Snode, conf *cmn.Config, localConf *cmn.LocalConfig) int { 448 conf.ConfigDir = t.TempDir() 449 conf.LogDir = t.TempDir() 450 conf.TestFSP.Root = t.TempDir() 451 conf.TestFSP.Instance = 42 452 453 if localConf == nil { 454 localConf = &cmn.LocalConfig{} 455 localConf.ConfigDir = conf.ConfigDir 456 localConf.HostNet.Port = conf.HostNet.Port 457 localConf.HostNet.PortIntraControl = conf.HostNet.PortIntraControl 458 localConf.HostNet.PortIntraData = conf.HostNet.PortIntraData 459 } 460 461 localConfFile := filepath.Join(conf.ConfigDir, fname.PlainLocalConfig) 462 err := jsp.SaveMeta(localConfFile, localConf, nil) 463 tassert.CheckFatal(t, err) 464 465 configFile := filepath.Join(conf.ConfigDir, "ais.json") 466 err = jsp.SaveMeta(configFile, &conf.ClusterConfig, nil) 467 tassert.CheckFatal(t, err) 468 469 var cmd = RestoreCmd{ 470 Args: []string{ 471 "-role=" + node.Type(), 472 "-daemon_id=" + node.ID(), 473 "-config=" + configFile, 474 "-local_config=" + localConfFile, 475 }, 476 Node: node, 477 Cmd: getAISNodeCmd(t), 478 } 479 480 pid, err := startNode(&cmd, false) 481 tassert.CheckFatal(t, err) 482 tassert.Fatalf(t, pid > 0, "invalid process ID %d", pid) 483 return pid 484 } 485 486 // CleanupNode kills the process. 487 func CleanupNode(t *testing.T, pid int) { 488 err := syscall.Kill(pid, syscall.SIGKILL) 489 // Ignore error if process is not found. 490 if errors.Is(err, syscall.ESRCH) { 491 return 492 } 493 tassert.CheckError(t, err) 494 } 495 496 // getAISNodeCmd finds the command for deploying AIS node 497 func getAISNodeCmd(t *testing.T) string { 498 // Get command from cached restore CMDs when available 499 if len(restoreNodes) != 0 { 500 for _, cmd := range restoreNodes { 501 return cmd.Cmd 502 } 503 } 504 505 // If no cached comand, use a random proxy to get command 506 proxyURL := RandomProxyURL() 507 proxy, err := GetPrimaryProxy(proxyURL) 508 tassert.CheckFatal(t, err) 509 rcmd := GetRestoreCmd(proxy) 510 return rcmd.Cmd 511 } 512 513 // getPID uses 'lsof' to find the pid of the ais process listening on a port 514 func getPID(port string) (int, error) { 515 var ( 516 sport = ":" + port 517 cmd = exec.Command("lsof", "-sTCP:LISTEN", "-i", sport) 518 ) 519 output, err := cmd.CombinedOutput() 520 if err != nil { 521 return 0, fmt.Errorf("error executing LSOF command: %v", err) 522 } 523 524 // Skip lines before first appearance of "COMMAND" 525 lines := strings.Split(string(output), "\n") 526 i := 0 527 for ; ; i++ { 528 if strings.HasPrefix(lines[i], "COMMAND") { 529 break 530 } 531 } 532 533 // Second column is the pid. 534 pid := strings.Fields(lines[i+1])[1] 535 return strconv.Atoi(pid) 536 } 537 538 // getProcess finds the ais process by 'lsof' using a port number, it finds the ais process's 539 // original command line by 'ps', returns the command line for later to restart(restore) the process. 540 func getProcess(port string) (pid int, cmd string, args []string, err error) { 541 pid, err = getPID(port) 542 if err != nil { 543 return 0, "", nil, fmt.Errorf("error getting pid on port: %v", err) 544 } 545 var ( 546 spid = strconv.Itoa(pid) 547 ps = exec.Command("ps", "-p", spid, "-o", "command") 548 ) 549 output, err := ps.CombinedOutput() 550 if err != nil { 551 return 0, "", nil, fmt.Errorf("error executing PS command: %v", err) 552 } 553 554 line := strings.Split(string(output), "\n")[1] 555 fields := strings.Fields(line) 556 if len(fields) == 0 { 557 return 0, "", nil, errors.New("no returned fields") 558 } 559 560 return pid, fields[0], fields[1:], nil 561 } 562 563 func WaitForPID(pid int) error { 564 const retryInterval = time.Second 565 566 process, err := os.FindProcess(pid) 567 if err != nil { 568 return nil // already (TODO: confirm) 569 } 570 571 var ( 572 cancel context.CancelFunc 573 ctx = context.Background() 574 done = make(chan error) 575 ) 576 tlog.Logf("Waiting for PID=%d to terminate\n", pid) 577 578 deadline := time.Minute / 2 579 ctx, cancel = context.WithTimeout(ctx, deadline) 580 defer cancel() 581 582 go func() { 583 _, erw := process.Wait() // NOTE: w/ no timeout 584 done <- erw 585 }() 586 time.Sleep(10 * time.Millisecond) 587 for { 588 select { 589 case <-done: 590 return nil 591 case <-ctx.Done(): 592 return ctx.Err() 593 default: 594 break 595 } 596 time.Sleep(retryInterval) 597 } 598 } 599 600 func GetRestoreCmd(si *meta.Snode) RestoreCmd { 601 var ( 602 err error 603 cmd = RestoreCmd{Node: si} 604 ) 605 if docker.IsRunning() { 606 return cmd 607 } 608 cmd.PID, cmd.Cmd, cmd.Args, err = getProcess(si.PubNet.Port) 609 cos.AssertNoErr(err) 610 return cmd 611 } 612 613 // EnsureOrigClusterState verifies the cluster has the same nodes after tests 614 // If a node is killed, it restores the node 615 func EnsureOrigClusterState(t *testing.T) { 616 if len(restoreNodes) == 0 { 617 return 618 } 619 var ( 620 proxyURL = RandomProxyURL() 621 smap = GetClusterMap(t, proxyURL) 622 baseParam = BaseAPIParams(proxyURL) 623 afterProxyCnt = smap.CountActivePs() 624 afterTargetCnt = smap.CountActiveTs() 625 tgtCnt int 626 proxyCnt int 627 updated bool 628 retried bool 629 ) 630 retry: 631 for _, cmd := range restoreNodes { 632 node := smap.GetNode(cmd.Node.ID()) 633 if node == nil && !retried { 634 tlog.Logf("Warning: %s %s not found in %s - retrying...", cmd.Node.Type(), cmd.Node.ID(), smap.StringEx()) 635 time.Sleep(2 * controlPlaneSleep) 636 smap = GetClusterMap(t, proxyURL) 637 // alternatively, could simply compare versions 638 retried = true 639 goto retry 640 } 641 } 642 // restore 643 for _, cmd := range restoreNodes { 644 if cmd.Node.IsProxy() { 645 proxyCnt++ 646 } else { 647 tgtCnt++ 648 } 649 node := smap.GetNode(cmd.Node.ID()) 650 if node != nil { 651 tassert.Errorf(t, node.Eq(cmd.Node), 652 "%s %s changed, before %+v, after %+v", cmd.Node.Type(), node.ID(), cmd.Node, node) 653 } else { 654 tassert.Errorf(t, false, "%s %s not found in %s", cmd.Node.Type(), cmd.Node.ID(), smap.StringEx()) 655 } 656 if docker.IsRunning() { 657 if node == nil { 658 err := RestoreNode(cmd, false, cmd.Node.Type()) 659 tassert.CheckError(t, err) 660 updated = true 661 } 662 continue 663 } 664 665 _, err := getPID(cmd.Node.PubNet.Port) 666 if err != nil { 667 tassert.CheckError(t, err) 668 if err = RestoreNode(cmd, false, cmd.Node.Type()); err == nil { 669 _, err := WaitNodeAdded(baseParam, cmd.Node.ID()) 670 tassert.CheckError(t, err) 671 } 672 tassert.CheckError(t, err) 673 updated = true 674 } 675 } 676 677 tassert.Errorf(t, afterProxyCnt == proxyCnt, "Some proxies crashed(?): expected %d, have %d", proxyCnt, afterProxyCnt) 678 tassert.Errorf(t, tgtCnt == afterTargetCnt, "Some targets crashed(?): expected %d, have %d", tgtCnt, afterTargetCnt) 679 680 if !updated { 681 return 682 } 683 684 _, err := WaitForClusterState(proxyURL, "cluster to stabilize", smap.Version, proxyCnt, tgtCnt) 685 tassert.CheckFatal(t, err) 686 687 if tgtCnt != afterTargetCnt { 688 WaitForRebalAndResil(t, BaseAPIParams(proxyURL)) 689 } 690 } 691 692 func WaitNodeAdded(bp api.BaseParams, nodeID string) (*meta.Smap, error) { 693 var i int 694 retry: 695 smap, err := api.GetClusterMap(bp) 696 if err != nil { 697 return nil, err 698 } 699 node := smap.GetNode(nodeID) 700 if node != nil { 701 return smap, WaitNodeReady(node.URL(cmn.NetPublic)) 702 } 703 time.Sleep(controlPlaneSleep) 704 i++ 705 if i > maxNodeRetry { 706 return nil, fmt.Errorf("max retry (%d) exceeded - node not in smap", maxNodeRetry) 707 } 708 709 goto retry 710 } 711 712 func WaitNodeReady(url string, opts ...*WaitRetryOpts) (err error) { 713 var ( 714 bp = BaseAPIParams(url) 715 retries = maxNodeRetry 716 retryInterval = controlPlaneSleep 717 i int 718 ) 719 if len(opts) > 0 && opts[0] != nil { 720 retries = opts[0].MaxRetries 721 retryInterval = opts[0].Interval 722 } 723 while503: 724 err = api.Health(bp) 725 if err == nil { 726 return nil 727 } 728 if !cmn.IsStatusServiceUnavailable(err) && !cos.IsRetriableConnErr(err) { 729 return 730 } 731 time.Sleep(retryInterval) 732 i++ 733 if i > retries { 734 return fmt.Errorf("node start failed - max retries (%d) exceeded", retries) 735 } 736 goto while503 737 } 738 739 func _joinCluster(proxyURL string, node *meta.Snode, timeout time.Duration) (rebID string, err error) { 740 bp := api.BaseParams{Client: gctx.Client, URL: proxyURL, Token: LoggedUserToken} 741 smap, err := api.GetClusterMap(bp) 742 if err != nil { 743 return "", err 744 } 745 if rebID, _, err = api.JoinCluster(bp, node); err != nil { 746 return 747 } 748 749 // If node is already in cluster we should not wait for map version 750 // sync because update will not be scheduled 751 if node := smap.GetNode(node.ID()); node == nil { 752 err = waitSmapSync(bp, time.Now().Add(timeout), smap, smap.Version, cos.NewStrSet()) 753 return 754 } 755 return 756 } 757 758 func _nextNode(smap *meta.Smap, idsToIgnore cos.StrSet) (sid string, isproxy, exists bool) { 759 for _, d := range smap.Pmap { 760 if !idsToIgnore.Contains(d.ID()) { 761 sid = d.ID() 762 isproxy, exists = true, true 763 return 764 } 765 } 766 for _, d := range smap.Tmap { 767 if !idsToIgnore.Contains(d.ID()) { 768 sid = d.ID() 769 isproxy, exists = false, true 770 return 771 } 772 } 773 return 774 } 775 776 func waitSmapSync(bp api.BaseParams, timeout time.Time, smap *meta.Smap, ver int64, ignore cos.StrSet) error { 777 var ( 778 prevSid string 779 orig = ignore.Clone() 780 ) 781 for { 782 sid, isproxy, exists := _nextNode(smap, ignore) 783 if !exists { 784 break 785 } 786 if sid == prevSid { 787 time.Sleep(time.Second) 788 } 789 sname := meta.Tname(sid) 790 if isproxy { 791 sname = meta.Pname(sid) 792 } 793 newSmap, err := api.GetNodeClusterMap(bp, sid) 794 if err != nil && !cos.IsRetriableConnErr(err) && 795 !cmn.IsStatusServiceUnavailable(err) && !cmn.IsStatusBadGateway(err) /* retry as well */ { 796 return err 797 } 798 if err == nil && newSmap.Version > ver { 799 ignore.Add(sid) 800 if newSmap.Version > smap.Version { 801 gctx.Log("Updating %s to %s from %s\n", smap, newSmap.StringEx(), sname) 802 cos.CopyStruct(smap, newSmap) 803 } 804 if newSmap.Version > ver+1 { 805 // reset 806 if ver <= 0 { 807 gctx.Log("Received %s from %s\n", newSmap.StringEx(), sname) 808 } else { 809 gctx.Log("Received newer %s from %s, updated wait-for condition (%d => %d)\n", 810 newSmap.StringEx(), sname, ver, newSmap.Version) 811 } 812 ver = newSmap.Version - 1 813 ignore = orig.Clone() 814 ignore.Add(sid) 815 } 816 continue 817 } 818 if time.Now().After(timeout) { 819 return fmt.Errorf("timed out waiting for %s to sync Smap > v%d", sname, ver) 820 } 821 if newSmap != nil { 822 if snode := newSmap.GetNode(sid); snode != nil { 823 gctx.Log("Waiting for %s(%s) to sync Smap > v%d\n", snode.StringEx(), newSmap, ver) 824 } else { 825 gctx.Log("Waiting for %s(%s) to sync Smap > v%d\n", sname, newSmap, ver) 826 gctx.Log("(Warning: %s hasn't joined yet - not present)\n", sname) 827 } 828 } 829 prevSid = sid 830 } 831 if currSmap == nil || currSmap.Version < smap.Version { 832 currSmap = smap 833 } 834 return nil 835 } 836 837 // remove node unsafe 838 func _removeNodeFromSmap(proxyURL, sid string, timeout time.Duration) error { 839 var ( 840 bp = api.BaseParams{Client: gctx.Client, URL: proxyURL, Token: LoggedUserToken} 841 smap, err = api.GetClusterMap(bp) 842 node = smap.GetNode(sid) 843 ) 844 if err != nil { 845 return fmt.Errorf("api.GetClusterMap failed, err: %v", err) 846 } 847 if node != nil && smap.IsPrimary(node) { 848 return errors.New("unregistering primary proxy is not allowed") 849 } 850 tlog.Logf("Remove %s from %s\n", node.StringEx(), smap) 851 852 err = api.RemoveNodeUnsafe(bp, sid) 853 if err != nil { 854 return err 855 } 856 857 // If node does not exist in cluster we should not wait for map version 858 // sync because update will not be scheduled. 859 if node != nil { 860 return waitSmapSync(bp, time.Now().Add(timeout), smap, smap.Version, cos.NewStrSet(node.ID())) 861 } 862 return nil 863 }