github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/multiproxy_test.go (about) 1 // Package integration_test. 2 /* 3 * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved. 4 */ 5 package integration_test 6 7 import ( 8 "context" 9 "errors" 10 "fmt" 11 "math" 12 "net/http" 13 "net/url" 14 "path/filepath" 15 "reflect" 16 "sync" 17 "testing" 18 "time" 19 20 "github.com/NVIDIA/aistore/ais" 21 "github.com/NVIDIA/aistore/api" 22 "github.com/NVIDIA/aistore/api/apc" 23 "github.com/NVIDIA/aistore/cmn" 24 "github.com/NVIDIA/aistore/cmn/cos" 25 "github.com/NVIDIA/aistore/cmn/fname" 26 "github.com/NVIDIA/aistore/cmn/jsp" 27 "github.com/NVIDIA/aistore/core/meta" 28 "github.com/NVIDIA/aistore/reb" 29 "github.com/NVIDIA/aistore/tools" 30 "github.com/NVIDIA/aistore/tools/docker" 31 "github.com/NVIDIA/aistore/tools/readers" 32 "github.com/NVIDIA/aistore/tools/tassert" 33 "github.com/NVIDIA/aistore/tools/tlog" 34 "github.com/NVIDIA/aistore/tools/trand" 35 "github.com/NVIDIA/aistore/xact" 36 "github.com/OneOfOne/xxhash" 37 jsoniter "github.com/json-iterator/go" 38 ) 39 40 const ( 41 localBucketDir = "multipleproxy" 42 defaultChanSize = 10 43 ) 44 45 var ( 46 voteTests = []Test{ 47 {"PrimaryCrash", primaryCrashElectRestart}, 48 {"NodeCrashRestoreDifferentIP", nodeCrashRestoreDifferentIP}, 49 {"ProxyCrash", proxyCrash}, 50 {"PrimaryAndTargetCrash", primaryAndTargetCrash}, 51 {"PrimaryAndProxyCrash", primaryAndProxyCrash}, 52 {"CrashAndFastRestore", crashAndFastRestore}, 53 {"TargetRejoin", targetRejoin}, 54 {"JoinWhileVoteInProgress", joinWhileVoteInProgress}, 55 {"MinorityTargetMapVersionMismatch", minorityTargetMapVersionMismatch}, 56 {"MajorityTargetMapVersionMismatch", majorityTargetMapVersionMismatch}, 57 {"ConcurrentPutGetDel", concurrentPutGetDel}, 58 {"ProxyStress", proxyStress}, 59 {"NetworkFailure", networkFailure}, 60 {"PrimaryAndNextCrash", primaryAndNextCrash}, 61 {"DiscoveryAndOriginalPrimaryCrash", discoveryAndOrigPrimaryProxiesCrash}, 62 {"AddNodeDuplicateIP", addNodeDuplicateIP}, 63 {"AddNodeDuplicateDaemonID", addNodeDuplicateDaemonID}, 64 } 65 66 icTests = []Test{ 67 {"ICMemberLeaveAndRejoin", icMemberLeaveAndRejoin}, 68 {"ICKillAndRestorePrimary", icKillAndRestorePrimary}, 69 {"ICSyncOwnTbl", icSyncOwnershipTable}, 70 {"ICSinglePrimaryRevamp", icSinglePrimaryRevamp}, 71 {"ICStressMonitorXactMultiICFail", icStressMonitorXactMultiICFail}, 72 } 73 ) 74 75 func TestMultiProxy(t *testing.T) { 76 tools.CheckSkip(t, &tools.SkipTestArgs{ 77 Long: true, 78 RequiredDeployment: tools.ClusterTypeLocal, 79 MinProxies: 3, 80 MinTargets: 1, 81 }) 82 83 defer tools.EnsureOrigClusterState(t) 84 for _, test := range voteTests { 85 t.Run(test.name, test.method) 86 if t.Failed() { 87 t.FailNow() 88 } 89 } 90 } 91 92 // primaryCrashElectRestart kills the current primary proxy, wait for the new primary proxy is up and verifies it, 93 // restores the original primary proxy as non primary 94 func primaryCrashElectRestart(t *testing.T) { 95 proxyURL := tools.RandomProxyURL(t) 96 killRestorePrimary(t, proxyURL, false, nil) 97 } 98 99 func killRestorePrimary(t *testing.T, proxyURL string, restoreAsPrimary bool, 100 postKill func(smap *meta.Smap, newPrimary, oldPrimary *meta.Snode)) *meta.Smap { 101 var ( 102 smap = tools.GetClusterMap(t, proxyURL) 103 proxyCount = smap.CountActivePs() 104 oldPrimary = smap.Primary 105 oldPrimaryURL = smap.Primary.URL(cmn.NetPublic) 106 oldPrimaryID = smap.Primary.ID() 107 ) 108 109 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), proxyCount) 110 newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap) 111 tassert.CheckFatal(t, err) 112 newPrimary := smap.GetProxy(newPrimaryID) 113 114 tlog.Logf("New primary: %s --> %s\n", newPrimaryID, newPrimaryURL) 115 tlog.Logf("Killing primary: %s --> %s\n", oldPrimaryURL, oldPrimaryID) 116 117 // cmd and args are the original command line of how the proxy is started 118 cmd, err := tools.KillNode(smap.Primary) 119 tassert.CheckFatal(t, err) 120 121 smap, err = tools.WaitForClusterState(newPrimaryURL, "new primary elected", smap.Version, 122 smap.CountActivePs()-1, smap.CountActiveTs()) 123 tassert.CheckFatal(t, err) 124 tlog.Logf("New primary elected: %s\n", newPrimaryID) 125 126 tassert.Errorf(t, smap.Primary.ID() == newPrimaryID, "Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), newPrimaryID) 127 128 if postKill != nil { 129 postKill(smap, newPrimary, oldPrimary) 130 } 131 132 // re-construct the command line to start the original proxy but add the current primary proxy to the args 133 err = tools.RestoreNode(cmd, false, "proxy (prev primary)") 134 tassert.CheckFatal(t, err) 135 136 smap, err = tools.WaitForClusterState(newPrimaryURL, "restore", smap.Version, proxyCount, 0) 137 tassert.CheckFatal(t, err) 138 if _, ok := smap.Pmap[oldPrimaryID]; !ok { 139 t.Fatalf("Previous primary proxy did not rejoin the cluster") 140 } 141 checkSmaps(t, newPrimaryURL) 142 143 if restoreAsPrimary { 144 return setPrimaryTo(t, oldPrimaryURL, smap, "", oldPrimaryID) 145 } 146 return smap 147 } 148 149 func nodeCrashRestoreDifferentIP(t *testing.T) { 150 for _, ty := range []string{apc.Proxy, apc.Target} { 151 t.Run(ty, func(t *testing.T) { 152 killRestoreDiffIP(t, ty) 153 }) 154 } 155 } 156 157 func killRestoreDiffIP(t *testing.T, nodeType string) { 158 // NOTE: This function requires local deployment as it changes node config 159 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeLocal}) 160 161 var ( 162 proxyURL = tools.GetPrimaryURL() 163 smap = tools.GetClusterMap(t, proxyURL) 164 origProxyCnt, origTargetCount = smap.CountActivePs(), smap.CountActiveTs() 165 portInc = 100 166 node *meta.Snode 167 err error 168 pdc, tdc int 169 restore bool 170 ) 171 172 if nodeType == apc.Proxy { 173 node, err = smap.GetRandProxy(true /*exclude primary*/) 174 pdc = 1 175 } else { 176 node, err = smap.GetRandTarget() 177 tdc = 1 178 } 179 tassert.CheckFatal(t, err) 180 181 killRestore: 182 cfg := tools.GetDaemonConfig(t, node) 183 tlog.Logf("Killing %s\n", node.StringEx()) 184 cmd, err := tools.KillNode(node) 185 tassert.CheckFatal(t, err) 186 187 smap, err = tools.WaitForClusterState(proxyURL, "cluster to stabilize", smap.Version, origProxyCnt-pdc, origTargetCount-tdc) 188 tassert.CheckFatal(t, err) 189 190 // Update local config ports. 191 localConfPath := filepath.Join(cfg.ConfigDir, fname.PlainLocalConfig) 192 localConf := &cmn.LocalConfig{} 193 _, err = jsp.LoadMeta(localConfPath, localConf) 194 tassert.CheckFatal(t, err) 195 localConf.HostNet.Port = cfg.HostNet.Port + portInc 196 localConf.HostNet.PortIntraControl = cfg.HostNet.PortIntraControl + portInc 197 localConf.HostNet.PortIntraData = cfg.HostNet.PortIntraData + portInc 198 err = jsp.SaveMeta(localConfPath, localConf, nil) 199 tassert.CheckFatal(t, err) 200 201 err = tools.RestoreNode(cmd, false, nodeType) 202 tassert.CheckFatal(t, err) 203 204 smap, err = tools.WaitForClusterState(proxyURL, "restore with changed config", smap.Version, origProxyCnt, 0) 205 tassert.CheckFatal(t, err) 206 207 // Health check with old public URL should fail 208 err = api.Health(tools.BaseAPIParams(node.URL(cmn.NetPublic))) 209 tassert.Errorf(t, err != nil, "health check with old IP information should fail %v", err) 210 211 newNode := smap.GetNode(node.ID()) 212 err = tools.WaitNodeReady(newNode.URL(cmn.NetPublic)) 213 tassert.CheckError(t, err) 214 if !restore { 215 // Revert port changes 216 restore = true 217 node = newNode 218 portInc = -portInc 219 goto killRestore 220 } 221 222 if nodeType == apc.Target { 223 tools.WaitForRebalAndResil(t, tools.BaseAPIParams(proxyURL)) 224 } 225 } 226 227 // primaryAndTargetCrash kills the primary p[roxy and one random target, verifies the next in 228 // line proxy becomes the new primary, restore the target and proxy, restore original primary. 229 func primaryAndTargetCrash(t *testing.T) { 230 if docker.IsRunning() { 231 t.Skip("Skipped because setting new primary URL in command line for docker is not supported") 232 } 233 234 proxyURL := tools.RandomProxyURL(t) 235 smap := tools.GetClusterMap(t, proxyURL) 236 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 237 238 newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap) 239 tassert.CheckFatal(t, err) 240 241 oldPrimaryURL := smap.Primary.URL(cmn.NetPublic) 242 tlog.Logf("Killing proxy %s - %s\n", oldPrimaryURL, smap.Primary.ID()) 243 cmd, err := tools.KillNode(smap.Primary) 244 tassert.CheckFatal(t, err) 245 246 // Select a random target 247 var ( 248 targetURL string 249 targetID string 250 targetNode *meta.Snode 251 origTargetCount = smap.CountActiveTs() 252 origProxyCount = smap.CountActivePs() 253 ) 254 255 targetNode, _ = smap.GetRandTarget() 256 targetURL = targetNode.URL(cmn.NetPublic) 257 targetID = targetNode.ID() 258 259 tlog.Logf("Killing target: %s - %s\n", targetURL, targetID) 260 tcmd, err := tools.KillNode(targetNode) 261 tassert.CheckFatal(t, err) 262 263 smap, err = tools.WaitForClusterState(newPrimaryURL, "new primary elected", 264 smap.Version, origProxyCount-1, origTargetCount-1) 265 tassert.CheckFatal(t, err) 266 267 if smap.Primary.ID() != newPrimaryID { 268 t.Fatalf("Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), newPrimaryID) 269 } 270 271 err = tools.RestoreNode(tcmd, false, "target") 272 tassert.CheckFatal(t, err) 273 274 err = tools.RestoreNode(cmd, false, "proxy (prev primary)") 275 tassert.CheckFatal(t, err) 276 277 _, err = tools.WaitForClusterState(newPrimaryURL, "restore proxy and target", 278 smap.Version, origProxyCount, origTargetCount) 279 tassert.CheckFatal(t, err) 280 tools.WaitForRebalAndResil(t, tools.BaseAPIParams(newPrimaryURL)) 281 } 282 283 // A very simple test to check if a primary proxy can detect non-primary one 284 // dies and then update and sync SMap 285 func proxyCrash(t *testing.T) { 286 proxyURL := tools.RandomProxyURL(t) 287 smap := tools.GetClusterMap(t, proxyURL) 288 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 289 290 primaryURL := smap.Primary.URL(cmn.NetPublic) 291 tlog.Logf("Primary: %s\n", smap.Primary.StringEx()) 292 293 origProxyCount := smap.CountActivePs() 294 secondNode, err := smap.GetRandProxy(true /*exclude primary*/) 295 tassert.CheckFatal(t, err) 296 297 tlog.Logf("Killing non-primary %s\n", secondNode.StringEx()) 298 secondCmd, err := tools.KillNode(secondNode) 299 tassert.CheckFatal(t, err) 300 301 smap, err = tools.WaitForClusterState(primaryURL, "proxy removed", smap.Version, origProxyCount-1, 0) 302 tassert.CheckFatal(t, err) 303 304 err = tools.RestoreNode(secondCmd, false, "proxy") 305 tassert.CheckFatal(t, err) 306 307 smap, err = tools.WaitForClusterState(primaryURL, "proxy restoreid", smap.Version, origProxyCount, 0) 308 tassert.CheckFatal(t, err) 309 310 if _, ok := smap.Pmap[secondNode.ID()]; !ok { 311 t.Fatalf("Non-primary proxy did not rejoin the cluster.") 312 } 313 } 314 315 func addNodeDuplicateDaemonID(t *testing.T) { 316 for _, ty := range []string{apc.Proxy, apc.Target} { 317 t.Run(ty, func(t *testing.T) { 318 _addNodeDuplicateDaemonID(t, ty) 319 }) 320 } 321 } 322 323 // 1. Select a random proxy/target node based on `nodeType` param 324 // 2. Try deploying a new node using the same DaemonID as randomly chosen node 325 // 3. Wait for the newly deployed daemon to be terminated - failing to join cluster 326 // NOTE: Test assumes that the randomly chosen node is healthy (i.e. doesn't terminate or restart) 327 // TODO: add test for target that tries to join with duplicate DaemonID and contains user-data 328 func _addNodeDuplicateDaemonID(t *testing.T, nodeType string) { 329 // NOTE: This function requires local deployment as it changes node config 330 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeLocal}) 331 332 var ( 333 proxyURL = tools.GetPrimaryURL() 334 smap = tools.GetClusterMap(t, proxyURL) 335 node *meta.Snode 336 err error 337 338 // node configs 339 portInc = 100 340 ) 341 342 if nodeType == apc.Proxy { 343 node, err = smap.GetRandProxy(true) 344 } else { 345 node, err = smap.GetRandTarget() 346 } 347 tassert.CheckFatal(t, err) 348 conf := tools.GetDaemonConfig(t, node) 349 350 // Create local config for daemon. 351 localConf := &cmn.LocalConfig{} 352 localConf.ConfigDir = conf.ConfigDir 353 localConf.HostNet.Port = conf.HostNet.Port + portInc 354 localConf.HostNet.PortIntraControl = conf.HostNet.PortIntraControl + portInc 355 localConf.HostNet.PortIntraData = conf.HostNet.PortIntraData + portInc 356 357 // start with different config but same daemon ID 358 pid := tools.DeployNode(t, node, conf, localConf) 359 t.Cleanup(func() { 360 tools.CleanupNode(t, pid) 361 }) 362 363 err = tools.WaitForPID(pid) 364 tassert.CheckFatal(t, err) 365 } 366 367 func addNodeDuplicateIP(t *testing.T) { 368 for _, ty := range []string{apc.Proxy, apc.Target} { 369 t.Run(ty, func(t *testing.T) { 370 _addNodeDuplicateIP(t, ty) 371 }) 372 } 373 } 374 375 // 1. Select a random proxy/target node based on `nodeType` param 376 // 2. Try deploying a new node using the same IP configuration as randomly chosen node 377 // 3. Wait for the newly deployed daemon to be terminated - failing to join cluster 378 // NOTE: Test assumes that the randomly chosen node is healthy (i.e. doesn't terminate or restart) 379 func _addNodeDuplicateIP(t *testing.T, nodeType string) { 380 // NOTE: This function requires local deployment as it changes node config 381 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeLocal}) 382 383 var ( 384 proxyURL = tools.GetPrimaryURL() 385 smap = tools.GetClusterMap(t, proxyURL) 386 node *meta.Snode 387 err error 388 ) 389 390 if nodeType == apc.Proxy { 391 node, err = smap.GetRandProxy(true) 392 } else { 393 node, err = smap.GetRandTarget() 394 } 395 tassert.CheckFatal(t, err) 396 conf := tools.GetDaemonConfig(t, node) 397 398 // Make sure that the `DaemonID` is different. 399 node.DaeID = "testing_" + trand.String(10) 400 401 pid := tools.DeployNode(t, node, conf, nil) 402 t.Cleanup(func() { 403 tools.CleanupNode(t, pid) 404 }) 405 406 err = tools.WaitForPID(pid) 407 tassert.CheckFatal(t, err) 408 } 409 410 // primaryAndProxyCrash kills primary proxy and one another proxy(not the next in line primary) 411 // and restore them afterwards 412 func primaryAndProxyCrash(t *testing.T) { 413 var ( 414 proxyURL = tools.RandomProxyURL(t) 415 smap = tools.GetClusterMap(t, proxyURL) 416 origProxyCount = smap.CountActivePs() 417 oldPrimaryURL, oldPrimaryID = smap.Primary.URL(cmn.NetPublic), smap.Primary.ID() 418 secondNode *meta.Snode 419 secondID string 420 ) 421 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 422 423 newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap) 424 tassert.CheckFatal(t, err) 425 426 tlog.Logf("Killing primary: %s - %s\n", oldPrimaryURL, oldPrimaryID) 427 cmd, err := tools.KillNode(smap.Primary) 428 tassert.CheckFatal(t, err) 429 430 // Do not choose the next primary in line, or the current primary proxy 431 // This is because the system currently cannot recover if the next proxy in line is 432 // also killed (TODO) 433 for k, v := range smap.Pmap { 434 if k != newPrimaryID && k != oldPrimaryID { 435 secondNode = v 436 secondID = secondNode.ID() 437 break 438 } 439 } 440 tassert.Errorf(t, secondID != "", "not enough proxies (%d)", origProxyCount) 441 n := cos.NowRand().Intn(20) 442 time.Sleep(time.Duration(n+1) * time.Second) 443 444 tlog.Logf("Killing non-primary: %s\n", secondNode.StringEx()) 445 secondCmd, err := tools.KillNode(secondNode) 446 tassert.CheckFatal(t, err) 447 448 smap, err = tools.WaitForClusterState(newPrimaryURL, "elect new primary", 449 smap.Version, origProxyCount-2, 0) 450 tassert.CheckFatal(t, err) 451 452 err = tools.RestoreNode(cmd, true, "previous primary "+oldPrimaryID) 453 tassert.CheckFatal(t, err) 454 455 smap, err = tools.WaitForClusterState(newPrimaryURL, "join back previous primary "+oldPrimaryID, 456 smap.Version, origProxyCount-1, 0) 457 tassert.CheckFatal(t, err) 458 459 err = tools.RestoreNode(secondCmd, false, "proxy") 460 tassert.CheckFatal(t, err) 461 462 smap, err = tools.WaitForClusterState(newPrimaryURL, "join back non-primary "+secondID, 463 smap.Version, origProxyCount, 0) 464 tassert.CheckFatal(t, err) 465 466 if smap.Primary.ID() != newPrimaryID { 467 t.Fatalf("Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), newPrimaryID) 468 } 469 470 if _, ok := smap.Pmap[oldPrimaryID]; !ok { 471 t.Fatalf("Previous primary proxy %s did not rejoin the cluster", oldPrimaryID) 472 } 473 474 if _, ok := smap.Pmap[secondID]; !ok { 475 t.Fatalf("Second proxy %s did not rejoin the cluster", secondID) 476 } 477 } 478 479 // targetRejoin kills a random selected target, wait for it to rejoin and verifies it 480 func targetRejoin(t *testing.T) { 481 var ( 482 id string 483 proxyURL = tools.RandomProxyURL(t) 484 ) 485 486 smap := tools.GetClusterMap(t, proxyURL) 487 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 488 489 node, err := smap.GetRandTarget() 490 if err != nil { 491 tlog.Logf("Warning: %v\n", err) 492 tlog.Logln("Retrying...") 493 // retry once 494 time.Sleep(10 * time.Second) 495 smap = tools.GetClusterMap(t, proxyURL) 496 node, err = smap.GetRandTarget() 497 tassert.CheckFatal(t, err) 498 } 499 id = node.ID() 500 501 cmd, err := tools.KillNode(node) 502 tassert.CheckFatal(t, err) 503 smap, err = tools.WaitForClusterState(proxyURL, "target crashed", smap.Version, smap.CountActivePs(), smap.CountActiveTs()-1) 504 tassert.CheckFatal(t, err) 505 506 if _, ok := smap.Tmap[id]; ok { 507 t.Fatalf("Killed target was not removed from the Smap: %v", id) 508 } 509 510 err = tools.RestoreNode(cmd, false, "target") 511 tassert.CheckFatal(t, err) 512 513 smap, err = tools.WaitForClusterState(proxyURL, "target rejoined", 514 smap.Version, smap.CountActivePs(), smap.CountActiveTs()+1) 515 tassert.CheckFatal(t, err) 516 517 if _, ok := smap.Tmap[id]; !ok { 518 t.Fatalf("Restarted target %s did not rejoin the cluster", id) 519 } 520 tools.WaitForRebalAndResil(t, tools.BaseAPIParams(proxyURL)) 521 } 522 523 // crashAndFastRestore kills the primary and restores it before a new leader is elected 524 func crashAndFastRestore(t *testing.T) { 525 var err error 526 proxyURL := tools.RandomProxyURL(t) 527 smap := tools.GetClusterMap(t, proxyURL) 528 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 529 530 // Make sure proxyURL is not primary URL. 531 _, proxyURL, err = chooseNextProxy(smap) 532 tassert.CheckFatal(t, err) 533 oldPrimaryID := smap.Primary.ID() 534 tlog.Logf("The current primary %s, Smap version %d\n", oldPrimaryID, smap.Version) 535 536 cmd, err := tools.KillNode(smap.Primary) 537 tassert.CheckFatal(t, err) 538 539 // quick crash and recover 540 time.Sleep(2 * time.Second) 541 err = tools.RestoreNode(cmd, true, "proxy (primary)") 542 tassert.CheckFatal(t, err) 543 544 tlog.Logf("The %s is currently restarting\n", oldPrimaryID) 545 546 // NOTE: using (version - 1) because the primary will restart with its old version, 547 // there will be no version change for this restore, so force beginning version to 1 less 548 // than the original version in order to use WaitForClusterState. 549 smap, err = tools.WaitForClusterState(proxyURL, "restore", smap.Version-1, 0, 0) 550 tassert.CheckFatal(t, err) 551 552 if smap.Primary.ID() != oldPrimaryID { 553 t.Fatalf("Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), oldPrimaryID) 554 } 555 } 556 557 func joinWhileVoteInProgress(t *testing.T) { 558 if docker.IsRunning() { 559 t.Skipf("skipping %s (docker is not supported)", t.Name()) 560 } 561 var ( 562 smap = tools.GetClusterMap(t, proxyURL) 563 oldTargetCnt = smap.CountActiveTs() 564 oldProxyCnt = smap.CountActivePs() 565 stopch = make(chan struct{}) 566 errCh = make(chan error, 10) 567 mocktgt = &voteRetryMockTarget{ 568 voteInProgress: true, 569 errCh: errCh, 570 } 571 ) 572 tlog.Logf("targets: %d, proxies: %d\n", oldTargetCnt, oldProxyCnt) 573 574 proxy, err := smap.GetRandProxy(true /*exclude primary*/) 575 tassert.CheckFatal(t, err) 576 proxyURL := proxy.URL(cmn.NetPublic) 577 wg := &sync.WaitGroup{} 578 579 wg.Add(1) 580 go runMockTarget(t, proxyURL, mocktgt, stopch, smap, wg) 581 582 _, err = tools.WaitForClusterState(proxyURL, "mock target joined", smap.Version, oldProxyCnt, oldTargetCnt+1) 583 tassert.CheckFatal(t, err) 584 585 smap = killRestorePrimary(t, proxyURL, false, nil) 586 // 587 // FIXME: election is in progress if and only when xaction(apc.ActElection) is running - 588 // simulating the scenario via mocktgt.voteInProgress = true is incorrect 589 // 590 // if _, ok := smap.Pmap[oldPrimaryID]; ok { 591 // t.Fatalf("Previous primary proxy rejoined the cluster during a vote") 592 // } 593 mocktgt.voteInProgress = false 594 // smap, err = tools.WaitForClusterState(newPrimaryURL, "synchronize new Smap", 595 // smap.Version, testing.Verbose(), oldProxyCnt, oldTargetCnt+1) 596 // tassert.CheckFatal(t, err) 597 // 598 // end of FIXME 599 600 // time to kill the mock target, job well done 601 var v struct{} 602 stopch <- v 603 close(stopch) 604 select { 605 case err := <-errCh: 606 t.Errorf("Mock Target Error: %v", err) 607 default: 608 } 609 610 wg.Wait() 611 612 _, err = tools.WaitForClusterState(smap.Primary.URL(cmn.NetPublic), 613 "cluster to stabilize", smap.Version, oldProxyCnt, oldTargetCnt) 614 tassert.CheckFatal(t, err) 615 } 616 617 func minorityTargetMapVersionMismatch(t *testing.T) { 618 proxyURL := tools.RandomProxyURL(t) 619 targetMapVersionMismatch( 620 func(i int) int { 621 return i/4 + 1 622 }, t, proxyURL) 623 } 624 625 func majorityTargetMapVersionMismatch(t *testing.T) { 626 proxyURL := tools.RandomProxyURL(t) 627 targetMapVersionMismatch( 628 func(i int) int { 629 return i/2 + 1 630 }, t, proxyURL) 631 } 632 633 // targetMapVersionMismatch updates map version of a few targets, kill the primary proxy 634 // wait for the new leader to come online 635 func targetMapVersionMismatch(getNum func(int) int, t *testing.T, proxyURL string) { 636 smap := tools.GetClusterMap(t, proxyURL) 637 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 638 639 smap.Version++ 640 jsonMap, err := jsoniter.Marshal(smap) 641 tassert.CheckFatal(t, err) 642 643 n := getNum(smap.CountActiveTs() + smap.CountActivePs() - 1) 644 for _, v := range smap.Tmap { 645 if n == 0 { 646 break 647 } 648 baseParams := tools.BaseAPIParams(v.URL(cmn.NetPublic)) 649 baseParams.Method = http.MethodPut 650 reqParams := &api.ReqParams{ 651 BaseParams: baseParams, 652 Path: apc.URLPathDae.Join(apc.SyncSmap), 653 Body: jsonMap, 654 Header: http.Header{cos.HdrContentType: []string{cos.ContentJSON}}, 655 } 656 err = reqParams.DoRequest() 657 tassert.CheckFatal(t, err) 658 n-- 659 } 660 killRestorePrimary(t, proxyURL, false, nil) 661 } 662 663 // concurrentPutGetDel does put/get/del sequence against all proxies concurrently 664 func concurrentPutGetDel(t *testing.T) { 665 _ = tools.RandomProxyURL(t) 666 runProviderTests(t, func(t *testing.T, bck *meta.Bck) { 667 proxyURL := tools.RandomProxyURL(t) 668 smap := tools.GetClusterMap(t, proxyURL) 669 tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs()) 670 671 var ( 672 wg = &sync.WaitGroup{} 673 errCh = make(chan error, smap.CountActivePs()) 674 cksumType = bck.Props.Cksum.Type 675 ) 676 677 // cid = a goroutine ID to make filenames unique 678 // otherwise it is easy to run into a trouble when 2 goroutines do: 679 // 1PUT 2PUT 1DEL 2DEL 680 // And the second goroutine fails with error "object does not exist" 681 for _, v := range smap.Pmap { 682 wg.Add(1) 683 go func(url string) { 684 defer wg.Done() 685 errCh <- proxyPutGetDelete(100, url, bck.Clone(), cksumType) 686 }(v.URL(cmn.NetPublic)) 687 } 688 689 wg.Wait() 690 close(errCh) 691 692 for err := range errCh { 693 tassert.CheckFatal(t, err) 694 } 695 }) 696 } 697 698 // proxyPutGetDelete repeats put/get/del N times, all requests go to the same proxy 699 func proxyPutGetDelete(count int, proxyURL string, bck cmn.Bck, cksumType string) error { 700 baseParams := tools.BaseAPIParams(proxyURL) 701 for range count { 702 reader, err := readers.NewRand(fileSize, cksumType) 703 if err != nil { 704 return fmt.Errorf("error creating reader: %v", err) 705 } 706 fname := trand.String(20) 707 keyname := fmt.Sprintf("%s/%s", localBucketDir, fname) 708 putArgs := api.PutArgs{ 709 BaseParams: baseParams, 710 Bck: bck, 711 ObjName: keyname, 712 Cksum: reader.Cksum(), 713 Reader: reader, 714 } 715 if _, err = api.PutObject(&putArgs); err != nil { 716 return fmt.Errorf("error executing put: %v", err) 717 } 718 if _, err = api.GetObject(baseParams, bck, keyname, nil); err != nil { 719 return fmt.Errorf("error executing get: %v", err) 720 } 721 if err = tools.Del(proxyURL, bck, keyname, nil /* wg */, nil /* errCh */, true /* silent */); err != nil { 722 return fmt.Errorf("error executing del: %v", err) 723 } 724 } 725 726 return nil 727 } 728 729 // putGetDelWorker does put/get/del in sequence; if primary proxy change happens, it checks the failed delete 730 // channel and route the deletes to the new primary proxy 731 // stops when told to do so via the stop channel 732 func putGetDelWorker(proxyURL string, stopCh <-chan struct{}, proxyURLCh <-chan string, errCh chan error, 733 wg *sync.WaitGroup) { 734 defer wg.Done() 735 736 missedDeleteCh := make(chan string, 100) 737 baseParams := tools.BaseAPIParams(proxyURL) 738 739 bck := cmn.Bck{ 740 Name: testBucketName, 741 Provider: apc.AIS, 742 } 743 cksumType := bck.DefaultProps(initialClusterConfig).Cksum.Type 744 loop: 745 for { 746 select { 747 case <-stopCh: 748 close(errCh) 749 break loop 750 751 case url := <-proxyURLCh: 752 // send failed deletes to the new primary proxy 753 deleteLoop: 754 for { 755 select { 756 case objName := <-missedDeleteCh: 757 err := tools.Del(url, bck, objName, nil, errCh, true) 758 if err != nil { 759 missedDeleteCh <- objName 760 } 761 762 default: 763 break deleteLoop 764 } 765 } 766 767 default: 768 } 769 770 reader, err := readers.NewRand(fileSize, cksumType) 771 if err != nil { 772 errCh <- err 773 continue 774 } 775 776 fname := trand.String(20) 777 objName := fmt.Sprintf("%s/%s", localBucketDir, fname) 778 putArgs := api.PutArgs{ 779 BaseParams: baseParams, 780 Bck: bck, 781 ObjName: objName, 782 Cksum: reader.Cksum(), 783 Reader: reader, 784 } 785 _, err = api.PutObject(&putArgs) 786 if err != nil { 787 errCh <- err 788 continue 789 } 790 _, err = api.GetObject(baseParams, bck, objName, nil) 791 if err != nil { 792 errCh <- err 793 } 794 795 err = tools.Del(proxyURL, bck, objName, nil, errCh, true) 796 if err != nil { 797 missedDeleteCh <- objName 798 } 799 } 800 801 // process left over not deleted objects 802 close(missedDeleteCh) 803 for n := range missedDeleteCh { 804 tools.Del(proxyURL, bck, n, nil, nil, true) 805 } 806 } 807 808 // primaryKiller kills primary proxy, notifies all workers, and restores it. 809 func primaryKiller(t *testing.T, proxyURL string, stopch <-chan struct{}, proxyurlchs []chan string, 810 errCh chan error, wg *sync.WaitGroup) { 811 defer wg.Done() 812 813 loop: 814 for { 815 select { 816 case <-stopch: 817 close(errCh) 818 for _, ch := range proxyurlchs { 819 close(ch) 820 } 821 822 break loop 823 824 default: 825 } 826 827 postKill := func(_ *meta.Smap, newPrimary, _ *meta.Snode) { 828 // let the workers go to the dying primary for a little while longer to generate errored requests 829 time.Sleep(time.Second) 830 for _, ch := range proxyurlchs { 831 ch <- newPrimary.URL(cmn.NetPublic) 832 } 833 } 834 killRestorePrimary(t, proxyURL, false, postKill) 835 } 836 } 837 838 // Tests if a node is able to restart when discovery and original primary provided in config are not available 839 // 1. Set primary as original primary from config 840 // 2. Kill discovery node provided in config, a random proxy and target 841 // 3. Try restoring the killed nodes one at a time 842 func discoveryAndOrigPrimaryProxiesCrash(t *testing.T) { 843 var ( 844 config = tools.GetClusterConfig(t) 845 restoreCmd = make([]tools.RestoreCmd, 0, 3) 846 proxyURL string 847 pcnt, tcnt int 848 randomKilled bool 849 ) 850 851 // Make sure primary is same config 852 smap := primarySetToRand(t) 853 origProxyCnt := smap.CountActivePs() 854 origTargetCnt := smap.CountActiveTs() 855 856 for _, si := range smap.Pmap { 857 if smap.IsPrimary(si) { 858 continue 859 } 860 if si.HasURL(config.Proxy.DiscoveryURL) { 861 pcnt++ 862 tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, si.StringEx()) 863 cmd, err := tools.KillNode(si) 864 tassert.CheckFatal(t, err) 865 restoreCmd = append(restoreCmd, cmd) 866 continue 867 } 868 if randomKilled { 869 // Set proxyURL - used to get latest smap 870 proxyURL = si.URL(cmn.NetPublic) 871 continue 872 } 873 874 // Kill a random non primary proxy 875 pcnt++ 876 tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, si.StringEx()) 877 cmd, err := tools.KillNode(si) 878 tassert.CheckFatal(t, err) 879 restoreCmd = append(restoreCmd, cmd) 880 randomKilled = true 881 } 882 883 // Kill a random target 884 target, err := smap.GetRandTarget() 885 tassert.CheckFatal(t, err) 886 tcnt++ 887 tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, target.StringEx()) 888 cmd, err := tools.KillNode(target) 889 tassert.CheckFatal(t, err) 890 restoreCmd = append(restoreCmd, cmd) 891 892 // Kill original primary 893 pcnt++ 894 tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, smap.Primary.StringEx()) 895 cmd, err = tools.KillNode(smap.Primary) 896 tassert.CheckFatal(t, err) 897 restoreCmd = append(restoreCmd, cmd) 898 899 proxyCnt, targetCnt := origProxyCnt-pcnt, origTargetCnt-tcnt 900 smap, err = tools.WaitForClusterState(proxyURL, "kill proxies and target", smap.Version, proxyCnt, targetCnt) 901 tassert.CheckFatal(t, err) 902 903 // Restore all killed nodes 904 for _, cmd := range restoreCmd { 905 if cmd.Node.IsProxy() { 906 proxyCnt++ 907 } 908 if cmd.Node.IsTarget() { 909 targetCnt++ 910 } 911 tools.RestoreNode(cmd, false, cmd.Node.Type()) 912 _, err = tools.WaitForClusterState(proxyURL, "restore "+cmd.Node.ID(), smap.Version, 913 proxyCnt, targetCnt) 914 tassert.CheckError(t, err) 915 } 916 917 tools.WaitForRebalAndResil(t, tools.BaseAPIParams(proxyURL)) 918 } 919 920 // proxyStress starts a group of workers doing put/get/del in sequence against primary proxy, 921 // while the operations are on going, a separate go routine kills the primary proxy, notifies all 922 // workers about the proxy change, restart the killed proxy as a non-primary proxy. 923 // the process is repeated until a pre-defined time duration is reached. 924 func proxyStress(t *testing.T) { 925 var ( 926 wg sync.WaitGroup 927 errChs = make([]chan error, workerCnt+1) 928 stopChs = make([]chan struct{}, workerCnt+1) 929 proxyURLChs = make([]chan string, workerCnt) 930 bck = cmn.Bck{ 931 Name: testBucketName, 932 Provider: apc.AIS, 933 } 934 proxyURL = tools.RandomProxyURL(t) 935 ) 936 937 tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/) 938 defer func() { 939 err := tools.WaitNodeReady(proxyURL) 940 tassert.CheckFatal(t, err) 941 }() 942 943 // start all workers 944 for i := range workerCnt { 945 errChs[i] = make(chan error, defaultChanSize) 946 stopChs[i] = make(chan struct{}, defaultChanSize) 947 proxyURLChs[i] = make(chan string, defaultChanSize) 948 949 wg.Add(1) 950 go putGetDelWorker(proxyURL, stopChs[i], proxyURLChs[i], errChs[i], &wg) 951 952 // stagger the workers so they don't always do the same operation at the same time 953 n := cos.NowRand().Intn(999) 954 time.Sleep(time.Duration(n+1) * time.Millisecond) 955 } 956 957 errChs[workerCnt] = make(chan error, defaultChanSize) 958 stopChs[workerCnt] = make(chan struct{}, defaultChanSize) 959 wg.Add(1) 960 go primaryKiller(t, proxyURL, stopChs[workerCnt], proxyURLChs, errChs[workerCnt], &wg) 961 962 timer := time.After(tools.MultiProxyTestTimeout) 963 loop: 964 for { 965 for _, ch := range errChs { 966 select { 967 case <-timer: 968 break loop 969 case <-ch: 970 // Read errors, throw away, this is needed to unblock the workers. 971 default: 972 } 973 } 974 } 975 976 // stop all workers 977 for _, stopCh := range stopChs { 978 stopCh <- struct{}{} 979 close(stopCh) 980 } 981 982 wg.Wait() 983 } 984 985 // smap - current Smap 986 // directURL - URL of the proxy that we send the request to (not necessarily the current primary) 987 // toID - DaemonID and URL of the proxy that must become the new primary 988 func setPrimaryTo(t *testing.T, proxyURL string, smap *meta.Smap, directURL, toID string) (newSmap *meta.Smap) { 989 if directURL == "" { 990 directURL = smap.Primary.URL(cmn.NetPublic) 991 } 992 993 baseParams := tools.BaseAPIParams(directURL) 994 tlog.Logf("Setting primary from %s to %s\n", smap.Primary.ID(), toID) 995 err := api.SetPrimaryProxy(baseParams, toID, false /*force*/) 996 tassert.CheckFatal(t, err) 997 998 newSmap, err = tools.WaitForNewSmap(proxyURL, smap.Version) 999 tassert.CheckFatal(t, err) 1000 if newSmap.Primary.ID() != toID { 1001 t.Fatalf("Expected primary=%s, got %s", toID, newSmap.Primary.ID()) 1002 } 1003 checkSmaps(t, newSmap.Primary.URL(cmn.NetPublic)) 1004 return 1005 } 1006 1007 func chooseNextProxy(smap *meta.Smap) (proxyid, proxyURL string, err error) { 1008 pid, err := hrwProxyTest(smap, smap.Primary.ID()) 1009 pi := smap.Pmap[pid] 1010 if err != nil { 1011 return 1012 } 1013 1014 return pi.ID(), pi.URL(cmn.NetPublic), nil 1015 } 1016 1017 // For each proxy: compare its Smap vs primary(*) and return an error if differs 1018 func checkSmaps(t *testing.T, proxyURL string) { 1019 var ( 1020 smap1 = tools.GetClusterMap(t, proxyURL) 1021 primary = smap1.Primary // primary according to the `proxyURL`(*) 1022 smapDiffer bool 1023 ) 1024 for _, psi := range smap1.Pmap { 1025 smap2 := tools.GetClusterMap(t, psi.URL(cmn.NetPublic)) 1026 uuid, sameOrigin, sameVersion, eq := smap1.Compare(smap2) 1027 if eq { 1028 continue 1029 } 1030 err := fmt.Errorf("(%s %s, primary=%s) != (%s %s, primary=%s): (uuid=%s, same-orig=%t, same-ver=%t)", 1031 proxyURL, smap1, primary, psi.URL(cmn.NetPublic), smap2, smap2.Primary, uuid, sameOrigin, sameVersion) 1032 t.Error(err) 1033 smapDiffer = true 1034 } 1035 if !smapDiffer { 1036 tlog.Logln("all Smap copies are identical: " + smap1.StringEx()) 1037 } 1038 } 1039 1040 func primarySetToRand(t *testing.T) *meta.Smap { 1041 var ( 1042 proxyURL = tools.GetPrimaryURL() 1043 smap = tools.GetClusterMap(t, proxyURL) 1044 currURL = smap.Primary.URL(cmn.NetPublic) 1045 ) 1046 if currURL != proxyURL { 1047 t.Fatalf("Err in the test itself: expecting currURL %s == proxyurl %s", currURL, proxyURL) 1048 } 1049 1050 psi, err := smap.GetRandProxy(true /*exclude primary*/) 1051 tassert.CheckFatal(t, err) 1052 return setPrimaryTo(t, proxyURL, smap, "", psi.ID()) 1053 } 1054 1055 // This is duplicated in the tests because the `idDigest` of `daemonInfo` is not 1056 // exported. As a result of this, ais.HrwProxy will not return the correct 1057 // proxy since the `idDigest` will be initialized to 0. To avoid this, we 1058 // compute the checksum directly in this method. 1059 func hrwProxyTest(smap *meta.Smap, idToSkip string) (pi string, err error) { 1060 if smap.CountActivePs() == 0 { 1061 err = errors.New("AIStore cluster map is empty: no proxies") 1062 return 1063 } 1064 var ( 1065 maxH uint64 1066 skipped int 1067 ) 1068 for id, snode := range smap.Pmap { 1069 if id == idToSkip { 1070 skipped++ 1071 continue 1072 } 1073 if smap.NonElectable(snode) { 1074 skipped++ 1075 continue 1076 } 1077 1078 if smap.InMaintOrDecomm(snode) { 1079 skipped++ 1080 continue 1081 } 1082 1083 cs := xxhash.Checksum64S(cos.UnsafeB(snode.ID()), cos.MLCG32) 1084 if cs > maxH { 1085 maxH = cs 1086 pi = id 1087 } 1088 } 1089 if pi == "" { 1090 err = fmt.Errorf("cannot HRW-select proxy: current count=%d, skipped=%d", 1091 smap.CountActivePs(), skipped) 1092 } 1093 return 1094 } 1095 1096 func networkFailureTarget(t *testing.T) { 1097 proxyURL := tools.RandomProxyURL(t) 1098 smap := tools.GetClusterMap(t, proxyURL) 1099 proxyCount, targetCount := smap.CountActivePs(), smap.CountActiveTs() 1100 1101 tassert.Fatalf(t, targetCount > 0, "At least 1 target required") 1102 target, _ := smap.GetRandTarget() 1103 targetID := target.ID() 1104 1105 tlog.Logf("Disconnecting target: %s\n", targetID) 1106 oldNetworks, err := docker.Disconnect(targetID) 1107 tassert.CheckFatal(t, err) 1108 1109 smap, err = tools.WaitForClusterState( 1110 proxyURL, 1111 "target is down", 1112 smap.Version, 1113 proxyCount, 1114 targetCount-1, 1115 ) 1116 tassert.CheckFatal(t, err) 1117 1118 tlog.Logf("Connecting target %s to networks again\n", targetID) 1119 err = docker.Connect(targetID, oldNetworks) 1120 tassert.CheckFatal(t, err) 1121 1122 _, err = tools.WaitForClusterState( 1123 proxyURL, 1124 "to check cluster state", 1125 smap.Version, 1126 proxyCount, 1127 targetCount, 1128 ) 1129 tassert.CheckFatal(t, err) 1130 } 1131 1132 func networkFailureProxy(t *testing.T) { 1133 proxyURL := tools.RandomProxyURL(t) 1134 smap := tools.GetClusterMap(t, proxyURL) 1135 proxyCount, targetCount := smap.CountActivePs(), smap.CountActiveTs() 1136 tassert.Fatalf(t, proxyCount > 1, "At least 2 proxy required (has: %d)", proxyCount) 1137 1138 oldPrimaryID := smap.Primary.ID() 1139 proxyID, _, err := chooseNextProxy(smap) 1140 tassert.CheckFatal(t, err) 1141 1142 tlog.Logf("Disconnecting proxy: %s\n", proxyID) 1143 oldNetworks, err := docker.Disconnect(proxyID) 1144 tassert.CheckFatal(t, err) 1145 1146 smap, err = tools.WaitForClusterState( 1147 proxyURL, 1148 "proxy is down", 1149 smap.Version, 1150 proxyCount-1, 1151 targetCount, 1152 ) 1153 tassert.CheckFatal(t, err) 1154 1155 tlog.Logf("Connecting proxy %s to networks again\n", proxyID) 1156 err = docker.Connect(proxyID, oldNetworks) 1157 tassert.CheckFatal(t, err) 1158 1159 smap, err = tools.WaitForClusterState( 1160 proxyURL, 1161 "to check cluster state", 1162 smap.Version, 1163 proxyCount, 1164 targetCount, 1165 ) 1166 tassert.CheckFatal(t, err) 1167 1168 if oldPrimaryID != smap.Primary.ID() { 1169 t.Fatalf("Primary proxy changed from %s to %s", 1170 oldPrimaryID, smap.Primary.ID()) 1171 } 1172 } 1173 1174 func networkFailurePrimary(t *testing.T) { 1175 proxyURL := tools.RandomProxyURL(t) 1176 smap := tools.GetClusterMap(t, proxyURL) 1177 if smap.CountActivePs() < 2 { 1178 t.Fatal("At least 2 proxy required") 1179 } 1180 1181 proxyCount, targetCount := smap.CountActivePs(), smap.CountActiveTs() 1182 oldPrimaryID, oldPrimaryURL := smap.Primary.ID(), smap.Primary.URL(cmn.NetPublic) 1183 newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap) 1184 tassert.CheckFatal(t, err) 1185 1186 // Disconnect primary 1187 tlog.Logf("Disconnecting primary %s from all networks\n", oldPrimaryID) 1188 oldNetworks, err := docker.Disconnect(oldPrimaryID) 1189 tassert.CheckFatal(t, err) 1190 1191 // Check smap 1192 smap, err = tools.WaitForClusterState( 1193 newPrimaryURL, 1194 "original primary gone", 1195 smap.Version, 1196 proxyCount-1, 1197 targetCount, 1198 ) 1199 tassert.CheckFatal(t, err) 1200 1201 if smap.Primary.ID() != newPrimaryID { 1202 t.Fatalf("wrong primary proxy: %s, expecting: %s after disconnecting", 1203 smap.Primary.ID(), newPrimaryID) 1204 } 1205 1206 // Connect again 1207 tlog.Logf("Connecting primary %s to networks again\n", oldPrimaryID) 1208 err = docker.Connect(oldPrimaryID, oldNetworks) 1209 tassert.CheckFatal(t, err) 1210 1211 // give a little time to original primary, so it picks up the network 1212 // connections and starts talking to neighbors 1213 _, err = tools.WaitForClusterState( 1214 oldPrimaryID, 1215 "original primary is restored", 1216 smap.Version, 1217 proxyCount, 1218 targetCount, 1219 ) 1220 tassert.CheckFatal(t, err) 1221 1222 oldSmap := tools.GetClusterMap(t, oldPrimaryURL) 1223 // the original primary still thinks that it is the primary, so its smap 1224 // should not change after the network is back 1225 if oldSmap.Primary.ID() != oldPrimaryID { 1226 tlog.Logf("Old primary changed its smap. Its current primary: %s (expected %s - self)\n", 1227 oldSmap.Primary.ID(), oldPrimaryID) 1228 } 1229 1230 // Forcefully set new primary for the original one 1231 baseParams := tools.BaseAPIParams(oldPrimaryURL) 1232 baseParams.Method = http.MethodPut 1233 reqParams := &api.ReqParams{ 1234 BaseParams: baseParams, 1235 Path: apc.URLPathDaeProxy.Join(newPrimaryID), 1236 Query: url.Values{ 1237 apc.QparamForce: {"true"}, 1238 apc.QparamPrimaryCandidate: {newPrimaryURL}, 1239 }, 1240 } 1241 err = reqParams.DoRequest() 1242 tassert.CheckFatal(t, err) 1243 1244 smap, err = tools.WaitForClusterState( 1245 newPrimaryURL, 1246 "original primary joined the new primary", 1247 smap.Version, 1248 proxyCount, 1249 targetCount, 1250 ) 1251 tassert.CheckFatal(t, err) 1252 1253 if smap.Primary.ID() != newPrimaryID { 1254 t.Fatalf("expected primary=%s, got %s after connecting again", newPrimaryID, smap.Primary.ID()) 1255 } 1256 } 1257 1258 func networkFailure(t *testing.T) { 1259 tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeDocker}) 1260 1261 t.Run("Target network disconnect", networkFailureTarget) 1262 t.Run("Secondary proxy network disconnect", networkFailureProxy) 1263 t.Run("Primary proxy network disconnect", networkFailurePrimary) 1264 } 1265 1266 // primaryAndNextCrash kills the primary proxy and a proxy that should be selected 1267 // after the current primary dies, verifies the second in line proxy becomes 1268 // the new primary, restore all proxies 1269 func primaryAndNextCrash(t *testing.T) { 1270 proxyURL := tools.RandomProxyURL(t) 1271 smap := tools.GetClusterMap(t, proxyURL) 1272 origProxyCount := smap.CountActivePs() 1273 1274 if origProxyCount < 4 { 1275 t.Skip("The test requires at least 4 proxies, found only ", origProxyCount) 1276 } 1277 1278 // get next primary 1279 firstPrimaryID, firstPrimaryURL, err := chooseNextProxy(smap) 1280 tassert.CheckFatal(t, err) 1281 // Cluster map is re-read to have a clone of original smap that the test 1282 // can modify in any way it needs. Because original smap got must be preserved 1283 smapNext := tools.GetClusterMap(t, proxyURL) 1284 // get next next primary 1285 firstPrimary := smapNext.Pmap[firstPrimaryID] 1286 delete(smapNext.Pmap, firstPrimaryID) 1287 finalPrimaryID, finalPrimaryURL, err := chooseNextProxy(smapNext) 1288 tassert.CheckFatal(t, err) 1289 1290 // kill the current primary 1291 oldPrimaryURL, oldPrimaryID := smap.Primary.URL(cmn.NetPublic), smap.Primary.ID() 1292 tlog.Logf("Killing primary proxy: %s - %s\n", oldPrimaryURL, oldPrimaryID) 1293 cmdFirst, err := tools.KillNode(smap.Primary) 1294 tassert.CheckFatal(t, err) 1295 1296 // kill the next primary 1297 tlog.Logf("Killing next to primary proxy: %s - %s\n", firstPrimaryID, firstPrimaryURL) 1298 cmdSecond, errSecond := tools.KillNode(firstPrimary) 1299 // if kill fails it does not make sense to wait for the cluster is stable 1300 if errSecond == nil { 1301 // the cluster should vote, so the smap version should be increased at 1302 // least by 100, that is why +99 1303 smap, err = tools.WaitForClusterState(finalPrimaryURL, "new primary elected", 1304 smap.Version+99, origProxyCount-2, 0) 1305 tassert.CheckFatal(t, err) 1306 } 1307 1308 tlog.Logf("Checking current primary, %s\n", smap.StringEx()) 1309 if smap.Primary.ID() != finalPrimaryID { 1310 t.Errorf("Expected primary %s but real primary is %s", finalPrimaryID, smap.Primary.ID()) 1311 } 1312 1313 // restore next and prev primaries in the reversed order 1314 err = tools.RestoreNode(cmdSecond, false, "proxy (next primary)") 1315 tassert.CheckFatal(t, err) 1316 smap, err = tools.WaitForClusterState(finalPrimaryURL, "restore next primary", 1317 smap.Version, origProxyCount-1, 0) 1318 tassert.CheckFatal(t, err) 1319 1320 err = tools.RestoreNode(cmdFirst, false, "proxy (prev primary)") 1321 tassert.CheckFatal(t, err) 1322 _, err = tools.WaitForClusterState(finalPrimaryURL, "restore prev primary", 1323 smap.Version, origProxyCount, 0) 1324 tassert.CheckFatal(t, err) 1325 } 1326 1327 func TestIC(t *testing.T) { 1328 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, RequiredDeployment: tools.ClusterTypeLocal}) 1329 1330 proxyURL := tools.RandomProxyURL(t) 1331 smap := tools.GetClusterMap(t, proxyURL) 1332 if cnt := smap.CountActivePs(); cnt < 4 { 1333 t.Fatalf("Not enough proxies (%d) to run tests (must be at least 4)", cnt) 1334 } 1335 1336 defer tools.EnsureOrigClusterState(t) 1337 for _, test := range icTests { 1338 t.Run(test.name, test.method) 1339 if t.Failed() { 1340 t.FailNow() 1341 } 1342 } 1343 time.Sleep(time.Second) 1344 } 1345 1346 func killRandNonPrimaryIC(t testing.TB, smap *meta.Smap) (tools.RestoreCmd, *meta.Smap) { 1347 origProxyCount := smap.CountActivePs() 1348 primary := smap.Primary 1349 var killNode *meta.Snode 1350 for _, psi := range smap.Pmap { 1351 if smap.IsIC(psi) && !psi.Eq(primary) { 1352 killNode = psi 1353 break 1354 } 1355 } 1356 cmd, err := tools.KillNode(killNode) 1357 tassert.CheckFatal(t, err) 1358 1359 smap, err = tools.WaitForClusterState(primary.URL(cmn.NetPublic), "propagate new Smap", 1360 smap.Version, origProxyCount-1, 0) 1361 tassert.CheckError(t, err) 1362 return cmd, smap 1363 } 1364 1365 func icFromSmap(smap *meta.Smap) cos.StrSet { 1366 lst := make(cos.StrSet, meta.DfltCountIC) 1367 for pid, psi := range smap.Pmap { 1368 if smap.IsIC(psi) { 1369 lst.Add(pid) 1370 } 1371 } 1372 return lst 1373 } 1374 1375 func icMemberLeaveAndRejoin(t *testing.T) { 1376 smap := tools.GetClusterMap(t, proxyURL) 1377 primary := smap.Primary 1378 tassert.Fatalf(t, smap.ICCount() == meta.DfltCountIC, 1379 "should have %d members in IC, has %d", meta.DfltCountIC, smap.ICCount()) 1380 1381 // Primary must be an IC member 1382 tassert.Fatalf(t, smap.IsIC(primary), "primary (%s) should be a IC member, (were: %s)", primary, smap.StrIC(primary)) 1383 1384 // killing an IC member, should add a new IC member 1385 // select IC member which is not primary and kill 1386 origIC := icFromSmap(smap) 1387 cmd, smap := killRandNonPrimaryIC(t, smap) 1388 delete(origIC, cmd.Node.ID()) 1389 1390 tassert.Errorf(t, !smap.IsIC(cmd.Node), "Killed daemon (%s) must be removed from IC", cmd.Node.ID()) 1391 1392 // should have remaining IC nodes 1393 for sid := range origIC { 1394 tassert.Errorf(t, smap.IsIC(smap.GetProxy(sid)), "Should not remove existing IC members (%s)", sid) 1395 } 1396 tassert.Errorf(t, smap.ICCount() == meta.DfltCountIC, "should have %d members in IC, has %d", 1397 meta.DfltCountIC, smap.ICCount()) 1398 1399 err := tools.RestoreNode(cmd, false, "proxy") 1400 tassert.CheckFatal(t, err) 1401 1402 updatedICs := icFromSmap(smap) 1403 smap, err = tools.WaitNodeAdded(tools.BaseAPIParams(primary.URL(cmn.NetPublic)), cmd.Node.ID()) 1404 tassert.CheckFatal(t, err) 1405 1406 // Adding a new node shouldn't change IC members. 1407 newIC := icFromSmap(smap) 1408 tassert.Errorf(t, reflect.DeepEqual(updatedICs, newIC), "shouldn't update existing IC members") 1409 } 1410 1411 func icKillAndRestorePrimary(t *testing.T) { 1412 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1413 var ( 1414 proxyURL = tools.RandomProxyURL(t) 1415 smap = tools.GetClusterMap(t, proxyURL) 1416 oldIC = icFromSmap(smap) 1417 oldPrimary = smap.Primary 1418 ) 1419 1420 icCheck := func(smap *meta.Smap, newPrimary, oldPrimary *meta.Snode) { 1421 // Old primary shouldn't be in IC. 1422 tassert.Errorf(t, !smap.IsIC(oldPrimary), "killed primary (%s) must be removed from IC", oldPrimary) 1423 1424 // New primary should be part of IC. 1425 tassert.Errorf(t, smap.IsIC(newPrimary), "new primary (%s) must be part of IC", newPrimary) 1426 1427 // Remaining IC member should be unchanged. 1428 for sid := range oldIC { 1429 if sid != oldPrimary.ID() { 1430 tassert.Errorf(t, smap.IsIC(smap.GetProxy(sid)), "should not remove existing IC members (%s)", sid) 1431 } 1432 } 1433 } 1434 1435 smap = killRestorePrimary(t, proxyURL, true, icCheck) 1436 1437 // When a node added as primary, it should add itself to IC. 1438 tassert.Fatalf(t, smap.IsIC(oldPrimary), 1439 "primary (%s) should be a IC member, (were: %s)", oldPrimary, smap.StrIC(oldPrimary)) 1440 tassert.Errorf(t, smap.ICCount() == meta.DfltCountIC, 1441 "should have %d members in IC, has %d", meta.DfltCountIC, smap.ICCount()) 1442 } 1443 1444 func icSyncOwnershipTable(t *testing.T) { 1445 var ( 1446 proxyURL = tools.RandomProxyURL(t) 1447 baseParams = tools.BaseAPIParams(proxyURL) 1448 smap = tools.GetClusterMap(t, proxyURL) 1449 primary = smap.Primary 1450 1451 src = cmn.Bck{ 1452 Name: testBucketName, 1453 Provider: apc.AIS, 1454 } 1455 1456 dstBck = cmn.Bck{ 1457 Name: testBucketName + "_new", 1458 Provider: apc.AIS, 1459 } 1460 ) 1461 1462 tools.CreateBucket(t, proxyURL, src, nil, true /*cleanup*/) 1463 1464 // Start any xaction and get ID. 1465 xid, err := api.CopyBucket(baseParams, src, dstBck, nil) 1466 tassert.CheckFatal(t, err) 1467 t.Cleanup(func() { 1468 tools.DestroyBucket(t, proxyURL, dstBck) 1469 }) 1470 1471 // Killing an IC member, should add a new IC member. 1472 // Select IC member which is not primary and kill. 1473 origIC := icFromSmap(smap) 1474 cmd, smap := killRandNonPrimaryIC(t, smap) 1475 1476 // Try getting xaction status from new IC member. 1477 updatedIC := icFromSmap(smap) 1478 newICMemID := getNewICMember(t, origIC, updatedIC) 1479 1480 newICNode := smap.GetProxy(newICMemID) 1481 1482 baseParams = tools.BaseAPIParams(newICNode.URL(cmn.NetPublic)) 1483 xargs := xact.ArgsMsg{ID: xid, Kind: apc.ActCopyBck} 1484 _, err = api.GetOneXactionStatus(baseParams, &xargs) 1485 tassert.CheckError(t, err) 1486 1487 err = tools.RestoreNode(cmd, false, "proxy") 1488 tassert.CheckFatal(t, err) 1489 1490 smap, err = tools.WaitNodeAdded(baseParams, cmd.Node.ID()) 1491 tassert.CheckFatal(t, err) 1492 tassert.Fatalf(t, !smap.IsIC(cmd.Node), "newly joined node shouldn't be in IC (%s)", cmd.Node) 1493 1494 // Should sync ownership table when non-ic member become primary. 1495 smap = setPrimaryTo(t, primary.URL(cmn.NetPublic), smap, "", cmd.Node.ID()) 1496 tassert.Fatalf(t, smap.IsIC(cmd.Node), "primary (%s) should be a IC member, (were: %s)", primary, smap.StrIC(primary)) 1497 1498 baseParams = tools.BaseAPIParams(cmd.Node.URL(cmn.NetPublic)) 1499 _, err = api.GetOneXactionStatus(baseParams, &xargs) 1500 tassert.CheckError(t, err) 1501 } 1502 1503 func icSinglePrimaryRevamp(t *testing.T) { 1504 tools.CheckSkip(t, &tools.SkipTestArgs{Long: true}) 1505 1506 var ( 1507 proxyURL = tools.RandomProxyURL(t) 1508 smap = tools.GetClusterMap(t, proxyURL) 1509 origProxyCount = smap.CountActivePs() 1510 1511 src = cmn.Bck{ 1512 Name: testBucketName, 1513 Provider: apc.AIS, 1514 } 1515 1516 dstBck = cmn.Bck{ 1517 Name: testBucketName + "_new", 1518 Provider: apc.AIS, 1519 } 1520 ) 1521 1522 nodesToRestore := make([]tools.RestoreCmd, 0, origProxyCount-1) 1523 1524 // Kill all nodes except primary. 1525 for i := origProxyCount; i > 1; i-- { 1526 var cmd tools.RestoreCmd 1527 cmd, smap = killRandNonPrimaryIC(t, smap) 1528 nodesToRestore = append(nodesToRestore, cmd) 1529 } 1530 1531 proxyURL = smap.Primary.URL(cmn.NetPublic) 1532 baseParams = tools.BaseAPIParams(proxyURL) 1533 tools.CreateBucket(t, proxyURL, src, nil, true /*cleanup*/) 1534 1535 // Start any xaction and get ID. 1536 xid, err := api.CopyBucket(baseParams, src, dstBck, nil) 1537 xargs := xact.ArgsMsg{ID: xid, Kind: apc.ActCopyBck} 1538 1539 tassert.CheckFatal(t, err) 1540 t.Cleanup(func() { 1541 tools.DestroyBucket(t, proxyURL, dstBck) 1542 }) 1543 1544 // Restart all killed nodes and check for xaction status. 1545 for _, cmd := range nodesToRestore { 1546 err = tools.RestoreNode(cmd, false, "proxy") 1547 tassert.CheckError(t, err) 1548 1549 smap, err = tools.WaitForClusterState(proxyURL, 1550 "restore node "+cmd.Node.ID(), smap.Version, 1551 smap.CountActivePs()+1, smap.CountTargets()) 1552 tassert.CheckFatal(t, err) 1553 1554 baseParams = tools.BaseAPIParams(cmd.Node.URL(cmn.NetPublic)) 1555 _, err = api.GetOneXactionStatus(baseParams, &xargs) 1556 tassert.CheckError(t, err) 1557 } 1558 } 1559 1560 func icStressMonitorXactMultiICFail(t *testing.T) { 1561 var ( 1562 proxyURL = tools.GetPrimaryURL() 1563 smap = tools.GetClusterMap(t, proxyURL) 1564 1565 m = ioContext{ 1566 t: t, 1567 num: 1000, 1568 fileSize: 50 * cos.KiB, 1569 } 1570 numCopyXacts = 20 1571 ) 1572 1573 // 1. Populate a bucket required for copy xactions 1574 m.init(true /*cleanup*/) 1575 tools.CreateBucket(t, proxyURL, m.bck, nil, true /*cleanup*/) 1576 m.puts() 1577 1578 // 2. Kill and restore random IC members in background 1579 stopCh := &cos.StopCh{} 1580 stopCh.Init() 1581 krWg := &sync.WaitGroup{} 1582 krWg.Add(1) 1583 go killRestoreIC(t, smap, stopCh, krWg) 1584 defer func() { 1585 // Stop the background kill and restore task 1586 stopCh.Close() 1587 krWg.Wait() 1588 }() 1589 1590 // 3. Start multiple xactions and poll random proxy for status till xaction is complete 1591 wg := startCPBckAndWait(t, m.bck, numCopyXacts) 1592 wg.Wait() 1593 } 1594 1595 func startCPBckAndWait(t testing.TB, srcBck cmn.Bck, count int) *sync.WaitGroup { 1596 var ( 1597 proxyURL = tools.GetPrimaryURL() 1598 baseParams = tools.BaseAPIParams(proxyURL) 1599 wg = &sync.WaitGroup{} 1600 ) 1601 for i := range count { 1602 wg.Add(1) 1603 go func(idx int) { 1604 dstBck := cmn.Bck{ 1605 Name: fmt.Sprintf("%s_dst_par_%d", testBucketName, idx), 1606 Provider: apc.AIS, 1607 } 1608 xid, err := api.CopyBucket(baseParams, srcBck, dstBck, nil) 1609 tassert.CheckError(t, err) 1610 defer func() { 1611 tools.DestroyBucket(t, proxyURL, dstBck) 1612 wg.Done() 1613 }() 1614 xargs := xact.ArgsMsg{ID: xid, Timeout: tools.RebalanceTimeout} 1615 _, err = api.WaitForXactionIC(baseParams, &xargs) 1616 tassert.CheckError(t, err) 1617 }(i) 1618 } 1619 return wg 1620 } 1621 1622 // Continuously kill and restore IC nodes 1623 func killRestoreIC(t *testing.T, smap *meta.Smap, stopCh *cos.StopCh, wg *sync.WaitGroup) { 1624 var ( 1625 cmd tools.RestoreCmd 1626 proxyURL = smap.Primary.URL(cmn.NetPublic) 1627 ) 1628 defer wg.Done() 1629 1630 for { 1631 cmd, smap = killRandNonPrimaryIC(t, smap) 1632 err := tools.RestoreNode(cmd, false, "proxy") 1633 tassert.CheckFatal(t, err) 1634 1635 smap, err = tools.WaitForClusterState(proxyURL, "restore", smap.Version, 0, 0) 1636 tassert.CheckFatal(t, err) 1637 time.Sleep(2 * time.Second) 1638 1639 select { 1640 case <-stopCh.Listen(): 1641 return 1642 default: 1643 break 1644 } 1645 } 1646 } 1647 1648 // misc 1649 1650 func getNewICMember(t testing.TB, oldMap, newMap cos.StrSet) (daeID string) { 1651 for sid := range newMap { 1652 if _, ok := oldMap[sid]; !ok { 1653 tassert.Errorf(t, daeID == "", "should change only one IC member") 1654 daeID = sid 1655 } 1656 } 1657 tassert.Fatalf(t, daeID != "", "should change at least one IC member") 1658 return 1659 } 1660 1661 // 1662 // mock target 1663 // 1664 1665 const ( 1666 mockTargetPort = "8079" 1667 ) 1668 1669 type targetMocker interface { 1670 filehdlr(w http.ResponseWriter, r *http.Request) 1671 daemonhdlr(w http.ResponseWriter, r *http.Request) 1672 votehdlr(w http.ResponseWriter, r *http.Request) 1673 healthdlr(w http.ResponseWriter, r *http.Request) 1674 } 1675 1676 type MockRegRequest struct { 1677 SI *meta.Snode `json:"si"` 1678 } 1679 1680 func runMockTarget(t *testing.T, proxyURL string, mocktgt targetMocker, stopch chan struct{}, smap *meta.Smap, wg *sync.WaitGroup) { 1681 defer wg.Done() 1682 mux := http.NewServeMux() 1683 1684 mux.HandleFunc(apc.URLPathBuckets.S, mocktgt.filehdlr) 1685 mux.HandleFunc(apc.URLPathObjects.S, mocktgt.filehdlr) 1686 mux.HandleFunc(apc.URLPathDae.S, mocktgt.daemonhdlr) 1687 mux.HandleFunc(apc.URLPathVote.S, mocktgt.votehdlr) 1688 mux.HandleFunc(apc.URLPathHealth.S, mocktgt.healthdlr) 1689 1690 target, _ := smap.GetRandTarget() 1691 ip := target.PubNet.Hostname 1692 1693 s := &http.Server{ 1694 Addr: ip + ":" + mockTargetPort, 1695 Handler: mux, 1696 ReadHeaderTimeout: 10 * time.Second, 1697 } 1698 go s.ListenAndServe() 1699 1700 err := registerMockTarget(proxyURL, smap) 1701 if err != nil { 1702 t.Errorf("failed to start http server for mock target: %v", err) 1703 return 1704 } 1705 tlog.Logf("t[%s] is up\n", tools.MockDaemonID) 1706 1707 <-stopch 1708 1709 tlog.Logf("started unsafe removal of t[%s]\n", tools.MockDaemonID) 1710 err = tools.RemoveNodeUnsafe(proxyURL, tools.MockDaemonID) 1711 if err != nil { 1712 tlog.Logf("Error: failed to unsafely remove t[%s]: %v\n", tools.MockDaemonID, err) 1713 } 1714 s.Shutdown(context.Background()) 1715 } 1716 1717 func registerMockTarget(proxyURL string, smap *meta.Smap) error { 1718 var ( 1719 jsonDaemonInfo []byte 1720 err error 1721 ) 1722 1723 // borrow a random target's ip but using a different port to register the mock target 1724 for _, v := range smap.Tmap { 1725 v.DaeID = tools.MockDaemonID 1726 v.PubNet = meta.NetInfo{ 1727 Hostname: v.PubNet.Hostname, 1728 Port: mockTargetPort, 1729 URL: "http://" + v.PubNet.Hostname + ":" + mockTargetPort, 1730 } 1731 v.ControlNet = v.PubNet 1732 v.DataNet = v.PubNet 1733 regReq := MockRegRequest{SI: v} 1734 jsonDaemonInfo, err = jsoniter.Marshal(regReq) 1735 if err != nil { 1736 return err 1737 } 1738 break 1739 } 1740 baseParams := tools.BaseAPIParams(proxyURL) 1741 baseParams.Method = http.MethodPost 1742 reqParams := &api.ReqParams{ 1743 BaseParams: baseParams, 1744 Path: apc.URLPathCluAutoReg.S, 1745 Body: jsonDaemonInfo, 1746 Header: http.Header{cos.HdrContentType: []string{cos.ContentJSON}}, 1747 } 1748 return reqParams.DoRequest() 1749 } 1750 1751 type voteRetryMockTarget struct { 1752 voteInProgress bool 1753 errCh chan error 1754 } 1755 1756 type cluMetaRedux struct { 1757 Smap *meta.Smap 1758 VoteInProgress bool `json:"voting"` 1759 } 1760 1761 func newVoteMsg(inp bool) cluMetaRedux { 1762 return cluMetaRedux{VoteInProgress: inp, Smap: &meta.Smap{Version: 1}} 1763 } 1764 1765 func (*voteRetryMockTarget) filehdlr(http.ResponseWriter, *http.Request) { 1766 // Ignore all file requests 1767 } 1768 1769 func (p *voteRetryMockTarget) daemonhdlr(w http.ResponseWriter, r *http.Request) { 1770 switch r.Method { 1771 case http.MethodGet: 1772 msg := newVoteMsg(p.voteInProgress) // treat all Get requests as requests for a VoteMsg 1773 jsbytes, err := jsoniter.Marshal(msg) 1774 if err == nil { 1775 _, err = w.Write(jsbytes) 1776 } 1777 if err != nil { 1778 p.errCh <- fmt.Errorf("error writing vote message: %v", err) 1779 } 1780 default: 1781 } 1782 } 1783 1784 func (*voteRetryMockTarget) votehdlr(w http.ResponseWriter, _ *http.Request) { 1785 // Always vote yes. 1786 w.Write([]byte(ais.VoteYes)) 1787 } 1788 1789 func (p *voteRetryMockTarget) healthdlr(w http.ResponseWriter, r *http.Request) { 1790 query := r.URL.Query() 1791 getRebStatus := cos.IsParseBool(query.Get(apc.QparamRebStatus)) 1792 if getRebStatus { 1793 status := &reb.Status{} 1794 status.RebID = math.MaxInt64 // to abort t[MOCK] join triggered rebalance 1795 body := cos.MustMarshal(status) 1796 _, err := w.Write(body) 1797 if err != nil { 1798 p.errCh <- fmt.Errorf("error writing reb-status: %v", err) 1799 } 1800 } 1801 }