github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/ais/test/multiproxy_test.go (about)

     1  // Package integration_test.
     2  /*
     3   * Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
     4   */
     5  package integration_test
     6  
     7  import (
     8  	"context"
     9  	"errors"
    10  	"fmt"
    11  	"math"
    12  	"net/http"
    13  	"net/url"
    14  	"path/filepath"
    15  	"reflect"
    16  	"sync"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/NVIDIA/aistore/ais"
    21  	"github.com/NVIDIA/aistore/api"
    22  	"github.com/NVIDIA/aistore/api/apc"
    23  	"github.com/NVIDIA/aistore/cmn"
    24  	"github.com/NVIDIA/aistore/cmn/cos"
    25  	"github.com/NVIDIA/aistore/cmn/fname"
    26  	"github.com/NVIDIA/aistore/cmn/jsp"
    27  	"github.com/NVIDIA/aistore/core/meta"
    28  	"github.com/NVIDIA/aistore/reb"
    29  	"github.com/NVIDIA/aistore/tools"
    30  	"github.com/NVIDIA/aistore/tools/docker"
    31  	"github.com/NVIDIA/aistore/tools/readers"
    32  	"github.com/NVIDIA/aistore/tools/tassert"
    33  	"github.com/NVIDIA/aistore/tools/tlog"
    34  	"github.com/NVIDIA/aistore/tools/trand"
    35  	"github.com/NVIDIA/aistore/xact"
    36  	"github.com/OneOfOne/xxhash"
    37  	jsoniter "github.com/json-iterator/go"
    38  )
    39  
    40  const (
    41  	localBucketDir  = "multipleproxy"
    42  	defaultChanSize = 10
    43  )
    44  
    45  var (
    46  	voteTests = []Test{
    47  		{"PrimaryCrash", primaryCrashElectRestart},
    48  		{"NodeCrashRestoreDifferentIP", nodeCrashRestoreDifferentIP},
    49  		{"ProxyCrash", proxyCrash},
    50  		{"PrimaryAndTargetCrash", primaryAndTargetCrash},
    51  		{"PrimaryAndProxyCrash", primaryAndProxyCrash},
    52  		{"CrashAndFastRestore", crashAndFastRestore},
    53  		{"TargetRejoin", targetRejoin},
    54  		{"JoinWhileVoteInProgress", joinWhileVoteInProgress},
    55  		{"MinorityTargetMapVersionMismatch", minorityTargetMapVersionMismatch},
    56  		{"MajorityTargetMapVersionMismatch", majorityTargetMapVersionMismatch},
    57  		{"ConcurrentPutGetDel", concurrentPutGetDel},
    58  		{"ProxyStress", proxyStress},
    59  		{"NetworkFailure", networkFailure},
    60  		{"PrimaryAndNextCrash", primaryAndNextCrash},
    61  		{"DiscoveryAndOriginalPrimaryCrash", discoveryAndOrigPrimaryProxiesCrash},
    62  		{"AddNodeDuplicateIP", addNodeDuplicateIP},
    63  		{"AddNodeDuplicateDaemonID", addNodeDuplicateDaemonID},
    64  	}
    65  
    66  	icTests = []Test{
    67  		{"ICMemberLeaveAndRejoin", icMemberLeaveAndRejoin},
    68  		{"ICKillAndRestorePrimary", icKillAndRestorePrimary},
    69  		{"ICSyncOwnTbl", icSyncOwnershipTable},
    70  		{"ICSinglePrimaryRevamp", icSinglePrimaryRevamp},
    71  		{"ICStressMonitorXactMultiICFail", icStressMonitorXactMultiICFail},
    72  	}
    73  )
    74  
    75  func TestMultiProxy(t *testing.T) {
    76  	tools.CheckSkip(t, &tools.SkipTestArgs{
    77  		Long:               true,
    78  		RequiredDeployment: tools.ClusterTypeLocal,
    79  		MinProxies:         3,
    80  		MinTargets:         1,
    81  	})
    82  
    83  	defer tools.EnsureOrigClusterState(t)
    84  	for _, test := range voteTests {
    85  		t.Run(test.name, test.method)
    86  		if t.Failed() {
    87  			t.FailNow()
    88  		}
    89  	}
    90  }
    91  
    92  // primaryCrashElectRestart kills the current primary proxy, wait for the new primary proxy is up and verifies it,
    93  // restores the original primary proxy as non primary
    94  func primaryCrashElectRestart(t *testing.T) {
    95  	proxyURL := tools.RandomProxyURL(t)
    96  	killRestorePrimary(t, proxyURL, false, nil)
    97  }
    98  
    99  func killRestorePrimary(t *testing.T, proxyURL string, restoreAsPrimary bool,
   100  	postKill func(smap *meta.Smap, newPrimary, oldPrimary *meta.Snode)) *meta.Smap {
   101  	var (
   102  		smap          = tools.GetClusterMap(t, proxyURL)
   103  		proxyCount    = smap.CountActivePs()
   104  		oldPrimary    = smap.Primary
   105  		oldPrimaryURL = smap.Primary.URL(cmn.NetPublic)
   106  		oldPrimaryID  = smap.Primary.ID()
   107  	)
   108  
   109  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), proxyCount)
   110  	newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap)
   111  	tassert.CheckFatal(t, err)
   112  	newPrimary := smap.GetProxy(newPrimaryID)
   113  
   114  	tlog.Logf("New primary: %s --> %s\n", newPrimaryID, newPrimaryURL)
   115  	tlog.Logf("Killing primary: %s --> %s\n", oldPrimaryURL, oldPrimaryID)
   116  
   117  	// cmd and args are the original command line of how the proxy is started
   118  	cmd, err := tools.KillNode(smap.Primary)
   119  	tassert.CheckFatal(t, err)
   120  
   121  	smap, err = tools.WaitForClusterState(newPrimaryURL, "new primary elected", smap.Version,
   122  		smap.CountActivePs()-1, smap.CountActiveTs())
   123  	tassert.CheckFatal(t, err)
   124  	tlog.Logf("New primary elected: %s\n", newPrimaryID)
   125  
   126  	tassert.Errorf(t, smap.Primary.ID() == newPrimaryID, "Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), newPrimaryID)
   127  
   128  	if postKill != nil {
   129  		postKill(smap, newPrimary, oldPrimary)
   130  	}
   131  
   132  	// re-construct the command line to start the original proxy but add the current primary proxy to the args
   133  	err = tools.RestoreNode(cmd, false, "proxy (prev primary)")
   134  	tassert.CheckFatal(t, err)
   135  
   136  	smap, err = tools.WaitForClusterState(newPrimaryURL, "restore", smap.Version, proxyCount, 0)
   137  	tassert.CheckFatal(t, err)
   138  	if _, ok := smap.Pmap[oldPrimaryID]; !ok {
   139  		t.Fatalf("Previous primary proxy did not rejoin the cluster")
   140  	}
   141  	checkSmaps(t, newPrimaryURL)
   142  
   143  	if restoreAsPrimary {
   144  		return setPrimaryTo(t, oldPrimaryURL, smap, "", oldPrimaryID)
   145  	}
   146  	return smap
   147  }
   148  
   149  func nodeCrashRestoreDifferentIP(t *testing.T) {
   150  	for _, ty := range []string{apc.Proxy, apc.Target} {
   151  		t.Run(ty, func(t *testing.T) {
   152  			killRestoreDiffIP(t, ty)
   153  		})
   154  	}
   155  }
   156  
   157  func killRestoreDiffIP(t *testing.T, nodeType string) {
   158  	// NOTE: This function requires local deployment as it changes node config
   159  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeLocal})
   160  
   161  	var (
   162  		proxyURL                      = tools.GetPrimaryURL()
   163  		smap                          = tools.GetClusterMap(t, proxyURL)
   164  		origProxyCnt, origTargetCount = smap.CountActivePs(), smap.CountActiveTs()
   165  		portInc                       = 100
   166  		node                          *meta.Snode
   167  		err                           error
   168  		pdc, tdc                      int
   169  		restore                       bool
   170  	)
   171  
   172  	if nodeType == apc.Proxy {
   173  		node, err = smap.GetRandProxy(true /*exclude primary*/)
   174  		pdc = 1
   175  	} else {
   176  		node, err = smap.GetRandTarget()
   177  		tdc = 1
   178  	}
   179  	tassert.CheckFatal(t, err)
   180  
   181  killRestore:
   182  	cfg := tools.GetDaemonConfig(t, node)
   183  	tlog.Logf("Killing %s\n", node.StringEx())
   184  	cmd, err := tools.KillNode(node)
   185  	tassert.CheckFatal(t, err)
   186  
   187  	smap, err = tools.WaitForClusterState(proxyURL, "cluster to stabilize", smap.Version, origProxyCnt-pdc, origTargetCount-tdc)
   188  	tassert.CheckFatal(t, err)
   189  
   190  	// Update local config ports.
   191  	localConfPath := filepath.Join(cfg.ConfigDir, fname.PlainLocalConfig)
   192  	localConf := &cmn.LocalConfig{}
   193  	_, err = jsp.LoadMeta(localConfPath, localConf)
   194  	tassert.CheckFatal(t, err)
   195  	localConf.HostNet.Port = cfg.HostNet.Port + portInc
   196  	localConf.HostNet.PortIntraControl = cfg.HostNet.PortIntraControl + portInc
   197  	localConf.HostNet.PortIntraData = cfg.HostNet.PortIntraData + portInc
   198  	err = jsp.SaveMeta(localConfPath, localConf, nil)
   199  	tassert.CheckFatal(t, err)
   200  
   201  	err = tools.RestoreNode(cmd, false, nodeType)
   202  	tassert.CheckFatal(t, err)
   203  
   204  	smap, err = tools.WaitForClusterState(proxyURL, "restore with changed config", smap.Version, origProxyCnt, 0)
   205  	tassert.CheckFatal(t, err)
   206  
   207  	// Health check with old public URL should fail
   208  	err = api.Health(tools.BaseAPIParams(node.URL(cmn.NetPublic)))
   209  	tassert.Errorf(t, err != nil, "health check with old IP information should fail %v", err)
   210  
   211  	newNode := smap.GetNode(node.ID())
   212  	err = tools.WaitNodeReady(newNode.URL(cmn.NetPublic))
   213  	tassert.CheckError(t, err)
   214  	if !restore {
   215  		// Revert port changes
   216  		restore = true
   217  		node = newNode
   218  		portInc = -portInc
   219  		goto killRestore
   220  	}
   221  
   222  	if nodeType == apc.Target {
   223  		tools.WaitForRebalAndResil(t, tools.BaseAPIParams(proxyURL))
   224  	}
   225  }
   226  
   227  // primaryAndTargetCrash kills the primary p[roxy and one random target, verifies the next in
   228  // line proxy becomes the new primary, restore the target and proxy, restore original primary.
   229  func primaryAndTargetCrash(t *testing.T) {
   230  	if docker.IsRunning() {
   231  		t.Skip("Skipped because setting new primary URL in command line for docker is not supported")
   232  	}
   233  
   234  	proxyURL := tools.RandomProxyURL(t)
   235  	smap := tools.GetClusterMap(t, proxyURL)
   236  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   237  
   238  	newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap)
   239  	tassert.CheckFatal(t, err)
   240  
   241  	oldPrimaryURL := smap.Primary.URL(cmn.NetPublic)
   242  	tlog.Logf("Killing proxy %s - %s\n", oldPrimaryURL, smap.Primary.ID())
   243  	cmd, err := tools.KillNode(smap.Primary)
   244  	tassert.CheckFatal(t, err)
   245  
   246  	// Select a random target
   247  	var (
   248  		targetURL       string
   249  		targetID        string
   250  		targetNode      *meta.Snode
   251  		origTargetCount = smap.CountActiveTs()
   252  		origProxyCount  = smap.CountActivePs()
   253  	)
   254  
   255  	targetNode, _ = smap.GetRandTarget()
   256  	targetURL = targetNode.URL(cmn.NetPublic)
   257  	targetID = targetNode.ID()
   258  
   259  	tlog.Logf("Killing target: %s - %s\n", targetURL, targetID)
   260  	tcmd, err := tools.KillNode(targetNode)
   261  	tassert.CheckFatal(t, err)
   262  
   263  	smap, err = tools.WaitForClusterState(newPrimaryURL, "new primary elected",
   264  		smap.Version, origProxyCount-1, origTargetCount-1)
   265  	tassert.CheckFatal(t, err)
   266  
   267  	if smap.Primary.ID() != newPrimaryID {
   268  		t.Fatalf("Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), newPrimaryID)
   269  	}
   270  
   271  	err = tools.RestoreNode(tcmd, false, "target")
   272  	tassert.CheckFatal(t, err)
   273  
   274  	err = tools.RestoreNode(cmd, false, "proxy (prev primary)")
   275  	tassert.CheckFatal(t, err)
   276  
   277  	_, err = tools.WaitForClusterState(newPrimaryURL, "restore proxy and target",
   278  		smap.Version, origProxyCount, origTargetCount)
   279  	tassert.CheckFatal(t, err)
   280  	tools.WaitForRebalAndResil(t, tools.BaseAPIParams(newPrimaryURL))
   281  }
   282  
   283  // A very simple test to check if a primary proxy can detect non-primary one
   284  // dies and then update and sync SMap
   285  func proxyCrash(t *testing.T) {
   286  	proxyURL := tools.RandomProxyURL(t)
   287  	smap := tools.GetClusterMap(t, proxyURL)
   288  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   289  
   290  	primaryURL := smap.Primary.URL(cmn.NetPublic)
   291  	tlog.Logf("Primary: %s\n", smap.Primary.StringEx())
   292  
   293  	origProxyCount := smap.CountActivePs()
   294  	secondNode, err := smap.GetRandProxy(true /*exclude primary*/)
   295  	tassert.CheckFatal(t, err)
   296  
   297  	tlog.Logf("Killing non-primary %s\n", secondNode.StringEx())
   298  	secondCmd, err := tools.KillNode(secondNode)
   299  	tassert.CheckFatal(t, err)
   300  
   301  	smap, err = tools.WaitForClusterState(primaryURL, "proxy removed", smap.Version, origProxyCount-1, 0)
   302  	tassert.CheckFatal(t, err)
   303  
   304  	err = tools.RestoreNode(secondCmd, false, "proxy")
   305  	tassert.CheckFatal(t, err)
   306  
   307  	smap, err = tools.WaitForClusterState(primaryURL, "proxy restoreid", smap.Version, origProxyCount, 0)
   308  	tassert.CheckFatal(t, err)
   309  
   310  	if _, ok := smap.Pmap[secondNode.ID()]; !ok {
   311  		t.Fatalf("Non-primary proxy did not rejoin the cluster.")
   312  	}
   313  }
   314  
   315  func addNodeDuplicateDaemonID(t *testing.T) {
   316  	for _, ty := range []string{apc.Proxy, apc.Target} {
   317  		t.Run(ty, func(t *testing.T) {
   318  			_addNodeDuplicateDaemonID(t, ty)
   319  		})
   320  	}
   321  }
   322  
   323  // 1. Select a random proxy/target node based on `nodeType` param
   324  // 2. Try deploying a new node using the same DaemonID as randomly chosen node
   325  // 3. Wait for the newly deployed daemon to be terminated - failing to join cluster
   326  // NOTE: Test assumes that the randomly chosen node is healthy (i.e. doesn't terminate or restart)
   327  // TODO: add test for target that tries to join with duplicate DaemonID and contains user-data
   328  func _addNodeDuplicateDaemonID(t *testing.T, nodeType string) {
   329  	// NOTE: This function requires local deployment as it changes node config
   330  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeLocal})
   331  
   332  	var (
   333  		proxyURL = tools.GetPrimaryURL()
   334  		smap     = tools.GetClusterMap(t, proxyURL)
   335  		node     *meta.Snode
   336  		err      error
   337  
   338  		// node configs
   339  		portInc = 100
   340  	)
   341  
   342  	if nodeType == apc.Proxy {
   343  		node, err = smap.GetRandProxy(true)
   344  	} else {
   345  		node, err = smap.GetRandTarget()
   346  	}
   347  	tassert.CheckFatal(t, err)
   348  	conf := tools.GetDaemonConfig(t, node)
   349  
   350  	// Create local config for daemon.
   351  	localConf := &cmn.LocalConfig{}
   352  	localConf.ConfigDir = conf.ConfigDir
   353  	localConf.HostNet.Port = conf.HostNet.Port + portInc
   354  	localConf.HostNet.PortIntraControl = conf.HostNet.PortIntraControl + portInc
   355  	localConf.HostNet.PortIntraData = conf.HostNet.PortIntraData + portInc
   356  
   357  	// start with different config but same daemon ID
   358  	pid := tools.DeployNode(t, node, conf, localConf)
   359  	t.Cleanup(func() {
   360  		tools.CleanupNode(t, pid)
   361  	})
   362  
   363  	err = tools.WaitForPID(pid)
   364  	tassert.CheckFatal(t, err)
   365  }
   366  
   367  func addNodeDuplicateIP(t *testing.T) {
   368  	for _, ty := range []string{apc.Proxy, apc.Target} {
   369  		t.Run(ty, func(t *testing.T) {
   370  			_addNodeDuplicateIP(t, ty)
   371  		})
   372  	}
   373  }
   374  
   375  // 1. Select a random proxy/target node based on `nodeType` param
   376  // 2. Try deploying a new node using the same IP configuration as randomly chosen node
   377  // 3. Wait for the newly deployed daemon to be terminated - failing to join cluster
   378  // NOTE: Test assumes that the randomly chosen node is healthy (i.e. doesn't terminate or restart)
   379  func _addNodeDuplicateIP(t *testing.T, nodeType string) {
   380  	// NOTE: This function requires local deployment as it changes node config
   381  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeLocal})
   382  
   383  	var (
   384  		proxyURL = tools.GetPrimaryURL()
   385  		smap     = tools.GetClusterMap(t, proxyURL)
   386  		node     *meta.Snode
   387  		err      error
   388  	)
   389  
   390  	if nodeType == apc.Proxy {
   391  		node, err = smap.GetRandProxy(true)
   392  	} else {
   393  		node, err = smap.GetRandTarget()
   394  	}
   395  	tassert.CheckFatal(t, err)
   396  	conf := tools.GetDaemonConfig(t, node)
   397  
   398  	// Make sure that the `DaemonID` is different.
   399  	node.DaeID = "testing_" + trand.String(10)
   400  
   401  	pid := tools.DeployNode(t, node, conf, nil)
   402  	t.Cleanup(func() {
   403  		tools.CleanupNode(t, pid)
   404  	})
   405  
   406  	err = tools.WaitForPID(pid)
   407  	tassert.CheckFatal(t, err)
   408  }
   409  
   410  // primaryAndProxyCrash kills primary proxy and one another proxy(not the next in line primary)
   411  // and restore them afterwards
   412  func primaryAndProxyCrash(t *testing.T) {
   413  	var (
   414  		proxyURL                    = tools.RandomProxyURL(t)
   415  		smap                        = tools.GetClusterMap(t, proxyURL)
   416  		origProxyCount              = smap.CountActivePs()
   417  		oldPrimaryURL, oldPrimaryID = smap.Primary.URL(cmn.NetPublic), smap.Primary.ID()
   418  		secondNode                  *meta.Snode
   419  		secondID                    string
   420  	)
   421  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   422  
   423  	newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap)
   424  	tassert.CheckFatal(t, err)
   425  
   426  	tlog.Logf("Killing primary: %s - %s\n", oldPrimaryURL, oldPrimaryID)
   427  	cmd, err := tools.KillNode(smap.Primary)
   428  	tassert.CheckFatal(t, err)
   429  
   430  	// Do not choose the next primary in line, or the current primary proxy
   431  	// This is because the system currently cannot recover if the next proxy in line is
   432  	// also killed (TODO)
   433  	for k, v := range smap.Pmap {
   434  		if k != newPrimaryID && k != oldPrimaryID {
   435  			secondNode = v
   436  			secondID = secondNode.ID()
   437  			break
   438  		}
   439  	}
   440  	tassert.Errorf(t, secondID != "", "not enough proxies (%d)", origProxyCount)
   441  	n := cos.NowRand().Intn(20)
   442  	time.Sleep(time.Duration(n+1) * time.Second)
   443  
   444  	tlog.Logf("Killing non-primary: %s\n", secondNode.StringEx())
   445  	secondCmd, err := tools.KillNode(secondNode)
   446  	tassert.CheckFatal(t, err)
   447  
   448  	smap, err = tools.WaitForClusterState(newPrimaryURL, "elect new primary",
   449  		smap.Version, origProxyCount-2, 0)
   450  	tassert.CheckFatal(t, err)
   451  
   452  	err = tools.RestoreNode(cmd, true, "previous primary "+oldPrimaryID)
   453  	tassert.CheckFatal(t, err)
   454  
   455  	smap, err = tools.WaitForClusterState(newPrimaryURL, "join back previous primary "+oldPrimaryID,
   456  		smap.Version, origProxyCount-1, 0)
   457  	tassert.CheckFatal(t, err)
   458  
   459  	err = tools.RestoreNode(secondCmd, false, "proxy")
   460  	tassert.CheckFatal(t, err)
   461  
   462  	smap, err = tools.WaitForClusterState(newPrimaryURL, "join back non-primary "+secondID,
   463  		smap.Version, origProxyCount, 0)
   464  	tassert.CheckFatal(t, err)
   465  
   466  	if smap.Primary.ID() != newPrimaryID {
   467  		t.Fatalf("Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), newPrimaryID)
   468  	}
   469  
   470  	if _, ok := smap.Pmap[oldPrimaryID]; !ok {
   471  		t.Fatalf("Previous primary proxy %s did not rejoin the cluster", oldPrimaryID)
   472  	}
   473  
   474  	if _, ok := smap.Pmap[secondID]; !ok {
   475  		t.Fatalf("Second proxy %s did not rejoin the cluster", secondID)
   476  	}
   477  }
   478  
   479  // targetRejoin kills a random selected target, wait for it to rejoin and verifies it
   480  func targetRejoin(t *testing.T) {
   481  	var (
   482  		id       string
   483  		proxyURL = tools.RandomProxyURL(t)
   484  	)
   485  
   486  	smap := tools.GetClusterMap(t, proxyURL)
   487  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   488  
   489  	node, err := smap.GetRandTarget()
   490  	if err != nil {
   491  		tlog.Logf("Warning: %v\n", err)
   492  		tlog.Logln("Retrying...")
   493  		// retry once
   494  		time.Sleep(10 * time.Second)
   495  		smap = tools.GetClusterMap(t, proxyURL)
   496  		node, err = smap.GetRandTarget()
   497  		tassert.CheckFatal(t, err)
   498  	}
   499  	id = node.ID()
   500  
   501  	cmd, err := tools.KillNode(node)
   502  	tassert.CheckFatal(t, err)
   503  	smap, err = tools.WaitForClusterState(proxyURL, "target crashed", smap.Version, smap.CountActivePs(), smap.CountActiveTs()-1)
   504  	tassert.CheckFatal(t, err)
   505  
   506  	if _, ok := smap.Tmap[id]; ok {
   507  		t.Fatalf("Killed target was not removed from the Smap: %v", id)
   508  	}
   509  
   510  	err = tools.RestoreNode(cmd, false, "target")
   511  	tassert.CheckFatal(t, err)
   512  
   513  	smap, err = tools.WaitForClusterState(proxyURL, "target rejoined",
   514  		smap.Version, smap.CountActivePs(), smap.CountActiveTs()+1)
   515  	tassert.CheckFatal(t, err)
   516  
   517  	if _, ok := smap.Tmap[id]; !ok {
   518  		t.Fatalf("Restarted target %s did not rejoin the cluster", id)
   519  	}
   520  	tools.WaitForRebalAndResil(t, tools.BaseAPIParams(proxyURL))
   521  }
   522  
   523  // crashAndFastRestore kills the primary and restores it before a new leader is elected
   524  func crashAndFastRestore(t *testing.T) {
   525  	var err error
   526  	proxyURL := tools.RandomProxyURL(t)
   527  	smap := tools.GetClusterMap(t, proxyURL)
   528  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   529  
   530  	// Make sure proxyURL is not primary URL.
   531  	_, proxyURL, err = chooseNextProxy(smap)
   532  	tassert.CheckFatal(t, err)
   533  	oldPrimaryID := smap.Primary.ID()
   534  	tlog.Logf("The current primary %s, Smap version %d\n", oldPrimaryID, smap.Version)
   535  
   536  	cmd, err := tools.KillNode(smap.Primary)
   537  	tassert.CheckFatal(t, err)
   538  
   539  	// quick crash and recover
   540  	time.Sleep(2 * time.Second)
   541  	err = tools.RestoreNode(cmd, true, "proxy (primary)")
   542  	tassert.CheckFatal(t, err)
   543  
   544  	tlog.Logf("The %s is currently restarting\n", oldPrimaryID)
   545  
   546  	// NOTE: using (version - 1) because the primary will restart with its old version,
   547  	//       there will be no version change for this restore, so force beginning version to 1 less
   548  	//       than the original version in order to use WaitForClusterState.
   549  	smap, err = tools.WaitForClusterState(proxyURL, "restore", smap.Version-1, 0, 0)
   550  	tassert.CheckFatal(t, err)
   551  
   552  	if smap.Primary.ID() != oldPrimaryID {
   553  		t.Fatalf("Wrong primary proxy: %s, expecting: %s", smap.Primary.ID(), oldPrimaryID)
   554  	}
   555  }
   556  
   557  func joinWhileVoteInProgress(t *testing.T) {
   558  	if docker.IsRunning() {
   559  		t.Skipf("skipping %s (docker is not supported)", t.Name())
   560  	}
   561  	var (
   562  		smap         = tools.GetClusterMap(t, proxyURL)
   563  		oldTargetCnt = smap.CountActiveTs()
   564  		oldProxyCnt  = smap.CountActivePs()
   565  		stopch       = make(chan struct{})
   566  		errCh        = make(chan error, 10)
   567  		mocktgt      = &voteRetryMockTarget{
   568  			voteInProgress: true,
   569  			errCh:          errCh,
   570  		}
   571  	)
   572  	tlog.Logf("targets: %d, proxies: %d\n", oldTargetCnt, oldProxyCnt)
   573  
   574  	proxy, err := smap.GetRandProxy(true /*exclude primary*/)
   575  	tassert.CheckFatal(t, err)
   576  	proxyURL := proxy.URL(cmn.NetPublic)
   577  	wg := &sync.WaitGroup{}
   578  
   579  	wg.Add(1)
   580  	go runMockTarget(t, proxyURL, mocktgt, stopch, smap, wg)
   581  
   582  	_, err = tools.WaitForClusterState(proxyURL, "mock target joined", smap.Version, oldProxyCnt, oldTargetCnt+1)
   583  	tassert.CheckFatal(t, err)
   584  
   585  	smap = killRestorePrimary(t, proxyURL, false, nil)
   586  	//
   587  	// FIXME: election is in progress if and only when xaction(apc.ActElection) is running -
   588  	//        simulating the scenario via mocktgt.voteInProgress = true is incorrect
   589  	//
   590  	// if _, ok := smap.Pmap[oldPrimaryID]; ok {
   591  	//	t.Fatalf("Previous primary proxy rejoined the cluster during a vote")
   592  	// }
   593  	mocktgt.voteInProgress = false
   594  	// smap, err = tools.WaitForClusterState(newPrimaryURL, "synchronize new Smap",
   595  	// smap.Version, testing.Verbose(), oldProxyCnt, oldTargetCnt+1)
   596  	// tassert.CheckFatal(t, err)
   597  	//
   598  	// end of FIXME
   599  
   600  	// time to kill the mock target, job well done
   601  	var v struct{}
   602  	stopch <- v
   603  	close(stopch)
   604  	select {
   605  	case err := <-errCh:
   606  		t.Errorf("Mock Target Error: %v", err)
   607  	default:
   608  	}
   609  
   610  	wg.Wait()
   611  
   612  	_, err = tools.WaitForClusterState(smap.Primary.URL(cmn.NetPublic),
   613  		"cluster to stabilize", smap.Version, oldProxyCnt, oldTargetCnt)
   614  	tassert.CheckFatal(t, err)
   615  }
   616  
   617  func minorityTargetMapVersionMismatch(t *testing.T) {
   618  	proxyURL := tools.RandomProxyURL(t)
   619  	targetMapVersionMismatch(
   620  		func(i int) int {
   621  			return i/4 + 1
   622  		}, t, proxyURL)
   623  }
   624  
   625  func majorityTargetMapVersionMismatch(t *testing.T) {
   626  	proxyURL := tools.RandomProxyURL(t)
   627  	targetMapVersionMismatch(
   628  		func(i int) int {
   629  			return i/2 + 1
   630  		}, t, proxyURL)
   631  }
   632  
   633  // targetMapVersionMismatch updates map version of a few targets, kill the primary proxy
   634  // wait for the new leader to come online
   635  func targetMapVersionMismatch(getNum func(int) int, t *testing.T, proxyURL string) {
   636  	smap := tools.GetClusterMap(t, proxyURL)
   637  	tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   638  
   639  	smap.Version++
   640  	jsonMap, err := jsoniter.Marshal(smap)
   641  	tassert.CheckFatal(t, err)
   642  
   643  	n := getNum(smap.CountActiveTs() + smap.CountActivePs() - 1)
   644  	for _, v := range smap.Tmap {
   645  		if n == 0 {
   646  			break
   647  		}
   648  		baseParams := tools.BaseAPIParams(v.URL(cmn.NetPublic))
   649  		baseParams.Method = http.MethodPut
   650  		reqParams := &api.ReqParams{
   651  			BaseParams: baseParams,
   652  			Path:       apc.URLPathDae.Join(apc.SyncSmap),
   653  			Body:       jsonMap,
   654  			Header:     http.Header{cos.HdrContentType: []string{cos.ContentJSON}},
   655  		}
   656  		err = reqParams.DoRequest()
   657  		tassert.CheckFatal(t, err)
   658  		n--
   659  	}
   660  	killRestorePrimary(t, proxyURL, false, nil)
   661  }
   662  
   663  // concurrentPutGetDel does put/get/del sequence against all proxies concurrently
   664  func concurrentPutGetDel(t *testing.T) {
   665  	_ = tools.RandomProxyURL(t)
   666  	runProviderTests(t, func(t *testing.T, bck *meta.Bck) {
   667  		proxyURL := tools.RandomProxyURL(t)
   668  		smap := tools.GetClusterMap(t, proxyURL)
   669  		tlog.Logf("targets: %d, proxies: %d\n", smap.CountActiveTs(), smap.CountActivePs())
   670  
   671  		var (
   672  			wg        = &sync.WaitGroup{}
   673  			errCh     = make(chan error, smap.CountActivePs())
   674  			cksumType = bck.Props.Cksum.Type
   675  		)
   676  
   677  		// cid = a goroutine ID to make filenames unique
   678  		// otherwise it is easy to run into a trouble when 2 goroutines do:
   679  		//   1PUT 2PUT 1DEL 2DEL
   680  		// And the second goroutine fails with error "object does not exist"
   681  		for _, v := range smap.Pmap {
   682  			wg.Add(1)
   683  			go func(url string) {
   684  				defer wg.Done()
   685  				errCh <- proxyPutGetDelete(100, url, bck.Clone(), cksumType)
   686  			}(v.URL(cmn.NetPublic))
   687  		}
   688  
   689  		wg.Wait()
   690  		close(errCh)
   691  
   692  		for err := range errCh {
   693  			tassert.CheckFatal(t, err)
   694  		}
   695  	})
   696  }
   697  
   698  // proxyPutGetDelete repeats put/get/del N times, all requests go to the same proxy
   699  func proxyPutGetDelete(count int, proxyURL string, bck cmn.Bck, cksumType string) error {
   700  	baseParams := tools.BaseAPIParams(proxyURL)
   701  	for range count {
   702  		reader, err := readers.NewRand(fileSize, cksumType)
   703  		if err != nil {
   704  			return fmt.Errorf("error creating reader: %v", err)
   705  		}
   706  		fname := trand.String(20)
   707  		keyname := fmt.Sprintf("%s/%s", localBucketDir, fname)
   708  		putArgs := api.PutArgs{
   709  			BaseParams: baseParams,
   710  			Bck:        bck,
   711  			ObjName:    keyname,
   712  			Cksum:      reader.Cksum(),
   713  			Reader:     reader,
   714  		}
   715  		if _, err = api.PutObject(&putArgs); err != nil {
   716  			return fmt.Errorf("error executing put: %v", err)
   717  		}
   718  		if _, err = api.GetObject(baseParams, bck, keyname, nil); err != nil {
   719  			return fmt.Errorf("error executing get: %v", err)
   720  		}
   721  		if err = tools.Del(proxyURL, bck, keyname, nil /* wg */, nil /* errCh */, true /* silent */); err != nil {
   722  			return fmt.Errorf("error executing del: %v", err)
   723  		}
   724  	}
   725  
   726  	return nil
   727  }
   728  
   729  // putGetDelWorker does put/get/del in sequence; if primary proxy change happens, it checks the failed delete
   730  // channel and route the deletes to the new primary proxy
   731  // stops when told to do so via the stop channel
   732  func putGetDelWorker(proxyURL string, stopCh <-chan struct{}, proxyURLCh <-chan string, errCh chan error,
   733  	wg *sync.WaitGroup) {
   734  	defer wg.Done()
   735  
   736  	missedDeleteCh := make(chan string, 100)
   737  	baseParams := tools.BaseAPIParams(proxyURL)
   738  
   739  	bck := cmn.Bck{
   740  		Name:     testBucketName,
   741  		Provider: apc.AIS,
   742  	}
   743  	cksumType := bck.DefaultProps(initialClusterConfig).Cksum.Type
   744  loop:
   745  	for {
   746  		select {
   747  		case <-stopCh:
   748  			close(errCh)
   749  			break loop
   750  
   751  		case url := <-proxyURLCh:
   752  			// send failed deletes to the new primary proxy
   753  		deleteLoop:
   754  			for {
   755  				select {
   756  				case objName := <-missedDeleteCh:
   757  					err := tools.Del(url, bck, objName, nil, errCh, true)
   758  					if err != nil {
   759  						missedDeleteCh <- objName
   760  					}
   761  
   762  				default:
   763  					break deleteLoop
   764  				}
   765  			}
   766  
   767  		default:
   768  		}
   769  
   770  		reader, err := readers.NewRand(fileSize, cksumType)
   771  		if err != nil {
   772  			errCh <- err
   773  			continue
   774  		}
   775  
   776  		fname := trand.String(20)
   777  		objName := fmt.Sprintf("%s/%s", localBucketDir, fname)
   778  		putArgs := api.PutArgs{
   779  			BaseParams: baseParams,
   780  			Bck:        bck,
   781  			ObjName:    objName,
   782  			Cksum:      reader.Cksum(),
   783  			Reader:     reader,
   784  		}
   785  		_, err = api.PutObject(&putArgs)
   786  		if err != nil {
   787  			errCh <- err
   788  			continue
   789  		}
   790  		_, err = api.GetObject(baseParams, bck, objName, nil)
   791  		if err != nil {
   792  			errCh <- err
   793  		}
   794  
   795  		err = tools.Del(proxyURL, bck, objName, nil, errCh, true)
   796  		if err != nil {
   797  			missedDeleteCh <- objName
   798  		}
   799  	}
   800  
   801  	// process left over not deleted objects
   802  	close(missedDeleteCh)
   803  	for n := range missedDeleteCh {
   804  		tools.Del(proxyURL, bck, n, nil, nil, true)
   805  	}
   806  }
   807  
   808  // primaryKiller kills primary proxy, notifies all workers, and restores it.
   809  func primaryKiller(t *testing.T, proxyURL string, stopch <-chan struct{}, proxyurlchs []chan string,
   810  	errCh chan error, wg *sync.WaitGroup) {
   811  	defer wg.Done()
   812  
   813  loop:
   814  	for {
   815  		select {
   816  		case <-stopch:
   817  			close(errCh)
   818  			for _, ch := range proxyurlchs {
   819  				close(ch)
   820  			}
   821  
   822  			break loop
   823  
   824  		default:
   825  		}
   826  
   827  		postKill := func(_ *meta.Smap, newPrimary, _ *meta.Snode) {
   828  			// let the workers go to the dying primary for a little while longer to generate errored requests
   829  			time.Sleep(time.Second)
   830  			for _, ch := range proxyurlchs {
   831  				ch <- newPrimary.URL(cmn.NetPublic)
   832  			}
   833  		}
   834  		killRestorePrimary(t, proxyURL, false, postKill)
   835  	}
   836  }
   837  
   838  // Tests if a node is able to restart when discovery and original primary provided in config are not available
   839  // 1. Set primary as original primary from config
   840  // 2. Kill discovery node provided in config, a random proxy and target
   841  // 3. Try restoring the killed nodes one at a time
   842  func discoveryAndOrigPrimaryProxiesCrash(t *testing.T) {
   843  	var (
   844  		config       = tools.GetClusterConfig(t)
   845  		restoreCmd   = make([]tools.RestoreCmd, 0, 3)
   846  		proxyURL     string
   847  		pcnt, tcnt   int
   848  		randomKilled bool
   849  	)
   850  
   851  	// Make sure primary is same config
   852  	smap := primarySetToRand(t)
   853  	origProxyCnt := smap.CountActivePs()
   854  	origTargetCnt := smap.CountActiveTs()
   855  
   856  	for _, si := range smap.Pmap {
   857  		if smap.IsPrimary(si) {
   858  			continue
   859  		}
   860  		if si.HasURL(config.Proxy.DiscoveryURL) {
   861  			pcnt++
   862  			tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, si.StringEx())
   863  			cmd, err := tools.KillNode(si)
   864  			tassert.CheckFatal(t, err)
   865  			restoreCmd = append(restoreCmd, cmd)
   866  			continue
   867  		}
   868  		if randomKilled {
   869  			// Set proxyURL - used to get latest smap
   870  			proxyURL = si.URL(cmn.NetPublic)
   871  			continue
   872  		}
   873  
   874  		// Kill a random non primary proxy
   875  		pcnt++
   876  		tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, si.StringEx())
   877  		cmd, err := tools.KillNode(si)
   878  		tassert.CheckFatal(t, err)
   879  		restoreCmd = append(restoreCmd, cmd)
   880  		randomKilled = true
   881  	}
   882  
   883  	// Kill a random target
   884  	target, err := smap.GetRandTarget()
   885  	tassert.CheckFatal(t, err)
   886  	tcnt++
   887  	tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, target.StringEx())
   888  	cmd, err := tools.KillNode(target)
   889  	tassert.CheckFatal(t, err)
   890  	restoreCmd = append(restoreCmd, cmd)
   891  
   892  	// Kill original primary
   893  	pcnt++
   894  	tlog.Logf("Kill #%d: %s\n", pcnt+tcnt, smap.Primary.StringEx())
   895  	cmd, err = tools.KillNode(smap.Primary)
   896  	tassert.CheckFatal(t, err)
   897  	restoreCmd = append(restoreCmd, cmd)
   898  
   899  	proxyCnt, targetCnt := origProxyCnt-pcnt, origTargetCnt-tcnt
   900  	smap, err = tools.WaitForClusterState(proxyURL, "kill proxies and target", smap.Version, proxyCnt, targetCnt)
   901  	tassert.CheckFatal(t, err)
   902  
   903  	// Restore all killed nodes
   904  	for _, cmd := range restoreCmd {
   905  		if cmd.Node.IsProxy() {
   906  			proxyCnt++
   907  		}
   908  		if cmd.Node.IsTarget() {
   909  			targetCnt++
   910  		}
   911  		tools.RestoreNode(cmd, false, cmd.Node.Type())
   912  		_, err = tools.WaitForClusterState(proxyURL, "restore "+cmd.Node.ID(), smap.Version,
   913  			proxyCnt, targetCnt)
   914  		tassert.CheckError(t, err)
   915  	}
   916  
   917  	tools.WaitForRebalAndResil(t, tools.BaseAPIParams(proxyURL))
   918  }
   919  
   920  // proxyStress starts a group of workers doing put/get/del in sequence against primary proxy,
   921  // while the operations are on going, a separate go routine kills the primary proxy, notifies all
   922  // workers about the proxy change, restart the killed proxy as a non-primary proxy.
   923  // the process is repeated until a pre-defined time duration is reached.
   924  func proxyStress(t *testing.T) {
   925  	var (
   926  		wg          sync.WaitGroup
   927  		errChs      = make([]chan error, workerCnt+1)
   928  		stopChs     = make([]chan struct{}, workerCnt+1)
   929  		proxyURLChs = make([]chan string, workerCnt)
   930  		bck         = cmn.Bck{
   931  			Name:     testBucketName,
   932  			Provider: apc.AIS,
   933  		}
   934  		proxyURL = tools.RandomProxyURL(t)
   935  	)
   936  
   937  	tools.CreateBucket(t, proxyURL, bck, nil, true /*cleanup*/)
   938  	defer func() {
   939  		err := tools.WaitNodeReady(proxyURL)
   940  		tassert.CheckFatal(t, err)
   941  	}()
   942  
   943  	// start all workers
   944  	for i := range workerCnt {
   945  		errChs[i] = make(chan error, defaultChanSize)
   946  		stopChs[i] = make(chan struct{}, defaultChanSize)
   947  		proxyURLChs[i] = make(chan string, defaultChanSize)
   948  
   949  		wg.Add(1)
   950  		go putGetDelWorker(proxyURL, stopChs[i], proxyURLChs[i], errChs[i], &wg)
   951  
   952  		// stagger the workers so they don't always do the same operation at the same time
   953  		n := cos.NowRand().Intn(999)
   954  		time.Sleep(time.Duration(n+1) * time.Millisecond)
   955  	}
   956  
   957  	errChs[workerCnt] = make(chan error, defaultChanSize)
   958  	stopChs[workerCnt] = make(chan struct{}, defaultChanSize)
   959  	wg.Add(1)
   960  	go primaryKiller(t, proxyURL, stopChs[workerCnt], proxyURLChs, errChs[workerCnt], &wg)
   961  
   962  	timer := time.After(tools.MultiProxyTestTimeout)
   963  loop:
   964  	for {
   965  		for _, ch := range errChs {
   966  			select {
   967  			case <-timer:
   968  				break loop
   969  			case <-ch:
   970  				// Read errors, throw away, this is needed to unblock the workers.
   971  			default:
   972  			}
   973  		}
   974  	}
   975  
   976  	// stop all workers
   977  	for _, stopCh := range stopChs {
   978  		stopCh <- struct{}{}
   979  		close(stopCh)
   980  	}
   981  
   982  	wg.Wait()
   983  }
   984  
   985  // smap 	- current Smap
   986  // directURL	- URL of the proxy that we send the request to (not necessarily the current primary)
   987  // toID 	- DaemonID and URL of the proxy that must become the new primary
   988  func setPrimaryTo(t *testing.T, proxyURL string, smap *meta.Smap, directURL, toID string) (newSmap *meta.Smap) {
   989  	if directURL == "" {
   990  		directURL = smap.Primary.URL(cmn.NetPublic)
   991  	}
   992  
   993  	baseParams := tools.BaseAPIParams(directURL)
   994  	tlog.Logf("Setting primary from %s to %s\n", smap.Primary.ID(), toID)
   995  	err := api.SetPrimaryProxy(baseParams, toID, false /*force*/)
   996  	tassert.CheckFatal(t, err)
   997  
   998  	newSmap, err = tools.WaitForNewSmap(proxyURL, smap.Version)
   999  	tassert.CheckFatal(t, err)
  1000  	if newSmap.Primary.ID() != toID {
  1001  		t.Fatalf("Expected primary=%s, got %s", toID, newSmap.Primary.ID())
  1002  	}
  1003  	checkSmaps(t, newSmap.Primary.URL(cmn.NetPublic))
  1004  	return
  1005  }
  1006  
  1007  func chooseNextProxy(smap *meta.Smap) (proxyid, proxyURL string, err error) {
  1008  	pid, err := hrwProxyTest(smap, smap.Primary.ID())
  1009  	pi := smap.Pmap[pid]
  1010  	if err != nil {
  1011  		return
  1012  	}
  1013  
  1014  	return pi.ID(), pi.URL(cmn.NetPublic), nil
  1015  }
  1016  
  1017  // For each proxy: compare its Smap vs primary(*) and return an error if differs
  1018  func checkSmaps(t *testing.T, proxyURL string) {
  1019  	var (
  1020  		smap1      = tools.GetClusterMap(t, proxyURL)
  1021  		primary    = smap1.Primary // primary according to the `proxyURL`(*)
  1022  		smapDiffer bool
  1023  	)
  1024  	for _, psi := range smap1.Pmap {
  1025  		smap2 := tools.GetClusterMap(t, psi.URL(cmn.NetPublic))
  1026  		uuid, sameOrigin, sameVersion, eq := smap1.Compare(smap2)
  1027  		if eq {
  1028  			continue
  1029  		}
  1030  		err := fmt.Errorf("(%s %s, primary=%s) != (%s %s, primary=%s): (uuid=%s, same-orig=%t, same-ver=%t)",
  1031  			proxyURL, smap1, primary, psi.URL(cmn.NetPublic), smap2, smap2.Primary, uuid, sameOrigin, sameVersion)
  1032  		t.Error(err)
  1033  		smapDiffer = true
  1034  	}
  1035  	if !smapDiffer {
  1036  		tlog.Logln("all Smap copies are identical: " + smap1.StringEx())
  1037  	}
  1038  }
  1039  
  1040  func primarySetToRand(t *testing.T) *meta.Smap {
  1041  	var (
  1042  		proxyURL = tools.GetPrimaryURL()
  1043  		smap     = tools.GetClusterMap(t, proxyURL)
  1044  		currURL  = smap.Primary.URL(cmn.NetPublic)
  1045  	)
  1046  	if currURL != proxyURL {
  1047  		t.Fatalf("Err in the test itself: expecting currURL %s == proxyurl %s", currURL, proxyURL)
  1048  	}
  1049  
  1050  	psi, err := smap.GetRandProxy(true /*exclude primary*/)
  1051  	tassert.CheckFatal(t, err)
  1052  	return setPrimaryTo(t, proxyURL, smap, "", psi.ID())
  1053  }
  1054  
  1055  // This is duplicated in the tests because the `idDigest` of `daemonInfo` is not
  1056  // exported. As a result of this, ais.HrwProxy will not return the correct
  1057  // proxy since the `idDigest` will be initialized to 0. To avoid this, we
  1058  // compute the checksum directly in this method.
  1059  func hrwProxyTest(smap *meta.Smap, idToSkip string) (pi string, err error) {
  1060  	if smap.CountActivePs() == 0 {
  1061  		err = errors.New("AIStore cluster map is empty: no proxies")
  1062  		return
  1063  	}
  1064  	var (
  1065  		maxH    uint64
  1066  		skipped int
  1067  	)
  1068  	for id, snode := range smap.Pmap {
  1069  		if id == idToSkip {
  1070  			skipped++
  1071  			continue
  1072  		}
  1073  		if smap.NonElectable(snode) {
  1074  			skipped++
  1075  			continue
  1076  		}
  1077  
  1078  		if smap.InMaintOrDecomm(snode) {
  1079  			skipped++
  1080  			continue
  1081  		}
  1082  
  1083  		cs := xxhash.Checksum64S(cos.UnsafeB(snode.ID()), cos.MLCG32)
  1084  		if cs > maxH {
  1085  			maxH = cs
  1086  			pi = id
  1087  		}
  1088  	}
  1089  	if pi == "" {
  1090  		err = fmt.Errorf("cannot HRW-select proxy: current count=%d, skipped=%d",
  1091  			smap.CountActivePs(), skipped)
  1092  	}
  1093  	return
  1094  }
  1095  
  1096  func networkFailureTarget(t *testing.T) {
  1097  	proxyURL := tools.RandomProxyURL(t)
  1098  	smap := tools.GetClusterMap(t, proxyURL)
  1099  	proxyCount, targetCount := smap.CountActivePs(), smap.CountActiveTs()
  1100  
  1101  	tassert.Fatalf(t, targetCount > 0, "At least 1 target required")
  1102  	target, _ := smap.GetRandTarget()
  1103  	targetID := target.ID()
  1104  
  1105  	tlog.Logf("Disconnecting target: %s\n", targetID)
  1106  	oldNetworks, err := docker.Disconnect(targetID)
  1107  	tassert.CheckFatal(t, err)
  1108  
  1109  	smap, err = tools.WaitForClusterState(
  1110  		proxyURL,
  1111  		"target is down",
  1112  		smap.Version,
  1113  		proxyCount,
  1114  		targetCount-1,
  1115  	)
  1116  	tassert.CheckFatal(t, err)
  1117  
  1118  	tlog.Logf("Connecting target %s to networks again\n", targetID)
  1119  	err = docker.Connect(targetID, oldNetworks)
  1120  	tassert.CheckFatal(t, err)
  1121  
  1122  	_, err = tools.WaitForClusterState(
  1123  		proxyURL,
  1124  		"to check cluster state",
  1125  		smap.Version,
  1126  		proxyCount,
  1127  		targetCount,
  1128  	)
  1129  	tassert.CheckFatal(t, err)
  1130  }
  1131  
  1132  func networkFailureProxy(t *testing.T) {
  1133  	proxyURL := tools.RandomProxyURL(t)
  1134  	smap := tools.GetClusterMap(t, proxyURL)
  1135  	proxyCount, targetCount := smap.CountActivePs(), smap.CountActiveTs()
  1136  	tassert.Fatalf(t, proxyCount > 1, "At least 2 proxy required (has: %d)", proxyCount)
  1137  
  1138  	oldPrimaryID := smap.Primary.ID()
  1139  	proxyID, _, err := chooseNextProxy(smap)
  1140  	tassert.CheckFatal(t, err)
  1141  
  1142  	tlog.Logf("Disconnecting proxy: %s\n", proxyID)
  1143  	oldNetworks, err := docker.Disconnect(proxyID)
  1144  	tassert.CheckFatal(t, err)
  1145  
  1146  	smap, err = tools.WaitForClusterState(
  1147  		proxyURL,
  1148  		"proxy is down",
  1149  		smap.Version,
  1150  		proxyCount-1,
  1151  		targetCount,
  1152  	)
  1153  	tassert.CheckFatal(t, err)
  1154  
  1155  	tlog.Logf("Connecting proxy %s to networks again\n", proxyID)
  1156  	err = docker.Connect(proxyID, oldNetworks)
  1157  	tassert.CheckFatal(t, err)
  1158  
  1159  	smap, err = tools.WaitForClusterState(
  1160  		proxyURL,
  1161  		"to check cluster state",
  1162  		smap.Version,
  1163  		proxyCount,
  1164  		targetCount,
  1165  	)
  1166  	tassert.CheckFatal(t, err)
  1167  
  1168  	if oldPrimaryID != smap.Primary.ID() {
  1169  		t.Fatalf("Primary proxy changed from %s to %s",
  1170  			oldPrimaryID, smap.Primary.ID())
  1171  	}
  1172  }
  1173  
  1174  func networkFailurePrimary(t *testing.T) {
  1175  	proxyURL := tools.RandomProxyURL(t)
  1176  	smap := tools.GetClusterMap(t, proxyURL)
  1177  	if smap.CountActivePs() < 2 {
  1178  		t.Fatal("At least 2 proxy required")
  1179  	}
  1180  
  1181  	proxyCount, targetCount := smap.CountActivePs(), smap.CountActiveTs()
  1182  	oldPrimaryID, oldPrimaryURL := smap.Primary.ID(), smap.Primary.URL(cmn.NetPublic)
  1183  	newPrimaryID, newPrimaryURL, err := chooseNextProxy(smap)
  1184  	tassert.CheckFatal(t, err)
  1185  
  1186  	// Disconnect primary
  1187  	tlog.Logf("Disconnecting primary %s from all networks\n", oldPrimaryID)
  1188  	oldNetworks, err := docker.Disconnect(oldPrimaryID)
  1189  	tassert.CheckFatal(t, err)
  1190  
  1191  	// Check smap
  1192  	smap, err = tools.WaitForClusterState(
  1193  		newPrimaryURL,
  1194  		"original primary gone",
  1195  		smap.Version,
  1196  		proxyCount-1,
  1197  		targetCount,
  1198  	)
  1199  	tassert.CheckFatal(t, err)
  1200  
  1201  	if smap.Primary.ID() != newPrimaryID {
  1202  		t.Fatalf("wrong primary proxy: %s, expecting: %s after disconnecting",
  1203  			smap.Primary.ID(), newPrimaryID)
  1204  	}
  1205  
  1206  	// Connect again
  1207  	tlog.Logf("Connecting primary %s to networks again\n", oldPrimaryID)
  1208  	err = docker.Connect(oldPrimaryID, oldNetworks)
  1209  	tassert.CheckFatal(t, err)
  1210  
  1211  	// give a little time to original primary, so it picks up the network
  1212  	// connections and starts talking to neighbors
  1213  	_, err = tools.WaitForClusterState(
  1214  		oldPrimaryID,
  1215  		"original primary is restored",
  1216  		smap.Version,
  1217  		proxyCount,
  1218  		targetCount,
  1219  	)
  1220  	tassert.CheckFatal(t, err)
  1221  
  1222  	oldSmap := tools.GetClusterMap(t, oldPrimaryURL)
  1223  	// the original primary still thinks that it is the primary, so its smap
  1224  	// should not change after the network is back
  1225  	if oldSmap.Primary.ID() != oldPrimaryID {
  1226  		tlog.Logf("Old primary changed its smap. Its current primary: %s (expected %s - self)\n",
  1227  			oldSmap.Primary.ID(), oldPrimaryID)
  1228  	}
  1229  
  1230  	// Forcefully set new primary for the original one
  1231  	baseParams := tools.BaseAPIParams(oldPrimaryURL)
  1232  	baseParams.Method = http.MethodPut
  1233  	reqParams := &api.ReqParams{
  1234  		BaseParams: baseParams,
  1235  		Path:       apc.URLPathDaeProxy.Join(newPrimaryID),
  1236  		Query: url.Values{
  1237  			apc.QparamForce:            {"true"},
  1238  			apc.QparamPrimaryCandidate: {newPrimaryURL},
  1239  		},
  1240  	}
  1241  	err = reqParams.DoRequest()
  1242  	tassert.CheckFatal(t, err)
  1243  
  1244  	smap, err = tools.WaitForClusterState(
  1245  		newPrimaryURL,
  1246  		"original primary joined the new primary",
  1247  		smap.Version,
  1248  		proxyCount,
  1249  		targetCount,
  1250  	)
  1251  	tassert.CheckFatal(t, err)
  1252  
  1253  	if smap.Primary.ID() != newPrimaryID {
  1254  		t.Fatalf("expected primary=%s, got %s after connecting again", newPrimaryID, smap.Primary.ID())
  1255  	}
  1256  }
  1257  
  1258  func networkFailure(t *testing.T) {
  1259  	tools.CheckSkip(t, &tools.SkipTestArgs{RequiredDeployment: tools.ClusterTypeDocker})
  1260  
  1261  	t.Run("Target network disconnect", networkFailureTarget)
  1262  	t.Run("Secondary proxy network disconnect", networkFailureProxy)
  1263  	t.Run("Primary proxy network disconnect", networkFailurePrimary)
  1264  }
  1265  
  1266  // primaryAndNextCrash kills the primary proxy and a proxy that should be selected
  1267  // after the current primary dies, verifies the second in line proxy becomes
  1268  // the new primary, restore all proxies
  1269  func primaryAndNextCrash(t *testing.T) {
  1270  	proxyURL := tools.RandomProxyURL(t)
  1271  	smap := tools.GetClusterMap(t, proxyURL)
  1272  	origProxyCount := smap.CountActivePs()
  1273  
  1274  	if origProxyCount < 4 {
  1275  		t.Skip("The test requires at least 4 proxies, found only ", origProxyCount)
  1276  	}
  1277  
  1278  	// get next primary
  1279  	firstPrimaryID, firstPrimaryURL, err := chooseNextProxy(smap)
  1280  	tassert.CheckFatal(t, err)
  1281  	// Cluster map is re-read to have a clone of original smap that the test
  1282  	// can modify in any way it needs. Because original smap got must be preserved
  1283  	smapNext := tools.GetClusterMap(t, proxyURL)
  1284  	// get next next primary
  1285  	firstPrimary := smapNext.Pmap[firstPrimaryID]
  1286  	delete(smapNext.Pmap, firstPrimaryID)
  1287  	finalPrimaryID, finalPrimaryURL, err := chooseNextProxy(smapNext)
  1288  	tassert.CheckFatal(t, err)
  1289  
  1290  	// kill the current primary
  1291  	oldPrimaryURL, oldPrimaryID := smap.Primary.URL(cmn.NetPublic), smap.Primary.ID()
  1292  	tlog.Logf("Killing primary proxy: %s - %s\n", oldPrimaryURL, oldPrimaryID)
  1293  	cmdFirst, err := tools.KillNode(smap.Primary)
  1294  	tassert.CheckFatal(t, err)
  1295  
  1296  	// kill the next primary
  1297  	tlog.Logf("Killing next to primary proxy: %s - %s\n", firstPrimaryID, firstPrimaryURL)
  1298  	cmdSecond, errSecond := tools.KillNode(firstPrimary)
  1299  	// if kill fails it does not make sense to wait for the cluster is stable
  1300  	if errSecond == nil {
  1301  		// the cluster should vote, so the smap version should be increased at
  1302  		// least by 100, that is why +99
  1303  		smap, err = tools.WaitForClusterState(finalPrimaryURL, "new primary elected",
  1304  			smap.Version+99, origProxyCount-2, 0)
  1305  		tassert.CheckFatal(t, err)
  1306  	}
  1307  
  1308  	tlog.Logf("Checking current primary, %s\n", smap.StringEx())
  1309  	if smap.Primary.ID() != finalPrimaryID {
  1310  		t.Errorf("Expected primary %s but real primary is %s", finalPrimaryID, smap.Primary.ID())
  1311  	}
  1312  
  1313  	// restore next and prev primaries in the reversed order
  1314  	err = tools.RestoreNode(cmdSecond, false, "proxy (next primary)")
  1315  	tassert.CheckFatal(t, err)
  1316  	smap, err = tools.WaitForClusterState(finalPrimaryURL, "restore next primary",
  1317  		smap.Version, origProxyCount-1, 0)
  1318  	tassert.CheckFatal(t, err)
  1319  
  1320  	err = tools.RestoreNode(cmdFirst, false, "proxy (prev primary)")
  1321  	tassert.CheckFatal(t, err)
  1322  	_, err = tools.WaitForClusterState(finalPrimaryURL, "restore prev primary",
  1323  		smap.Version, origProxyCount, 0)
  1324  	tassert.CheckFatal(t, err)
  1325  }
  1326  
  1327  func TestIC(t *testing.T) {
  1328  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true, RequiredDeployment: tools.ClusterTypeLocal})
  1329  
  1330  	proxyURL := tools.RandomProxyURL(t)
  1331  	smap := tools.GetClusterMap(t, proxyURL)
  1332  	if cnt := smap.CountActivePs(); cnt < 4 {
  1333  		t.Fatalf("Not enough proxies (%d) to run tests (must be at least 4)", cnt)
  1334  	}
  1335  
  1336  	defer tools.EnsureOrigClusterState(t)
  1337  	for _, test := range icTests {
  1338  		t.Run(test.name, test.method)
  1339  		if t.Failed() {
  1340  			t.FailNow()
  1341  		}
  1342  	}
  1343  	time.Sleep(time.Second)
  1344  }
  1345  
  1346  func killRandNonPrimaryIC(t testing.TB, smap *meta.Smap) (tools.RestoreCmd, *meta.Smap) {
  1347  	origProxyCount := smap.CountActivePs()
  1348  	primary := smap.Primary
  1349  	var killNode *meta.Snode
  1350  	for _, psi := range smap.Pmap {
  1351  		if smap.IsIC(psi) && !psi.Eq(primary) {
  1352  			killNode = psi
  1353  			break
  1354  		}
  1355  	}
  1356  	cmd, err := tools.KillNode(killNode)
  1357  	tassert.CheckFatal(t, err)
  1358  
  1359  	smap, err = tools.WaitForClusterState(primary.URL(cmn.NetPublic), "propagate new Smap",
  1360  		smap.Version, origProxyCount-1, 0)
  1361  	tassert.CheckError(t, err)
  1362  	return cmd, smap
  1363  }
  1364  
  1365  func icFromSmap(smap *meta.Smap) cos.StrSet {
  1366  	lst := make(cos.StrSet, meta.DfltCountIC)
  1367  	for pid, psi := range smap.Pmap {
  1368  		if smap.IsIC(psi) {
  1369  			lst.Add(pid)
  1370  		}
  1371  	}
  1372  	return lst
  1373  }
  1374  
  1375  func icMemberLeaveAndRejoin(t *testing.T) {
  1376  	smap := tools.GetClusterMap(t, proxyURL)
  1377  	primary := smap.Primary
  1378  	tassert.Fatalf(t, smap.ICCount() == meta.DfltCountIC,
  1379  		"should have %d members in IC, has %d", meta.DfltCountIC, smap.ICCount())
  1380  
  1381  	// Primary must be an IC member
  1382  	tassert.Fatalf(t, smap.IsIC(primary), "primary (%s) should be a IC member, (were: %s)", primary, smap.StrIC(primary))
  1383  
  1384  	// killing an IC member, should add a new IC member
  1385  	// select IC member which is not primary and kill
  1386  	origIC := icFromSmap(smap)
  1387  	cmd, smap := killRandNonPrimaryIC(t, smap)
  1388  	delete(origIC, cmd.Node.ID())
  1389  
  1390  	tassert.Errorf(t, !smap.IsIC(cmd.Node), "Killed daemon (%s) must be removed from IC", cmd.Node.ID())
  1391  
  1392  	// should have remaining IC nodes
  1393  	for sid := range origIC {
  1394  		tassert.Errorf(t, smap.IsIC(smap.GetProxy(sid)), "Should not remove existing IC members (%s)", sid)
  1395  	}
  1396  	tassert.Errorf(t, smap.ICCount() == meta.DfltCountIC, "should have %d members in IC, has %d",
  1397  		meta.DfltCountIC, smap.ICCount())
  1398  
  1399  	err := tools.RestoreNode(cmd, false, "proxy")
  1400  	tassert.CheckFatal(t, err)
  1401  
  1402  	updatedICs := icFromSmap(smap)
  1403  	smap, err = tools.WaitNodeAdded(tools.BaseAPIParams(primary.URL(cmn.NetPublic)), cmd.Node.ID())
  1404  	tassert.CheckFatal(t, err)
  1405  
  1406  	// Adding a new node shouldn't change IC members.
  1407  	newIC := icFromSmap(smap)
  1408  	tassert.Errorf(t, reflect.DeepEqual(updatedICs, newIC), "shouldn't update existing IC members")
  1409  }
  1410  
  1411  func icKillAndRestorePrimary(t *testing.T) {
  1412  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1413  	var (
  1414  		proxyURL   = tools.RandomProxyURL(t)
  1415  		smap       = tools.GetClusterMap(t, proxyURL)
  1416  		oldIC      = icFromSmap(smap)
  1417  		oldPrimary = smap.Primary
  1418  	)
  1419  
  1420  	icCheck := func(smap *meta.Smap, newPrimary, oldPrimary *meta.Snode) {
  1421  		// Old primary shouldn't be in IC.
  1422  		tassert.Errorf(t, !smap.IsIC(oldPrimary), "killed primary (%s) must be removed from IC", oldPrimary)
  1423  
  1424  		// New primary should be part of IC.
  1425  		tassert.Errorf(t, smap.IsIC(newPrimary), "new primary (%s) must be part of IC", newPrimary)
  1426  
  1427  		// Remaining IC member should be unchanged.
  1428  		for sid := range oldIC {
  1429  			if sid != oldPrimary.ID() {
  1430  				tassert.Errorf(t, smap.IsIC(smap.GetProxy(sid)), "should not remove existing IC members (%s)", sid)
  1431  			}
  1432  		}
  1433  	}
  1434  
  1435  	smap = killRestorePrimary(t, proxyURL, true, icCheck)
  1436  
  1437  	// When a node added as primary, it should add itself to IC.
  1438  	tassert.Fatalf(t, smap.IsIC(oldPrimary),
  1439  		"primary (%s) should be a IC member, (were: %s)", oldPrimary, smap.StrIC(oldPrimary))
  1440  	tassert.Errorf(t, smap.ICCount() == meta.DfltCountIC,
  1441  		"should have %d members in IC, has %d", meta.DfltCountIC, smap.ICCount())
  1442  }
  1443  
  1444  func icSyncOwnershipTable(t *testing.T) {
  1445  	var (
  1446  		proxyURL   = tools.RandomProxyURL(t)
  1447  		baseParams = tools.BaseAPIParams(proxyURL)
  1448  		smap       = tools.GetClusterMap(t, proxyURL)
  1449  		primary    = smap.Primary
  1450  
  1451  		src = cmn.Bck{
  1452  			Name:     testBucketName,
  1453  			Provider: apc.AIS,
  1454  		}
  1455  
  1456  		dstBck = cmn.Bck{
  1457  			Name:     testBucketName + "_new",
  1458  			Provider: apc.AIS,
  1459  		}
  1460  	)
  1461  
  1462  	tools.CreateBucket(t, proxyURL, src, nil, true /*cleanup*/)
  1463  
  1464  	// Start any xaction and get ID.
  1465  	xid, err := api.CopyBucket(baseParams, src, dstBck, nil)
  1466  	tassert.CheckFatal(t, err)
  1467  	t.Cleanup(func() {
  1468  		tools.DestroyBucket(t, proxyURL, dstBck)
  1469  	})
  1470  
  1471  	// Killing an IC member, should add a new IC member.
  1472  	// Select IC member which is not primary and kill.
  1473  	origIC := icFromSmap(smap)
  1474  	cmd, smap := killRandNonPrimaryIC(t, smap)
  1475  
  1476  	// Try getting xaction status from new IC member.
  1477  	updatedIC := icFromSmap(smap)
  1478  	newICMemID := getNewICMember(t, origIC, updatedIC)
  1479  
  1480  	newICNode := smap.GetProxy(newICMemID)
  1481  
  1482  	baseParams = tools.BaseAPIParams(newICNode.URL(cmn.NetPublic))
  1483  	xargs := xact.ArgsMsg{ID: xid, Kind: apc.ActCopyBck}
  1484  	_, err = api.GetOneXactionStatus(baseParams, &xargs)
  1485  	tassert.CheckError(t, err)
  1486  
  1487  	err = tools.RestoreNode(cmd, false, "proxy")
  1488  	tassert.CheckFatal(t, err)
  1489  
  1490  	smap, err = tools.WaitNodeAdded(baseParams, cmd.Node.ID())
  1491  	tassert.CheckFatal(t, err)
  1492  	tassert.Fatalf(t, !smap.IsIC(cmd.Node), "newly joined node shouldn't be in IC (%s)", cmd.Node)
  1493  
  1494  	// Should sync ownership table when non-ic member become primary.
  1495  	smap = setPrimaryTo(t, primary.URL(cmn.NetPublic), smap, "", cmd.Node.ID())
  1496  	tassert.Fatalf(t, smap.IsIC(cmd.Node), "primary (%s) should be a IC member, (were: %s)", primary, smap.StrIC(primary))
  1497  
  1498  	baseParams = tools.BaseAPIParams(cmd.Node.URL(cmn.NetPublic))
  1499  	_, err = api.GetOneXactionStatus(baseParams, &xargs)
  1500  	tassert.CheckError(t, err)
  1501  }
  1502  
  1503  func icSinglePrimaryRevamp(t *testing.T) {
  1504  	tools.CheckSkip(t, &tools.SkipTestArgs{Long: true})
  1505  
  1506  	var (
  1507  		proxyURL       = tools.RandomProxyURL(t)
  1508  		smap           = tools.GetClusterMap(t, proxyURL)
  1509  		origProxyCount = smap.CountActivePs()
  1510  
  1511  		src = cmn.Bck{
  1512  			Name:     testBucketName,
  1513  			Provider: apc.AIS,
  1514  		}
  1515  
  1516  		dstBck = cmn.Bck{
  1517  			Name:     testBucketName + "_new",
  1518  			Provider: apc.AIS,
  1519  		}
  1520  	)
  1521  
  1522  	nodesToRestore := make([]tools.RestoreCmd, 0, origProxyCount-1)
  1523  
  1524  	// Kill all nodes except primary.
  1525  	for i := origProxyCount; i > 1; i-- {
  1526  		var cmd tools.RestoreCmd
  1527  		cmd, smap = killRandNonPrimaryIC(t, smap)
  1528  		nodesToRestore = append(nodesToRestore, cmd)
  1529  	}
  1530  
  1531  	proxyURL = smap.Primary.URL(cmn.NetPublic)
  1532  	baseParams = tools.BaseAPIParams(proxyURL)
  1533  	tools.CreateBucket(t, proxyURL, src, nil, true /*cleanup*/)
  1534  
  1535  	// Start any xaction and get ID.
  1536  	xid, err := api.CopyBucket(baseParams, src, dstBck, nil)
  1537  	xargs := xact.ArgsMsg{ID: xid, Kind: apc.ActCopyBck}
  1538  
  1539  	tassert.CheckFatal(t, err)
  1540  	t.Cleanup(func() {
  1541  		tools.DestroyBucket(t, proxyURL, dstBck)
  1542  	})
  1543  
  1544  	// Restart all killed nodes and check for xaction status.
  1545  	for _, cmd := range nodesToRestore {
  1546  		err = tools.RestoreNode(cmd, false, "proxy")
  1547  		tassert.CheckError(t, err)
  1548  
  1549  		smap, err = tools.WaitForClusterState(proxyURL,
  1550  			"restore node "+cmd.Node.ID(), smap.Version,
  1551  			smap.CountActivePs()+1, smap.CountTargets())
  1552  		tassert.CheckFatal(t, err)
  1553  
  1554  		baseParams = tools.BaseAPIParams(cmd.Node.URL(cmn.NetPublic))
  1555  		_, err = api.GetOneXactionStatus(baseParams, &xargs)
  1556  		tassert.CheckError(t, err)
  1557  	}
  1558  }
  1559  
  1560  func icStressMonitorXactMultiICFail(t *testing.T) {
  1561  	var (
  1562  		proxyURL = tools.GetPrimaryURL()
  1563  		smap     = tools.GetClusterMap(t, proxyURL)
  1564  
  1565  		m = ioContext{
  1566  			t:        t,
  1567  			num:      1000,
  1568  			fileSize: 50 * cos.KiB,
  1569  		}
  1570  		numCopyXacts = 20
  1571  	)
  1572  
  1573  	// 1. Populate a bucket required for copy xactions
  1574  	m.init(true /*cleanup*/)
  1575  	tools.CreateBucket(t, proxyURL, m.bck, nil, true /*cleanup*/)
  1576  	m.puts()
  1577  
  1578  	// 2. Kill and restore random IC members in background
  1579  	stopCh := &cos.StopCh{}
  1580  	stopCh.Init()
  1581  	krWg := &sync.WaitGroup{}
  1582  	krWg.Add(1)
  1583  	go killRestoreIC(t, smap, stopCh, krWg)
  1584  	defer func() {
  1585  		// Stop the background kill and restore task
  1586  		stopCh.Close()
  1587  		krWg.Wait()
  1588  	}()
  1589  
  1590  	// 3. Start multiple xactions and poll random proxy for status till xaction is complete
  1591  	wg := startCPBckAndWait(t, m.bck, numCopyXacts)
  1592  	wg.Wait()
  1593  }
  1594  
  1595  func startCPBckAndWait(t testing.TB, srcBck cmn.Bck, count int) *sync.WaitGroup {
  1596  	var (
  1597  		proxyURL   = tools.GetPrimaryURL()
  1598  		baseParams = tools.BaseAPIParams(proxyURL)
  1599  		wg         = &sync.WaitGroup{}
  1600  	)
  1601  	for i := range count {
  1602  		wg.Add(1)
  1603  		go func(idx int) {
  1604  			dstBck := cmn.Bck{
  1605  				Name:     fmt.Sprintf("%s_dst_par_%d", testBucketName, idx),
  1606  				Provider: apc.AIS,
  1607  			}
  1608  			xid, err := api.CopyBucket(baseParams, srcBck, dstBck, nil)
  1609  			tassert.CheckError(t, err)
  1610  			defer func() {
  1611  				tools.DestroyBucket(t, proxyURL, dstBck)
  1612  				wg.Done()
  1613  			}()
  1614  			xargs := xact.ArgsMsg{ID: xid, Timeout: tools.RebalanceTimeout}
  1615  			_, err = api.WaitForXactionIC(baseParams, &xargs)
  1616  			tassert.CheckError(t, err)
  1617  		}(i)
  1618  	}
  1619  	return wg
  1620  }
  1621  
  1622  // Continuously kill and restore IC nodes
  1623  func killRestoreIC(t *testing.T, smap *meta.Smap, stopCh *cos.StopCh, wg *sync.WaitGroup) {
  1624  	var (
  1625  		cmd      tools.RestoreCmd
  1626  		proxyURL = smap.Primary.URL(cmn.NetPublic)
  1627  	)
  1628  	defer wg.Done()
  1629  
  1630  	for {
  1631  		cmd, smap = killRandNonPrimaryIC(t, smap)
  1632  		err := tools.RestoreNode(cmd, false, "proxy")
  1633  		tassert.CheckFatal(t, err)
  1634  
  1635  		smap, err = tools.WaitForClusterState(proxyURL, "restore", smap.Version, 0, 0)
  1636  		tassert.CheckFatal(t, err)
  1637  		time.Sleep(2 * time.Second)
  1638  
  1639  		select {
  1640  		case <-stopCh.Listen():
  1641  			return
  1642  		default:
  1643  			break
  1644  		}
  1645  	}
  1646  }
  1647  
  1648  // misc
  1649  
  1650  func getNewICMember(t testing.TB, oldMap, newMap cos.StrSet) (daeID string) {
  1651  	for sid := range newMap {
  1652  		if _, ok := oldMap[sid]; !ok {
  1653  			tassert.Errorf(t, daeID == "", "should change only one IC member")
  1654  			daeID = sid
  1655  		}
  1656  	}
  1657  	tassert.Fatalf(t, daeID != "", "should change at least one IC member")
  1658  	return
  1659  }
  1660  
  1661  //
  1662  // mock target
  1663  //
  1664  
  1665  const (
  1666  	mockTargetPort = "8079"
  1667  )
  1668  
  1669  type targetMocker interface {
  1670  	filehdlr(w http.ResponseWriter, r *http.Request)
  1671  	daemonhdlr(w http.ResponseWriter, r *http.Request)
  1672  	votehdlr(w http.ResponseWriter, r *http.Request)
  1673  	healthdlr(w http.ResponseWriter, r *http.Request)
  1674  }
  1675  
  1676  type MockRegRequest struct {
  1677  	SI *meta.Snode `json:"si"`
  1678  }
  1679  
  1680  func runMockTarget(t *testing.T, proxyURL string, mocktgt targetMocker, stopch chan struct{}, smap *meta.Smap, wg *sync.WaitGroup) {
  1681  	defer wg.Done()
  1682  	mux := http.NewServeMux()
  1683  
  1684  	mux.HandleFunc(apc.URLPathBuckets.S, mocktgt.filehdlr)
  1685  	mux.HandleFunc(apc.URLPathObjects.S, mocktgt.filehdlr)
  1686  	mux.HandleFunc(apc.URLPathDae.S, mocktgt.daemonhdlr)
  1687  	mux.HandleFunc(apc.URLPathVote.S, mocktgt.votehdlr)
  1688  	mux.HandleFunc(apc.URLPathHealth.S, mocktgt.healthdlr)
  1689  
  1690  	target, _ := smap.GetRandTarget()
  1691  	ip := target.PubNet.Hostname
  1692  
  1693  	s := &http.Server{
  1694  		Addr:              ip + ":" + mockTargetPort,
  1695  		Handler:           mux,
  1696  		ReadHeaderTimeout: 10 * time.Second,
  1697  	}
  1698  	go s.ListenAndServe()
  1699  
  1700  	err := registerMockTarget(proxyURL, smap)
  1701  	if err != nil {
  1702  		t.Errorf("failed to start http server for mock target: %v", err)
  1703  		return
  1704  	}
  1705  	tlog.Logf("t[%s] is up\n", tools.MockDaemonID)
  1706  
  1707  	<-stopch
  1708  
  1709  	tlog.Logf("started unsafe removal of t[%s]\n", tools.MockDaemonID)
  1710  	err = tools.RemoveNodeUnsafe(proxyURL, tools.MockDaemonID)
  1711  	if err != nil {
  1712  		tlog.Logf("Error: failed to unsafely remove t[%s]: %v\n", tools.MockDaemonID, err)
  1713  	}
  1714  	s.Shutdown(context.Background())
  1715  }
  1716  
  1717  func registerMockTarget(proxyURL string, smap *meta.Smap) error {
  1718  	var (
  1719  		jsonDaemonInfo []byte
  1720  		err            error
  1721  	)
  1722  
  1723  	// borrow a random target's ip but using a different port to register the mock target
  1724  	for _, v := range smap.Tmap {
  1725  		v.DaeID = tools.MockDaemonID
  1726  		v.PubNet = meta.NetInfo{
  1727  			Hostname: v.PubNet.Hostname,
  1728  			Port:     mockTargetPort,
  1729  			URL:      "http://" + v.PubNet.Hostname + ":" + mockTargetPort,
  1730  		}
  1731  		v.ControlNet = v.PubNet
  1732  		v.DataNet = v.PubNet
  1733  		regReq := MockRegRequest{SI: v}
  1734  		jsonDaemonInfo, err = jsoniter.Marshal(regReq)
  1735  		if err != nil {
  1736  			return err
  1737  		}
  1738  		break
  1739  	}
  1740  	baseParams := tools.BaseAPIParams(proxyURL)
  1741  	baseParams.Method = http.MethodPost
  1742  	reqParams := &api.ReqParams{
  1743  		BaseParams: baseParams,
  1744  		Path:       apc.URLPathCluAutoReg.S,
  1745  		Body:       jsonDaemonInfo,
  1746  		Header:     http.Header{cos.HdrContentType: []string{cos.ContentJSON}},
  1747  	}
  1748  	return reqParams.DoRequest()
  1749  }
  1750  
  1751  type voteRetryMockTarget struct {
  1752  	voteInProgress bool
  1753  	errCh          chan error
  1754  }
  1755  
  1756  type cluMetaRedux struct {
  1757  	Smap           *meta.Smap
  1758  	VoteInProgress bool `json:"voting"`
  1759  }
  1760  
  1761  func newVoteMsg(inp bool) cluMetaRedux {
  1762  	return cluMetaRedux{VoteInProgress: inp, Smap: &meta.Smap{Version: 1}}
  1763  }
  1764  
  1765  func (*voteRetryMockTarget) filehdlr(http.ResponseWriter, *http.Request) {
  1766  	// Ignore all file requests
  1767  }
  1768  
  1769  func (p *voteRetryMockTarget) daemonhdlr(w http.ResponseWriter, r *http.Request) {
  1770  	switch r.Method {
  1771  	case http.MethodGet:
  1772  		msg := newVoteMsg(p.voteInProgress) // treat all Get requests as requests for a VoteMsg
  1773  		jsbytes, err := jsoniter.Marshal(msg)
  1774  		if err == nil {
  1775  			_, err = w.Write(jsbytes)
  1776  		}
  1777  		if err != nil {
  1778  			p.errCh <- fmt.Errorf("error writing vote message: %v", err)
  1779  		}
  1780  	default:
  1781  	}
  1782  }
  1783  
  1784  func (*voteRetryMockTarget) votehdlr(w http.ResponseWriter, _ *http.Request) {
  1785  	// Always vote yes.
  1786  	w.Write([]byte(ais.VoteYes))
  1787  }
  1788  
  1789  func (p *voteRetryMockTarget) healthdlr(w http.ResponseWriter, r *http.Request) {
  1790  	query := r.URL.Query()
  1791  	getRebStatus := cos.IsParseBool(query.Get(apc.QparamRebStatus))
  1792  	if getRebStatus {
  1793  		status := &reb.Status{}
  1794  		status.RebID = math.MaxInt64 // to abort t[MOCK] join triggered rebalance
  1795  		body := cos.MustMarshal(status)
  1796  		_, err := w.Write(body)
  1797  		if err != nil {
  1798  			p.errCh <- fmt.Errorf("error writing reb-status: %v", err)
  1799  		}
  1800  	}
  1801  }