github.com/matrixorigin/matrixone@v0.7.0/pkg/tests/service/service.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package service
    16  
    17  import (
    18  	"context"
    19  	"os"
    20  	"path/filepath"
    21  	"sync"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/google/uuid"
    26  	"github.com/matrixorigin/matrixone/pkg/cnservice"
    27  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    28  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    29  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    30  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    31  	"github.com/matrixorigin/matrixone/pkg/defines"
    32  	"github.com/matrixorigin/matrixone/pkg/dnservice"
    33  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    34  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    35  	"github.com/matrixorigin/matrixone/pkg/hakeeper/checkers/syshealth"
    36  	"github.com/matrixorigin/matrixone/pkg/logservice"
    37  	"github.com/matrixorigin/matrixone/pkg/logutil"
    38  	logpb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    39  	"github.com/matrixorigin/matrixone/pkg/pb/metadata"
    40  	"github.com/matrixorigin/matrixone/pkg/testutil"
    41  	"github.com/matrixorigin/matrixone/pkg/txn/clock"
    42  	"github.com/stretchr/testify/assert"
    43  	"github.com/stretchr/testify/require"
    44  	"go.uber.org/zap"
    45  )
    46  
    47  var (
    48  	defaultWaitInterval = 100 * time.Millisecond
    49  	defaultTestTimeout  = 3 * time.Minute
    50  )
    51  
    52  // Cluster describes behavior of test framework.
    53  type Cluster interface {
    54  	// Start starts svcs sequentially, after start, system init is completed.
    55  	Start() error
    56  	// Close stops svcs sequentially
    57  	Close() error
    58  	// Options returns the adjusted options
    59  	Options() Options
    60  
    61  	ClusterOperation
    62  	ClusterAwareness
    63  	ClusterState
    64  	ClusterWaitState
    65  }
    66  
    67  // ClusterOperation supports kinds of cluster operations.
    68  type ClusterOperation interface {
    69  	// CloseDNService closes dn service by uuid.
    70  	CloseDNService(uuid string) error
    71  	// StartDNService starts dn service by uuid.
    72  	StartDNService(uuid string) error
    73  
    74  	// CloseDNServiceIndexed closes dn service by its index.
    75  	CloseDNServiceIndexed(index int) error
    76  	// StartDNServiceIndexed starts dn service by its index.
    77  	StartDNServiceIndexed(index int) error
    78  
    79  	// CloseLogService closes log service by uuid.
    80  	CloseLogService(uuid string) error
    81  	// StartLogService starts log service by uuid.
    82  	StartLogService(uuid string) error
    83  
    84  	// CloseLogServiceIndexed closes log service by its index.
    85  	CloseLogServiceIndexed(index int) error
    86  	// StartLogServiceIndexed starts log service by its index.
    87  	StartLogServiceIndexed(index int) error
    88  
    89  	// CloseCNService closes cn service by uuid.
    90  	CloseCNService(uuid string) error
    91  	// StartCNService starts cn service by uuid.
    92  	StartCNService(uuid string) error
    93  
    94  	// CloseCNServiceIndexed closes cn service by its index.
    95  	CloseCNServiceIndexed(index int) error
    96  	// StartCNServiceIndexed starts cn service by its index.
    97  	StartCNServiceIndexed(index int) error
    98  
    99  	// NewNetworkPartition constructs network partition from service index.
   100  	NewNetworkPartition(dnIndexes, logIndexes, cnIndexes []uint32) NetworkPartition
   101  	// RemainingNetworkPartition returns partition for the remaining services.
   102  	RemainingNetworkPartition(partitions ...NetworkPartition) NetworkPartition
   103  	// StartNetworkPartition enables network partition feature.
   104  	StartNetworkPartition(partitions ...NetworkPartition)
   105  	// CloseNetworkPartition disables network partition feature.
   106  	CloseNetworkPartition()
   107  }
   108  
   109  // ClusterAwareness provides cluster awareness information.
   110  type ClusterAwareness interface {
   111  	// ListDNServices lists uuid of all dn services.
   112  	ListDNServices() []string
   113  	// ListLogServices lists uuid of all log services.
   114  	ListLogServices() []string
   115  	// ListCnServices lists uuid of all cn services.
   116  	ListCnServices() []string
   117  	// ListHAKeeperServices lists all hakeeper log services.
   118  	ListHAKeeperServices() []LogService
   119  
   120  	// GetDNService fetches dn service instance by uuid.
   121  	GetDNService(uuid string) (DNService, error)
   122  	// GetLogService fetches log service instance by index.
   123  	GetLogService(uuid string) (LogService, error)
   124  	// GetDNServiceIndexed fetches dn service instance by uuid.
   125  	GetDNServiceIndexed(index int) (DNService, error)
   126  	// GetLogServiceIndexed fetches log service instance by index.
   127  	GetLogServiceIndexed(index int) (LogService, error)
   128  	// GetCNService fetches cn service instance by index.
   129  	GetCNService(uuid string) (CNService, error)
   130  	// GetCNServiceIndexed fetches cn service instance by index.
   131  	GetCNServiceIndexed(index int) (CNService, error)
   132  
   133  	// GetClusterState fetches current cluster state
   134  	GetClusterState(ctx context.Context) (*logpb.CheckerState, error)
   135  }
   136  
   137  // ClusterState provides cluster running state.
   138  type ClusterState interface {
   139  	// ListDNShards lists all dn shards within the cluster.
   140  	ListDNShards(ctx context.Context) ([]metadata.DNShardRecord, error)
   141  	// ListLogShards lists all log shards within the cluster.
   142  	ListLogShards(ctx context.Context) ([]metadata.LogShardRecord, error)
   143  
   144  	// GetDNStoreInfo gets dn store information by uuid.
   145  	GetDNStoreInfo(ctx context.Context, uuid string) (logpb.DNStoreInfo, error)
   146  	// GetDNStoreInfoIndexed gets dn store information by index.
   147  	GetDNStoreInfoIndexed(ctx context.Context, index int) (logpb.DNStoreInfo, error)
   148  
   149  	// GetLogStoreInfo gets log store information by uuid.
   150  	GetLogStoreInfo(ctx context.Context, uuid string) (logpb.LogStoreInfo, error)
   151  	// GetLogStoreInfoIndexed gets log store information by index.
   152  	GetLogStoreInfoIndexed(ctx context.Context, index int) (logpb.LogStoreInfo, error)
   153  
   154  	// GetCNStoreInfo gets cn store information by uuid.
   155  	GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error)
   156  	// GetCNStoreInfoIndexed gets cn store information by index.
   157  	GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error)
   158  
   159  	// GetHAKeeperState returns hakeeper state from running hakeeper.
   160  	GetHAKeeperState() logpb.HAKeeperState
   161  	// GetHAKeeperConfig returns hakeeper configuration.
   162  	GetHAKeeperConfig() hakeeper.Config
   163  
   164  	// DNStoreExpired checks dn store expired or not by uuid.
   165  	DNStoreExpired(uuid string) (bool, error)
   166  	// DNStoreExpiredIndexed checks dn store expired or not by index.
   167  	DNStoreExpiredIndexed(index int) (bool, error)
   168  	// LogStoreExpired checks log store expired or not by uuid.
   169  	LogStoreExpired(uuid string) (bool, error)
   170  	// LogStoreExpiredIndexed checks log store expired or not by index.
   171  	LogStoreExpiredIndexed(index int) (bool, error)
   172  	// CNStoreExpired checks cn store expired or not by uuid.
   173  	CNStoreExpired(uuid string) (bool, error)
   174  	// CNStoreExpiredIndexed checks cn store expired or not by index.
   175  	CNStoreExpiredIndexed(index int) (bool, error)
   176  
   177  	// IsClusterHealthy checks whether cluster is healthy or not.
   178  	IsClusterHealthy() bool
   179  }
   180  
   181  // ClusterWaitState waits cluster state until timeout.
   182  type ClusterWaitState interface {
   183  	// WaitHAKeeperLeader waits hakeeper leader elected and return it.
   184  	WaitHAKeeperLeader(ctx context.Context) LogService
   185  	// WaitHAKeeperState waits the specific hakeeper state.
   186  	WaitHAKeeperState(ctx context.Context, expected logpb.HAKeeperState)
   187  
   188  	// WaitDNShardsReported waits the expected count of dn shards reported.
   189  	WaitDNShardsReported(ctx context.Context)
   190  	// WaitLogShardsReported waits the expected count of log shards reported.
   191  	WaitLogShardsReported(ctx context.Context)
   192  	// WaitDNReplicaReported waits dn replica reported.
   193  	WaitDNReplicaReported(ctx context.Context, shardID uint64)
   194  	// WaitLogReplicaReported waits log replicas reported.
   195  	WaitLogReplicaReported(ctx context.Context, shardID uint64)
   196  
   197  	// WaitDNStoreTimeout waits dn store timeout by uuid.
   198  	WaitDNStoreTimeout(ctx context.Context, uuid string)
   199  	// WaitDNStoreTimeoutIndexed waits dn store timeout by index.
   200  	WaitDNStoreTimeoutIndexed(ctx context.Context, index int)
   201  	// WaitDNStoreReported waits dn store reported by uuid.
   202  	WaitDNStoreReported(ctx context.Context, uuid string)
   203  	// WaitDNStoreReportedIndexed waits dn store reported by index.
   204  	WaitDNStoreReportedIndexed(ctx context.Context, index int)
   205  	// WaitDNStoreTaskServiceCreated waits dn store task service started by uuid.
   206  	WaitDNStoreTaskServiceCreated(ctx context.Context, uuid string)
   207  	// WaitDNStoreTaskServiceCreatedIndexed waits dn store task service started by index.
   208  	WaitDNStoreTaskServiceCreatedIndexed(ctx context.Context, index int)
   209  	// WaitCNStoreReported waits cn store reported by uuid.
   210  	WaitCNStoreReported(ctx context.Context, uuid string)
   211  	// WaitCNStoreReportedIndexed waits cn store reported by index.
   212  	WaitCNStoreReportedIndexed(ctx context.Context, index int)
   213  	// WaitCNStoreTaskServiceCreated waits cn store task service started by uuid.
   214  	WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string)
   215  	// WaitCNStoreTaskServiceCreatedIndexed waits cn store task service started by index.
   216  	WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int)
   217  	// WaitLogStoreTaskServiceCreated waits log store task service started by uuid
   218  	WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string)
   219  	// WaitLogStoreTaskServiceCreatedIndexed waits log store task service started by index
   220  	WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int)
   221  
   222  	// WaitLogStoreTimeout waits log store timeout by uuid.
   223  	WaitLogStoreTimeout(ctx context.Context, uuid string)
   224  	// WaitLogStoreTimeoutIndexed waits log store timeout by index.
   225  	WaitLogStoreTimeoutIndexed(ctx context.Context, index int)
   226  	// WaitLogStoreReported waits log store reported by uuid.
   227  	WaitLogStoreReported(ctx context.Context, uuid string)
   228  	// WaitLogStoreReportedIndexed waits log store reported by index.
   229  	WaitLogStoreReportedIndexed(ctx context.Context, index int)
   230  }
   231  
   232  // ----------------------------------------------------
   233  // The following are implements for interface `Cluster`.
   234  // ----------------------------------------------------
   235  
   236  // testCluster simulates a cluster with dn and log service.
   237  type testCluster struct {
   238  	t       *testing.T
   239  	testID  string
   240  	opt     Options
   241  	logger  *zap.Logger
   242  	stopper *stopper.Stopper
   243  	clock   clock.Clock
   244  
   245  	dn struct {
   246  		sync.Mutex
   247  		cfgs []*dnservice.Config
   248  		opts []dnOptions
   249  		svcs []DNService
   250  	}
   251  
   252  	log struct {
   253  		once sync.Once
   254  
   255  		sync.Mutex
   256  		cfgs []logservice.Config
   257  		opts []logOptions
   258  		svcs []LogService
   259  	}
   260  
   261  	cn struct {
   262  		sync.Mutex
   263  		cfgs []*cnservice.Config
   264  		opts []cnOptions
   265  		svcs []CNService
   266  	}
   267  
   268  	network struct {
   269  		addresses serviceAddresses
   270  
   271  		sync.RWMutex
   272  		addressSets []addressSet
   273  	}
   274  
   275  	fileservices *fileServices
   276  
   277  	mu struct {
   278  		sync.Mutex
   279  		running bool
   280  	}
   281  }
   282  
   283  // NewCluster construct a cluster for integration test.
   284  func NewCluster(t *testing.T, opt Options) (Cluster, error) {
   285  	logutil.SetupMOLogger(&logutil.LogConfig{
   286  		Level:  "debug",
   287  		Format: "console",
   288  	})
   289  	opt.validate()
   290  
   291  	c := &testCluster{
   292  		t:       t,
   293  		testID:  uuid.New().String(),
   294  		opt:     opt,
   295  		stopper: stopper.NewStopper("test-cluster"),
   296  	}
   297  	c.logger = logutil.Adjust(opt.logger).With(zap.String("testcase", t.Name())).With(zap.String("test-id", c.testID))
   298  	c.opt.rootDataDir = filepath.Join(c.opt.rootDataDir, c.testID, t.Name())
   299  	if c.clock == nil {
   300  		c.clock = clock.NewUnixNanoHLCClockWithStopper(c.stopper, 0)
   301  	}
   302  
   303  	// TODO: CN and LOG use process level runtime
   304  	runtime.SetupProcessLevelRuntime(c.newRuntime())
   305  
   306  	// build addresses for all services
   307  	c.network.addresses = c.buildServiceAddresses()
   308  	// build log service configurations
   309  	c.log.cfgs, c.log.opts = c.buildLogConfigs(c.network.addresses)
   310  	// build dn service configurations
   311  	c.dn.cfgs, c.dn.opts = c.buildDNConfigs(c.network.addresses)
   312  	// build cn service configurations
   313  	c.cn.cfgs, c.cn.opts = c.buildCNConfigs(c.network.addresses)
   314  	// build FileService instances
   315  	c.fileservices = c.buildFileServices()
   316  
   317  	return c, nil
   318  }
   319  
   320  func (c *testCluster) Start() error {
   321  	c.mu.Lock()
   322  	defer c.mu.Unlock()
   323  
   324  	if c.mu.running {
   325  		return nil
   326  	}
   327  
   328  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   329  	defer cancel()
   330  
   331  	c.mu.running = true
   332  	// start log services first
   333  	if err := c.startLogServices(ctx); err != nil {
   334  		return err
   335  	}
   336  
   337  	// start dn services
   338  	if err := c.startDNServices(ctx); err != nil {
   339  		return err
   340  	}
   341  
   342  	// start cn services
   343  	if err := c.startCNServices(ctx); err != nil {
   344  		return err
   345  	}
   346  
   347  	return nil
   348  }
   349  
   350  func (c *testCluster) Options() Options {
   351  	return c.opt
   352  }
   353  
   354  func (c *testCluster) Close() error {
   355  	defer logutil.LogClose(c.logger, "tests-framework")()
   356  	c.logger.Info("closing testCluster")
   357  
   358  	c.mu.Lock()
   359  	defer c.mu.Unlock()
   360  
   361  	if !c.mu.running {
   362  		return nil
   363  	}
   364  
   365  	// close all cn services first
   366  	if err := c.closeCNServices(); err != nil {
   367  		return err
   368  	}
   369  
   370  	// close all dn services
   371  	if err := c.closeDNServices(); err != nil {
   372  		return err
   373  	}
   374  
   375  	// close all log services
   376  	if err := c.closeLogServices(); err != nil {
   377  		return err
   378  	}
   379  
   380  	c.mu.running = false
   381  	c.stopper.Stop()
   382  
   383  	if !c.opt.keepData {
   384  		if err := os.RemoveAll(c.opt.rootDataDir); err != nil {
   385  			return err
   386  		}
   387  	}
   388  	return nil
   389  }
   390  
   391  // ----------------------------------------------------------
   392  // The following are implements for interface `ClusterState`.
   393  // ----------------------------------------------------------
   394  func (c *testCluster) ListDNShards(
   395  	ctx context.Context,
   396  ) ([]metadata.DNShardRecord, error) {
   397  	state, err := c.GetClusterState(ctx)
   398  	if err != nil {
   399  		return nil, err
   400  	}
   401  	return state.ClusterInfo.DNShards, nil
   402  }
   403  
   404  func (c *testCluster) ListLogShards(
   405  	ctx context.Context,
   406  ) ([]metadata.LogShardRecord, error) {
   407  	state, err := c.GetClusterState(ctx)
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	return state.ClusterInfo.LogShards, nil
   412  }
   413  
   414  func (c *testCluster) GetDNStoreInfo(
   415  	ctx context.Context, uuid string,
   416  ) (logpb.DNStoreInfo, error) {
   417  	state, err := c.GetClusterState(ctx)
   418  	if err != nil {
   419  		return logpb.DNStoreInfo{}, err
   420  	}
   421  	stores := state.DNState.Stores
   422  	if storeInfo, ok := stores[uuid]; ok {
   423  		return storeInfo, nil
   424  	}
   425  	return logpb.DNStoreInfo{}, moerr.NewNoService(ctx, uuid)
   426  }
   427  
   428  func (c *testCluster) GetDNStoreInfoIndexed(
   429  	ctx context.Context, index int,
   430  ) (logpb.DNStoreInfo, error) {
   431  	ds, err := c.GetDNServiceIndexed(index)
   432  	if err != nil {
   433  		return logpb.DNStoreInfo{}, err
   434  	}
   435  	return c.GetDNStoreInfo(ctx, ds.ID())
   436  }
   437  
   438  func (c *testCluster) GetLogStoreInfo(
   439  	ctx context.Context, uuid string,
   440  ) (logpb.LogStoreInfo, error) {
   441  	state, err := c.GetClusterState(ctx)
   442  	if err != nil {
   443  		return logpb.LogStoreInfo{}, err
   444  	}
   445  	stores := state.LogState.Stores
   446  	if storeInfo, ok := stores[uuid]; ok {
   447  		return storeInfo, nil
   448  	}
   449  	return logpb.LogStoreInfo{}, moerr.NewNoService(ctx, uuid)
   450  }
   451  
   452  func (c *testCluster) GetLogStoreInfoIndexed(
   453  	ctx context.Context, index int,
   454  ) (logpb.LogStoreInfo, error) {
   455  	ls, err := c.GetLogServiceIndexed(index)
   456  	if err != nil {
   457  		return logpb.LogStoreInfo{}, err
   458  	}
   459  	return c.GetLogStoreInfo(ctx, ls.ID())
   460  }
   461  
   462  func (c *testCluster) GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error) {
   463  	state, err := c.GetClusterState(ctx)
   464  	if err != nil {
   465  		return logpb.CNStoreInfo{}, err
   466  	}
   467  	stores := state.CNState.Stores
   468  	if storeInfo, ok := stores[uuid]; ok {
   469  		return storeInfo, nil
   470  	}
   471  	return logpb.CNStoreInfo{}, moerr.NewNoService(ctx, uuid)
   472  }
   473  
   474  func (c *testCluster) GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error) {
   475  	ls, err := c.GetCNServiceIndexed(index)
   476  	if err != nil {
   477  		return logpb.CNStoreInfo{}, err
   478  	}
   479  	return c.GetCNStoreInfo(ctx, ls.ID())
   480  }
   481  
   482  func (c *testCluster) GetHAKeeperState() logpb.HAKeeperState {
   483  	state := c.getClusterState()
   484  	require.NotNil(c.t, state)
   485  	return state.State
   486  }
   487  
   488  func (c *testCluster) GetHAKeeperConfig() hakeeper.Config {
   489  	return c.opt.BuildHAKeeperConfig()
   490  }
   491  
   492  func (c *testCluster) DNStoreExpired(uuid string) (bool, error) {
   493  	state := c.getClusterState()
   494  	require.NotNil(c.t, state)
   495  
   496  	dnStore, ok := state.DNState.Stores[uuid]
   497  	if !ok {
   498  		return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF)
   499  	}
   500  
   501  	hkcfg := c.GetHAKeeperConfig()
   502  	expired := hkcfg.DNStoreExpired(dnStore.Tick, state.Tick)
   503  
   504  	c.logger.Info(
   505  		"check dn store expired or not",
   506  		zap.Any("hakeeper config", hkcfg),
   507  		zap.Uint64("dn store tick", dnStore.Tick),
   508  		zap.Uint64("current tick", state.Tick),
   509  		zap.Bool("expired", expired),
   510  	)
   511  
   512  	return expired, nil
   513  }
   514  
   515  func (c *testCluster) DNStoreExpiredIndexed(index int) (bool, error) {
   516  	ds, err := c.GetDNServiceIndexed(index)
   517  	if err != nil {
   518  		return false, err
   519  	}
   520  	return c.DNStoreExpired(ds.ID())
   521  }
   522  
   523  func (c *testCluster) LogStoreExpired(uuid string) (bool, error) {
   524  	state := c.getClusterState()
   525  	require.NotNil(c.t, state)
   526  
   527  	logStore, ok := state.LogState.Stores[uuid]
   528  	if !ok {
   529  		return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF)
   530  	}
   531  
   532  	hkcfg := c.GetHAKeeperConfig()
   533  	expired := hkcfg.LogStoreExpired(logStore.Tick, state.Tick)
   534  
   535  	c.logger.Info(
   536  		"check log store expired or not",
   537  		zap.Any("hakeeper config", hkcfg),
   538  		zap.Uint64("log store tick", logStore.Tick),
   539  		zap.Uint64("current tick", state.Tick),
   540  		zap.Bool("expired", expired),
   541  	)
   542  
   543  	return expired, nil
   544  }
   545  
   546  func (c *testCluster) LogStoreExpiredIndexed(index int) (bool, error) {
   547  	ls, err := c.GetLogServiceIndexed(index)
   548  	if err != nil {
   549  		return false, err
   550  	}
   551  	return c.LogStoreExpired(ls.ID())
   552  }
   553  
   554  func (c *testCluster) CNStoreExpired(uuid string) (bool, error) {
   555  	state := c.getClusterState()
   556  	require.NotNil(c.t, state)
   557  
   558  	cnStore, ok := state.CNState.Stores[uuid]
   559  	if !ok {
   560  		return false, moerr.NewShardNotReportedNoCtx(uuid, 0)
   561  	}
   562  
   563  	hkcfg := c.GetHAKeeperConfig()
   564  	expired := hkcfg.CNStoreExpired(cnStore.Tick, state.Tick)
   565  
   566  	c.logger.Info(
   567  		"check cn store expired or not",
   568  		zap.Any("hakeeper config", hkcfg),
   569  		zap.Uint64("cn store tick", cnStore.Tick),
   570  		zap.Uint64("current tick", state.Tick),
   571  		zap.Bool("expired", expired),
   572  	)
   573  
   574  	return expired, nil
   575  }
   576  
   577  func (c *testCluster) CNStoreExpiredIndexed(index int) (bool, error) {
   578  	cs, err := c.GetCNServiceIndexed(index)
   579  	if err != nil {
   580  		return false, err
   581  	}
   582  	return c.CNStoreExpired(cs.ID())
   583  }
   584  
   585  func (c *testCluster) IsClusterHealthy() bool {
   586  	hkcfg := c.GetHAKeeperConfig()
   587  	state := c.getClusterState()
   588  	_, healthy := syshealth.Check(
   589  		hkcfg,
   590  		state.GetClusterInfo(),
   591  		state.GetDNState(),
   592  		state.GetLogState(),
   593  		state.GetTick(),
   594  	)
   595  	return healthy
   596  }
   597  
   598  // --------------------------------------------------------------
   599  // The following are implements for interface `ClusterWaitState`.
   600  // --------------------------------------------------------------
   601  func (c *testCluster) WaitHAKeeperLeader(ctx context.Context) LogService {
   602  	for {
   603  		select {
   604  		case <-ctx.Done():
   605  			assert.FailNow(
   606  				c.t,
   607  				"terminated when waiting for hakeeper leader",
   608  				"error: %s", ctx.Err(),
   609  			)
   610  		default:
   611  			time.Sleep(defaultWaitInterval)
   612  
   613  			leader := c.getHAKeeperLeader()
   614  			if leader != nil {
   615  				return leader
   616  			}
   617  		}
   618  	}
   619  }
   620  
   621  func (c *testCluster) WaitHAKeeperState(
   622  	ctx context.Context, expected logpb.HAKeeperState,
   623  ) {
   624  	for {
   625  		select {
   626  		case <-ctx.Done():
   627  			assert.FailNow(
   628  				c.t,
   629  				"terminated when waiting for hakeeper state",
   630  				"error: %s", ctx.Err(),
   631  			)
   632  		default:
   633  			time.Sleep(defaultWaitInterval)
   634  
   635  			state := c.getClusterState()
   636  			if state == nil {
   637  				continue
   638  			}
   639  			if state.State == expected {
   640  				return
   641  			}
   642  		}
   643  	}
   644  }
   645  
   646  func (c *testCluster) WaitDNShardsReported(ctx context.Context) {
   647  	for {
   648  		select {
   649  		case <-ctx.Done():
   650  			assert.FailNow(
   651  				c.t,
   652  				"terminated when waiting for all dn shards reported",
   653  				"error: %s", ctx.Err(),
   654  			)
   655  		default:
   656  			time.Sleep(defaultWaitInterval)
   657  
   658  			state := c.getClusterState()
   659  			if state == nil {
   660  				continue
   661  			}
   662  
   663  			expected := ParseExpectedDNShardCount(state.ClusterInfo)
   664  			reported := ParseReportedDNShardCount(
   665  				state.DNState, c.GetHAKeeperConfig(), state.Tick,
   666  			)
   667  
   668  			// FIXME: what about reported larger than expected
   669  			if reported >= expected {
   670  				return
   671  			}
   672  		}
   673  	}
   674  }
   675  
   676  func (c *testCluster) WaitLogShardsReported(ctx context.Context) {
   677  	for {
   678  		select {
   679  		case <-ctx.Done():
   680  			assert.FailNow(
   681  				c.t,
   682  				"terminated when waiting for all log shards reported",
   683  				"error: %s", ctx.Err(),
   684  			)
   685  		default:
   686  			time.Sleep(defaultWaitInterval)
   687  
   688  			state := c.getClusterState()
   689  			if state == nil {
   690  				continue
   691  			}
   692  
   693  			expected := ParseExpectedLogShardCount(state.ClusterInfo)
   694  			reported := ParseReportedLogShardCount(
   695  				state.LogState, c.GetHAKeeperConfig(), state.Tick,
   696  			)
   697  			// FIXME: what about reported larger than expected
   698  			if reported >= expected {
   699  				return
   700  			}
   701  		}
   702  	}
   703  }
   704  
   705  func (c *testCluster) WaitDNReplicaReported(ctx context.Context, shardID uint64) {
   706  	for {
   707  		select {
   708  		case <-ctx.Done():
   709  			assert.FailNow(
   710  				c.t,
   711  				"terminated when waiting replica of dn shard reported",
   712  				"shard %d, error: %s", shardID, ctx.Err(),
   713  			)
   714  		default:
   715  			time.Sleep(defaultWaitInterval)
   716  
   717  			state := c.getClusterState()
   718  			if state == nil {
   719  				continue
   720  			}
   721  
   722  			reported := ParseDNShardReportedSize(
   723  				shardID, state.DNState, c.GetHAKeeperConfig(), state.Tick,
   724  			)
   725  			if reported >= DNShardExpectedSize {
   726  				return
   727  			}
   728  		}
   729  	}
   730  }
   731  
   732  func (c *testCluster) WaitLogReplicaReported(ctx context.Context, shardID uint64) {
   733  	for {
   734  		select {
   735  		case <-ctx.Done():
   736  			assert.FailNow(
   737  				c.t,
   738  				"terminated when waiting replica of log shard reported",
   739  				"shard %d, error: %s", shardID, ctx.Err(),
   740  			)
   741  		default:
   742  			time.Sleep(defaultWaitInterval)
   743  
   744  			state := c.getClusterState()
   745  			if state == nil {
   746  				continue
   747  			}
   748  
   749  			expected := ParseLogShardExpectedSize(shardID, state.ClusterInfo)
   750  			reported := ParseLogShardReportedSize(
   751  				shardID, state.LogState, c.GetHAKeeperConfig(), state.Tick,
   752  			)
   753  			if reported >= expected {
   754  				return
   755  			}
   756  		}
   757  	}
   758  }
   759  
   760  func (c *testCluster) WaitDNStoreTimeout(ctx context.Context, uuid string) {
   761  	for {
   762  		select {
   763  		case <-ctx.Done():
   764  			assert.FailNow(
   765  				c.t,
   766  				"terminated when waiting dn store timeout",
   767  				"dn store %s, error: %s", uuid, ctx.Err(),
   768  			)
   769  		default:
   770  			time.Sleep(defaultWaitInterval)
   771  
   772  			expired, err := c.DNStoreExpired(uuid)
   773  			if err != nil {
   774  				c.logger.Error("fail to check dn store expired or not",
   775  					zap.Error(err),
   776  					zap.String("uuid", uuid),
   777  				)
   778  				continue
   779  			}
   780  
   781  			if expired {
   782  				return
   783  			}
   784  		}
   785  	}
   786  }
   787  
   788  func (c *testCluster) WaitDNStoreTimeoutIndexed(ctx context.Context, index int) {
   789  	ds, err := c.GetDNServiceIndexed(index)
   790  	require.NoError(c.t, err)
   791  
   792  	c.WaitDNStoreTimeout(ctx, ds.ID())
   793  }
   794  
   795  func (c *testCluster) WaitDNStoreReported(ctx context.Context, uuid string) {
   796  	for {
   797  		select {
   798  		case <-ctx.Done():
   799  			assert.FailNow(
   800  				c.t,
   801  				"terminated when waiting dn store reported",
   802  				"dn store %s, error: %s", uuid, ctx.Err(),
   803  			)
   804  		default:
   805  			time.Sleep(defaultWaitInterval)
   806  
   807  			expired, err := c.DNStoreExpired(uuid)
   808  			if err != nil {
   809  				c.logger.Error("fail to check dn store expired or not",
   810  					zap.Error(err),
   811  					zap.String("uuid", uuid),
   812  				)
   813  				continue
   814  			}
   815  
   816  			if !expired {
   817  				return
   818  			}
   819  		}
   820  	}
   821  }
   822  
   823  func (c *testCluster) WaitDNStoreReportedIndexed(ctx context.Context, index int) {
   824  	ds, err := c.GetDNServiceIndexed(index)
   825  	require.NoError(c.t, err)
   826  
   827  	c.WaitDNStoreReported(ctx, ds.ID())
   828  }
   829  
   830  func (c *testCluster) WaitCNStoreReported(ctx context.Context, uuid string) {
   831  	for {
   832  		select {
   833  		case <-ctx.Done():
   834  			assert.FailNow(
   835  				c.t,
   836  				"terminated when waiting cn store reported",
   837  				"cn store %s, error: %s", uuid, ctx.Err(),
   838  			)
   839  		default:
   840  			time.Sleep(defaultWaitInterval)
   841  
   842  			expired, err := c.CNStoreExpired(uuid)
   843  			if err != nil {
   844  				c.logger.Error("fail to check cn store expired or not",
   845  					zap.Error(err),
   846  					zap.String("uuid", uuid),
   847  				)
   848  				continue
   849  			}
   850  
   851  			if !expired {
   852  				return
   853  			}
   854  		}
   855  	}
   856  }
   857  
   858  func (c *testCluster) WaitCNStoreReportedIndexed(ctx context.Context, index int) {
   859  	ds, err := c.GetCNServiceIndexed(index)
   860  	require.NoError(c.t, err)
   861  
   862  	c.WaitCNStoreReported(ctx, ds.ID())
   863  }
   864  
   865  func (c *testCluster) WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string) {
   866  	ds, err := c.GetCNService(uuid)
   867  	require.NoError(c.t, err)
   868  
   869  	for {
   870  		select {
   871  		case <-ctx.Done():
   872  			assert.FailNow(
   873  				c.t,
   874  				"terminated when waiting task service created on cn store",
   875  				"cn store %s, error: %s", uuid, ctx.Err(),
   876  			)
   877  		default:
   878  			_, ok := ds.GetTaskService()
   879  			if ok {
   880  				return
   881  			}
   882  			time.Sleep(defaultWaitInterval)
   883  		}
   884  	}
   885  }
   886  
   887  func (c *testCluster) WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) {
   888  	ds, err := c.GetCNServiceIndexed(index)
   889  	require.NoError(c.t, err)
   890  	c.WaitCNStoreTaskServiceCreated(ctx, ds.ID())
   891  }
   892  
   893  func (c *testCluster) WaitDNStoreTaskServiceCreated(ctx context.Context, uuid string) {
   894  	ds, err := c.GetDNService(uuid)
   895  	require.NoError(c.t, err)
   896  
   897  	for {
   898  		select {
   899  		case <-ctx.Done():
   900  			assert.FailNow(
   901  				c.t,
   902  				"terminated when waiting task service created on dn store",
   903  				"dn store %s, error: %s", uuid, ctx.Err(),
   904  			)
   905  		default:
   906  			_, ok := ds.GetTaskService()
   907  			if ok {
   908  				return
   909  			}
   910  			time.Sleep(defaultWaitInterval)
   911  		}
   912  	}
   913  }
   914  
   915  func (c *testCluster) WaitDNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) {
   916  	ds, err := c.GetDNServiceIndexed(index)
   917  	require.NoError(c.t, err)
   918  	c.WaitDNStoreTaskServiceCreated(ctx, ds.ID())
   919  }
   920  
   921  func (c *testCluster) WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string) {
   922  	ls, err := c.GetLogService(uuid)
   923  	require.NoError(c.t, err)
   924  
   925  	for {
   926  		select {
   927  		case <-ctx.Done():
   928  			assert.FailNow(
   929  				c.t,
   930  				"terminated when waiting task service created on log store",
   931  				"log store %s, error: %s", uuid, ctx.Err(),
   932  			)
   933  		default:
   934  			_, ok := ls.GetTaskService()
   935  			if ok {
   936  				return
   937  			}
   938  			time.Sleep(defaultWaitInterval)
   939  		}
   940  	}
   941  }
   942  
   943  func (c *testCluster) WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int) {
   944  	ds, err := c.GetLogServiceIndexed(index)
   945  	require.NoError(c.t, err)
   946  	c.WaitLogStoreTaskServiceCreated(ctx, ds.ID())
   947  }
   948  
   949  func (c *testCluster) WaitLogStoreTimeout(ctx context.Context, uuid string) {
   950  	for {
   951  		select {
   952  		case <-ctx.Done():
   953  			assert.FailNow(
   954  				c.t,
   955  				"terminated when waiting log store timeout",
   956  				"log store %s, error: %s", uuid, ctx.Err(),
   957  			)
   958  		default:
   959  			time.Sleep(defaultWaitInterval)
   960  
   961  			expired, err := c.LogStoreExpired(uuid)
   962  			if err != nil {
   963  				c.logger.Error("fail to check log store expired or not",
   964  					zap.Error(err),
   965  					zap.String("uuid", uuid),
   966  				)
   967  				continue
   968  			}
   969  
   970  			if expired {
   971  				return
   972  			}
   973  		}
   974  	}
   975  }
   976  
   977  func (c *testCluster) WaitLogStoreTimeoutIndexed(ctx context.Context, index int) {
   978  	ls, err := c.GetLogServiceIndexed(index)
   979  	require.NoError(c.t, err)
   980  
   981  	c.WaitLogStoreTimeout(ctx, ls.ID())
   982  }
   983  
   984  func (c *testCluster) WaitLogStoreReported(ctx context.Context, uuid string) {
   985  	for {
   986  		select {
   987  		case <-ctx.Done():
   988  			assert.FailNow(
   989  				c.t,
   990  				"terminated when waiting log store reported",
   991  				"log store %s, error: %s", uuid, ctx.Err(),
   992  			)
   993  		default:
   994  			time.Sleep(defaultWaitInterval)
   995  
   996  			expired, err := c.LogStoreExpired(uuid)
   997  			if err != nil {
   998  				c.logger.Error("fail to check log store expired or not",
   999  					zap.Error(err),
  1000  					zap.String("uuid", uuid),
  1001  				)
  1002  				continue
  1003  			}
  1004  
  1005  			if !expired {
  1006  				return
  1007  			}
  1008  		}
  1009  	}
  1010  }
  1011  
  1012  func (c *testCluster) WaitLogStoreReportedIndexed(ctx context.Context, index int) {
  1013  	ls, err := c.GetLogServiceIndexed(index)
  1014  	require.NoError(c.t, err)
  1015  
  1016  	c.WaitLogStoreReported(ctx, ls.ID())
  1017  }
  1018  
  1019  // --------------------------------------------------------------
  1020  // The following are implements for interface `ClusterAwareness`.
  1021  // --------------------------------------------------------------
  1022  func (c *testCluster) ListDNServices() []string {
  1023  	ids := make([]string, 0, len(c.dn.svcs))
  1024  	for _, cfg := range c.dn.cfgs {
  1025  		ids = append(ids, cfg.UUID)
  1026  	}
  1027  	return ids
  1028  }
  1029  
  1030  func (c *testCluster) ListLogServices() []string {
  1031  	ids := make([]string, 0, len(c.log.svcs))
  1032  	for _, svc := range c.log.svcs {
  1033  		ids = append(ids, svc.ID())
  1034  	}
  1035  	return ids
  1036  }
  1037  
  1038  func (c *testCluster) ListCnServices() []string {
  1039  	ids := make([]string, 0, len(c.cn.svcs))
  1040  	for _, svc := range c.cn.svcs {
  1041  		ids = append(ids, svc.ID())
  1042  	}
  1043  	return ids
  1044  }
  1045  
  1046  func (c *testCluster) ListHAKeeperServices() []LogService {
  1047  	return c.selectHAkeeperServices()
  1048  }
  1049  
  1050  func (c *testCluster) GetDNService(uuid string) (DNService, error) {
  1051  	c.dn.Lock()
  1052  	defer c.dn.Unlock()
  1053  
  1054  	for i, cfg := range c.dn.cfgs {
  1055  		if cfg.UUID == uuid {
  1056  			return c.dn.svcs[i], nil
  1057  		}
  1058  	}
  1059  	return nil, moerr.NewNoServiceNoCtx(uuid)
  1060  }
  1061  
  1062  func (c *testCluster) GetLogService(uuid string) (LogService, error) {
  1063  	c.log.Lock()
  1064  	defer c.log.Unlock()
  1065  
  1066  	for _, svc := range c.log.svcs {
  1067  		if svc.ID() == uuid {
  1068  			return svc, nil
  1069  		}
  1070  	}
  1071  	return nil, moerr.NewNoServiceNoCtx(uuid)
  1072  }
  1073  
  1074  func (c *testCluster) GetCNService(uuid string) (CNService, error) {
  1075  	c.log.Lock()
  1076  	defer c.log.Unlock()
  1077  
  1078  	for _, svc := range c.cn.svcs {
  1079  		if svc.ID() == uuid {
  1080  			return svc, nil
  1081  		}
  1082  	}
  1083  	return nil, moerr.NewNoServiceNoCtx(uuid)
  1084  }
  1085  
  1086  func (c *testCluster) GetDNServiceIndexed(index int) (DNService, error) {
  1087  	c.dn.Lock()
  1088  	defer c.dn.Unlock()
  1089  
  1090  	if index >= len(c.dn.svcs) || index < 0 {
  1091  		return nil, moerr.NewInvalidServiceIndexNoCtx(index)
  1092  	}
  1093  	return c.dn.svcs[index], nil
  1094  }
  1095  
  1096  func (c *testCluster) GetLogServiceIndexed(index int) (LogService, error) {
  1097  	c.log.Lock()
  1098  	defer c.log.Unlock()
  1099  
  1100  	if index >= len(c.log.svcs) || index < 0 {
  1101  		return nil, moerr.NewInvalidServiceIndexNoCtx(index)
  1102  	}
  1103  	return c.log.svcs[index], nil
  1104  }
  1105  
  1106  func (c *testCluster) GetCNServiceIndexed(index int) (CNService, error) {
  1107  	c.log.Lock()
  1108  	defer c.log.Unlock()
  1109  
  1110  	if index >= len(c.cn.svcs) || index < 0 {
  1111  		return nil, moerr.NewInvalidServiceIndexNoCtx(index)
  1112  	}
  1113  	return c.cn.svcs[index], nil
  1114  }
  1115  
  1116  // NB: we could also fetch cluster state from non-leader hakeeper.
  1117  func (c *testCluster) GetClusterState(
  1118  	ctx context.Context,
  1119  ) (*logpb.CheckerState, error) {
  1120  	c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning)
  1121  	leader := c.WaitHAKeeperLeader(ctx)
  1122  	return leader.GetClusterState()
  1123  }
  1124  
  1125  // --------------------------------------------------------------
  1126  // The following are implements for interface `ClusterOperation`.
  1127  // --------------------------------------------------------------
  1128  func (c *testCluster) CloseDNService(uuid string) error {
  1129  	ds, err := c.GetDNService(uuid)
  1130  	if err != nil {
  1131  		return err
  1132  	}
  1133  	return ds.Close()
  1134  }
  1135  
  1136  func (c *testCluster) StartDNService(uuid string) error {
  1137  	ds, err := c.GetDNService(uuid)
  1138  	if err != nil {
  1139  		return err
  1140  	}
  1141  	return ds.Start()
  1142  }
  1143  
  1144  func (c *testCluster) CloseDNServiceIndexed(index int) error {
  1145  	ds, err := c.GetDNServiceIndexed(index)
  1146  	if err != nil {
  1147  		return err
  1148  	}
  1149  	return ds.Close()
  1150  }
  1151  
  1152  func (c *testCluster) StartDNServiceIndexed(index int) error {
  1153  	ds, err := c.GetDNServiceIndexed(index)
  1154  	if err != nil {
  1155  		return err
  1156  	}
  1157  	return ds.Start()
  1158  }
  1159  
  1160  func (c *testCluster) CloseLogService(uuid string) error {
  1161  	ls, err := c.GetLogService(uuid)
  1162  	if err != nil {
  1163  		return err
  1164  	}
  1165  	return ls.Close()
  1166  }
  1167  
  1168  func (c *testCluster) StartLogService(uuid string) error {
  1169  	ls, err := c.GetLogService(uuid)
  1170  	if err != nil {
  1171  		return err
  1172  	}
  1173  	return ls.Start()
  1174  }
  1175  
  1176  func (c *testCluster) CloseLogServiceIndexed(index int) error {
  1177  	ls, err := c.GetLogServiceIndexed(index)
  1178  	if err != nil {
  1179  		return err
  1180  	}
  1181  	return ls.Close()
  1182  }
  1183  
  1184  func (c *testCluster) StartLogServiceIndexed(index int) error {
  1185  	ls, err := c.GetLogServiceIndexed(index)
  1186  	if err != nil {
  1187  		return err
  1188  	}
  1189  	return ls.Start()
  1190  }
  1191  
  1192  func (c *testCluster) CloseCNService(uuid string) error {
  1193  	cs, err := c.GetCNService(uuid)
  1194  	if err != nil {
  1195  		return err
  1196  	}
  1197  	return cs.Close()
  1198  }
  1199  
  1200  func (c *testCluster) StartCNService(uuid string) error {
  1201  	cs, err := c.GetCNService(uuid)
  1202  	if err != nil {
  1203  		return err
  1204  	}
  1205  	return cs.Start()
  1206  }
  1207  
  1208  func (c *testCluster) CloseCNServiceIndexed(index int) error {
  1209  	cs, err := c.GetCNServiceIndexed(index)
  1210  	if err != nil {
  1211  		return err
  1212  	}
  1213  	return cs.Close()
  1214  }
  1215  
  1216  func (c *testCluster) StartCNServiceIndexed(index int) error {
  1217  	cs, err := c.GetCNServiceIndexed(index)
  1218  	if err != nil {
  1219  		return err
  1220  	}
  1221  	return cs.Start()
  1222  }
  1223  
  1224  func (c *testCluster) NewNetworkPartition(
  1225  	dnIndexes, logIndexes, cnIndexes []uint32,
  1226  ) NetworkPartition {
  1227  	return newNetworkPartition(
  1228  		c.opt.initial.logServiceNum, logIndexes,
  1229  		c.opt.initial.dnServiceNum, dnIndexes,
  1230  		c.opt.initial.cnServiceNum, cnIndexes,
  1231  	)
  1232  }
  1233  
  1234  func (c *testCluster) RemainingNetworkPartition(
  1235  	partitions ...NetworkPartition,
  1236  ) NetworkPartition {
  1237  	return remainingNetworkPartition(c.opt.initial.logServiceNum, c.opt.initial.dnServiceNum, 0, partitions...)
  1238  }
  1239  
  1240  func (c *testCluster) StartNetworkPartition(parts ...NetworkPartition) {
  1241  	c.network.Lock()
  1242  	defer c.network.Unlock()
  1243  
  1244  	addressSets := c.network.addresses.buildPartitionAddressSets(parts...)
  1245  	c.network.addressSets = addressSets
  1246  }
  1247  
  1248  func (c *testCluster) CloseNetworkPartition() {
  1249  	c.network.Lock()
  1250  	defer c.network.Unlock()
  1251  
  1252  	c.network.addressSets = nil
  1253  }
  1254  
  1255  // ------------------------------------------------------
  1256  // The following are private utilities for `testCluster`.
  1257  // ------------------------------------------------------
  1258  
  1259  // buildServiceAddresses builds addresses for all services.
  1260  func (c *testCluster) buildServiceAddresses() serviceAddresses {
  1261  	return newServiceAddresses(c.t, c.opt.initial.logServiceNum,
  1262  		c.opt.initial.dnServiceNum, c.opt.initial.cnServiceNum, c.opt.hostAddr)
  1263  }
  1264  
  1265  // buildDNConfigs builds configurations for all dn services.
  1266  func (c *testCluster) buildDNConfigs(
  1267  	address serviceAddresses,
  1268  ) ([]*dnservice.Config, []dnOptions) {
  1269  	batch := c.opt.initial.dnServiceNum
  1270  
  1271  	cfgs := make([]*dnservice.Config, 0, batch)
  1272  	opts := make([]dnOptions, 0, batch)
  1273  	for i := 0; i < batch; i++ {
  1274  		cfg := buildDNConfig(i, c.opt, address)
  1275  		cfgs = append(cfgs, cfg)
  1276  
  1277  		localAddr := cfg.ListenAddress
  1278  		opt := buildDNOptions(cfg, c.backendFilterFactory(localAddr))
  1279  		opts = append(opts, opt)
  1280  	}
  1281  	return cfgs, opts
  1282  }
  1283  
  1284  // buildLogConfigs builds configurations for all log services.
  1285  func (c *testCluster) buildLogConfigs(
  1286  	address serviceAddresses,
  1287  ) ([]logservice.Config, []logOptions) {
  1288  	batch := c.opt.initial.logServiceNum
  1289  
  1290  	cfgs := make([]logservice.Config, 0, batch)
  1291  	opts := make([]logOptions, 0, batch)
  1292  	for i := 0; i < batch; i++ {
  1293  		cfg := buildLogConfig(i, c.opt, address)
  1294  		cfgs = append(cfgs, cfg)
  1295  
  1296  		localAddr := cfg.ServiceAddress
  1297  		opt := buildLogOptions(cfg, c.backendFilterFactory(localAddr))
  1298  		opts = append(opts, opt)
  1299  	}
  1300  	return cfgs, opts
  1301  }
  1302  
  1303  func (c *testCluster) buildCNConfigs(
  1304  	address serviceAddresses,
  1305  ) ([]*cnservice.Config, []cnOptions) {
  1306  	batch := c.opt.initial.cnServiceNum
  1307  
  1308  	cfgs := make([]*cnservice.Config, 0, batch)
  1309  	opts := make([]cnOptions, 0, batch)
  1310  	for i := 0; i < batch; i++ {
  1311  		cfg := buildCNConfig(i, c.opt, address)
  1312  		cfgs = append(cfgs, cfg)
  1313  
  1314  		opt := buildCNOptions()
  1315  		opt = append(opt, cnservice.WithLogger(c.logger))
  1316  		opts = append(opts, opt)
  1317  	}
  1318  	return cfgs, opts
  1319  }
  1320  
  1321  // initDNServices builds all dn services.
  1322  //
  1323  // Before initializing dn service, log service must be started already.
  1324  func (c *testCluster) initDNServices(fileservices *fileServices) []DNService {
  1325  	batch := c.opt.initial.dnServiceNum
  1326  
  1327  	c.logger.Info("initialize dn services", zap.Int("batch", batch))
  1328  
  1329  	svcs := make([]DNService, 0, batch)
  1330  	for i := 0; i < batch; i++ {
  1331  		cfg := c.dn.cfgs[i]
  1332  		opt := c.dn.opts[i]
  1333  		fs, err := fileservice.NewFileServices(
  1334  			defines.LocalFileServiceName,
  1335  			fileservices.getDNLocalFileService(i),
  1336  			fileservices.getS3FileService(),
  1337  		)
  1338  		if err != nil {
  1339  			panic(err)
  1340  		}
  1341  		ds, err := newDNService(
  1342  			cfg,
  1343  			c.newRuntime(),
  1344  			fs,
  1345  			opt)
  1346  		require.NoError(c.t, err)
  1347  
  1348  		c.logger.Info(
  1349  			"dn service initialized",
  1350  			zap.Int("index", i),
  1351  			zap.Any("config", cfg),
  1352  		)
  1353  
  1354  		svcs = append(svcs, ds)
  1355  	}
  1356  
  1357  	return svcs
  1358  }
  1359  
  1360  // initLogServices builds all log services.
  1361  func (c *testCluster) initLogServices() []LogService {
  1362  	batch := c.opt.initial.logServiceNum
  1363  
  1364  	c.logger.Info("initialize log services", zap.Int("batch", batch))
  1365  
  1366  	svcs := make([]LogService, 0, batch)
  1367  	for i := 0; i < batch; i++ {
  1368  		cfg := c.log.cfgs[i]
  1369  		opt := c.log.opts[i]
  1370  		ls, err := newLogService(cfg, testutil.NewFS(), opt)
  1371  		require.NoError(c.t, err)
  1372  
  1373  		c.logger.Info(
  1374  			"log service initialized",
  1375  			zap.Int("index", i),
  1376  			zap.Any("config", cfg),
  1377  		)
  1378  
  1379  		svcs = append(svcs, ls)
  1380  	}
  1381  	return svcs
  1382  }
  1383  
  1384  func (c *testCluster) initCNServices(fileservices *fileServices) []CNService {
  1385  	batch := c.opt.initial.cnServiceNum
  1386  
  1387  	c.logger.Info("initialize cn services", zap.Int("batch", batch))
  1388  
  1389  	svcs := make([]CNService, 0, batch)
  1390  	for i := 0; i < batch; i++ {
  1391  		cfg := c.cn.cfgs[i]
  1392  		opt := c.cn.opts[i]
  1393  		fs, err := fileservice.NewFileServices(
  1394  			defines.LocalFileServiceName,
  1395  			fileservices.getCNLocalFileService(i),
  1396  			fileservices.getS3FileService(),
  1397  		)
  1398  		if err != nil {
  1399  			panic(err)
  1400  		}
  1401  		ctx, cancel := context.WithCancel(context.Background())
  1402  		cs, err := newCNService(cfg, ctx, fs, opt)
  1403  		if err != nil {
  1404  			panic(err)
  1405  		}
  1406  		cs.SetCancel(cancel)
  1407  
  1408  		c.logger.Info(
  1409  			"cn service initialized",
  1410  			zap.Int("index", i),
  1411  			zap.Any("config", cfg),
  1412  		)
  1413  
  1414  		svcs = append(svcs, cs)
  1415  	}
  1416  	return svcs
  1417  }
  1418  
  1419  // startDNServices initializes and starts all dn services.
  1420  func (c *testCluster) startDNServices(ctx context.Context) error {
  1421  	// initialize all dn services
  1422  	c.dn.svcs = c.initDNServices(c.fileservices)
  1423  
  1424  	// start dn services
  1425  	for _, ds := range c.dn.svcs {
  1426  		if err := ds.Start(); err != nil {
  1427  			return err
  1428  		}
  1429  	}
  1430  
  1431  	c.WaitDNShardsReported(ctx)
  1432  	return nil
  1433  }
  1434  
  1435  // startLogServices initializes and starts all log services.
  1436  func (c *testCluster) startLogServices(ctx context.Context) error {
  1437  	// initialize all log service
  1438  	c.log.svcs = c.initLogServices()
  1439  
  1440  	// start log services
  1441  	for _, ls := range c.log.svcs {
  1442  		if err := ls.Start(); err != nil {
  1443  			return err
  1444  		}
  1445  	}
  1446  
  1447  	// start hakeeper replicas
  1448  	if err := c.startHAKeeperReplica(); err != nil {
  1449  		return err
  1450  	}
  1451  
  1452  	// initialize cluster information
  1453  	if err := c.setInitialClusterInfo(); err != nil {
  1454  		return err
  1455  	}
  1456  
  1457  	c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning)
  1458  	return nil
  1459  }
  1460  
  1461  func (c *testCluster) startCNServices(ctx context.Context) error {
  1462  	c.cn.svcs = c.initCNServices(c.fileservices)
  1463  
  1464  	for _, cs := range c.cn.svcs {
  1465  		if err := cs.Start(); err != nil {
  1466  			return err
  1467  		}
  1468  	}
  1469  
  1470  	if err := c.waitSystemInitCompleted(ctx); err != nil {
  1471  		return err
  1472  	}
  1473  	return nil
  1474  }
  1475  
  1476  // closeDNServices closes all dn services.
  1477  func (c *testCluster) closeDNServices() error {
  1478  	c.logger.Info("start to close dn services")
  1479  
  1480  	for i, ds := range c.dn.svcs {
  1481  		c.logger.Info("close dn service", zap.Int("index", i))
  1482  		if err := ds.Close(); err != nil {
  1483  			return err
  1484  		}
  1485  		c.logger.Info("dn service closed", zap.Int("index", i))
  1486  	}
  1487  
  1488  	return nil
  1489  }
  1490  
  1491  // closeLogServices closes all log services.
  1492  func (c *testCluster) closeLogServices() error {
  1493  	defer logutil.LogClose(c.logger, "tests-framework/logservices")()
  1494  
  1495  	for i, ls := range c.log.svcs {
  1496  		c.logger.Info("close log service", zap.Int("index", i))
  1497  		if err := ls.Close(); err != nil {
  1498  			return err
  1499  		}
  1500  		c.logger.Info("log service closed", zap.Int("index", i))
  1501  	}
  1502  
  1503  	return nil
  1504  }
  1505  
  1506  func (c *testCluster) closeCNServices() error {
  1507  	defer logutil.LogClose(c.logger, "tests-framework/cnservices")()
  1508  
  1509  	for i, cs := range c.cn.svcs {
  1510  		c.logger.Info("close cn service", zap.Int("index", i))
  1511  		if err := cs.Close(); err != nil {
  1512  			return err
  1513  		}
  1514  		c.logger.Info("cn service closed", zap.Int("index", i))
  1515  	}
  1516  
  1517  	return nil
  1518  }
  1519  
  1520  // getClusterState fetches cluster state from arbitrary hakeeper.
  1521  //
  1522  // NB: it's possible that getClusterState returns nil value.
  1523  func (c *testCluster) getClusterState() *logpb.CheckerState {
  1524  	var state *logpb.CheckerState
  1525  	fn := func(index int, svc LogService) bool {
  1526  		s, err := svc.GetClusterState()
  1527  		if err != nil {
  1528  			c.logger.Error(
  1529  				"fail to get cluster state",
  1530  				zap.Error(err),
  1531  				zap.Int("index", index),
  1532  			)
  1533  			return false
  1534  		}
  1535  		state = s
  1536  		// XXX MPOOL
  1537  		// Too much logging can break CI.
  1538  		// c.logger.Info("current cluster state", zap.Any("state", s))
  1539  		return true
  1540  	}
  1541  	c.rangeHAKeeperService(fn)
  1542  	return state
  1543  }
  1544  
  1545  // getHAKeeperLeader gets log service which is hakeeper leader.
  1546  func (c *testCluster) getHAKeeperLeader() LogService {
  1547  	var leader LogService
  1548  	fn := func(index int, svc LogService) bool {
  1549  		isLeader, err := svc.IsLeaderHakeeper()
  1550  		if err != nil {
  1551  			c.logger.Error(
  1552  				"fail to check hakeeper",
  1553  				zap.Error(err),
  1554  				zap.Int("index", index),
  1555  			)
  1556  			return false
  1557  		}
  1558  		c.logger.Info(
  1559  			"hakeeper state",
  1560  			zap.Bool("isLeader", isLeader),
  1561  			zap.Int("index", index),
  1562  		)
  1563  
  1564  		if isLeader {
  1565  			leader = svc
  1566  			return true
  1567  		}
  1568  
  1569  		return false
  1570  	}
  1571  	c.rangeHAKeeperService(fn)
  1572  	return leader
  1573  }
  1574  
  1575  // rangeHAKeeperService iterates all hakeeper service until `fn` returns true.
  1576  func (c *testCluster) rangeHAKeeperService(
  1577  	fn func(index int, svc LogService) bool,
  1578  ) {
  1579  	for i, svc := range c.selectHAkeeperServices() {
  1580  		index := i
  1581  
  1582  		if svc.Status() != ServiceStarted {
  1583  			c.logger.Warn(
  1584  				"hakeeper service not started",
  1585  				zap.Int("index", index),
  1586  			)
  1587  			continue
  1588  		}
  1589  
  1590  		if fn(index, svc) {
  1591  			break
  1592  		}
  1593  	}
  1594  }
  1595  
  1596  func (c *testCluster) waitSystemInitCompleted(ctx context.Context) error {
  1597  	log, err := c.GetLogServiceIndexed(0)
  1598  	if err != nil {
  1599  		return err
  1600  	}
  1601  	if err := log.CreateInitTasks(); err != nil {
  1602  		return err
  1603  	}
  1604  
  1605  	c.WaitCNStoreTaskServiceCreatedIndexed(ctx, 0)
  1606  	cn, err := c.GetCNServiceIndexed(0)
  1607  	if err != nil {
  1608  		return err
  1609  	}
  1610  	if err := cn.WaitSystemInitCompleted(ctx); err != nil {
  1611  		return err
  1612  	}
  1613  	return nil
  1614  }
  1615  
  1616  func (c *testCluster) newRuntime() runtime.Runtime {
  1617  	return runtime.NewRuntime(metadata.ServiceType_CN, "", c.logger, runtime.WithClock(c.clock))
  1618  }
  1619  
  1620  // FilterFunc returns true if traffic was allowed.
  1621  type FilterFunc func(morpc.Message, string) bool
  1622  
  1623  // backendFilterFactory constructs a closure with the type of FilterFunc.
  1624  func (c *testCluster) backendFilterFactory(localAddr string) FilterFunc {
  1625  	return func(_ morpc.Message, backendAddr string) bool {
  1626  		// NB: it's possible that partition takes effect once more after disabled.
  1627  		c.network.RLock()
  1628  		addressSets := c.network.addressSets
  1629  		c.network.RUnlock()
  1630  
  1631  		if len(addressSets) == 0 {
  1632  			return true
  1633  		}
  1634  
  1635  		for _, addrSet := range addressSets {
  1636  			if addrSet.contains(localAddr) &&
  1637  				addrSet.contains(backendAddr) {
  1638  				return true
  1639  			}
  1640  		}
  1641  
  1642  		c.logger.Info(
  1643  			"traffic not allowed",
  1644  			zap.String("local", localAddr),
  1645  			zap.String("backend", backendAddr),
  1646  		)
  1647  
  1648  		return false
  1649  	}
  1650  }