github.com/matrixorigin/matrixone@v1.2.0/pkg/tests/service/service.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package service
    16  
    17  import (
    18  	"context"
    19  	"os"
    20  	"path/filepath"
    21  	"sync"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/google/uuid"
    26  	"github.com/matrixorigin/matrixone/pkg/cnservice"
    27  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    28  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    29  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    30  	"github.com/matrixorigin/matrixone/pkg/common/stopper"
    31  	"github.com/matrixorigin/matrixone/pkg/defines"
    32  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    33  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    34  	"github.com/matrixorigin/matrixone/pkg/hakeeper/checkers/syshealth"
    35  	"github.com/matrixorigin/matrixone/pkg/logservice"
    36  	"github.com/matrixorigin/matrixone/pkg/logutil"
    37  	logpb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    38  	"github.com/matrixorigin/matrixone/pkg/pb/metadata"
    39  	"github.com/matrixorigin/matrixone/pkg/testutil"
    40  	"github.com/matrixorigin/matrixone/pkg/tnservice"
    41  	"github.com/matrixorigin/matrixone/pkg/txn/clock"
    42  	"github.com/stretchr/testify/assert"
    43  	"github.com/stretchr/testify/require"
    44  	"go.uber.org/zap"
    45  )
    46  
    47  var (
    48  	defaultWaitInterval = 100 * time.Millisecond
    49  	defaultTestTimeout  = 3 * time.Minute
    50  )
    51  
    52  // Cluster describes behavior of test framework.
    53  type Cluster interface {
    54  	// Start starts svcs sequentially, after start, system init is completed.
    55  	Start() error
    56  	// Close stops svcs sequentially
    57  	Close() error
    58  	// Options returns the adjusted options
    59  	Options() Options
    60  	// Clock get cluster clock
    61  	Clock() clock.Clock
    62  
    63  	ClusterOperation
    64  	ClusterAwareness
    65  	ClusterState
    66  	ClusterWaitState
    67  }
    68  
    69  // ClusterOperation supports kinds of cluster operations.
    70  type ClusterOperation interface {
    71  	// CloseTNService closes tn service by uuid.
    72  	CloseTNService(uuid string) error
    73  	// StartTNService starts tn service by uuid.
    74  	StartTNService(uuid string) error
    75  
    76  	// CloseTNServiceIndexed closes tn service by its index.
    77  	CloseTNServiceIndexed(index int) error
    78  	// StartTNServiceIndexed starts tn service by its index.
    79  	StartTNServiceIndexed(index int) error
    80  
    81  	// CloseLogService closes log service by uuid.
    82  	CloseLogService(uuid string) error
    83  	// StartLogService starts log service by uuid.
    84  	StartLogService(uuid string) error
    85  
    86  	// CloseLogServiceIndexed closes log service by its index.
    87  	CloseLogServiceIndexed(index int) error
    88  	// StartLogServiceIndexed starts log service by its index.
    89  	StartLogServiceIndexed(index int) error
    90  
    91  	// CloseCNService closes cn service by uuid.
    92  	CloseCNService(uuid string) error
    93  	// StartCNService starts cn service by uuid.
    94  	StartCNService(uuid string) error
    95  
    96  	// CloseCNServiceIndexed closes cn service by its index.
    97  	CloseCNServiceIndexed(index int) error
    98  	// StartCNServiceIndexed starts cn service by its index.
    99  	StartCNServiceIndexed(index int) error
   100  
   101  	// StartCNServices start number of cn services.
   102  	StartCNServices(n int) error
   103  
   104  	// NewNetworkPartition constructs network partition from service index.
   105  	NewNetworkPartition(tnIndexes, logIndexes, cnIndexes []uint32) NetworkPartition
   106  	// RemainingNetworkPartition returns partition for the remaining services.
   107  	RemainingNetworkPartition(partitions ...NetworkPartition) NetworkPartition
   108  	// StartNetworkPartition enables network partition feature.
   109  	StartNetworkPartition(partitions ...NetworkPartition)
   110  	// CloseNetworkPartition disables network partition feature.
   111  	CloseNetworkPartition()
   112  }
   113  
   114  // ClusterAwareness provides cluster awareness information.
   115  type ClusterAwareness interface {
   116  	// ListTNServices lists uuid of all tn services.
   117  	ListTNServices() []string
   118  	// ListLogServices lists uuid of all log services.
   119  	ListLogServices() []string
   120  	// ListCnServices lists uuid of all cn services.
   121  	ListCnServices() []string
   122  	// ListHAKeeperServices lists all hakeeper log services.
   123  	ListHAKeeperServices() []LogService
   124  
   125  	// GetTNService fetches tn service instance by uuid.
   126  	GetTNService(uuid string) (TNService, error)
   127  	// GetLogService fetches log service instance by index.
   128  	GetLogService(uuid string) (LogService, error)
   129  	// GetTNServiceIndexed fetches tn service instance by uuid.
   130  	GetTNServiceIndexed(index int) (TNService, error)
   131  	// GetLogServiceIndexed fetches log service instance by index.
   132  	GetLogServiceIndexed(index int) (LogService, error)
   133  	// GetCNService fetches cn service instance by index.
   134  	GetCNService(uuid string) (CNService, error)
   135  	// GetCNServiceIndexed fetches cn service instance by index.
   136  	GetCNServiceIndexed(index int) (CNService, error)
   137  
   138  	// GetClusterState fetches current cluster state
   139  	GetClusterState(ctx context.Context) (*logpb.CheckerState, error)
   140  }
   141  
   142  // ClusterState provides cluster running state.
   143  type ClusterState interface {
   144  	// ListTNShards lists all tn shards within the cluster.
   145  	ListTNShards(ctx context.Context) ([]metadata.TNShardRecord, error)
   146  	// ListLogShards lists all log shards within the cluster.
   147  	ListLogShards(ctx context.Context) ([]metadata.LogShardRecord, error)
   148  
   149  	// GetTNStoreInfo gets tn store information by uuid.
   150  	GetTNStoreInfo(ctx context.Context, uuid string) (logpb.TNStoreInfo, error)
   151  	// GetTNStoreInfoIndexed gets tn store information by index.
   152  	GetTNStoreInfoIndexed(ctx context.Context, index int) (logpb.TNStoreInfo, error)
   153  
   154  	// GetLogStoreInfo gets log store information by uuid.
   155  	GetLogStoreInfo(ctx context.Context, uuid string) (logpb.LogStoreInfo, error)
   156  	// GetLogStoreInfoIndexed gets log store information by index.
   157  	GetLogStoreInfoIndexed(ctx context.Context, index int) (logpb.LogStoreInfo, error)
   158  
   159  	// GetCNStoreInfo gets cn store information by uuid.
   160  	GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error)
   161  	// GetCNStoreInfoIndexed gets cn store information by index.
   162  	GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error)
   163  
   164  	// GetHAKeeperState returns hakeeper state from running hakeeper.
   165  	GetHAKeeperState() logpb.HAKeeperState
   166  	// GetHAKeeperConfig returns hakeeper configuration.
   167  	GetHAKeeperConfig() hakeeper.Config
   168  
   169  	// TNStoreExpired checks tn store expired or not by uuid.
   170  	TNStoreExpired(uuid string) (bool, error)
   171  	// TNStoreExpiredIndexed checks tn store expired or not by index.
   172  	TNStoreExpiredIndexed(index int) (bool, error)
   173  	// LogStoreExpired checks log store expired or not by uuid.
   174  	LogStoreExpired(uuid string) (bool, error)
   175  	// LogStoreExpiredIndexed checks log store expired or not by index.
   176  	LogStoreExpiredIndexed(index int) (bool, error)
   177  	// CNStoreExpired checks cn store expired or not by uuid.
   178  	CNStoreExpired(uuid string) (bool, error)
   179  	// CNStoreExpiredIndexed checks cn store expired or not by index.
   180  	CNStoreExpiredIndexed(index int) (bool, error)
   181  
   182  	// IsClusterHealthy checks whether cluster is healthy or not.
   183  	IsClusterHealthy() bool
   184  }
   185  
   186  // ClusterWaitState waits cluster state until timeout.
   187  type ClusterWaitState interface {
   188  	// WaitHAKeeperLeader waits hakeeper leader elected and return it.
   189  	WaitHAKeeperLeader(ctx context.Context) LogService
   190  	// WaitHAKeeperState waits the specific hakeeper state.
   191  	WaitHAKeeperState(ctx context.Context, expected logpb.HAKeeperState)
   192  
   193  	// WaitTNShardsReported waits the expected count of tn shards reported.
   194  	WaitTNShardsReported(ctx context.Context)
   195  	// WaitLogShardsReported waits the expected count of log shards reported.
   196  	WaitLogShardsReported(ctx context.Context)
   197  	// WaitTNReplicaReported waits tn replica reported.
   198  	WaitTNReplicaReported(ctx context.Context, shardID uint64)
   199  	// WaitLogReplicaReported waits log replicas reported.
   200  	WaitLogReplicaReported(ctx context.Context, shardID uint64)
   201  
   202  	// WaitTNStoreTimeout waits tn store timeout by uuid.
   203  	WaitTNStoreTimeout(ctx context.Context, uuid string)
   204  	// WaitTNStoreTimeoutIndexed waits tn store timeout by index.
   205  	WaitTNStoreTimeoutIndexed(ctx context.Context, index int)
   206  	// WaitTNStoreReported waits tn store reported by uuid.
   207  	WaitTNStoreReported(ctx context.Context, uuid string)
   208  	// WaitTNStoreReportedIndexed waits tn store reported by index.
   209  	WaitTNStoreReportedIndexed(ctx context.Context, index int)
   210  	// WaitTNStoreTaskServiceCreated waits tn store task service started by uuid.
   211  	WaitTNStoreTaskServiceCreated(ctx context.Context, uuid string)
   212  	// WaitTNStoreTaskServiceCreatedIndexed waits tn store task service started by index.
   213  	WaitTNStoreTaskServiceCreatedIndexed(ctx context.Context, index int)
   214  	// WaitCNStoreReported waits cn store reported by uuid.
   215  	WaitCNStoreReported(ctx context.Context, uuid string)
   216  	// WaitCNStoreReportedIndexed waits cn store reported by index.
   217  	WaitCNStoreReportedIndexed(ctx context.Context, index int)
   218  	// WaitCNStoreTaskServiceCreated waits cn store task service started by uuid.
   219  	WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string)
   220  	// WaitCNStoreTaskServiceCreatedIndexed waits cn store task service started by index.
   221  	WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int)
   222  	// WaitLogStoreTaskServiceCreated waits log store task service started by uuid
   223  	WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string)
   224  	// WaitLogStoreTaskServiceCreatedIndexed waits log store task service started by index
   225  	WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int)
   226  
   227  	// WaitLogStoreTimeout waits log store timeout by uuid.
   228  	WaitLogStoreTimeout(ctx context.Context, uuid string)
   229  	// WaitLogStoreTimeoutIndexed waits log store timeout by index.
   230  	WaitLogStoreTimeoutIndexed(ctx context.Context, index int)
   231  	// WaitLogStoreReported waits log store reported by uuid.
   232  	WaitLogStoreReported(ctx context.Context, uuid string)
   233  	// WaitLogStoreReportedIndexed waits log store reported by index.
   234  	WaitLogStoreReportedIndexed(ctx context.Context, index int)
   235  }
   236  
   237  // ----------------------------------------------------
   238  // The following are implements for interface `Cluster`.
   239  // ----------------------------------------------------
   240  
   241  // testCluster simulates a cluster with tn and log service.
   242  type testCluster struct {
   243  	t       *testing.T
   244  	testID  string
   245  	opt     Options
   246  	logger  *zap.Logger
   247  	stopper *stopper.Stopper
   248  	clock   clock.Clock
   249  
   250  	tn struct {
   251  		sync.Mutex
   252  		cfgs []*tnservice.Config
   253  		opts []tnOptions
   254  		svcs []TNService
   255  	}
   256  
   257  	log struct {
   258  		once sync.Once
   259  
   260  		sync.Mutex
   261  		cfgs []logservice.Config
   262  		opts []logOptions
   263  		svcs []LogService
   264  	}
   265  
   266  	cn struct {
   267  		sync.Mutex
   268  		cfgs []*cnservice.Config
   269  		opts []cnOptions
   270  		svcs []CNService
   271  	}
   272  
   273  	network struct {
   274  		addresses *serviceAddresses
   275  
   276  		sync.RWMutex
   277  		addressSets []addressSet
   278  	}
   279  
   280  	fileservices *fileServices
   281  
   282  	mu struct {
   283  		sync.Mutex
   284  		running bool
   285  	}
   286  }
   287  
   288  // NewCluster construct a cluster for integration test.
   289  func NewCluster(ctx context.Context, t *testing.T, opt Options) (Cluster, error) {
   290  	logutil.SetupMOLogger(&logutil.LogConfig{
   291  		Level:  "fatal",
   292  		Format: "console",
   293  	})
   294  	opt.validate()
   295  
   296  	uid, _ := uuid.NewV7()
   297  	c := &testCluster{
   298  		t:       t,
   299  		testID:  uid.String(),
   300  		opt:     opt,
   301  		stopper: stopper.NewStopper("test-cluster"),
   302  	}
   303  	c.logger = logutil.Adjust(opt.logger).With(zap.String("testcase", t.Name())).With(zap.String("test-id", c.testID))
   304  	c.opt.rootDataDir = filepath.Join(c.opt.rootDataDir, c.testID, t.Name())
   305  	if c.clock == nil {
   306  		c.clock = clock.NewUnixNanoHLCClockWithStopper(c.stopper, 0)
   307  	}
   308  
   309  	// TODO: CN and LOG use process level runtime
   310  	runtime.SetupProcessLevelRuntime(c.newRuntime())
   311  
   312  	// build addresses for all services
   313  	c.network.addresses = c.buildServiceAddresses()
   314  	// build log service configurations
   315  	c.log.cfgs, c.log.opts = c.buildLogConfigs()
   316  	// build tn service configurations
   317  	c.tn.cfgs, c.tn.opts = c.buildTNConfigs()
   318  
   319  	// build FileService instances
   320  	c.fileservices = c.buildFileServices(ctx)
   321  
   322  	// build cn service configurations
   323  	c.buildCNConfigs(c.opt.initial.cnServiceNum)
   324  	return c, nil
   325  }
   326  
   327  func (c *testCluster) Start() error {
   328  	c.mu.Lock()
   329  	defer c.mu.Unlock()
   330  
   331  	if c.mu.running {
   332  		return nil
   333  	}
   334  
   335  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   336  	defer cancel()
   337  
   338  	c.mu.running = true
   339  	// start log services first
   340  	if err := c.startLogServices(ctx); err != nil {
   341  		return err
   342  	}
   343  
   344  	// start tn services
   345  	if err := c.startTNServices(ctx); err != nil {
   346  		return err
   347  	}
   348  
   349  	// start cn services
   350  	if err := c.startCNServices(ctx); err != nil {
   351  		return err
   352  	}
   353  
   354  	return nil
   355  }
   356  
   357  func (c *testCluster) Options() Options {
   358  	return c.opt
   359  }
   360  
   361  func (c *testCluster) Clock() clock.Clock {
   362  	return c.clock
   363  }
   364  
   365  func (c *testCluster) Close() error {
   366  	defer logutil.LogClose(c.logger, "tests-framework")()
   367  	c.logger.Info("closing testCluster")
   368  
   369  	c.mu.Lock()
   370  	defer c.mu.Unlock()
   371  
   372  	if !c.mu.running {
   373  		return nil
   374  	}
   375  
   376  	// close all cn services first
   377  	if err := c.closeCNServices(); err != nil {
   378  		return err
   379  	}
   380  
   381  	// close all tn services
   382  	if err := c.closeTNServices(); err != nil {
   383  		return err
   384  	}
   385  
   386  	// close all log services
   387  	if err := c.closeLogServices(); err != nil {
   388  		return err
   389  	}
   390  
   391  	c.mu.running = false
   392  	c.stopper.Stop()
   393  
   394  	if !c.opt.keepData {
   395  		if err := os.RemoveAll(c.opt.rootDataDir); err != nil {
   396  			return err
   397  		}
   398  	}
   399  	return nil
   400  }
   401  
   402  // ----------------------------------------------------------
   403  // The following are implements for interface `ClusterState`.
   404  // ----------------------------------------------------------
   405  func (c *testCluster) ListTNShards(
   406  	ctx context.Context,
   407  ) ([]metadata.TNShardRecord, error) {
   408  	state, err := c.GetClusterState(ctx)
   409  	if err != nil {
   410  		return nil, err
   411  	}
   412  	return state.ClusterInfo.TNShards, nil
   413  }
   414  
   415  func (c *testCluster) ListLogShards(
   416  	ctx context.Context,
   417  ) ([]metadata.LogShardRecord, error) {
   418  	state, err := c.GetClusterState(ctx)
   419  	if err != nil {
   420  		return nil, err
   421  	}
   422  	return state.ClusterInfo.LogShards, nil
   423  }
   424  
   425  func (c *testCluster) GetTNStoreInfo(
   426  	ctx context.Context, uuid string,
   427  ) (logpb.TNStoreInfo, error) {
   428  	state, err := c.GetClusterState(ctx)
   429  	if err != nil {
   430  		return logpb.TNStoreInfo{}, err
   431  	}
   432  	stores := state.TNState.Stores
   433  	if storeInfo, ok := stores[uuid]; ok {
   434  		return storeInfo, nil
   435  	}
   436  	return logpb.TNStoreInfo{}, moerr.NewNoService(ctx, uuid)
   437  }
   438  
   439  func (c *testCluster) GetTNStoreInfoIndexed(
   440  	ctx context.Context, index int,
   441  ) (logpb.TNStoreInfo, error) {
   442  	ds, err := c.GetTNServiceIndexed(index)
   443  	if err != nil {
   444  		return logpb.TNStoreInfo{}, err
   445  	}
   446  	return c.GetTNStoreInfo(ctx, ds.ID())
   447  }
   448  
   449  func (c *testCluster) GetLogStoreInfo(
   450  	ctx context.Context, uuid string,
   451  ) (logpb.LogStoreInfo, error) {
   452  	state, err := c.GetClusterState(ctx)
   453  	if err != nil {
   454  		return logpb.LogStoreInfo{}, err
   455  	}
   456  	stores := state.LogState.Stores
   457  	if storeInfo, ok := stores[uuid]; ok {
   458  		return storeInfo, nil
   459  	}
   460  	return logpb.LogStoreInfo{}, moerr.NewNoService(ctx, uuid)
   461  }
   462  
   463  func (c *testCluster) GetLogStoreInfoIndexed(
   464  	ctx context.Context, index int,
   465  ) (logpb.LogStoreInfo, error) {
   466  	ls, err := c.GetLogServiceIndexed(index)
   467  	if err != nil {
   468  		return logpb.LogStoreInfo{}, err
   469  	}
   470  	return c.GetLogStoreInfo(ctx, ls.ID())
   471  }
   472  
   473  func (c *testCluster) GetCNStoreInfo(ctx context.Context, uuid string) (logpb.CNStoreInfo, error) {
   474  	state, err := c.GetClusterState(ctx)
   475  	if err != nil {
   476  		return logpb.CNStoreInfo{}, err
   477  	}
   478  	stores := state.CNState.Stores
   479  	if storeInfo, ok := stores[uuid]; ok {
   480  		return storeInfo, nil
   481  	}
   482  	return logpb.CNStoreInfo{}, moerr.NewNoService(ctx, uuid)
   483  }
   484  
   485  func (c *testCluster) GetCNStoreInfoIndexed(ctx context.Context, index int) (logpb.CNStoreInfo, error) {
   486  	ls, err := c.GetCNServiceIndexed(index)
   487  	if err != nil {
   488  		return logpb.CNStoreInfo{}, err
   489  	}
   490  	return c.GetCNStoreInfo(ctx, ls.ID())
   491  }
   492  
   493  func (c *testCluster) GetHAKeeperState() logpb.HAKeeperState {
   494  	state := c.getClusterState()
   495  	require.NotNil(c.t, state)
   496  	return state.State
   497  }
   498  
   499  func (c *testCluster) GetHAKeeperConfig() hakeeper.Config {
   500  	return c.opt.BuildHAKeeperConfig()
   501  }
   502  
   503  func (c *testCluster) TNStoreExpired(uuid string) (bool, error) {
   504  	state := c.getClusterState()
   505  	require.NotNil(c.t, state)
   506  
   507  	tnStore, ok := state.TNState.Stores[uuid]
   508  	if !ok {
   509  		return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF)
   510  	}
   511  
   512  	hkcfg := c.GetHAKeeperConfig()
   513  	expired := hkcfg.TNStoreExpired(tnStore.Tick, state.Tick)
   514  
   515  	c.logger.Info(
   516  		"check tn store expired or not",
   517  		zap.Any("hakeeper config", hkcfg),
   518  		zap.Uint64("dn store tick", tnStore.Tick),
   519  		zap.Uint64("current tick", state.Tick),
   520  		zap.Bool("expired", expired),
   521  	)
   522  
   523  	return expired, nil
   524  }
   525  
   526  func (c *testCluster) TNStoreExpiredIndexed(index int) (bool, error) {
   527  	ds, err := c.GetTNServiceIndexed(index)
   528  	if err != nil {
   529  		return false, err
   530  	}
   531  	return c.TNStoreExpired(ds.ID())
   532  }
   533  
   534  func (c *testCluster) LogStoreExpired(uuid string) (bool, error) {
   535  	state := c.getClusterState()
   536  	require.NotNil(c.t, state)
   537  
   538  	logStore, ok := state.LogState.Stores[uuid]
   539  	if !ok {
   540  		return false, moerr.NewShardNotReportedNoCtx(uuid, 0xDEADBEEF)
   541  	}
   542  
   543  	hkcfg := c.GetHAKeeperConfig()
   544  	expired := hkcfg.LogStoreExpired(logStore.Tick, state.Tick)
   545  
   546  	c.logger.Info(
   547  		"check log store expired or not",
   548  		zap.Any("hakeeper config", hkcfg),
   549  		zap.Uint64("log store tick", logStore.Tick),
   550  		zap.Uint64("current tick", state.Tick),
   551  		zap.Bool("expired", expired),
   552  	)
   553  
   554  	return expired, nil
   555  }
   556  
   557  func (c *testCluster) LogStoreExpiredIndexed(index int) (bool, error) {
   558  	ls, err := c.GetLogServiceIndexed(index)
   559  	if err != nil {
   560  		return false, err
   561  	}
   562  	return c.LogStoreExpired(ls.ID())
   563  }
   564  
   565  func (c *testCluster) CNStoreExpired(uuid string) (bool, error) {
   566  	state := c.getClusterState()
   567  	require.NotNil(c.t, state)
   568  
   569  	cnStore, ok := state.CNState.Stores[uuid]
   570  	if !ok {
   571  		return false, moerr.NewShardNotReportedNoCtx(uuid, 0)
   572  	}
   573  
   574  	hkcfg := c.GetHAKeeperConfig()
   575  	expired := hkcfg.CNStoreExpired(cnStore.Tick, state.Tick)
   576  
   577  	c.logger.Info(
   578  		"check cn store expired or not",
   579  		zap.Any("hakeeper config", hkcfg),
   580  		zap.Uint64("cn store tick", cnStore.Tick),
   581  		zap.Uint64("current tick", state.Tick),
   582  		zap.Bool("expired", expired),
   583  	)
   584  
   585  	return expired, nil
   586  }
   587  
   588  func (c *testCluster) CNStoreExpiredIndexed(index int) (bool, error) {
   589  	cs, err := c.GetCNServiceIndexed(index)
   590  	if err != nil {
   591  		return false, err
   592  	}
   593  	return c.CNStoreExpired(cs.ID())
   594  }
   595  
   596  func (c *testCluster) IsClusterHealthy() bool {
   597  	hkcfg := c.GetHAKeeperConfig()
   598  	state := c.getClusterState()
   599  	_, healthy := syshealth.Check(
   600  		hkcfg,
   601  		state.GetClusterInfo(),
   602  		state.GetTNState(),
   603  		state.GetLogState(),
   604  		state.GetTick(),
   605  	)
   606  	return healthy
   607  }
   608  
   609  // --------------------------------------------------------------
   610  // The following are implements for interface `ClusterWaitState`.
   611  // --------------------------------------------------------------
   612  func (c *testCluster) WaitHAKeeperLeader(ctx context.Context) LogService {
   613  	for {
   614  		select {
   615  		case <-ctx.Done():
   616  			assert.FailNow(
   617  				c.t,
   618  				"terminated when waiting for hakeeper leader",
   619  				"error: %s", ctx.Err(),
   620  			)
   621  		default:
   622  			time.Sleep(defaultWaitInterval)
   623  
   624  			leader := c.getHAKeeperLeader()
   625  			if leader != nil {
   626  				return leader
   627  			}
   628  		}
   629  	}
   630  }
   631  
   632  func (c *testCluster) WaitHAKeeperState(
   633  	ctx context.Context, expected logpb.HAKeeperState,
   634  ) {
   635  	for {
   636  		select {
   637  		case <-ctx.Done():
   638  			assert.FailNow(
   639  				c.t,
   640  				"terminated when waiting for hakeeper state",
   641  				"error: %s", ctx.Err(),
   642  			)
   643  		default:
   644  			time.Sleep(defaultWaitInterval)
   645  
   646  			state := c.getClusterState()
   647  			if state == nil {
   648  				continue
   649  			}
   650  			if state.State == expected {
   651  				return
   652  			}
   653  		}
   654  	}
   655  }
   656  
   657  func (c *testCluster) WaitTNShardsReported(ctx context.Context) {
   658  	for {
   659  		select {
   660  		case <-ctx.Done():
   661  			assert.FailNow(
   662  				c.t,
   663  				"terminated when waiting for all tn shards reported",
   664  				"error: %s", ctx.Err(),
   665  			)
   666  		default:
   667  			time.Sleep(defaultWaitInterval)
   668  
   669  			state := c.getClusterState()
   670  			if state == nil {
   671  				continue
   672  			}
   673  
   674  			expected := ParseExpectedTNShardCount(state.ClusterInfo)
   675  			reported := ParseReportedTNShardCount(
   676  				state.TNState, c.GetHAKeeperConfig(), state.Tick,
   677  			)
   678  
   679  			// FIXME: what about reported larger than expected
   680  			if reported >= expected {
   681  				return
   682  			}
   683  		}
   684  	}
   685  }
   686  
   687  func (c *testCluster) WaitLogShardsReported(ctx context.Context) {
   688  	for {
   689  		select {
   690  		case <-ctx.Done():
   691  			assert.FailNow(
   692  				c.t,
   693  				"terminated when waiting for all log shards reported",
   694  				"error: %s", ctx.Err(),
   695  			)
   696  		default:
   697  			time.Sleep(defaultWaitInterval)
   698  
   699  			state := c.getClusterState()
   700  			if state == nil {
   701  				continue
   702  			}
   703  
   704  			expected := ParseExpectedLogShardCount(state.ClusterInfo)
   705  			reported := ParseReportedLogShardCount(
   706  				state.LogState, c.GetHAKeeperConfig(), state.Tick,
   707  			)
   708  			// FIXME: what about reported larger than expected
   709  			if reported >= expected {
   710  				return
   711  			}
   712  		}
   713  	}
   714  }
   715  
   716  func (c *testCluster) WaitTNReplicaReported(ctx context.Context, shardID uint64) {
   717  	for {
   718  		select {
   719  		case <-ctx.Done():
   720  			assert.FailNow(
   721  				c.t,
   722  				"terminated when waiting replica of tn shard reported",
   723  				"shard %d, error: %s", shardID, ctx.Err(),
   724  			)
   725  		default:
   726  			time.Sleep(defaultWaitInterval)
   727  
   728  			state := c.getClusterState()
   729  			if state == nil {
   730  				continue
   731  			}
   732  
   733  			reported := ParseTNShardReportedSize(
   734  				shardID, state.TNState, c.GetHAKeeperConfig(), state.Tick,
   735  			)
   736  			if reported >= TNShardExpectedSize {
   737  				return
   738  			}
   739  		}
   740  	}
   741  }
   742  
   743  func (c *testCluster) WaitLogReplicaReported(ctx context.Context, shardID uint64) {
   744  	for {
   745  		select {
   746  		case <-ctx.Done():
   747  			assert.FailNow(
   748  				c.t,
   749  				"terminated when waiting replica of log shard reported",
   750  				"shard %d, error: %s", shardID, ctx.Err(),
   751  			)
   752  		default:
   753  			time.Sleep(defaultWaitInterval)
   754  
   755  			state := c.getClusterState()
   756  			if state == nil {
   757  				continue
   758  			}
   759  
   760  			expected := ParseLogShardExpectedSize(shardID, state.ClusterInfo)
   761  			reported := ParseLogShardReportedSize(
   762  				shardID, state.LogState, c.GetHAKeeperConfig(), state.Tick,
   763  			)
   764  			if reported >= expected {
   765  				return
   766  			}
   767  		}
   768  	}
   769  }
   770  
   771  func (c *testCluster) WaitTNStoreTimeout(ctx context.Context, uuid string) {
   772  	for {
   773  		select {
   774  		case <-ctx.Done():
   775  			assert.FailNow(
   776  				c.t,
   777  				"terminated when waiting tn store timeout",
   778  				"dn store %s, error: %s", uuid, ctx.Err(),
   779  			)
   780  		default:
   781  			time.Sleep(defaultWaitInterval)
   782  
   783  			expired, err := c.TNStoreExpired(uuid)
   784  			if err != nil {
   785  				c.logger.Error("fail to check tn store expired or not",
   786  					zap.Error(err),
   787  					zap.String("uuid", uuid),
   788  				)
   789  				continue
   790  			}
   791  
   792  			if expired {
   793  				return
   794  			}
   795  		}
   796  	}
   797  }
   798  
   799  func (c *testCluster) WaitTNStoreTimeoutIndexed(ctx context.Context, index int) {
   800  	ds, err := c.GetTNServiceIndexed(index)
   801  	require.NoError(c.t, err)
   802  
   803  	c.WaitTNStoreTimeout(ctx, ds.ID())
   804  }
   805  
   806  func (c *testCluster) WaitTNStoreReported(ctx context.Context, uuid string) {
   807  	for {
   808  		select {
   809  		case <-ctx.Done():
   810  			assert.FailNow(
   811  				c.t,
   812  				"terminated when waiting tn store reported",
   813  				"dn store %s, error: %s", uuid, ctx.Err(),
   814  			)
   815  		default:
   816  			time.Sleep(defaultWaitInterval)
   817  
   818  			expired, err := c.TNStoreExpired(uuid)
   819  			if err != nil {
   820  				c.logger.Error("fail to check tn store expired or not",
   821  					zap.Error(err),
   822  					zap.String("uuid", uuid),
   823  				)
   824  				continue
   825  			}
   826  
   827  			if !expired {
   828  				return
   829  			}
   830  		}
   831  	}
   832  }
   833  
   834  func (c *testCluster) WaitTNStoreReportedIndexed(ctx context.Context, index int) {
   835  	ds, err := c.GetTNServiceIndexed(index)
   836  	require.NoError(c.t, err)
   837  
   838  	c.WaitTNStoreReported(ctx, ds.ID())
   839  }
   840  
   841  func (c *testCluster) WaitCNStoreReported(ctx context.Context, uuid string) {
   842  	for {
   843  		select {
   844  		case <-ctx.Done():
   845  			assert.FailNow(
   846  				c.t,
   847  				"terminated when waiting cn store reported",
   848  				"cn store %s, error: %s", uuid, ctx.Err(),
   849  			)
   850  		default:
   851  			time.Sleep(defaultWaitInterval)
   852  
   853  			expired, err := c.CNStoreExpired(uuid)
   854  			if err != nil {
   855  				c.logger.Error("fail to check cn store expired or not",
   856  					zap.Error(err),
   857  					zap.String("uuid", uuid),
   858  				)
   859  				continue
   860  			}
   861  
   862  			if !expired {
   863  				return
   864  			}
   865  		}
   866  	}
   867  }
   868  
   869  func (c *testCluster) WaitCNStoreReportedIndexed(ctx context.Context, index int) {
   870  	ds, err := c.GetCNServiceIndexed(index)
   871  	require.NoError(c.t, err)
   872  
   873  	c.WaitCNStoreReported(ctx, ds.ID())
   874  }
   875  
   876  func (c *testCluster) WaitCNStoreTaskServiceCreated(ctx context.Context, uuid string) {
   877  	ds, err := c.GetCNService(uuid)
   878  	require.NoError(c.t, err)
   879  
   880  	for {
   881  		select {
   882  		case <-ctx.Done():
   883  			assert.FailNow(
   884  				c.t,
   885  				"terminated when waiting task service created on cn store",
   886  				"cn store %s, error: %s", uuid, ctx.Err(),
   887  			)
   888  		default:
   889  			_, ok := ds.GetTaskService()
   890  			if ok {
   891  				return
   892  			}
   893  			time.Sleep(defaultWaitInterval)
   894  		}
   895  	}
   896  }
   897  
   898  func (c *testCluster) WaitCNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) {
   899  	ds, err := c.GetCNServiceIndexed(index)
   900  	require.NoError(c.t, err)
   901  	c.WaitCNStoreTaskServiceCreated(ctx, ds.ID())
   902  }
   903  
   904  func (c *testCluster) WaitTNStoreTaskServiceCreated(ctx context.Context, uuid string) {
   905  	ds, err := c.GetTNService(uuid)
   906  	require.NoError(c.t, err)
   907  
   908  	for {
   909  		select {
   910  		case <-ctx.Done():
   911  			assert.FailNow(
   912  				c.t,
   913  				"terminated when waiting task service created on tn store",
   914  				"dn store %s, error: %s", uuid, ctx.Err(),
   915  			)
   916  		default:
   917  			_, ok := ds.GetTaskService()
   918  			if ok {
   919  				return
   920  			}
   921  			time.Sleep(defaultWaitInterval)
   922  		}
   923  	}
   924  }
   925  
   926  func (c *testCluster) WaitTNStoreTaskServiceCreatedIndexed(ctx context.Context, index int) {
   927  	ds, err := c.GetTNServiceIndexed(index)
   928  	require.NoError(c.t, err)
   929  	c.WaitTNStoreTaskServiceCreated(ctx, ds.ID())
   930  }
   931  
   932  func (c *testCluster) WaitLogStoreTaskServiceCreated(ctx context.Context, uuid string) {
   933  	ls, err := c.GetLogService(uuid)
   934  	require.NoError(c.t, err)
   935  
   936  	for {
   937  		select {
   938  		case <-ctx.Done():
   939  			assert.FailNow(
   940  				c.t,
   941  				"terminated when waiting task service created on log store",
   942  				"log store %s, error: %s", uuid, ctx.Err(),
   943  			)
   944  		default:
   945  			_, ok := ls.GetTaskService()
   946  			if ok {
   947  				return
   948  			}
   949  			time.Sleep(defaultWaitInterval)
   950  		}
   951  	}
   952  }
   953  
   954  func (c *testCluster) WaitLogStoreTaskServiceCreatedIndexed(ctx context.Context, index int) {
   955  	ds, err := c.GetLogServiceIndexed(index)
   956  	require.NoError(c.t, err)
   957  	c.WaitLogStoreTaskServiceCreated(ctx, ds.ID())
   958  }
   959  
   960  func (c *testCluster) WaitLogStoreTimeout(ctx context.Context, uuid string) {
   961  	for {
   962  		select {
   963  		case <-ctx.Done():
   964  			assert.FailNow(
   965  				c.t,
   966  				"terminated when waiting log store timeout",
   967  				"log store %s, error: %s", uuid, ctx.Err(),
   968  			)
   969  		default:
   970  			time.Sleep(defaultWaitInterval)
   971  
   972  			expired, err := c.LogStoreExpired(uuid)
   973  			if err != nil {
   974  				c.logger.Error("fail to check log store expired or not",
   975  					zap.Error(err),
   976  					zap.String("uuid", uuid),
   977  				)
   978  				continue
   979  			}
   980  
   981  			if expired {
   982  				return
   983  			}
   984  		}
   985  	}
   986  }
   987  
   988  func (c *testCluster) WaitLogStoreTimeoutIndexed(ctx context.Context, index int) {
   989  	ls, err := c.GetLogServiceIndexed(index)
   990  	require.NoError(c.t, err)
   991  
   992  	c.WaitLogStoreTimeout(ctx, ls.ID())
   993  }
   994  
   995  func (c *testCluster) WaitLogStoreReported(ctx context.Context, uuid string) {
   996  	for {
   997  		select {
   998  		case <-ctx.Done():
   999  			assert.FailNow(
  1000  				c.t,
  1001  				"terminated when waiting log store reported",
  1002  				"log store %s, error: %s", uuid, ctx.Err(),
  1003  			)
  1004  		default:
  1005  			time.Sleep(defaultWaitInterval)
  1006  
  1007  			expired, err := c.LogStoreExpired(uuid)
  1008  			if err != nil {
  1009  				c.logger.Error("fail to check log store expired or not",
  1010  					zap.Error(err),
  1011  					zap.String("uuid", uuid),
  1012  				)
  1013  				continue
  1014  			}
  1015  
  1016  			if !expired {
  1017  				return
  1018  			}
  1019  		}
  1020  	}
  1021  }
  1022  
  1023  func (c *testCluster) WaitLogStoreReportedIndexed(ctx context.Context, index int) {
  1024  	ls, err := c.GetLogServiceIndexed(index)
  1025  	require.NoError(c.t, err)
  1026  
  1027  	c.WaitLogStoreReported(ctx, ls.ID())
  1028  }
  1029  
  1030  // --------------------------------------------------------------
  1031  // The following are implements for interface `ClusterAwareness`.
  1032  // --------------------------------------------------------------
  1033  func (c *testCluster) ListTNServices() []string {
  1034  	ids := make([]string, 0, len(c.tn.svcs))
  1035  	for _, cfg := range c.tn.cfgs {
  1036  		ids = append(ids, cfg.UUID)
  1037  	}
  1038  	return ids
  1039  }
  1040  
  1041  func (c *testCluster) ListLogServices() []string {
  1042  	ids := make([]string, 0, len(c.log.svcs))
  1043  	for _, svc := range c.log.svcs {
  1044  		ids = append(ids, svc.ID())
  1045  	}
  1046  	return ids
  1047  }
  1048  
  1049  func (c *testCluster) ListCnServices() []string {
  1050  	ids := make([]string, 0, len(c.cn.svcs))
  1051  	for _, svc := range c.cn.svcs {
  1052  		ids = append(ids, svc.ID())
  1053  	}
  1054  	return ids
  1055  }
  1056  
  1057  func (c *testCluster) ListHAKeeperServices() []LogService {
  1058  	return c.selectHAkeeperServices()
  1059  }
  1060  
  1061  func (c *testCluster) GetTNService(uuid string) (TNService, error) {
  1062  	c.tn.Lock()
  1063  	defer c.tn.Unlock()
  1064  
  1065  	for i, cfg := range c.tn.cfgs {
  1066  		if cfg.UUID == uuid {
  1067  			return c.tn.svcs[i], nil
  1068  		}
  1069  	}
  1070  	return nil, moerr.NewNoServiceNoCtx(uuid)
  1071  }
  1072  
  1073  func (c *testCluster) GetLogService(uuid string) (LogService, error) {
  1074  	c.log.Lock()
  1075  	defer c.log.Unlock()
  1076  
  1077  	for _, svc := range c.log.svcs {
  1078  		if svc.ID() == uuid {
  1079  			return svc, nil
  1080  		}
  1081  	}
  1082  	return nil, moerr.NewNoServiceNoCtx(uuid)
  1083  }
  1084  
  1085  func (c *testCluster) GetCNService(uuid string) (CNService, error) {
  1086  	c.log.Lock()
  1087  	defer c.log.Unlock()
  1088  
  1089  	for _, svc := range c.cn.svcs {
  1090  		if svc.ID() == uuid {
  1091  			return svc, nil
  1092  		}
  1093  	}
  1094  	return nil, moerr.NewNoServiceNoCtx(uuid)
  1095  }
  1096  
  1097  func (c *testCluster) GetTNServiceIndexed(index int) (TNService, error) {
  1098  	c.tn.Lock()
  1099  	defer c.tn.Unlock()
  1100  
  1101  	if index >= len(c.tn.svcs) || index < 0 {
  1102  		return nil, moerr.NewInvalidServiceIndexNoCtx(index)
  1103  	}
  1104  	return c.tn.svcs[index], nil
  1105  }
  1106  
  1107  func (c *testCluster) GetLogServiceIndexed(index int) (LogService, error) {
  1108  	c.log.Lock()
  1109  	defer c.log.Unlock()
  1110  
  1111  	if index >= len(c.log.svcs) || index < 0 {
  1112  		return nil, moerr.NewInvalidServiceIndexNoCtx(index)
  1113  	}
  1114  	return c.log.svcs[index], nil
  1115  }
  1116  
  1117  func (c *testCluster) GetCNServiceIndexed(index int) (CNService, error) {
  1118  	c.log.Lock()
  1119  	defer c.log.Unlock()
  1120  
  1121  	if index >= len(c.cn.svcs) || index < 0 {
  1122  		return nil, moerr.NewInvalidServiceIndexNoCtx(index)
  1123  	}
  1124  	return c.cn.svcs[index], nil
  1125  }
  1126  
  1127  // NB: we could also fetch cluster state from non-leader hakeeper.
  1128  func (c *testCluster) GetClusterState(
  1129  	ctx context.Context,
  1130  ) (*logpb.CheckerState, error) {
  1131  	c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning)
  1132  	leader := c.WaitHAKeeperLeader(ctx)
  1133  	return leader.GetClusterState()
  1134  }
  1135  
  1136  // --------------------------------------------------------------
  1137  // The following are implements for interface `ClusterOperation`.
  1138  // --------------------------------------------------------------
  1139  func (c *testCluster) CloseTNService(uuid string) error {
  1140  	ds, err := c.GetTNService(uuid)
  1141  	if err != nil {
  1142  		return err
  1143  	}
  1144  	return ds.Close()
  1145  }
  1146  
  1147  func (c *testCluster) StartTNService(uuid string) error {
  1148  	ds, err := c.GetTNService(uuid)
  1149  	if err != nil {
  1150  		return err
  1151  	}
  1152  	return ds.Start()
  1153  }
  1154  
  1155  func (c *testCluster) CloseTNServiceIndexed(index int) error {
  1156  	ds, err := c.GetTNServiceIndexed(index)
  1157  	if err != nil {
  1158  		return err
  1159  	}
  1160  	return ds.Close()
  1161  }
  1162  
  1163  func (c *testCluster) StartTNServiceIndexed(index int) error {
  1164  	ds, err := c.GetTNServiceIndexed(index)
  1165  	if err != nil {
  1166  		return err
  1167  	}
  1168  	return ds.Start()
  1169  }
  1170  
  1171  func (c *testCluster) CloseLogService(uuid string) error {
  1172  	ls, err := c.GetLogService(uuid)
  1173  	if err != nil {
  1174  		return err
  1175  	}
  1176  	return ls.Close()
  1177  }
  1178  
  1179  func (c *testCluster) StartLogService(uuid string) error {
  1180  	ls, err := c.GetLogService(uuid)
  1181  	if err != nil {
  1182  		return err
  1183  	}
  1184  	return ls.Start()
  1185  }
  1186  
  1187  func (c *testCluster) CloseLogServiceIndexed(index int) error {
  1188  	ls, err := c.GetLogServiceIndexed(index)
  1189  	if err != nil {
  1190  		return err
  1191  	}
  1192  	return ls.Close()
  1193  }
  1194  
  1195  func (c *testCluster) StartLogServiceIndexed(index int) error {
  1196  	ls, err := c.GetLogServiceIndexed(index)
  1197  	if err != nil {
  1198  		return err
  1199  	}
  1200  	return ls.Start()
  1201  }
  1202  
  1203  func (c *testCluster) CloseCNService(uuid string) error {
  1204  	cs, err := c.GetCNService(uuid)
  1205  	if err != nil {
  1206  		return err
  1207  	}
  1208  	return cs.Close()
  1209  }
  1210  
  1211  func (c *testCluster) StartCNService(uuid string) error {
  1212  	cs, err := c.GetCNService(uuid)
  1213  	if err != nil {
  1214  		return err
  1215  	}
  1216  	return cs.Start()
  1217  }
  1218  
  1219  func (c *testCluster) CloseCNServiceIndexed(index int) error {
  1220  	cs, err := c.GetCNServiceIndexed(index)
  1221  	if err != nil {
  1222  		return err
  1223  	}
  1224  	return cs.Close()
  1225  }
  1226  
  1227  func (c *testCluster) StartCNServiceIndexed(index int) error {
  1228  	cs, err := c.GetCNServiceIndexed(index)
  1229  	if err != nil {
  1230  		return err
  1231  	}
  1232  	return cs.Start()
  1233  }
  1234  
  1235  func (c *testCluster) StartCNServices(n int) error {
  1236  	offset := len(c.cn.svcs)
  1237  	c.buildCNConfigs(n)
  1238  	c.initCNServices(c.fileservices, offset)
  1239  
  1240  	for _, cs := range c.cn.svcs[offset:] {
  1241  		if err := cs.Start(); err != nil {
  1242  			return err
  1243  		}
  1244  	}
  1245  	return nil
  1246  }
  1247  
  1248  func (c *testCluster) NewNetworkPartition(
  1249  	tnIndexes, logIndexes, cnIndexes []uint32,
  1250  ) NetworkPartition {
  1251  	return newNetworkPartition(
  1252  		c.opt.initial.logServiceNum, logIndexes,
  1253  		c.opt.initial.tnServiceNum, tnIndexes,
  1254  		c.opt.initial.cnServiceNum, cnIndexes,
  1255  	)
  1256  }
  1257  
  1258  func (c *testCluster) RemainingNetworkPartition(
  1259  	partitions ...NetworkPartition,
  1260  ) NetworkPartition {
  1261  	return remainingNetworkPartition(c.opt.initial.logServiceNum, c.opt.initial.tnServiceNum, 0, partitions...)
  1262  }
  1263  
  1264  func (c *testCluster) StartNetworkPartition(parts ...NetworkPartition) {
  1265  	c.network.Lock()
  1266  	defer c.network.Unlock()
  1267  
  1268  	addressSets := c.network.addresses.buildPartitionAddressSets(parts...)
  1269  	c.network.addressSets = addressSets
  1270  }
  1271  
  1272  func (c *testCluster) CloseNetworkPartition() {
  1273  	c.network.Lock()
  1274  	defer c.network.Unlock()
  1275  
  1276  	c.network.addressSets = nil
  1277  }
  1278  
  1279  // ------------------------------------------------------
  1280  // The following are private utilities for `testCluster`.
  1281  // ------------------------------------------------------
  1282  
  1283  // buildServiceAddresses builds addresses for all services.
  1284  func (c *testCluster) buildServiceAddresses() *serviceAddresses {
  1285  	return newServiceAddresses(
  1286  		c.t,
  1287  		c.opt.initial.logServiceNum,
  1288  		c.opt.initial.tnServiceNum,
  1289  		c.opt.initial.cnServiceNum,
  1290  		c.opt.hostAddr)
  1291  }
  1292  
  1293  // buildTNConfigs builds configurations for all tn services.
  1294  func (c *testCluster) buildTNConfigs() ([]*tnservice.Config, []tnOptions) {
  1295  	batch := c.opt.initial.tnServiceNum
  1296  
  1297  	cfgs := make([]*tnservice.Config, 0, batch)
  1298  	opts := make([]tnOptions, 0, batch)
  1299  	for i := 0; i < batch; i++ {
  1300  		cfg := buildTNConfig(i, c.opt, c.network.addresses)
  1301  		cfgs = append(cfgs, cfg)
  1302  
  1303  		localAddr := cfg.ListenAddress
  1304  		opt := buildTNOptions(cfg, c.backendFilterFactory(localAddr))
  1305  		opts = append(opts, opt)
  1306  	}
  1307  	return cfgs, opts
  1308  }
  1309  
  1310  // buildLogConfigs builds configurations for all log services.
  1311  func (c *testCluster) buildLogConfigs() ([]logservice.Config, []logOptions) {
  1312  	batch := c.opt.initial.logServiceNum
  1313  
  1314  	cfgs := make([]logservice.Config, 0, batch)
  1315  	opts := make([]logOptions, 0, batch)
  1316  	for i := 0; i < batch; i++ {
  1317  		cfg := buildLogConfig(i, c.opt, c.network.addresses)
  1318  		cfgs = append(cfgs, cfg)
  1319  
  1320  		localAddr := cfg.LogServiceServiceAddr()
  1321  		opt := buildLogOptions(cfg, c.backendFilterFactory(localAddr))
  1322  		opts = append(opts, opt)
  1323  	}
  1324  	return cfgs, opts
  1325  }
  1326  
  1327  func (c *testCluster) buildCNConfigs(n int) {
  1328  	offset := len(c.cn.opts)
  1329  	batch := n
  1330  	c.network.addresses.buildCNAddress(c.t, batch, c.opt.hostAddr)
  1331  	for i := 0; i < batch; i++ {
  1332  		cfg := buildCNConfig(i+offset, c.opt, c.network.addresses)
  1333  		c.cn.cfgs = append(c.cn.cfgs, cfg)
  1334  		var opt cnOptions
  1335  		if c.opt.cn.optionFunc != nil {
  1336  			opt = c.opt.cn.optionFunc(i + offset)
  1337  		}
  1338  		opt = append(opt, cnservice.WithLogger(c.logger))
  1339  		c.cn.opts = append(c.cn.opts, opt)
  1340  
  1341  		c.fileservices.cnLocalFSs = append(c.fileservices.cnLocalFSs,
  1342  			c.createFS(context.Background(), filepath.Join(c.opt.rootDataDir, cfg.UUID), defines.LocalFileServiceName))
  1343  		c.fileservices.cnServiceNum++
  1344  	}
  1345  }
  1346  
  1347  // initTNServices builds all tn services.
  1348  //
  1349  // Before initializing tn service, log service must be started already.
  1350  func (c *testCluster) initTNServices(fileservices *fileServices) []TNService {
  1351  	batch := c.opt.initial.tnServiceNum
  1352  
  1353  	c.logger.Info("initialize tn services", zap.Int("batch", batch))
  1354  
  1355  	svcs := make([]TNService, 0, batch)
  1356  	for i := 0; i < batch; i++ {
  1357  		cfg := c.tn.cfgs[i]
  1358  		opt := c.tn.opts[i]
  1359  		fs, err := fileservice.NewFileServices(
  1360  			"",
  1361  			fileservices.getTNLocalFileService(i),
  1362  			fileservices.getS3FileService(),
  1363  		)
  1364  		if err != nil {
  1365  			panic(err)
  1366  		}
  1367  		ds, err := newTNService(
  1368  			cfg,
  1369  			c.newRuntime(),
  1370  			fs,
  1371  			opt)
  1372  		require.NoError(c.t, err)
  1373  
  1374  		c.logger.Info(
  1375  			"dn service initialized",
  1376  			zap.Int("index", i),
  1377  			zap.Any("config", cfg),
  1378  		)
  1379  
  1380  		svcs = append(svcs, ds)
  1381  	}
  1382  
  1383  	return svcs
  1384  }
  1385  
  1386  // initLogServices builds all log services.
  1387  func (c *testCluster) initLogServices() []LogService {
  1388  	batch := c.opt.initial.logServiceNum
  1389  
  1390  	c.logger.Info("initialize log services", zap.Int("batch", batch))
  1391  
  1392  	svcs := make([]LogService, 0, batch)
  1393  	for i := 0; i < batch; i++ {
  1394  		cfg := c.log.cfgs[i]
  1395  		opt := c.log.opts[i]
  1396  		ls, err := newLogService(cfg, testutil.NewFS(), opt)
  1397  		require.NoError(c.t, err)
  1398  
  1399  		c.logger.Info(
  1400  			"log service initialized",
  1401  			zap.Int("index", i),
  1402  			zap.Any("config", cfg),
  1403  		)
  1404  
  1405  		svcs = append(svcs, ls)
  1406  	}
  1407  	return svcs
  1408  }
  1409  
  1410  func (c *testCluster) initCNServices(
  1411  	fileservices *fileServices,
  1412  	offset int) {
  1413  	batch := len(c.cn.cfgs)
  1414  
  1415  	c.logger.Info("initialize cn services", zap.Int("batch", batch))
  1416  	for i := offset; i < batch; i++ {
  1417  		cfg := c.cn.cfgs[i]
  1418  		opt := c.cn.opts[i]
  1419  		fs, err := fileservice.NewFileServices(
  1420  			"",
  1421  			fileservices.getCNLocalFileService(i),
  1422  			fileservices.getS3FileService(),
  1423  			fileservices.getETLFileService(),
  1424  		)
  1425  		if err != nil {
  1426  			panic(err)
  1427  		}
  1428  		ctx, cancel := context.WithCancel(context.Background())
  1429  		cs, err := newCNService(cfg, ctx, fs, opt)
  1430  		if err != nil {
  1431  			panic(err)
  1432  		}
  1433  		cs.SetCancel(cancel)
  1434  
  1435  		c.logger.Info(
  1436  			"cn service initialized",
  1437  			zap.Int("index", i),
  1438  			zap.Any("config", cfg),
  1439  		)
  1440  
  1441  		c.cn.svcs = append(c.cn.svcs, cs)
  1442  	}
  1443  }
  1444  
  1445  // startTNServices initializes and starts all tn services.
  1446  func (c *testCluster) startTNServices(ctx context.Context) error {
  1447  	// initialize all tn services
  1448  	c.tn.svcs = c.initTNServices(c.fileservices)
  1449  
  1450  	// start tn services
  1451  	for _, ds := range c.tn.svcs {
  1452  		if err := ds.Start(); err != nil {
  1453  			return err
  1454  		}
  1455  	}
  1456  
  1457  	c.WaitTNShardsReported(ctx)
  1458  	return nil
  1459  }
  1460  
  1461  // startLogServices initializes and starts all log services.
  1462  func (c *testCluster) startLogServices(ctx context.Context) error {
  1463  	// initialize all log service
  1464  	c.log.svcs = c.initLogServices()
  1465  
  1466  	// start log services
  1467  	for _, ls := range c.log.svcs {
  1468  		if err := ls.Start(); err != nil {
  1469  			return err
  1470  		}
  1471  	}
  1472  
  1473  	// start hakeeper replicas
  1474  	if err := c.startHAKeeperReplica(); err != nil {
  1475  		return err
  1476  	}
  1477  
  1478  	// initialize cluster information
  1479  	if err := c.setInitialClusterInfo(); err != nil {
  1480  		return err
  1481  	}
  1482  
  1483  	c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning)
  1484  	return nil
  1485  }
  1486  
  1487  func (c *testCluster) startCNServices(ctx context.Context) error {
  1488  	c.initCNServices(c.fileservices, 0)
  1489  
  1490  	for _, cs := range c.cn.svcs {
  1491  		if err := cs.Start(); err != nil {
  1492  			return err
  1493  		}
  1494  	}
  1495  
  1496  	return nil
  1497  }
  1498  
  1499  // closeTNServices closes all tn services.
  1500  func (c *testCluster) closeTNServices() error {
  1501  	c.logger.Info("start to close tn services")
  1502  
  1503  	for i, ds := range c.tn.svcs {
  1504  		c.logger.Info("close tn service", zap.Int("index", i))
  1505  		if err := ds.Close(); err != nil {
  1506  			return err
  1507  		}
  1508  		c.logger.Info("dn service closed", zap.Int("index", i))
  1509  	}
  1510  
  1511  	return nil
  1512  }
  1513  
  1514  // closeLogServices closes all log services.
  1515  func (c *testCluster) closeLogServices() error {
  1516  	defer logutil.LogClose(c.logger, "tests-framework/logservices")()
  1517  
  1518  	for i, ls := range c.log.svcs {
  1519  		c.logger.Info("close log service", zap.Int("index", i))
  1520  		if err := ls.Close(); err != nil {
  1521  			return err
  1522  		}
  1523  		c.logger.Info("log service closed", zap.Int("index", i))
  1524  	}
  1525  
  1526  	return nil
  1527  }
  1528  
  1529  func (c *testCluster) closeCNServices() error {
  1530  	defer logutil.LogClose(c.logger, "tests-framework/cnservices")()
  1531  
  1532  	for i, cs := range c.cn.svcs {
  1533  		c.logger.Info("close cn service", zap.Int("index", i))
  1534  		if err := cs.Close(); err != nil {
  1535  			return err
  1536  		}
  1537  		c.logger.Info("cn service closed", zap.Int("index", i))
  1538  	}
  1539  
  1540  	return nil
  1541  }
  1542  
  1543  // getClusterState fetches cluster state from arbitrary hakeeper.
  1544  //
  1545  // NB: it's possible that getClusterState returns nil value.
  1546  func (c *testCluster) getClusterState() *logpb.CheckerState {
  1547  	var state *logpb.CheckerState
  1548  	fn := func(index int, svc LogService) bool {
  1549  		s, err := svc.GetClusterState()
  1550  		if err != nil {
  1551  			c.logger.Error(
  1552  				"fail to get cluster state",
  1553  				zap.Error(err),
  1554  				zap.Int("index", index),
  1555  			)
  1556  			return false
  1557  		}
  1558  		state = s
  1559  		// XXX MPOOL
  1560  		// Too much logging can break CI.
  1561  		// c.logger.Info("current cluster state", zap.Any("state", s))
  1562  		return true
  1563  	}
  1564  	c.rangeHAKeeperService(fn)
  1565  	return state
  1566  }
  1567  
  1568  // getHAKeeperLeader gets log service which is hakeeper leader.
  1569  func (c *testCluster) getHAKeeperLeader() LogService {
  1570  	var leader LogService
  1571  	fn := func(index int, svc LogService) bool {
  1572  		isLeader, err := svc.IsLeaderHakeeper()
  1573  		if err != nil {
  1574  			c.logger.Error(
  1575  				"fail to check hakeeper",
  1576  				zap.Error(err),
  1577  				zap.Int("index", index),
  1578  			)
  1579  			return false
  1580  		}
  1581  		c.logger.Info(
  1582  			"hakeeper state",
  1583  			zap.Bool("isLeader", isLeader),
  1584  			zap.Int("index", index),
  1585  		)
  1586  
  1587  		if isLeader {
  1588  			leader = svc
  1589  			return true
  1590  		}
  1591  
  1592  		return false
  1593  	}
  1594  	c.rangeHAKeeperService(fn)
  1595  	return leader
  1596  }
  1597  
  1598  // rangeHAKeeperService iterates all hakeeper service until `fn` returns true.
  1599  func (c *testCluster) rangeHAKeeperService(
  1600  	fn func(index int, svc LogService) bool,
  1601  ) {
  1602  	for i, svc := range c.selectHAkeeperServices() {
  1603  		index := i
  1604  
  1605  		if svc.Status() != ServiceStarted {
  1606  			c.logger.Warn(
  1607  				"hakeeper service not started",
  1608  				zap.Int("index", index),
  1609  			)
  1610  			continue
  1611  		}
  1612  
  1613  		if fn(index, svc) {
  1614  			break
  1615  		}
  1616  	}
  1617  }
  1618  
  1619  func (c *testCluster) newRuntime() runtime.Runtime {
  1620  	return runtime.NewRuntime(metadata.ServiceType_CN, "", c.logger, runtime.WithClock(c.clock))
  1621  }
  1622  
  1623  // FilterFunc returns true if traffic was allowed.
  1624  type FilterFunc func(morpc.Message, string) bool
  1625  
  1626  // backendFilterFactory constructs a closure with the type of FilterFunc.
  1627  func (c *testCluster) backendFilterFactory(localAddr string) FilterFunc {
  1628  	return func(_ morpc.Message, backendAddr string) bool {
  1629  		// NB: it's possible that partition takes effect once more after disabled.
  1630  		c.network.RLock()
  1631  		addressSets := c.network.addressSets
  1632  		c.network.RUnlock()
  1633  
  1634  		if len(addressSets) == 0 {
  1635  			return true
  1636  		}
  1637  
  1638  		for _, addrSet := range addressSets {
  1639  			if addrSet.contains(localAddr) &&
  1640  				addrSet.contains(backendAddr) {
  1641  				return true
  1642  			}
  1643  		}
  1644  
  1645  		c.logger.Info(
  1646  			"traffic not allowed",
  1647  			zap.String("local", localAddr),
  1648  			zap.String("backend", backendAddr),
  1649  		)
  1650  
  1651  		return false
  1652  	}
  1653  }