github.com/matrixorigin/matrixone@v0.7.0/pkg/tests/service/service_test.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package service
    16  
    17  import (
    18  	"context"
    19  	"testing"
    20  
    21  	"github.com/lni/goutils/leaktest"
    22  	"github.com/stretchr/testify/assert"
    23  	"github.com/stretchr/testify/require"
    24  
    25  	"github.com/matrixorigin/matrixone/pkg/logservice"
    26  	logpb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    27  )
    28  
    29  const (
    30  	supportMultiDN = false
    31  )
    32  
    33  func TestClusterStart(t *testing.T) {
    34  	defer leaktest.AfterTest(t)()
    35  	if testing.Short() {
    36  		t.Skip("skipping in short mode.")
    37  		return
    38  	}
    39  
    40  	// initialize cluster
    41  	c, err := NewCluster(t, DefaultOptions())
    42  	require.NoError(t, err)
    43  	// close the cluster
    44  	defer func(c Cluster) {
    45  		require.NoError(t, c.Close())
    46  	}(c)
    47  	// start the cluster
    48  	require.NoError(t, c.Start())
    49  }
    50  
    51  func TestAllocateID(t *testing.T) {
    52  	defer leaktest.AfterTest(t)()
    53  	if testing.Short() {
    54  		t.Skip("skipping in short mode.")
    55  		return
    56  	}
    57  
    58  	// initialize cluster
    59  	c, err := NewCluster(t, DefaultOptions())
    60  	require.NoError(t, err)
    61  
    62  	// close the cluster
    63  	defer func(c Cluster) {
    64  		require.NoError(t, c.Close())
    65  	}(c)
    66  	// start the cluster
    67  	require.NoError(t, c.Start())
    68  
    69  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
    70  	defer cancel()
    71  	c.WaitHAKeeperState(ctx, logpb.HAKeeperRunning)
    72  
    73  	cfg := logservice.HAKeeperClientConfig{
    74  		ServiceAddresses: []string{c.(*testCluster).network.addresses.logAddresses[0].listenAddr},
    75  		AllocateIDBatch:  10,
    76  	}
    77  	hc, err := logservice.NewCNHAKeeperClient(ctx, cfg)
    78  	require.NoError(t, err)
    79  	defer func() {
    80  		assert.NoError(t, hc.Close())
    81  	}()
    82  
    83  	last := uint64(0)
    84  	for i := 0; i < int(cfg.AllocateIDBatch)-1; i++ {
    85  		v, err := hc.AllocateID(ctx)
    86  		require.NoError(t, err)
    87  		assert.True(t, v > 0)
    88  		if last != 0 {
    89  			assert.Equal(t, v, last+1, i)
    90  		}
    91  		last = v
    92  	}
    93  }
    94  
    95  func TestClusterAwareness(t *testing.T) {
    96  	defer leaktest.AfterTest(t)()
    97  	if testing.Short() {
    98  		t.Skip("skipping in short mode.")
    99  		return
   100  	}
   101  
   102  	if !supportMultiDN {
   103  		t.Skip("skipping, multi db not support")
   104  		return
   105  	}
   106  
   107  	dnSvcNum := 2
   108  	logSvcNum := 3
   109  	opt := DefaultOptions().
   110  		WithDNServiceNum(dnSvcNum).
   111  		WithLogServiceNum(logSvcNum)
   112  
   113  	// initialize cluster
   114  	c, err := NewCluster(t, opt)
   115  	require.NoError(t, err)
   116  
   117  	// close the cluster
   118  	defer func(c Cluster) {
   119  		require.NoError(t, c.Close())
   120  	}(c)
   121  	// start the cluster
   122  	require.NoError(t, c.Start())
   123  
   124  	// -------------------------------------------
   125  	// the following would test `ClusterAwareness`
   126  	// -------------------------------------------
   127  	dsuuids := c.ListDNServices()
   128  	require.Equal(t, dnSvcNum, len(dsuuids))
   129  
   130  	lsuuids := c.ListLogServices()
   131  	require.Equal(t, logSvcNum, len(lsuuids))
   132  
   133  	hksvcs := c.ListHAKeeperServices()
   134  	require.NotZero(t, len(hksvcs))
   135  
   136  	dn, err := c.GetDNService(dsuuids[0])
   137  	require.NoError(t, err)
   138  	require.Equal(t, ServiceStarted, dn.Status())
   139  
   140  	log, err := c.GetLogService(lsuuids[0])
   141  	require.NoError(t, err)
   142  	require.Equal(t, ServiceStarted, log.Status())
   143  
   144  	ctx1, cancel1 := context.WithTimeout(context.Background(), defaultTestTimeout)
   145  	defer cancel1()
   146  	leader := c.WaitHAKeeperLeader(ctx1)
   147  	require.NotNil(t, leader)
   148  
   149  	// we must wait for hakeeper's running state, or hakeeper wouldn't receive hearbeat.
   150  	ctx2, cancel2 := context.WithTimeout(context.Background(), defaultTestTimeout)
   151  	defer cancel2()
   152  	c.WaitHAKeeperState(ctx2, logpb.HAKeeperRunning)
   153  
   154  	ctx3, cancel3 := context.WithTimeout(context.Background(), defaultTestTimeout)
   155  	defer cancel3()
   156  	state, err := c.GetClusterState(ctx3)
   157  	require.NoError(t, err)
   158  	require.Equal(t, dnSvcNum, len(state.DNState.Stores))
   159  	require.Equal(t, logSvcNum, len(state.LogState.Stores))
   160  }
   161  
   162  func TestClusterOperation(t *testing.T) {
   163  	defer leaktest.AfterTest(t)()
   164  	if testing.Short() {
   165  		t.Skip("skipping in short mode.")
   166  		return
   167  	}
   168  
   169  	if !supportMultiDN {
   170  		t.Skip("skipping, multi db not support")
   171  		return
   172  	}
   173  
   174  	dnSvcNum := 3
   175  	logSvcNum := 3
   176  	opt := DefaultOptions().
   177  		WithDNServiceNum(dnSvcNum).
   178  		WithLogServiceNum(logSvcNum)
   179  
   180  	// initialize cluster
   181  	c, err := NewCluster(t, opt)
   182  	require.NoError(t, err)
   183  
   184  	// close the cluster
   185  	defer func(c Cluster) {
   186  		require.NoError(t, c.Close())
   187  	}(c)
   188  	// start the cluster
   189  	require.NoError(t, c.Start())
   190  
   191  	// -------------------------------------------
   192  	// the following would test `ClusterOperation`
   193  	// -------------------------------------------
   194  
   195  	// 1. start/close dn services via different ways
   196  	dsuuids := c.ListDNServices()
   197  	require.Equal(t, dnSvcNum, len(dsuuids))
   198  	// 1.a start/close dn service by uuid
   199  	{
   200  		index := 0
   201  		dsuuid := dsuuids[index]
   202  
   203  		// get the instance of dn service
   204  		ds, err := c.GetDNService(dsuuid)
   205  		require.NoError(t, err)
   206  		require.Equal(t, ServiceStarted, ds.Status())
   207  
   208  		// start it
   209  		err = c.StartDNService(dsuuid)
   210  		require.NoError(t, err)
   211  		require.Equal(t, ServiceStarted, ds.Status())
   212  
   213  		// close it
   214  		err = c.CloseDNService(dsuuid)
   215  		require.NoError(t, err)
   216  		require.Equal(t, ServiceClosed, ds.Status())
   217  	}
   218  
   219  	// 1.b start/close dn service by index
   220  	{
   221  		index := 1
   222  
   223  		// get the instance of dn service
   224  		ds, err := c.GetDNServiceIndexed(index)
   225  		require.NoError(t, err)
   226  		require.Equal(t, ServiceStarted, ds.Status())
   227  
   228  		// start it
   229  		err = c.StartDNServiceIndexed(index)
   230  		require.NoError(t, err)
   231  		require.Equal(t, ServiceStarted, ds.Status())
   232  
   233  		// close it
   234  		err = c.CloseDNServiceIndexed(index)
   235  		require.NoError(t, err)
   236  		require.Equal(t, ServiceClosed, ds.Status())
   237  	}
   238  
   239  	// 1.c start/close dn service by instance
   240  	{
   241  		index := 2
   242  
   243  		// get the instance of dn service
   244  		ds, err := c.GetDNServiceIndexed(index)
   245  		require.NoError(t, err)
   246  		require.Equal(t, ServiceStarted, ds.Status())
   247  
   248  		// start it
   249  		err = ds.Start()
   250  		require.NoError(t, err)
   251  		require.Equal(t, ServiceStarted, ds.Status())
   252  
   253  		// close it
   254  		err = ds.Close()
   255  		require.NoError(t, err)
   256  		require.Equal(t, ServiceClosed, ds.Status())
   257  	}
   258  
   259  	// 2. start/close log services by different ways
   260  	lsuuids := c.ListLogServices()
   261  	require.Equal(t, logSvcNum, len(lsuuids))
   262  	// 2.a start/close log service by uuid
   263  	{
   264  		index := 0
   265  		lsuuid := lsuuids[index]
   266  
   267  		// get the instance of log service
   268  		ls, err := c.GetLogService(lsuuid)
   269  		require.NoError(t, err)
   270  		require.Equal(t, ServiceStarted, ls.Status())
   271  
   272  		// start it
   273  		err = c.StartLogService(lsuuid)
   274  		require.NoError(t, err)
   275  		require.Equal(t, ServiceStarted, ls.Status())
   276  
   277  		// close it
   278  		err = c.CloseLogService(lsuuid)
   279  		require.NoError(t, err)
   280  		require.Equal(t, ServiceClosed, ls.Status())
   281  	}
   282  
   283  	// 2.b start/close log service by index
   284  	{
   285  		index := 1
   286  
   287  		// get the instance of log service
   288  		ls, err := c.GetLogServiceIndexed(index)
   289  		require.NoError(t, err)
   290  		require.Equal(t, ServiceStarted, ls.Status())
   291  
   292  		// start it
   293  		err = c.StartLogServiceIndexed(index)
   294  		require.NoError(t, err)
   295  		require.Equal(t, ServiceStarted, ls.Status())
   296  
   297  		// close it
   298  		err = c.CloseLogServiceIndexed(index)
   299  		require.NoError(t, err)
   300  		require.Equal(t, ServiceClosed, ls.Status())
   301  	}
   302  
   303  	// 2.c start/close log service by instance
   304  	{
   305  		index := 2
   306  
   307  		// get the instance of log service
   308  		ls, err := c.GetLogServiceIndexed(index)
   309  		require.NoError(t, err)
   310  		require.Equal(t, ServiceStarted, ls.Status())
   311  
   312  		// start it
   313  		err = ls.Start()
   314  		require.NoError(t, err)
   315  		require.Equal(t, ServiceStarted, ls.Status())
   316  
   317  		// close it
   318  		err = ls.Close()
   319  		require.NoError(t, err)
   320  		require.Equal(t, ServiceClosed, ls.Status())
   321  	}
   322  }
   323  
   324  func TestClusterState(t *testing.T) {
   325  	defer leaktest.AfterTest(t)()
   326  	if testing.Short() {
   327  		t.Skip("skipping in short mode.")
   328  		return
   329  	}
   330  
   331  	if !supportMultiDN {
   332  		t.Skip("skipping, multi db not support")
   333  		return
   334  	}
   335  
   336  	dnSvcNum := 2
   337  	logSvcNum := 3
   338  	opt := DefaultOptions().
   339  		WithDNServiceNum(dnSvcNum).
   340  		WithLogServiceNum(logSvcNum)
   341  
   342  	// initialize cluster
   343  	c, err := NewCluster(t, opt)
   344  	require.NoError(t, err)
   345  
   346  	// close the cluster
   347  	defer func(c Cluster) {
   348  		require.NoError(t, c.Close())
   349  	}(c)
   350  	// start the cluster
   351  	require.NoError(t, c.Start())
   352  
   353  	// ----------------------------------------
   354  	// the following would test `ClusterState`.
   355  	// ----------------------------------------
   356  	ctx1, cancel1 := context.WithTimeout(context.Background(), defaultTestTimeout)
   357  	defer cancel1()
   358  	leader := c.WaitHAKeeperLeader(ctx1)
   359  	require.NotNil(t, leader)
   360  
   361  	dsuuids := c.ListDNServices()
   362  	require.Equal(t, dnSvcNum, len(dsuuids))
   363  
   364  	lsuuids := c.ListLogServices()
   365  	require.Equal(t, logSvcNum, len(lsuuids))
   366  
   367  	// we must wait for hakeeper's running state, or hakeeper wouldn't receive hearbeat.
   368  	ctx2, cancel2 := context.WithTimeout(context.Background(), defaultTestTimeout)
   369  	defer cancel2()
   370  	c.WaitHAKeeperState(ctx2, logpb.HAKeeperRunning)
   371  
   372  	hkstate := c.GetHAKeeperState()
   373  	require.Equal(t, logpb.HAKeeperRunning, hkstate)
   374  
   375  	// cluster should be healthy
   376  	require.True(t, c.IsClusterHealthy())
   377  
   378  	ctx3, cancel3 := context.WithTimeout(context.Background(), defaultTestTimeout)
   379  	defer cancel3()
   380  	state, err := c.GetClusterState(ctx3)
   381  	require.NoError(t, err)
   382  	require.Equal(t, dnSvcNum, len(state.DNState.Stores))
   383  	require.Equal(t, logSvcNum, len(state.LogState.Stores))
   384  
   385  	// FIXME: validate the result list of dn shards
   386  	ctx4, cancel4 := context.WithTimeout(context.Background(), defaultTestTimeout)
   387  	defer cancel4()
   388  	_, err = c.ListDNShards(ctx4)
   389  	require.NoError(t, err)
   390  
   391  	// FIXME: validate the result list of log shards
   392  	ctx5, cancel5 := context.WithTimeout(context.Background(), defaultTestTimeout)
   393  	defer cancel5()
   394  	_, err = c.ListLogShards(ctx5)
   395  	require.NoError(t, err)
   396  
   397  	// test for:
   398  	//   - GetDNStoreInfo
   399  	//   - GetDNStoreInfoIndexed
   400  	//   - DNStoreExpired
   401  	//   - DNStoreExpiredIndexed
   402  	{
   403  		dnIndex := 0
   404  		dsuuid := dsuuids[dnIndex]
   405  
   406  		ctx6, cancel6 := context.WithTimeout(context.Background(), defaultTestTimeout)
   407  		defer cancel6()
   408  		dnStoreInfo1, err := c.GetDNStoreInfo(ctx6, dsuuid)
   409  		require.NoError(t, err)
   410  
   411  		ctx7, cancel7 := context.WithTimeout(context.Background(), defaultTestTimeout)
   412  		defer cancel7()
   413  		dnStoreInfo2, err := c.GetDNStoreInfoIndexed(ctx7, dnIndex)
   414  		require.NoError(t, err)
   415  		require.Equal(t, dnStoreInfo1.Shards, dnStoreInfo2.Shards)
   416  
   417  		expired1, err := c.DNStoreExpired(dsuuid)
   418  		require.NoError(t, err)
   419  		require.False(t, expired1)
   420  
   421  		expired2, err := c.DNStoreExpiredIndexed(dnIndex)
   422  		require.NoError(t, err)
   423  		require.False(t, expired2)
   424  	}
   425  
   426  	// test for:
   427  	//   - GetLogStoreInfo
   428  	//   - GetLogStoreInfoIndexed
   429  	//   - LogStoreExpired
   430  	//   - LogStoreExpiredIndexed
   431  	{
   432  		logIndex := 1
   433  		lsuuid := lsuuids[logIndex]
   434  
   435  		ctx8, cancel8 := context.WithTimeout(context.Background(), defaultTestTimeout)
   436  		defer cancel8()
   437  		logStoreInfo1, err := c.GetLogStoreInfo(ctx8, lsuuid)
   438  		require.NoError(t, err)
   439  
   440  		ctx9, cancel9 := context.WithTimeout(context.Background(), defaultTestTimeout)
   441  		defer cancel9()
   442  		logStoreInfo2, err := c.GetLogStoreInfoIndexed(ctx9, logIndex)
   443  		require.NoError(t, err)
   444  		require.Equal(t, len(logStoreInfo1.Replicas), len(logStoreInfo2.Replicas)) // TODO: sort and compare detail.
   445  
   446  		expired1, err := c.LogStoreExpired(lsuuid)
   447  		require.NoError(t, err)
   448  		require.False(t, expired1)
   449  
   450  		expired2, err := c.LogStoreExpiredIndexed(logIndex)
   451  		require.NoError(t, err)
   452  		require.False(t, expired2)
   453  	}
   454  }
   455  
   456  func TestClusterWaitState(t *testing.T) {
   457  	defer leaktest.AfterTest(t)()
   458  	if testing.Short() {
   459  		t.Skip("skipping in short mode.")
   460  		return
   461  	}
   462  
   463  	if !supportMultiDN {
   464  		t.Skip("skipping, multi db not support")
   465  		return
   466  	}
   467  
   468  	dnSvcNum := 2
   469  	logSvcNum := 3
   470  	opt := DefaultOptions().
   471  		WithDNServiceNum(dnSvcNum).
   472  		WithLogServiceNum(logSvcNum)
   473  
   474  	// initialize cluster
   475  	c, err := NewCluster(t, opt)
   476  	require.NoError(t, err)
   477  
   478  	// close the cluster
   479  	defer func(c Cluster) {
   480  		require.NoError(t, c.Close())
   481  	}(c)
   482  	// start the cluster
   483  	require.NoError(t, c.Start())
   484  
   485  	// we must wait for hakeeper's running state, or hakeeper wouldn't receive hearbeat.
   486  	ctx1, cancel1 := context.WithTimeout(context.Background(), defaultTestTimeout)
   487  	defer cancel1()
   488  	c.WaitHAKeeperState(ctx1, logpb.HAKeeperRunning)
   489  
   490  	// --------------------------------------------
   491  	// the following would test `ClusterWaitState`.
   492  	// --------------------------------------------
   493  
   494  	// test WaitDNShardsReported
   495  	{
   496  		ctx2, cancel2 := context.WithTimeout(context.Background(), defaultTestTimeout)
   497  		defer cancel2()
   498  		c.WaitDNShardsReported(ctx2)
   499  	}
   500  
   501  	// test WaitLogShardsReported
   502  	{
   503  		ctx3, cancel3 := context.WithTimeout(context.Background(), defaultTestTimeout)
   504  		defer cancel3()
   505  		c.WaitLogShardsReported(ctx3)
   506  	}
   507  
   508  	// test WaitDNReplicaReported
   509  	{
   510  		ctx4, cancel4 := context.WithTimeout(context.Background(), defaultTestTimeout)
   511  		defer cancel4()
   512  		dnShards, err := c.ListDNShards(ctx4)
   513  		require.NoError(t, err)
   514  		require.NotZero(t, len(dnShards))
   515  
   516  		dnShardID := dnShards[0].ShardID
   517  		ctx5, cancel5 := context.WithTimeout(context.Background(), defaultTestTimeout)
   518  		defer cancel5()
   519  		c.WaitDNReplicaReported(ctx5, dnShardID)
   520  	}
   521  
   522  	// test WaitLogReplicaReported
   523  	{
   524  		ctx6, cancel6 := context.WithTimeout(context.Background(), defaultTestTimeout)
   525  		defer cancel6()
   526  		logShards, err := c.ListLogShards(ctx6)
   527  		require.NotZero(t, len(logShards))
   528  		require.NoError(t, err)
   529  
   530  		logShardID := logShards[0].ShardID
   531  		ctx7, cancel7 := context.WithTimeout(context.Background(), defaultTestTimeout)
   532  		defer cancel7()
   533  		c.WaitLogReplicaReported(ctx7, logShardID)
   534  	}
   535  }
   536  
   537  func TestNetworkPartition(t *testing.T) {
   538  	defer leaktest.AfterTest(t)()
   539  	if testing.Short() {
   540  		t.Skip("skipping in short mode.")
   541  		return
   542  	}
   543  
   544  	if !supportMultiDN {
   545  		t.Skip("skipping, multi db not support")
   546  		return
   547  	}
   548  
   549  	dnSvcNum := 2
   550  	logSvcNum := 4
   551  	opt := DefaultOptions().
   552  		WithDNServiceNum(dnSvcNum).
   553  		WithLogServiceNum(logSvcNum)
   554  
   555  	// initialize cluster
   556  	c, err := NewCluster(t, opt)
   557  	require.NoError(t, err)
   558  
   559  	// close the cluster
   560  	defer func(c Cluster) {
   561  		require.NoError(t, c.Close())
   562  	}(c)
   563  	// start the cluster
   564  	require.NoError(t, c.Start())
   565  
   566  	// we must wait for hakeeper's running state, or hakeeper wouldn't receive hearbeat.
   567  	ctx1, cancel1 := context.WithTimeout(context.Background(), defaultTestTimeout)
   568  	defer cancel1()
   569  	c.WaitHAKeeperState(ctx1, logpb.HAKeeperRunning)
   570  
   571  	// --------------------------------------------
   572  	// the following would test network partition
   573  	// --------------------------------------------
   574  
   575  	// dn service index: 0, 1
   576  	// log service index: 0, 1, 2, 3
   577  	// seperate dn service 1 from other services
   578  	partition1 := c.NewNetworkPartition([]uint32{1}, nil, nil)
   579  	require.Equal(t, []uint32{1}, partition1.ListDNServiceIndex())
   580  	require.Nil(t, partition1.ListLogServiceIndex())
   581  
   582  	partition2 := c.RemainingNetworkPartition(partition1)
   583  	require.Equal(t, []uint32{0}, partition2.ListDNServiceIndex())
   584  	require.Equal(t, []uint32{0, 1, 2, 3}, partition2.ListLogServiceIndex())
   585  
   586  	// enable network partition
   587  	c.StartNetworkPartition(partition1, partition2)
   588  	ctx2, cancel2 := context.WithTimeout(context.Background(), defaultTestTimeout)
   589  	defer cancel2()
   590  	c.WaitDNStoreTimeoutIndexed(ctx2, 1)
   591  
   592  	// disable network partition
   593  	c.CloseNetworkPartition()
   594  	ctx3, cancel3 := context.WithTimeout(context.Background(), defaultTestTimeout)
   595  	defer cancel3()
   596  	c.WaitDNStoreReportedIndexed(ctx3, 1)
   597  
   598  	// dn service index: 0, 1
   599  	// log service index: 0, 1, 2, 3
   600  	// seperate log service 3 from other services
   601  	partition3 := c.NewNetworkPartition(nil, []uint32{3}, nil)
   602  	require.Nil(t, partition3.ListDNServiceIndex())
   603  	require.Equal(t, []uint32{3}, partition3.ListLogServiceIndex())
   604  
   605  	partition4 := c.RemainingNetworkPartition(partition3)
   606  	require.Equal(t, []uint32{0, 1}, partition4.ListDNServiceIndex())
   607  	require.Equal(t, []uint32{0, 1, 2}, partition4.ListLogServiceIndex())
   608  
   609  	// enable network partition
   610  	c.StartNetworkPartition(partition3, partition4)
   611  	ctx4, cancel4 := context.WithTimeout(context.Background(), defaultTestTimeout)
   612  	defer cancel4()
   613  	c.WaitLogStoreTimeoutIndexed(ctx4, 3)
   614  
   615  	// disable network partition
   616  	c.CloseNetworkPartition()
   617  	ctx5, cancel5 := context.WithTimeout(context.Background(), defaultTestTimeout)
   618  	defer cancel5()
   619  	c.WaitLogStoreReportedIndexed(ctx5, 3)
   620  }