github.com/matrixorigin/matrixone@v0.7.0/pkg/logservice/service_test.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logservice
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"runtime/debug"
    21  	"sync"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/google/uuid"
    26  	"github.com/lni/dragonboat/v4"
    27  	"github.com/lni/goutils/leaktest"
    28  	"github.com/lni/vfs"
    29  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    30  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    31  	hapkg "github.com/matrixorigin/matrixone/pkg/hakeeper"
    32  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    33  	"github.com/matrixorigin/matrixone/pkg/testutil"
    34  	"github.com/stretchr/testify/assert"
    35  	"github.com/stretchr/testify/require"
    36  )
    37  
    38  const (
    39  	testServiceAddress     = "127.0.0.1:9000"
    40  	testGossipAddress      = "127.0.0.1:9010"
    41  	dummyGossipSeedAddress = "127.0.0.1:9100"
    42  	testServerMaxMsgSize   = 1000
    43  )
    44  
    45  func getServiceTestConfig() Config {
    46  	c := Config{
    47  		UUID:                 uuid.New().String(),
    48  		RTTMillisecond:       10,
    49  		GossipAddress:        testGossipAddress,
    50  		GossipListenAddress:  testGossipAddress,
    51  		GossipSeedAddresses:  []string{testGossipAddress, dummyGossipSeedAddress},
    52  		DeploymentID:         1,
    53  		FS:                   vfs.NewStrictMem(),
    54  		ServiceListenAddress: testServiceAddress,
    55  		ServiceAddress:       testServiceAddress,
    56  		DisableWorkers:       true,
    57  		UseTeeLogDB:          true,
    58  	}
    59  	c.RPC.MaxMessageSize = testServerMaxMsgSize
    60  	c.Fill()
    61  	return c
    62  }
    63  
    64  func runServiceTest(t *testing.T,
    65  	hakeeper bool, startReplica bool, fn func(*testing.T, *Service)) {
    66  	defer leaktest.AfterTest(t)()
    67  	cfg := getServiceTestConfig()
    68  	defer vfs.ReportLeakedFD(cfg.FS, t)
    69  	service, err := NewService(cfg,
    70  		testutil.NewFS(),
    71  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
    72  			return true
    73  		}),
    74  	)
    75  	require.NoError(t, err)
    76  	defer func() {
    77  		assert.NoError(t, service.Close())
    78  	}()
    79  
    80  	if startReplica {
    81  		shardID := hapkg.DefaultHAKeeperShardID
    82  		peers := make(map[uint64]dragonboat.Target)
    83  		peers[1] = service.ID()
    84  		if hakeeper {
    85  			require.NoError(t, service.store.startHAKeeperReplica(1, peers, false))
    86  		} else {
    87  			shardID = 1
    88  			require.NoError(t, service.store.startReplica(1, 1, peers, false))
    89  		}
    90  
    91  		// wait for leader to be elected
    92  		done := false
    93  		for i := 0; i < 1000; i++ {
    94  			_, _, ok, err := service.store.nh.GetLeaderID(shardID)
    95  			require.NoError(t, err)
    96  			if ok {
    97  				done = true
    98  				break
    99  			}
   100  			time.Sleep(10 * time.Millisecond)
   101  		}
   102  		require.True(t, done)
   103  	}
   104  
   105  	fn(t, service)
   106  }
   107  
   108  func TestNewService(t *testing.T) {
   109  	defer leaktest.AfterTest(t)()
   110  	cfg := getServiceTestConfig()
   111  	defer vfs.ReportLeakedFD(cfg.FS, t)
   112  	service, err := NewService(cfg,
   113  		testutil.NewFS(),
   114  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   115  			return true
   116  		}),
   117  	)
   118  	require.NoError(t, err)
   119  	assert.NoError(t, service.Close())
   120  }
   121  
   122  func TestServiceConnect(t *testing.T) {
   123  	fn := func(t *testing.T, s *Service) {
   124  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   125  		defer cancel()
   126  
   127  		req := pb.Request{
   128  			Method: pb.CONNECT,
   129  			LogRequest: pb.LogRequest{
   130  				ShardID: 1,
   131  				DNID:    100,
   132  			},
   133  		}
   134  		resp := s.handleConnect(ctx, req)
   135  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   136  	}
   137  	runServiceTest(t, false, true, fn)
   138  }
   139  
   140  func TestServiceConnectTimeout(t *testing.T) {
   141  	fn := func(t *testing.T, s *Service) {
   142  		ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond)
   143  		defer cancel()
   144  
   145  		req := pb.Request{
   146  			Method: pb.CONNECT,
   147  			LogRequest: pb.LogRequest{
   148  				ShardID: 1,
   149  				DNID:    100,
   150  			},
   151  		}
   152  		resp := s.handleConnect(ctx, req)
   153  		assert.Equal(t, uint32(moerr.ErrDragonboatTimeout), resp.ErrorCode)
   154  	}
   155  	runServiceTest(t, false, true, fn)
   156  }
   157  
   158  func TestServiceConnectRO(t *testing.T) {
   159  	fn := func(t *testing.T, s *Service) {
   160  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   161  		defer cancel()
   162  
   163  		req := pb.Request{
   164  			Method: pb.CONNECT_RO,
   165  			LogRequest: pb.LogRequest{
   166  				ShardID: 1,
   167  				DNID:    100,
   168  			},
   169  		}
   170  		resp := s.handleConnect(ctx, req)
   171  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   172  	}
   173  	runServiceTest(t, false, true, fn)
   174  }
   175  
   176  func getTestAppendCmd(id uint64, data []byte) []byte {
   177  	cmd := make([]byte, len(data)+headerSize+8)
   178  	binaryEnc.PutUint32(cmd, uint32(pb.UserEntryUpdate))
   179  	binaryEnc.PutUint64(cmd[headerSize:], id)
   180  	copy(cmd[headerSize+8:], data)
   181  	return cmd
   182  }
   183  
   184  func TestServiceHandleLogHeartbeat(t *testing.T) {
   185  	fn := func(t *testing.T, s *Service) {
   186  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   187  		defer cancel()
   188  
   189  		req := pb.Request{
   190  			Method: pb.LOG_HEARTBEAT,
   191  			LogHeartbeat: &pb.LogStoreHeartbeat{
   192  				UUID: "uuid1",
   193  			},
   194  		}
   195  		sc1 := pb.ScheduleCommand{
   196  			UUID: "uuid1",
   197  			ConfigChange: &pb.ConfigChange{
   198  				Replica: pb.Replica{
   199  					ShardID: 1,
   200  				},
   201  			},
   202  		}
   203  		sc2 := pb.ScheduleCommand{
   204  			UUID: "uuid2",
   205  			ConfigChange: &pb.ConfigChange{
   206  				Replica: pb.Replica{
   207  					ShardID: 2,
   208  				},
   209  			},
   210  		}
   211  		sc3 := pb.ScheduleCommand{
   212  			UUID: "uuid1",
   213  			ConfigChange: &pb.ConfigChange{
   214  				Replica: pb.Replica{
   215  					ShardID: 3,
   216  				},
   217  			},
   218  		}
   219  		require.NoError(t,
   220  			s.store.addScheduleCommands(ctx, 1, []pb.ScheduleCommand{sc1, sc2, sc3}))
   221  		resp := s.handleLogHeartbeat(ctx, req)
   222  		require.Equal(t, []pb.ScheduleCommand{sc1, sc3}, resp.CommandBatch.Commands)
   223  	}
   224  	runServiceTest(t, true, true, fn)
   225  }
   226  
   227  func TestServiceHandleCNHeartbeat(t *testing.T) {
   228  	fn := func(t *testing.T, s *Service) {
   229  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   230  		defer cancel()
   231  
   232  		req := pb.Request{
   233  			Method: pb.CN_HEARTBEAT,
   234  			CNHeartbeat: &pb.CNStoreHeartbeat{
   235  				UUID: "uuid1",
   236  			},
   237  		}
   238  		resp := s.handleCNHeartbeat(ctx, req)
   239  		assert.Equal(t, &pb.CommandBatch{}, resp.CommandBatch)
   240  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   241  	}
   242  	runServiceTest(t, true, true, fn)
   243  }
   244  
   245  func TestServiceHandleDNHeartbeat(t *testing.T) {
   246  	fn := func(t *testing.T, s *Service) {
   247  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   248  		defer cancel()
   249  
   250  		req := pb.Request{
   251  			Method: pb.DN_HEARTBEAT,
   252  			DNHeartbeat: &pb.DNStoreHeartbeat{
   253  				UUID: "uuid1",
   254  			},
   255  		}
   256  		sc1 := pb.ScheduleCommand{
   257  			UUID: "uuid1",
   258  			ConfigChange: &pb.ConfigChange{
   259  				Replica: pb.Replica{
   260  					ShardID: 1,
   261  				},
   262  			},
   263  		}
   264  		sc2 := pb.ScheduleCommand{
   265  			UUID: "uuid2",
   266  			ConfigChange: &pb.ConfigChange{
   267  				Replica: pb.Replica{
   268  					ShardID: 2,
   269  				},
   270  			},
   271  		}
   272  		sc3 := pb.ScheduleCommand{
   273  			UUID: "uuid1",
   274  			ConfigChange: &pb.ConfigChange{
   275  				Replica: pb.Replica{
   276  					ShardID: 3,
   277  				},
   278  			},
   279  		}
   280  		require.NoError(t,
   281  			s.store.addScheduleCommands(ctx, 1, []pb.ScheduleCommand{sc1, sc2, sc3}))
   282  		resp := s.handleDNHeartbeat(ctx, req)
   283  		require.Equal(t, []pb.ScheduleCommand{sc1, sc3}, resp.CommandBatch.Commands)
   284  	}
   285  	runServiceTest(t, true, true, fn)
   286  }
   287  
   288  func TestServiceHandleAppend(t *testing.T) {
   289  	fn := func(t *testing.T, s *Service) {
   290  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   291  		defer cancel()
   292  
   293  		req := pb.Request{
   294  			Method: pb.CONNECT_RO,
   295  			LogRequest: pb.LogRequest{
   296  				ShardID: 1,
   297  				DNID:    100,
   298  			},
   299  		}
   300  		resp := s.handleConnect(ctx, req)
   301  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   302  
   303  		data := make([]byte, 8)
   304  		cmd := getTestAppendCmd(req.LogRequest.DNID, data)
   305  		req = pb.Request{
   306  			Method: pb.APPEND,
   307  			LogRequest: pb.LogRequest{
   308  				ShardID: 1,
   309  			},
   310  		}
   311  		resp = s.handleAppend(ctx, req, cmd)
   312  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   313  		assert.Equal(t, uint64(4), resp.LogResponse.Lsn)
   314  	}
   315  	runServiceTest(t, false, true, fn)
   316  }
   317  
   318  func TestServiceHandleAppendWhenNotBeingTheLeaseHolder(t *testing.T) {
   319  	fn := func(t *testing.T, s *Service) {
   320  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   321  		defer cancel()
   322  
   323  		req := pb.Request{
   324  			Method: pb.CONNECT_RO,
   325  			LogRequest: pb.LogRequest{
   326  				ShardID: 1,
   327  				DNID:    100,
   328  			},
   329  		}
   330  		resp := s.handleConnect(ctx, req)
   331  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   332  
   333  		data := make([]byte, 8)
   334  		cmd := getTestAppendCmd(req.LogRequest.DNID+1, data)
   335  		req = pb.Request{
   336  			Method: pb.APPEND,
   337  			LogRequest: pb.LogRequest{
   338  				ShardID: 1,
   339  			},
   340  		}
   341  		resp = s.handleAppend(ctx, req, cmd)
   342  		assert.Equal(t, uint32(moerr.ErrNotLeaseHolder), resp.ErrorCode)
   343  		assert.Equal(t, uint64(0), resp.LogResponse.Lsn)
   344  	}
   345  	runServiceTest(t, false, true, fn)
   346  }
   347  
   348  func TestServiceHandleRead(t *testing.T) {
   349  	fn := func(t *testing.T, s *Service) {
   350  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   351  		defer cancel()
   352  
   353  		req := pb.Request{
   354  			Method: pb.CONNECT_RO,
   355  			LogRequest: pb.LogRequest{
   356  				ShardID: 1,
   357  				DNID:    100,
   358  			},
   359  		}
   360  		resp := s.handleConnect(ctx, req)
   361  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   362  
   363  		data := make([]byte, 8)
   364  		cmd := getTestAppendCmd(req.LogRequest.DNID, data)
   365  		req = pb.Request{
   366  			Method: pb.APPEND,
   367  			LogRequest: pb.LogRequest{
   368  				ShardID: 1,
   369  			},
   370  		}
   371  		resp = s.handleAppend(ctx, req, cmd)
   372  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   373  		assert.Equal(t, uint64(4), resp.LogResponse.Lsn)
   374  
   375  		req = pb.Request{
   376  			Method: pb.READ,
   377  			LogRequest: pb.LogRequest{
   378  				ShardID: 1,
   379  				Lsn:     1,
   380  				MaxSize: 1024 * 32,
   381  			},
   382  		}
   383  		resp, records := s.handleRead(ctx, req)
   384  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   385  		assert.Equal(t, uint64(1), resp.LogResponse.LastLsn)
   386  		require.Equal(t, 4, len(records.Records))
   387  		assert.Equal(t, pb.Internal, records.Records[0].Type)
   388  		assert.Equal(t, pb.Internal, records.Records[1].Type)
   389  		assert.Equal(t, pb.LeaseUpdate, records.Records[2].Type)
   390  		assert.Equal(t, pb.UserRecord, records.Records[3].Type)
   391  		assert.Equal(t, cmd, records.Records[3].Data)
   392  	}
   393  	runServiceTest(t, false, true, fn)
   394  }
   395  
   396  func TestServiceTruncate(t *testing.T) {
   397  	fn := func(t *testing.T, s *Service) {
   398  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   399  		defer cancel()
   400  
   401  		req := pb.Request{
   402  			Method: pb.CONNECT_RO,
   403  			LogRequest: pb.LogRequest{
   404  				ShardID: 1,
   405  				DNID:    100,
   406  			},
   407  		}
   408  		resp := s.handleConnect(ctx, req)
   409  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   410  
   411  		data := make([]byte, 8)
   412  		cmd := getTestAppendCmd(req.LogRequest.DNID, data)
   413  		req = pb.Request{
   414  			Method: pb.APPEND,
   415  			LogRequest: pb.LogRequest{
   416  				ShardID: 1,
   417  			},
   418  		}
   419  		resp = s.handleAppend(ctx, req, cmd)
   420  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   421  		assert.Equal(t, uint64(4), resp.LogResponse.Lsn)
   422  
   423  		req = pb.Request{
   424  			Method: pb.TRUNCATE,
   425  			LogRequest: pb.LogRequest{
   426  				ShardID: 1,
   427  				Lsn:     4,
   428  			},
   429  		}
   430  		resp = s.handleTruncate(ctx, req)
   431  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   432  		assert.Equal(t, uint64(0), resp.LogResponse.Lsn)
   433  
   434  		req = pb.Request{
   435  			Method: pb.GET_TRUNCATE,
   436  			LogRequest: pb.LogRequest{
   437  				ShardID: 1,
   438  			},
   439  		}
   440  		resp = s.handleGetTruncatedIndex(ctx, req)
   441  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   442  		assert.Equal(t, uint64(4), resp.LogResponse.Lsn)
   443  
   444  		req = pb.Request{
   445  			Method: pb.TRUNCATE,
   446  			LogRequest: pb.LogRequest{
   447  				ShardID: 1,
   448  				Lsn:     3,
   449  			},
   450  		}
   451  		resp = s.handleTruncate(ctx, req)
   452  		assert.Equal(t, uint32(moerr.ErrInvalidTruncateLsn), resp.ErrorCode)
   453  	}
   454  	runServiceTest(t, false, true, fn)
   455  }
   456  
   457  func TestServiceTsoUpdate(t *testing.T) {
   458  	fn := func(t *testing.T, s *Service) {
   459  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   460  		defer cancel()
   461  
   462  		req := pb.Request{
   463  			Method: pb.TSO_UPDATE,
   464  			TsoRequest: &pb.TsoRequest{
   465  				Count: 100,
   466  			},
   467  		}
   468  		resp := s.handleTsoUpdate(ctx, req)
   469  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   470  		assert.Equal(t, uint64(1), resp.TsoResponse.Value)
   471  
   472  		req.TsoRequest.Count = 1000
   473  		resp = s.handleTsoUpdate(ctx, req)
   474  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   475  		assert.Equal(t, uint64(101), resp.TsoResponse.Value)
   476  
   477  		resp = s.handleTsoUpdate(ctx, req)
   478  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   479  		assert.Equal(t, uint64(1101), resp.TsoResponse.Value)
   480  	}
   481  	runServiceTest(t, false, true, fn)
   482  }
   483  
   484  func TestServiceCheckHAKeeper(t *testing.T) {
   485  	fn := func(t *testing.T, s *Service) {
   486  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   487  		defer cancel()
   488  
   489  		req := pb.Request{
   490  			Method: pb.CHECK_HAKEEPER,
   491  		}
   492  		resp := s.handleCheckHAKeeper(ctx, req)
   493  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   494  		assert.False(t, resp.IsHAKeeper)
   495  	}
   496  	runServiceTest(t, false, false, fn)
   497  
   498  	fn = func(t *testing.T, s *Service) {
   499  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   500  		defer cancel()
   501  
   502  		init := make(map[uint64]dragonboat.Target)
   503  		init[1] = s.ID()
   504  		require.NoError(t, s.store.startHAKeeperReplica(1, init, false))
   505  		req := pb.Request{
   506  			Method: pb.CHECK_HAKEEPER,
   507  		}
   508  		resp := s.handleCheckHAKeeper(ctx, req)
   509  		assert.Equal(t, uint32(moerr.Ok), resp.ErrorCode)
   510  		assert.True(t, resp.IsHAKeeper)
   511  	}
   512  	runServiceTest(t, false, false, fn)
   513  }
   514  
   515  func TestShardInfoCanBeQueried(t *testing.T) {
   516  	defer leaktest.AfterTest(t)()
   517  	cfg1 := Config{
   518  		UUID:                uuid.New().String(),
   519  		FS:                  vfs.NewStrictMem(),
   520  		DeploymentID:        1,
   521  		RTTMillisecond:      5,
   522  		DataDir:             "data-1",
   523  		ServiceAddress:      "127.0.0.1:9002",
   524  		RaftAddress:         "127.0.0.1:9000",
   525  		GossipAddress:       "127.0.0.1:9001",
   526  		GossipSeedAddresses: []string{"127.0.0.1:9011"},
   527  		DisableWorkers:      true,
   528  	}
   529  	cfg2 := Config{
   530  		UUID:                uuid.New().String(),
   531  		FS:                  vfs.NewStrictMem(),
   532  		DeploymentID:        1,
   533  		RTTMillisecond:      5,
   534  		DataDir:             "data-2",
   535  		ServiceAddress:      "127.0.0.1:9012",
   536  		RaftAddress:         "127.0.0.1:9010",
   537  		GossipAddress:       "127.0.0.1:9011",
   538  		GossipSeedAddresses: []string{"127.0.0.1:9001"},
   539  		DisableWorkers:      true,
   540  	}
   541  	cfg1.Fill()
   542  	service1, err := NewService(cfg1,
   543  		testutil.NewFS(),
   544  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   545  			return true
   546  		}),
   547  	)
   548  	require.NoError(t, err)
   549  	defer func() {
   550  		assert.NoError(t, service1.Close())
   551  	}()
   552  	peers1 := make(map[uint64]dragonboat.Target)
   553  	peers1[1] = service1.ID()
   554  	assert.NoError(t, service1.store.startReplica(1, 1, peers1, false))
   555  	cfg2.Fill()
   556  	service2, err := NewService(cfg2,
   557  		testutil.NewFS(),
   558  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   559  			return true
   560  		}),
   561  	)
   562  	require.NoError(t, err)
   563  	defer func() {
   564  		assert.NoError(t, service2.Close())
   565  	}()
   566  	peers2 := make(map[uint64]dragonboat.Target)
   567  	peers2[1] = service2.ID()
   568  	assert.NoError(t, service2.store.startReplica(2, 1, peers2, false))
   569  
   570  	nhID1 := service1.ID()
   571  	nhID2 := service2.ID()
   572  
   573  	done := false
   574  
   575  	// FIXME:
   576  	// as per #3478, this test is flaky, increased loop count to 6000 to
   577  	// see whether gossip can finish syncing in 6 seconds time. also added some
   578  	// logging to get collect more details
   579  	for i := 0; i < 6000; i++ {
   580  		si1, ok := service1.getShardInfo(1)
   581  		if !ok || si1.LeaderID != 1 {
   582  			testLogger.Error("shard 1 info missing on service 1")
   583  			time.Sleep(time.Millisecond)
   584  			continue
   585  		}
   586  		assert.Equal(t, 1, len(si1.Replicas))
   587  		require.Equal(t, uint64(1), si1.ShardID)
   588  		ri, ok := si1.Replicas[1]
   589  		assert.True(t, ok)
   590  		assert.Equal(t, nhID1, ri.UUID)
   591  		assert.Equal(t, cfg1.ServiceAddress, ri.ServiceAddress)
   592  
   593  		si2, ok := service1.getShardInfo(2)
   594  		if !ok || si2.LeaderID != 1 {
   595  			testLogger.Error("shard 2 info missing on service 1")
   596  			time.Sleep(time.Millisecond)
   597  			continue
   598  		}
   599  		assert.Equal(t, 1, len(si2.Replicas))
   600  		require.Equal(t, uint64(2), si2.ShardID)
   601  		ri, ok = si2.Replicas[1]
   602  		assert.True(t, ok)
   603  		assert.Equal(t, nhID2, ri.UUID)
   604  		assert.Equal(t, cfg2.ServiceAddress, ri.ServiceAddress)
   605  
   606  		si1, ok = service2.getShardInfo(1)
   607  		if !ok || si1.LeaderID != 1 {
   608  			testLogger.Error("shard 1 info missing on service 2")
   609  			time.Sleep(time.Millisecond)
   610  			continue
   611  		}
   612  		assert.Equal(t, 1, len(si1.Replicas))
   613  		require.Equal(t, uint64(1), si1.ShardID)
   614  		ri, ok = si1.Replicas[1]
   615  		assert.True(t, ok)
   616  		assert.Equal(t, nhID1, ri.UUID)
   617  		assert.Equal(t, cfg1.ServiceAddress, ri.ServiceAddress)
   618  
   619  		si2, ok = service2.getShardInfo(2)
   620  		if !ok || si2.LeaderID != 1 {
   621  			testLogger.Error("shard 2 info missing on service 2")
   622  			time.Sleep(time.Millisecond)
   623  			continue
   624  		}
   625  		assert.Equal(t, 1, len(si2.Replicas))
   626  		require.Equal(t, uint64(2), si2.ShardID)
   627  		ri, ok = si2.Replicas[1]
   628  		assert.True(t, ok)
   629  		assert.Equal(t, nhID2, ri.UUID)
   630  		assert.Equal(t, cfg2.ServiceAddress, ri.ServiceAddress)
   631  
   632  		done = true
   633  		break
   634  	}
   635  	assert.True(t, done)
   636  }
   637  
   638  func TestGossipInSimulatedCluster(t *testing.T) {
   639  	defer leaktest.AfterTest(t)()
   640  	debug.SetMemoryLimit(1 << 30)
   641  	// start all services
   642  	nodeCount := 24
   643  	shardCount := nodeCount / 3
   644  	configs := make([]Config, 0)
   645  	services := make([]*Service, 0)
   646  	for i := 0; i < nodeCount; i++ {
   647  		cfg := Config{
   648  			FS:             vfs.NewStrictMem(),
   649  			UUID:           uuid.New().String(),
   650  			DeploymentID:   1,
   651  			RTTMillisecond: 200,
   652  			DataDir:        fmt.Sprintf("data-%d", i),
   653  			ServiceAddress: fmt.Sprintf("127.0.0.1:%d", 26000+10*i),
   654  			RaftAddress:    fmt.Sprintf("127.0.0.1:%d", 26000+10*i+1),
   655  			GossipAddress:  fmt.Sprintf("127.0.0.1:%d", 26000+10*i+2),
   656  			GossipSeedAddresses: []string{
   657  				"127.0.0.1:26002",
   658  				"127.0.0.1:26012",
   659  				"127.0.0.1:26022",
   660  				"127.0.0.1:26032",
   661  				"127.0.0.1:26042",
   662  				"127.0.0.1:26052",
   663  				"127.0.0.1:26062",
   664  				"127.0.0.1:26072",
   665  				"127.0.0.1:26082",
   666  				"127.0.0.1:26092",
   667  			},
   668  			DisableWorkers:  true,
   669  			LogDBBufferSize: 1024 * 16,
   670  		}
   671  		cfg.GossipProbeInterval.Duration = 350 * time.Millisecond
   672  		configs = append(configs, cfg)
   673  		service, err := NewService(cfg,
   674  			testutil.NewFS(),
   675  			WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   676  				return true
   677  			}),
   678  		)
   679  		require.NoError(t, err)
   680  		services = append(services, service)
   681  	}
   682  	defer func() {
   683  		testLogger.Info("going to close all services")
   684  		var wg sync.WaitGroup
   685  		for _, s := range services {
   686  			if s != nil {
   687  				selected := s
   688  				wg.Add(1)
   689  				go func() {
   690  					require.NoError(t, selected.Close())
   691  					wg.Done()
   692  					testLogger.Info("closed a service")
   693  				}()
   694  			}
   695  		}
   696  		wg.Wait()
   697  		time.Sleep(time.Second * 2)
   698  	}()
   699  	// start all replicas
   700  	// shardID: [1, 16]
   701  	id := uint64(100)
   702  	for i := uint64(0); i < uint64(shardCount); i++ {
   703  		shardID := i + 1
   704  		r1 := id
   705  		r2 := id + 1
   706  		r3 := id + 2
   707  		id += 3
   708  		replicas := make(map[uint64]dragonboat.Target)
   709  		replicas[r1] = services[i*3].ID()
   710  		replicas[r2] = services[i*3+1].ID()
   711  		replicas[r3] = services[i*3+2].ID()
   712  		require.NoError(t, services[i*3+0].store.startReplica(shardID, r1, replicas, false))
   713  		require.NoError(t, services[i*3+1].store.startReplica(shardID, r2, replicas, false))
   714  		require.NoError(t, services[i*3+2].store.startReplica(shardID, r3, replicas, false))
   715  	}
   716  	wait := func() {
   717  		time.Sleep(50 * time.Millisecond)
   718  	}
   719  	// check & wait all leaders to be elected and known to all services
   720  	cci := uint64(0)
   721  	iterations := 1000
   722  	for retry := 0; retry < iterations; retry++ {
   723  		notReady := 0
   724  		for i := 0; i < nodeCount; i++ {
   725  			shardID := uint64(i/3 + 1)
   726  			service := services[i]
   727  			info, ok := service.getShardInfo(shardID)
   728  			if !ok || info.LeaderID == 0 {
   729  				notReady++
   730  				wait()
   731  				continue
   732  			}
   733  			if shardID == 1 && info.Epoch != 0 {
   734  				cci = info.Epoch
   735  			}
   736  		}
   737  		if notReady <= 1 {
   738  			break
   739  		}
   740  		require.True(t, retry < iterations-1)
   741  	}
   742  	require.True(t, cci != 0)
   743  	// all good now, add a replica to shard 1
   744  	id += 1
   745  
   746  	for i := 0; i < iterations; i++ {
   747  		err := services[0].store.addReplica(1, id, services[3].ID(), cci)
   748  		if err == nil {
   749  			break
   750  		} else if err == dragonboat.ErrTimeout || err == dragonboat.ErrSystemBusy ||
   751  			err == dragonboat.ErrInvalidDeadline || err == dragonboat.ErrTimeoutTooSmall {
   752  			info, ok := services[0].getShardInfo(1)
   753  			if ok && info.LeaderID != 0 && len(info.Replicas) == 4 {
   754  				break
   755  			}
   756  			wait()
   757  			continue
   758  		} else if err == dragonboat.ErrRejected {
   759  			break
   760  		}
   761  		t.Fatalf("failed to add replica, %v", err)
   762  	}
   763  
   764  	// check the above change can be observed by all services
   765  	for retry := 0; retry < iterations; retry++ {
   766  		notReady := 0
   767  		for i := 0; i < nodeCount; i++ {
   768  			service := services[i]
   769  			info, ok := service.getShardInfo(1)
   770  			if !ok || info.LeaderID == 0 || len(info.Replicas) != 4 {
   771  				notReady++
   772  				wait()
   773  				continue
   774  			}
   775  		}
   776  		if notReady <= 1 {
   777  			break
   778  		}
   779  		require.True(t, retry < iterations-1)
   780  	}
   781  	// restart a service, watch how long will it take to get all required
   782  	// shard info
   783  	require.NoError(t, services[12].Close())
   784  	services[12] = nil
   785  	time.Sleep(2 * time.Second)
   786  	service, err := NewService(configs[12],
   787  		testutil.NewFS(),
   788  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   789  			return true
   790  		}),
   791  	)
   792  	require.NoError(t, err)
   793  	defer func() {
   794  		require.NoError(t, service.Close())
   795  	}()
   796  	for retry := 0; retry < iterations; retry++ {
   797  		notReady := 0
   798  		for i := uint64(0); i < uint64(shardCount); i++ {
   799  			shardID := i + 1
   800  			info, ok := service.getShardInfo(shardID)
   801  			if !ok || info.LeaderID == 0 {
   802  				notReady++
   803  				wait()
   804  				continue
   805  			}
   806  		}
   807  		if notReady <= 1 {
   808  			break
   809  		}
   810  		require.True(t, retry < iterations-1)
   811  	}
   812  }