github.com/matrixorigin/matrixone@v1.2.0/pkg/logservice/store_hakeeper_check_test.go (about)

     1  // Copyright 2021 - 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logservice
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/google/uuid"
    24  	"github.com/lni/dragonboat/v4"
    25  	"github.com/lni/goutils/leaktest"
    26  	"github.com/lni/vfs"
    27  	"github.com/stretchr/testify/assert"
    28  	"github.com/stretchr/testify/require"
    29  
    30  	"github.com/matrixorigin/matrixone/pkg/common/morpc"
    31  	"github.com/matrixorigin/matrixone/pkg/common/runtime"
    32  	"github.com/matrixorigin/matrixone/pkg/hakeeper"
    33  	pb "github.com/matrixorigin/matrixone/pkg/pb/logservice"
    34  	"github.com/matrixorigin/matrixone/pkg/pb/task"
    35  	"github.com/matrixorigin/matrixone/pkg/taskservice"
    36  )
    37  
    38  func TestIDAllocatorDefaultState(t *testing.T) {
    39  	alloc := newIDAllocator()
    40  	assert.Equal(t, uint64(0), alloc.Capacity())
    41  	v, ok := alloc.Next()
    42  	assert.False(t, ok)
    43  	assert.Equal(t, uint64(0), v)
    44  }
    45  
    46  func TestIDAllocatorCapacity(t *testing.T) {
    47  	tests := []struct {
    48  		next     uint64
    49  		last     uint64
    50  		capacity uint64
    51  	}{
    52  		{1, 1, 1},
    53  		{2, 1, 0},
    54  		{1, 2, 2},
    55  		{100, 200, 101},
    56  	}
    57  
    58  	for _, tt := range tests {
    59  		alloc := idAllocator{nextID: tt.next, lastID: tt.last}
    60  		assert.Equal(t, tt.capacity, alloc.Capacity())
    61  	}
    62  }
    63  
    64  func TestIDAllocatorSet(t *testing.T) {
    65  	alloc := idAllocator{nextID: 100, lastID: 200}
    66  	alloc.Set(hakeeper.K8SIDRangeEnd, hakeeper.K8SIDRangeEnd+100)
    67  	expected := idAllocator{
    68  		nextID: hakeeper.K8SIDRangeEnd,
    69  		lastID: hakeeper.K8SIDRangeEnd + 100,
    70  	}
    71  	assert.Equal(t, expected, alloc)
    72  }
    73  
    74  func TestIDAllocatorRejectInvalidSetInput(t *testing.T) {
    75  	alloc := idAllocator{nextID: 100, lastID: 200}
    76  	defer func() {
    77  		if r := recover(); r == nil {
    78  			t.Fatalf("failed to trigger panic")
    79  		}
    80  	}()
    81  	alloc.Set(300, 400)
    82  }
    83  
    84  func TestIDAllocatorNext(t *testing.T) {
    85  	tests := []struct {
    86  		next     uint64
    87  		last     uint64
    88  		capacity uint64
    89  	}{
    90  		{1, 1, 1},
    91  		{2, 1, 0},
    92  		{1, 2, 2},
    93  		{100, 200, 101},
    94  	}
    95  
    96  	for _, tt := range tests {
    97  		expected := tt.next
    98  		alloc := idAllocator{nextID: tt.next, lastID: tt.last}
    99  		for {
   100  			hasID := alloc.Capacity() != 0
   101  			v, ok := alloc.Next()
   102  			assert.Equal(t, hasID, ok)
   103  			if hasID {
   104  				assert.Equal(t, expected, v)
   105  				expected++
   106  			} else {
   107  				assert.Equal(t, uint64(0), v)
   108  				break
   109  			}
   110  		}
   111  	}
   112  }
   113  
   114  func TestHandleBootstrapFailure(t *testing.T) {
   115  	defer func() {
   116  		if r := recover(); r == nil {
   117  			t.Fatalf("failed to trigger panic")
   118  		}
   119  	}()
   120  	s := store{}
   121  	s.handleBootstrapFailure()
   122  }
   123  
   124  func runHAKeeperStoreTest(t *testing.T, startLogReplica bool, fn func(*testing.T, *store)) {
   125  	defer leaktest.AfterTest(t)()
   126  	cfg := getStoreTestConfig()
   127  	defer vfs.ReportLeakedFD(cfg.FS, t)
   128  	store, err := getTestStore(cfg, startLogReplica, nil)
   129  	assert.NoError(t, err)
   130  	defer func() {
   131  		assert.NoError(t, store.close())
   132  	}()
   133  	peers := make(map[uint64]dragonboat.Target)
   134  	peers[1] = store.id()
   135  	assert.NoError(t, store.startHAKeeperReplica(1, peers, false))
   136  	fn(t, store)
   137  }
   138  
   139  func runHakeeperTaskServiceTest(t *testing.T, fn func(*testing.T, *store, taskservice.TaskService)) {
   140  	defer leaktest.AfterTest(t)()
   141  	cfg := getStoreTestConfig()
   142  	cfg.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   143  	defer vfs.ReportLeakedFD(cfg.FS, t)
   144  
   145  	taskService := taskservice.NewTaskService(runtime.DefaultRuntime(), taskservice.NewMemTaskStorage())
   146  	defer taskService.StopScheduleCronTask()
   147  
   148  	store, err := getTestStore(cfg, false, taskService)
   149  	assert.NoError(t, err)
   150  	defer func() {
   151  		assert.NoError(t, store.close())
   152  	}()
   153  	peers := make(map[uint64]dragonboat.Target)
   154  	peers[1] = store.id()
   155  	assert.NoError(t, store.startHAKeeperReplica(1, peers, false))
   156  	fn(t, store, taskService)
   157  }
   158  
   159  func runHAKeeperClusterTest(t *testing.T, fn func(*testing.T, []*Service)) {
   160  	defer leaktest.AfterTest(t)()
   161  	cfg1 := DefaultConfig()
   162  	cfg1.UUID = uuid.New().String()
   163  	cfg1.FS = vfs.NewStrictMem()
   164  	cfg1.DeploymentID = 1
   165  	cfg1.RTTMillisecond = 5
   166  	cfg1.DataDir = "data-1"
   167  	cfg1.LogServicePort = 9002
   168  	cfg1.RaftPort = 9000
   169  	cfg1.GossipPort = 9001
   170  	cfg1.GossipSeedAddresses = []string{"127.0.0.1:9011", "127.0.0.1:9021", "127.0.0.1:9031"}
   171  	cfg1.DisableWorkers = true
   172  	cfg1.HAKeeperConfig.TickPerSecond = 10
   173  	cfg1.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   174  	cfg1.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second
   175  	cfg1.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   176  	cfg2 := DefaultConfig()
   177  	cfg2.UUID = uuid.New().String()
   178  	cfg2.FS = vfs.NewStrictMem()
   179  	cfg2.DeploymentID = 1
   180  	cfg2.RTTMillisecond = 5
   181  	cfg2.DataDir = "data-2"
   182  	cfg2.LogServicePort = 9012
   183  	cfg2.RaftPort = 9010
   184  	cfg2.GossipPort = 9011
   185  	cfg2.GossipSeedAddresses = []string{"127.0.0.1:9001", "127.0.0.1:9021", "127.0.0.1:9031"}
   186  	cfg2.DisableWorkers = true
   187  	cfg2.HAKeeperConfig.TickPerSecond = 10
   188  	cfg2.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   189  	cfg2.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second
   190  	cfg2.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   191  	cfg3 := DefaultConfig()
   192  	cfg3.UUID = uuid.New().String()
   193  	cfg3.FS = vfs.NewStrictMem()
   194  	cfg3.DeploymentID = 1
   195  	cfg3.RTTMillisecond = 5
   196  	cfg3.DataDir = "data-3"
   197  	cfg3.LogServicePort = 9022
   198  	cfg3.RaftPort = 9020
   199  	cfg3.GossipPort = 9021
   200  	cfg3.GossipSeedAddresses = []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9031"}
   201  	cfg3.DisableWorkers = true
   202  	cfg3.HAKeeperConfig.TickPerSecond = 10
   203  	cfg3.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   204  	cfg3.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second
   205  	cfg3.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   206  	cfg4 := DefaultConfig()
   207  	cfg4.UUID = uuid.New().String()
   208  	cfg4.FS = vfs.NewStrictMem()
   209  	cfg4.DeploymentID = 1
   210  	cfg4.RTTMillisecond = 5
   211  	cfg4.DataDir = "data-4"
   212  	cfg4.LogServicePort = 9032
   213  	cfg4.RaftPort = 9030
   214  	cfg4.GossipPort = 9031
   215  	cfg4.GossipSeedAddresses = []string{"127.0.0.1:9001", "127.0.0.1:9011", "127.0.0.1:9021"}
   216  	cfg4.DisableWorkers = true
   217  	cfg4.HAKeeperConfig.TickPerSecond = 10
   218  	cfg4.HAKeeperConfig.LogStoreTimeout.Duration = 5 * time.Second
   219  	cfg4.HAKeeperConfig.TNStoreTimeout.Duration = 10 * time.Second
   220  	cfg4.HAKeeperConfig.CNStoreTimeout.Duration = 5 * time.Second
   221  	service1, err := NewService(cfg1,
   222  		newFS(),
   223  		nil,
   224  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   225  			return true
   226  		}),
   227  	)
   228  	require.NoError(t, err)
   229  	defer func() {
   230  		assert.NoError(t, service1.Close())
   231  	}()
   232  	service2, err := NewService(cfg2,
   233  		newFS(),
   234  		nil,
   235  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   236  			return true
   237  		}),
   238  	)
   239  	require.NoError(t, err)
   240  	defer func() {
   241  		assert.NoError(t, service2.Close())
   242  	}()
   243  	service3, err := NewService(cfg3,
   244  		newFS(),
   245  		nil,
   246  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   247  			return true
   248  		}),
   249  	)
   250  	require.NoError(t, err)
   251  	defer func() {
   252  		assert.NoError(t, service3.Close())
   253  	}()
   254  	service4, err := NewService(cfg4,
   255  		newFS(),
   256  		nil,
   257  		WithBackendFilter(func(msg morpc.Message, backendAddr string) bool {
   258  			return true
   259  		}),
   260  	)
   261  	require.NoError(t, err)
   262  	defer func() {
   263  		assert.NoError(t, service4.Close())
   264  	}()
   265  
   266  	peers := make(map[uint64]dragonboat.Target)
   267  	peers[1] = service1.ID()
   268  	peers[2] = service2.ID()
   269  	peers[3] = service3.ID()
   270  	assert.NoError(t, service1.store.startHAKeeperReplica(1, peers, false))
   271  	assert.NoError(t, service2.store.startHAKeeperReplica(2, peers, false))
   272  	assert.NoError(t, service3.store.startHAKeeperReplica(3, peers, false))
   273  	fn(t, []*Service{service1, service2, service3, service4})
   274  }
   275  
   276  func TestHAKeeperCanBootstrapAndRepairShards(t *testing.T) {
   277  	fn := func(t *testing.T, services []*Service) {
   278  		// bootstrap the cluster, 1 TN 1 Log shard, Log and HAKeeper have
   279  		// 3 replicas
   280  		hakeeperDefaultTimeout = 10 * time.Second
   281  
   282  		store1 := services[0].store
   283  		state, err := store1.getCheckerState()
   284  		require.NoError(t, err)
   285  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   286  		nextIDByKey := map[string]uint64{"a": 1, "b": 2}
   287  		require.NoError(t, store1.setInitialClusterInfo(1, 1, 3, hakeeper.K8SIDRangeEnd+10, nextIDByKey))
   288  		state, err = store1.getCheckerState()
   289  		require.NoError(t, err)
   290  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   291  		assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId)
   292  		assert.Equal(t, nextIDByKey, state.NextIDByKey)
   293  
   294  		sendHeartbeat := func(ss []*Service) {
   295  			for _, s := range ss {
   296  				done := false
   297  				for i := 0; i < 10; i++ {
   298  					m := s.store.getHeartbeatMessage()
   299  					ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   300  					defer cancel()
   301  					_, err := s.store.addLogStoreHeartbeat(ctx, m)
   302  					if err == dragonboat.ErrTimeout {
   303  						time.Sleep(100 * time.Millisecond)
   304  					} else {
   305  						if err == nil {
   306  							done = true
   307  							break
   308  						} else {
   309  							t.Fatalf("failed to add heartbeat %v", err)
   310  						}
   311  					}
   312  				}
   313  				if !done {
   314  					t.Fatalf("failed to add heartbeat after 10 retries")
   315  				}
   316  			}
   317  		}
   318  		sendHeartbeat(services[:3])
   319  
   320  		// fake a TN store
   321  		tnMsg := pb.TNStoreHeartbeat{
   322  			UUID:   uuid.New().String(),
   323  			Shards: make([]pb.TNShardInfo, 0),
   324  		}
   325  		ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
   326  		defer cancel()
   327  		_, err = services[0].store.addTNStoreHeartbeat(ctx, tnMsg)
   328  		require.NoError(t, err)
   329  
   330  		// find out the leader HAKeeper store as we need the term value
   331  		var term uint64
   332  		var leaderStore *store
   333  		for _, s := range services[:3] {
   334  			isLeader, curTerm, err := s.store.isLeaderHAKeeper()
   335  			require.NoError(t, err)
   336  			if isLeader {
   337  				term = curTerm
   338  				leaderStore = s.store
   339  				break
   340  			}
   341  		}
   342  		require.NotNil(t, leaderStore)
   343  		require.True(t, term > 0)
   344  
   345  		// bootstrap the cluster
   346  		state, err = leaderStore.getCheckerState()
   347  		require.NoError(t, err)
   348  		leaderStore.bootstrap(term, state)
   349  
   350  		state, err = leaderStore.getCheckerState()
   351  		require.NoError(t, err)
   352  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   353  		assert.Equal(t, uint64(checkBootstrapCycles), leaderStore.bootstrapCheckCycles)
   354  		require.NotNil(t, leaderStore.bootstrapMgr)
   355  		assert.False(t, leaderStore.bootstrapMgr.CheckBootstrap(state.LogState))
   356  
   357  		// get and apply all bootstrap schedule commands
   358  		for _, s := range services[:3] {
   359  			cb, err := s.store.getCommandBatch(ctx, s.store.id())
   360  			require.NoError(t, err)
   361  			if len(cb.Commands) > 0 {
   362  				s.handleStartReplica(cb.Commands[0])
   363  			}
   364  		}
   365  
   366  		// check bootstrap can be completed
   367  		for i := 0; i < 100; i++ {
   368  			sendHeartbeat(services[:3])
   369  			state, err = leaderStore.getCheckerState()
   370  			require.NoError(t, err)
   371  			leaderStore.checkBootstrap(state)
   372  
   373  			state, err = leaderStore.getCheckerState()
   374  			require.NoError(t, err)
   375  			if state.State != pb.HAKeeperRunning {
   376  				// FIXME: why wait here?
   377  				time.Sleep(50 * time.Millisecond)
   378  			} else {
   379  				break
   380  			}
   381  			if i == 99 {
   382  				t.Fatalf("failed to complete bootstrap")
   383  			}
   384  		}
   385  
   386  		// get the TN bootstrap command, it contains TN shard and replica ID
   387  		cb, err := leaderStore.getCommandBatch(ctx, tnMsg.UUID)
   388  		require.NoError(t, err)
   389  		require.Equal(t, 1, len(cb.Commands))
   390  		cmd := cb.Commands[0]
   391  		assert.True(t, cmd.Bootstrapping)
   392  		assert.Equal(t, pb.TNService, cmd.ServiceType)
   393  		tnShardInfo := pb.TNShardInfo{
   394  			ShardID:   cmd.ConfigChange.Replica.ShardID,
   395  			ReplicaID: cmd.ConfigChange.Replica.ReplicaID,
   396  		}
   397  		tnMsg.Shards = append(tnMsg.Shards, tnShardInfo)
   398  		// as if TN is running
   399  		_, err = services[0].store.addTNStoreHeartbeat(ctx, tnMsg)
   400  		require.NoError(t, err)
   401  		// fake a free TN store
   402  		tnMsg2 := pb.TNStoreHeartbeat{
   403  			UUID:   uuid.New().String(),
   404  			Shards: make([]pb.TNShardInfo, 0),
   405  		}
   406  		_, err = services[0].store.addTNStoreHeartbeat(ctx, tnMsg2)
   407  		require.NoError(t, err)
   408  
   409  		// stop store 1
   410  		require.NoError(t, services[0].Close())
   411  		// no service.Close can be repeatedly called
   412  		services[0].store = nil
   413  		services = services[1:]
   414  
   415  		// wait for HAKeeper to repair the Log & HAKeeper shards
   416  		tnRepaired := false
   417  		for i := 0; i < 5000; i++ {
   418  			testLogger.Debug(fmt.Sprintf("iteration %d", i))
   419  			tn := func() (bool, error) {
   420  				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   421  				defer cancel()
   422  				m := services[0].store.getHeartbeatMessage()
   423  				if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil {
   424  					return false, err
   425  				} else {
   426  					services[0].handleCommands(cb.Commands)
   427  				}
   428  				m = services[1].store.getHeartbeatMessage()
   429  				if cb, err := services[1].store.addLogStoreHeartbeat(ctx, m); err != nil {
   430  					return false, err
   431  				} else {
   432  					services[1].handleCommands(cb.Commands)
   433  				}
   434  				m = services[2].store.getHeartbeatMessage()
   435  				if cb, err := services[0].store.addLogStoreHeartbeat(ctx, m); err != nil {
   436  					return false, err
   437  				} else {
   438  					services[2].handleCommands(cb.Commands)
   439  				}
   440  				if _, err := services[0].store.addTNStoreHeartbeat(ctx, tnMsg2); err != nil {
   441  					return false, err
   442  				}
   443  
   444  				for _, s := range services {
   445  					if hasShard(s.store, 0) {
   446  						s.store.hakeeperTick()
   447  						s.store.hakeeperCheck()
   448  					}
   449  
   450  					cb, err = services[0].store.getCommandBatch(ctx, tnMsg2.UUID)
   451  					if err != nil {
   452  						return false, err
   453  					}
   454  					if len(cb.Commands) > 0 {
   455  						cmd := cb.Commands[0]
   456  						if cmd.ServiceType == pb.TNService {
   457  							if cmd.ConfigChange != nil && cmd.ConfigChange.Replica.ShardID == tnShardInfo.ShardID &&
   458  								cmd.ConfigChange.Replica.ReplicaID > tnShardInfo.ReplicaID {
   459  								tnRepaired = true
   460  							}
   461  						}
   462  					}
   463  				}
   464  
   465  				logRepaired := true
   466  				for _, s := range services {
   467  					if !hasShard(s.store, 0) || !hasShard(s.store, 1) {
   468  						logRepaired = false
   469  						break
   470  					}
   471  				}
   472  				testLogger.Debug(fmt.Sprintf("dnRepaired %t, logRepaired %t", tnRepaired, logRepaired))
   473  				if !logRepaired || !tnRepaired {
   474  					return false, nil
   475  				} else {
   476  					testLogger.Debug(fmt.Sprintf("repair completed, i: %d", i))
   477  					return true, nil
   478  				}
   479  			}
   480  			completed, err := tn()
   481  			if err != nil && err != dragonboat.ErrTimeout &&
   482  				err != dragonboat.ErrInvalidDeadline && err != dragonboat.ErrTimeoutTooSmall {
   483  				t.Fatalf("unexpected error %v", err)
   484  			}
   485  			if completed {
   486  				for _, s := range services[:3] {
   487  					_ = s.task.holder.Close()
   488  				}
   489  				return
   490  			}
   491  			time.Sleep(5 * time.Millisecond)
   492  		}
   493  		t.Fatalf("failed to repair shards")
   494  	}
   495  	runHAKeeperClusterTest(t, fn)
   496  }
   497  
   498  func TestGetCheckerStateFromLeader(t *testing.T) {
   499  	fn := func(t *testing.T, store *store) {
   500  		ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(time.Second*10))
   501  		defer cancel()
   502  
   503  		for {
   504  			select {
   505  			case <-ctx.Done():
   506  				t.Error("test deadline reached")
   507  				return
   508  
   509  			default:
   510  				isLeader, termA, err := store.isLeaderHAKeeper()
   511  				state, termB := store.getCheckerStateFromLeader()
   512  				require.NoError(t, err)
   513  				assert.Equal(t, termB, termA)
   514  
   515  				if !isLeader {
   516  					assert.Equal(t, (*pb.CheckerState)(nil), state)
   517  				} else {
   518  					assert.NotEqual(t, (*pb.CheckerState)(nil), state)
   519  					return
   520  				}
   521  				time.Sleep(time.Second)
   522  			}
   523  		}
   524  	}
   525  
   526  	runHAKeeperStoreTest(t, false, fn)
   527  }
   528  
   529  func TestGetCheckerState(t *testing.T) {
   530  	fn := func(t *testing.T, store *store) {
   531  		state, err := store.getCheckerState()
   532  		require.NoError(t, err)
   533  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   534  	}
   535  	runHAKeeperStoreTest(t, false, fn)
   536  }
   537  
   538  func TestSetInitialClusterInfo(t *testing.T) {
   539  	fn := func(t *testing.T, store *store) {
   540  		state, err := store.getCheckerState()
   541  		require.NoError(t, err)
   542  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   543  		nextIDByKey := map[string]uint64{"a": 1, "b": 2}
   544  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey))
   545  		state, err = store.getCheckerState()
   546  		require.NoError(t, err)
   547  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   548  		assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId)
   549  		assert.Equal(t, nextIDByKey, state.NextIDByKey)
   550  	}
   551  	runHAKeeperStoreTest(t, false, fn)
   552  }
   553  
   554  func TestFailedBootstrap(t *testing.T) {
   555  	testBootstrap(t, true)
   556  }
   557  
   558  func TestBootstrap(t *testing.T) {
   559  	testBootstrap(t, false)
   560  }
   561  
   562  func testBootstrap(t *testing.T, fail bool) {
   563  	fn := func(t *testing.T, store *store) {
   564  		state, err := store.getCheckerState()
   565  		require.NoError(t, err)
   566  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   567  		nextIDByKey := map[string]uint64{"a": 1, "b": 2}
   568  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey))
   569  		state, err = store.getCheckerState()
   570  		require.NoError(t, err)
   571  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   572  		assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId)
   573  		assert.Equal(t, nextIDByKey, state.NextIDByKey)
   574  		m := store.getHeartbeatMessage()
   575  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   576  		defer cancel()
   577  		_, err = store.addLogStoreHeartbeat(ctx, m)
   578  		assert.NoError(t, err)
   579  
   580  		tnMsg := pb.TNStoreHeartbeat{
   581  			UUID:   uuid.New().String(),
   582  			Shards: make([]pb.TNShardInfo, 0),
   583  		}
   584  		_, err = store.addTNStoreHeartbeat(ctx, tnMsg)
   585  		assert.NoError(t, err)
   586  
   587  		_, term, err := store.isLeaderHAKeeper()
   588  		require.NoError(t, err)
   589  
   590  		state, err = store.getCheckerState()
   591  		require.NoError(t, err)
   592  		store.bootstrap(term, state)
   593  
   594  		state, err = store.getCheckerState()
   595  		require.NoError(t, err)
   596  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   597  		assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles)
   598  		require.NotNil(t, store.bootstrapMgr)
   599  		assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState))
   600  
   601  		if fail {
   602  			// keep checking, bootstrap will eventually be set as failed
   603  			for i := 0; i <= checkBootstrapCycles; i++ {
   604  				store.checkBootstrap(state)
   605  			}
   606  
   607  			state, err = store.getCheckerState()
   608  			require.NoError(t, err)
   609  			assert.Equal(t, pb.HAKeeperBootstrapFailed, state.State)
   610  		} else {
   611  			cb, err := store.getCommandBatch(ctx, tnMsg.UUID)
   612  			require.NoError(t, err)
   613  			require.Equal(t, 1, len(cb.Commands))
   614  			assert.True(t, cb.Commands[0].Bootstrapping)
   615  			assert.Equal(t, pb.TNService, cb.Commands[0].ServiceType)
   616  			assert.True(t, cb.Commands[0].ConfigChange.Replica.ReplicaID > 0)
   617  
   618  			cb, err = store.getCommandBatch(ctx, store.id())
   619  			require.NoError(t, err)
   620  			require.Equal(t, 1, len(cb.Commands))
   621  			assert.True(t, cb.Commands[0].Bootstrapping)
   622  			service := &Service{store: store}
   623  			service.handleStartReplica(cb.Commands[0])
   624  
   625  			for i := 0; i < 100; i++ {
   626  				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   627  				defer cancel()
   628  				m := store.getHeartbeatMessage()
   629  				_, err = store.addLogStoreHeartbeat(ctx, m)
   630  				assert.NoError(t, err)
   631  
   632  				state, err = store.getCheckerState()
   633  				require.NoError(t, err)
   634  				store.checkBootstrap(state)
   635  
   636  				state, err = store.getCheckerState()
   637  				require.NoError(t, err)
   638  				if state.State != pb.HAKeeperRunning {
   639  					time.Sleep(50 * time.Millisecond)
   640  				} else {
   641  					return
   642  				}
   643  				if i == 2999 {
   644  					t.Fatalf("failed to complete bootstrap")
   645  				}
   646  			}
   647  		}
   648  	}
   649  	runHAKeeperStoreTest(t, false, fn)
   650  }
   651  
   652  func TestTaskSchedulerCanScheduleTasksToCNs(t *testing.T) {
   653  	fn := func(t *testing.T, store *store, taskService taskservice.TaskService) {
   654  		state, err := store.getCheckerState()
   655  		require.NoError(t, err)
   656  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   657  		nextIDByKey := map[string]uint64{"a": 1, "b": 2}
   658  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey))
   659  		state, err = store.getCheckerState()
   660  		require.NoError(t, err)
   661  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   662  		assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId)
   663  		assert.Equal(t, nextIDByKey, state.NextIDByKey)
   664  		m := store.getHeartbeatMessage()
   665  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   666  		defer cancel()
   667  		_, err = store.addLogStoreHeartbeat(ctx, m)
   668  		assert.NoError(t, err)
   669  
   670  		_, term, err := store.isLeaderHAKeeper()
   671  		require.NoError(t, err)
   672  
   673  		state, err = store.getCheckerState()
   674  		require.NoError(t, err)
   675  		store.bootstrap(term, state)
   676  
   677  		state, err = store.getCheckerState()
   678  		require.NoError(t, err)
   679  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   680  		assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles)
   681  		require.NotNil(t, store.bootstrapMgr)
   682  		assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState))
   683  
   684  		cb, err := store.getCommandBatch(ctx, store.id())
   685  		require.NoError(t, err)
   686  		require.Equal(t, 1, len(cb.Commands))
   687  		assert.True(t, cb.Commands[0].Bootstrapping)
   688  		service := &Service{store: store}
   689  		service.handleStartReplica(cb.Commands[0])
   690  
   691  		for i := 0; i < 100; i++ {
   692  			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   693  			defer cancel()
   694  			m := store.getHeartbeatMessage()
   695  			_, err = store.addLogStoreHeartbeat(ctx, m)
   696  			assert.NoError(t, err)
   697  
   698  			state, err = store.getCheckerState()
   699  			require.NoError(t, err)
   700  			store.checkBootstrap(state)
   701  
   702  			state, err = store.getCheckerState()
   703  			require.NoError(t, err)
   704  			if state.State != pb.HAKeeperRunning {
   705  				time.Sleep(50 * time.Millisecond)
   706  			} else {
   707  				break
   708  			}
   709  			if i == 2999 {
   710  				t.Fatalf("failed to complete bootstrap")
   711  			}
   712  		}
   713  
   714  		cnUUID1 := uuid.New().String()
   715  		cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1}
   716  		_, err = store.addCNStoreHeartbeat(ctx, cnMsg1)
   717  		assert.NoError(t, err)
   718  		err = taskService.CreateAsyncTask(ctx, task.TaskMetadata{ID: "a"})
   719  		assert.NoError(t, err)
   720  		state, err = store.getCheckerState()
   721  		require.NoError(t, err)
   722  		tasks, err := taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   723  		assert.NoError(t, err)
   724  		assert.Equal(t, 0, len(tasks))
   725  		store.taskSchedule(state)
   726  		// update state
   727  		state, err = store.getCheckerState()
   728  		require.NoError(t, err)
   729  		store.taskSchedule(state)
   730  		tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   731  		assert.NoError(t, err)
   732  		assert.Equal(t, 1, len(tasks))
   733  
   734  		cnUUID2 := uuid.New().String()
   735  		cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2}
   736  		_, err = store.addCNStoreHeartbeat(ctx, cnMsg2)
   737  		assert.NoError(t, err)
   738  		err = taskService.CreateAsyncTask(ctx, task.TaskMetadata{ID: "b"})
   739  		assert.NoError(t, err)
   740  		state, err = store.getCheckerState()
   741  		require.NoError(t, err)
   742  		tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2))
   743  		assert.NoError(t, err)
   744  		assert.Equal(t, 0, len(tasks))
   745  		store.taskSchedule(state)
   746  		tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2))
   747  		assert.NoError(t, err)
   748  		assert.Equal(t, 1, len(tasks))
   749  	}
   750  	runHakeeperTaskServiceTest(t, fn)
   751  }
   752  
   753  func TestTaskSchedulerCanReScheduleExpiredTasks(t *testing.T) {
   754  	fn := func(t *testing.T, store *store, taskService taskservice.TaskService) {
   755  		state, err := store.getCheckerState()
   756  		require.NoError(t, err)
   757  		assert.Equal(t, pb.HAKeeperCreated, state.State)
   758  		nextIDByKey := map[string]uint64{"a": 1, "b": 2}
   759  		require.NoError(t, store.setInitialClusterInfo(1, 1, 1, hakeeper.K8SIDRangeEnd+10, nextIDByKey))
   760  		state, err = store.getCheckerState()
   761  		require.NoError(t, err)
   762  		assert.Equal(t, pb.HAKeeperBootstrapping, state.State)
   763  		assert.Equal(t, hakeeper.K8SIDRangeEnd+10, state.NextId)
   764  		assert.Equal(t, nextIDByKey, state.NextIDByKey)
   765  		m := store.getHeartbeatMessage()
   766  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   767  		defer cancel()
   768  		_, err = store.addLogStoreHeartbeat(ctx, m)
   769  		assert.NoError(t, err)
   770  
   771  		_, term, err := store.isLeaderHAKeeper()
   772  		require.NoError(t, err)
   773  
   774  		state, err = store.getCheckerState()
   775  		require.NoError(t, err)
   776  		store.bootstrap(term, state)
   777  
   778  		state, err = store.getCheckerState()
   779  		require.NoError(t, err)
   780  		assert.Equal(t, pb.HAKeeperBootstrapCommandsReceived, state.State)
   781  		assert.Equal(t, uint64(checkBootstrapCycles), store.bootstrapCheckCycles)
   782  		require.NotNil(t, store.bootstrapMgr)
   783  		assert.False(t, store.bootstrapMgr.CheckBootstrap(state.LogState))
   784  
   785  		cb, err := store.getCommandBatch(ctx, store.id())
   786  		require.NoError(t, err)
   787  		require.Equal(t, 1, len(cb.Commands))
   788  		assert.True(t, cb.Commands[0].Bootstrapping)
   789  		service := &Service{store: store}
   790  		service.handleStartReplica(cb.Commands[0])
   791  
   792  		for i := 0; i < 100; i++ {
   793  			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   794  			defer cancel()
   795  			m := store.getHeartbeatMessage()
   796  			_, err = store.addLogStoreHeartbeat(ctx, m)
   797  			assert.NoError(t, err)
   798  
   799  			state, err = store.getCheckerState()
   800  			require.NoError(t, err)
   801  			store.checkBootstrap(state)
   802  
   803  			state, err = store.getCheckerState()
   804  			require.NoError(t, err)
   805  			if state.State != pb.HAKeeperRunning {
   806  				time.Sleep(50 * time.Millisecond)
   807  			} else {
   808  				break
   809  			}
   810  			if i == 2999 {
   811  				t.Fatalf("failed to complete bootstrap")
   812  			}
   813  		}
   814  
   815  		cnUUID1 := uuid.New().String()
   816  		cnMsg1 := pb.CNStoreHeartbeat{UUID: cnUUID1}
   817  		_, err = store.addCNStoreHeartbeat(ctx, cnMsg1)
   818  		assert.NoError(t, err)
   819  		err = taskService.CreateAsyncTask(ctx, task.TaskMetadata{ID: "a"})
   820  		assert.NoError(t, err)
   821  		state, err = store.getCheckerState()
   822  		require.NoError(t, err)
   823  		tasks, err := taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   824  		assert.NoError(t, err)
   825  		assert.Equal(t, 0, len(tasks))
   826  		store.taskSchedule(state)
   827  		// update state
   828  		state, err = store.getCheckerState()
   829  		require.NoError(t, err)
   830  		store.taskSchedule(state)
   831  		tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   832  		assert.NoError(t, err)
   833  		assert.Equal(t, 1, len(tasks))
   834  
   835  		cnUUID2 := uuid.New().String()
   836  		for i := 0; i < 1000; i++ {
   837  			testLogger.Debug(fmt.Sprintf("iteration %d", i))
   838  			tn := func() bool {
   839  				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
   840  				defer cancel()
   841  				cnMsg2 := pb.CNStoreHeartbeat{UUID: cnUUID2}
   842  				_, err = store.addCNStoreHeartbeat(ctx, cnMsg2)
   843  				assert.NoError(t, err)
   844  				state, err = store.getCheckerState()
   845  				require.NoError(t, err)
   846  				store.taskSchedule(state)
   847  				tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID2))
   848  				assert.NoError(t, err)
   849  				if len(tasks) == 0 {
   850  					testLogger.Info("no task found")
   851  					time.Sleep(50 * time.Millisecond)
   852  				} else {
   853  					tasks, err = taskService.QueryAsyncTask(ctx, taskservice.WithTaskRunnerCond(taskservice.EQ, cnUUID1))
   854  					assert.Equal(t, 0, len(tasks))
   855  					return true
   856  				}
   857  				return false
   858  			}
   859  			completed := tn()
   860  			if completed {
   861  				store.taskScheduler.StopScheduleCronTask()
   862  				return
   863  			}
   864  			time.Sleep(100 * time.Millisecond)
   865  		}
   866  		t.Fatalf("failed to reschedule expired tasks")
   867  	}
   868  	runHakeeperTaskServiceTest(t, fn)
   869  }
   870  
   871  func TestGetTaskTableUserFromEnv(t *testing.T) {
   872  	t.Setenv(moAdminUser, "root")
   873  	user, ok := getTaskTableUserFromEnv()
   874  	require.False(t, ok)
   875  	require.Equal(t, pb.TaskTableUser{}, user)
   876  
   877  	t.Setenv(moAdminPassword, "")
   878  	user, ok = getTaskTableUserFromEnv()
   879  	require.False(t, ok)
   880  	require.Equal(t, pb.TaskTableUser{}, user)
   881  
   882  	t.Setenv(moAdminPassword, "root")
   883  	user, ok = getTaskTableUserFromEnv()
   884  	require.True(t, ok)
   885  	require.Equal(t, pb.TaskTableUser{Username: "root", Password: "root"}, user)
   886  }